Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

handyhome-ocr-api / extract_prc.py

takomattyy

Upload 10 files

6916300 verified 18 days ago

raw

history blame contribute delete

12.9 kB

	#!/usr/bin/env python3
	"""
	Philippine PRC (Professional Regulation Commission) License Information Extraction Script

	Purpose:
	Extracts structured information from PRC license images using OCR.
	Handles various PRC license formats including UMID-style cards.

	Why this script exists:
	- PRC licenses have complex layouts with multiple information fields
	- Need to extract profession-specific information
	- Handles both traditional PRC licenses and UMID-style PRC cards
	- Required for professional verification workflows

	Key Features:
	- Extracts CRN (Common Reference Number) - 12-digit format
	- Processes registration numbers and dates
	- Extracts profession information
	- Handles GSIS/SSS number extraction
	- Supports validity date tracking

	Dependencies:
	- PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
	- Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
	- requests: HTTP library (https://docs.python-requests.org/)

	Usage:
	python extract_prc.py "https://example.com/prc_license.jpg"

	Output:
	JSON with extracted information: crn, registration_number, profession, valid_until, etc.
	"""

	import sys, json, os, glob, re, requests
	from PIL import Image
	from io import BytesIO
	from datetime import datetime
	from contextlib import redirect_stdout, redirect_stderr

	# Immediately redirect all output to stderr except for our final JSON
	original_stdout = sys.stdout
	sys.stdout = sys.stderr

	# Suppress all PaddleOCR output
	os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
	os.environ['QT_QPA_PLATFORM'] = 'offscreen'
	os.environ['DISPLAY'] = ':99'

	# Import PaddleOCR after setting environment variables
	from paddleocr import PaddleOCR

	def dprint(msg, obj=None):
	"""
	Debug print function that safely handles object serialization.

	Args:
	msg (str): Debug message
	obj (any): Object to print (optional)

	Why this approach:
	- Centralized debug logging
	- Safe object serialization
	- Consistent debug output format
	"""
	try:
	print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
	except Exception:
	pass

	def clean_cache():
	cache_files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
	for f in cache_files:
	if os.path.exists(f):
	os.remove(f)
	dprint("Removed cache file", f)
	if os.path.exists("output"):
	import shutil
	shutil.rmtree("output")
	dprint("Removed output directory")

	def download_image(url, output_path='temp_image.jpg'):
	dprint("Starting download", url)
	clean_cache()
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	r = requests.get(url, headers=headers, timeout=30)
	dprint("HTTP status", r.status_code)
	r.raise_for_status()
	img = Image.open(BytesIO(r.content))
	if img.mode == 'RGBA':
	bg = Image.new('RGB', img.size, (255,255,255))
	bg.paste(img, mask=img.split()[-1])
	img = bg
	elif img.mode != 'RGB':
	img = img.convert('RGB')
	img.save(output_path, 'JPEG', quality=95)
	dprint("Saved image", output_path)
	return output_path

	def format_date(s):
	if not s: return None
	raw = s.strip()
	t = raw.replace(' ', '').replace('\\','/').replace('.','/')
	if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
	return t.replace('/', '-')
	# Accept mm/dd/yyyy style
	if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
	m, d, y = raw.split('/')
	return f"{y}-{int(m):02d}-{int(d):02d}"
	# Month name variants
	m = re.match(r'([A-Za-z]+)\s\d{1,2},\s\d{4}', raw)
	if m:
	try:
	return datetime.strptime(raw.replace(' ', ' '), "%B %d, %Y").strftime("%Y-%m-%d")
	except Exception:
	try:
	return datetime.strptime(raw.replace(' ', ' '), "%b %d, %Y").strftime("%Y-%m-%d")
	except Exception:
	pass
	return raw

	def cap_words(name):
	return None if not name else ' '.join(w.capitalize() for w in name.split())

	def normalize_name_from_parts(last, first_block):
	last = (last or '').strip()
	tokens = [t for t in (first_block or '').strip().split(' ') if t]
	given_kept = tokens[:2] # keep up to two given names
	composed = ' '.join(given_kept + [last]).strip()
	return cap_words(composed) if composed else None

	def normalize_full_name_from_three(first, middle, last):
	# keep first + optional second from "first" block; ignore middle completely
	tokens = [t for t in (first or '').strip().split(' ') if t]
	given_kept = tokens[:2]
	composed = ' '.join(given_kept + [last or '']).strip()
	return cap_words(composed) if composed else None

	def take_within(lines, i, k=5):
	out = []
	for j in range(1, k+1):
	if i+j < len(lines):
	t = str(lines[i+j]).strip()
	if t:
	out.append(t)
	return out

	def is_numeric_id(t):
	return bool(re.match(r'^\d{5,}$', str(t).replace(' ', '')))

	def is_crn(t):
	# UMID CRN commonly 12 digits
	return bool(re.match(r'^\d{12}$', t.replace(' ', '')))

	def is_date(t):
	t1 = t.replace(' ', '').replace('\\','/').replace('.','/')
	return bool(re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t1)) or bool(re.match(r'^\d{2}/\d{2}/\d{4}$', t)) or bool(re.match(r'^[A-Za-z]+\s\d{1,2},\s\d{4}$', t))

	def extract_prc_info(lines):
	"""
	Extract PRC license information from OCR text lines.

	Args:
	lines (list): List of text lines from OCR processing

	Returns:
	dict: Extracted PRC information with keys: crn, registration_number, profession, etc.

	Why this approach:
	- PRC licenses have complex layouts with multiple fields
	- Need to handle various license formats (traditional and UMID-style)
	- Extracts profession-specific information
	- Handles both traditional PRC licenses and UMID-style PRC cards
	- Uses lookahead pattern matching for field extraction
	"""
	dprint("Lines to extract", lines)

	# Initialize variables for extracted information
	crn = None
	full_name = None
	birth_date = None
	gsis_number = None
	sss_number = None
	registration_number = None
	registration_date = None
	valid_until = None
	profession = None

	# Collect name parts separately for composition
	last_name_txt = None
	first_name_txt = None

	L = [str(x or '').strip() for x in lines]
	i = 0
	while i < len(L):
	line = L[i]
	low = line.lower()
	dprint("Line", {"i": i, "text": line})

	# Extract CRN (UMID format) - 12 digits
	if crn is None and is_crn(line):
	crn = line.replace(' ', '')
	dprint("Found CRN", crn)

	# Extract Last Name using lookahead pattern
	if 'last name' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	tl = t.lower()
	if not any(k in tl for k in ['first', 'middle', 'registration', 'valid', 'date', 'no']):
	last_name_txt = t
	break

	# Extract First Name
	if 'firstname' in low or 'first name' in low:
	if i+1 < len(L):
	first_name_txt = L[i+1]

	# Extract Date of Birth
	if ('date of birth' in low) or ('birth' in low and 'date' in low):
	ahead = take_within(L, i, 4)
	for t in ahead:
	if is_date(t):
	birth_date = format_date(t)
	break

	# Extract Registration Number - handles split labels
	if low == 'registration' and i+1 < len(L) and L[i+1].lower() in ('no', 'no.', 'number'):
	ahead = take_within(L, i+1, 4)
	for t in ahead:
	if is_numeric_id(t):
	registration_number = t.replace(' ', '')
	break

	# Also handle fused label forms
	if ('registration no' in low) or ('registration number' in low):
	ahead = take_within(L, i, 4)
	for t in ahead:
	if is_numeric_id(t):
	registration_number = t.replace(' ', '')
	break

	# Extract Registration Date
	if low == 'registration' and i+1 < len(L) and L[i+1].lower() == 'date':
	ahead = take_within(L, i+1, 4)
	for t in ahead:
	if is_date(t):
	registration_date = format_date(t)
	break
	if 'registration date' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if is_date(t):
	registration_date = format_date(t)
	break

	# Extract Valid Until Date
	if 'valid until' in low or 'validity' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if is_date(t):
	valid_until = format_date(t)
	break

	# Extract Profession from bold lines
	if any(k in low for k in ['occupational','technician','engineer','teacher','nurse']):
	if len(line.split()) >= 2:
	profession = cap_words(line)
	dprint("Found profession", profession)

	# Extract SSS Number
	if sss_number is None and ('sss' in low or 'social security' in low):
	ahead = take_within(L, i, 3)
	for t in ahead:
	if is_numeric_id(t):
	sss_number = t.replace(' ', '')
	dprint("Found sss_number", sss_number)
	break

	# Extract GSIS Number
	if gsis_number is None and ('gsis' in low):
	ahead = take_within(L, i, 3)
	for t in ahead:
	if is_numeric_id(t):
	gsis_number = t.replace(' ', '')
	dprint("Found gsis_number", gsis_number)
	break

	i += 1

	# Compose full name from parts
	if full_name is None:
	full_name = normalize_name_from_parts(last_name_txt, first_name_txt)

	# Return structured result
	result = {
	"id_type": "PRC ID",
	"crn": crn,
	"id_number": registration_number or crn, # Frontend expects id_number
	"registration_number": registration_number,
	"registration_date": registration_date,
	"valid_until": valid_until,
	"full_name": full_name,
	"birth_date": birth_date,
	"sss_number": sss_number,
	"gsis_number": gsis_number,
	"profession": profession
	}
	dprint("Final result", result)
	return result

	def extract_ocr_lines(image_path):
	os.makedirs("output", exist_ok=True)
	dprint("Initializing PaddleOCR")

	with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
	ocr = PaddleOCR(
	use_doc_orientation_classify=False,
	use_doc_unwarping=False,
	use_textline_orientation=False,
	lang='en'
	)
	dprint("OCR initialized")
	dprint("Running OCR predict", image_path)
	results = ocr.predict(image_path)
	dprint("OCR predict done, results_count", len(results))

	# Process OCR results directly
	all_text = []
	try:
	lines = results[0] if results and isinstance(results[0], list) else results
	for item in lines:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	meta = item[1]
	if isinstance(meta, (list, tuple)) and len(meta) >= 1:
	all_text.append(str(meta[0]))
	except Exception as e:
	dprint("Error processing OCR results", str(e))

	dprint("All direct texts", all_text)
	return extract_prc_info(all_text) if all_text else {
	"id_type": "PRC ID",
	"crn": None,
	"full_name": None,
	"birth_date": None
	}

	if len(sys.argv) < 2:
	sys.stdout = original_stdout
	print(json.dumps({"error": "No image URL provided"}))
	sys.exit(1)

	image_url = sys.argv[1]
	dprint("Processing image URL", image_url)
	try:
	image_path = download_image(image_url)
	dprint("Image downloaded to", image_path)
	ocr_results = extract_ocr_lines(image_path)
	dprint("OCR results ready")

	# Restore stdout and print only the JSON response
	sys.stdout = original_stdout
	sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results}))
	sys.stdout.flush()

	except Exception as e:
	dprint("Exception", str(e))
	# Restore stdout for error JSON
	sys.stdout = original_stdout
	sys.stdout.write(json.dumps({"error": str(e)}))
	sys.stdout.flush()
	sys.exit(1)
	finally:
	# Clean up
	try:
	clean_cache()
	except:
	pass