#!/usr/bin/env python3 """ Philippine PRC (Professional Regulation Commission) License Information Extraction Script Purpose: Extracts structured information from PRC license images using OCR. Handles various PRC license formats including UMID-style cards. Why this script exists: - PRC licenses have complex layouts with multiple information fields - Need to extract profession-specific information - Handles both traditional PRC licenses and UMID-style PRC cards - Required for professional verification workflows Key Features: - Extracts CRN (Common Reference Number) - 12-digit format - Processes registration numbers and dates - Extracts profession information - Handles GSIS/SSS number extraction - Supports validity date tracking Dependencies: - PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR) - Pillow (PIL): Image processing (https://pillow.readthedocs.io/) - requests: HTTP library (https://docs.python-requests.org/) Usage: python extract_prc.py "https://example.com/prc_license.jpg" Output: JSON with extracted information: crn, registration_number, profession, valid_until, etc. """ import sys, json, os, glob, re, requests from PIL import Image from io import BytesIO from datetime import datetime from contextlib import redirect_stdout, redirect_stderr # Immediately redirect all output to stderr except for our final JSON original_stdout = sys.stdout sys.stdout = sys.stderr # Suppress all PaddleOCR output os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' os.environ['QT_QPA_PLATFORM'] = 'offscreen' os.environ['DISPLAY'] = ':99' # Import PaddleOCR after setting environment variables from paddleocr import PaddleOCR def dprint(msg, obj=None): """ Debug print function that safely handles object serialization. Args: msg (str): Debug message obj (any): Object to print (optional) Why this approach: - Centralized debug logging - Safe object serialization - Consistent debug output format """ try: print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr) except Exception: pass def clean_cache(): cache_files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json'] for f in cache_files: if os.path.exists(f): os.remove(f) dprint("Removed cache file", f) if os.path.exists("output"): import shutil shutil.rmtree("output") dprint("Removed output directory") def download_image(url, output_path='temp_image.jpg'): dprint("Starting download", url) clean_cache() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } r = requests.get(url, headers=headers, timeout=30) dprint("HTTP status", r.status_code) r.raise_for_status() img = Image.open(BytesIO(r.content)) if img.mode == 'RGBA': bg = Image.new('RGB', img.size, (255,255,255)) bg.paste(img, mask=img.split()[-1]) img = bg elif img.mode != 'RGB': img = img.convert('RGB') img.save(output_path, 'JPEG', quality=95) dprint("Saved image", output_path) return output_path def format_date(s): if not s: return None raw = s.strip() t = raw.replace(' ', '').replace('\\','/').replace('.','/') if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t): return t.replace('/', '-') # Accept mm/dd/yyyy style if re.match(r'^\d{2}/\d{2}/\d{4}$', raw): m, d, y = raw.split('/') return f"{y}-{int(m):02d}-{int(d):02d}" # Month name variants m = re.match(r'([A-Za-z]+)\s*\d{1,2},\s*\d{4}', raw) if m: try: return datetime.strptime(raw.replace(' ', ' '), "%B %d, %Y").strftime("%Y-%m-%d") except Exception: try: return datetime.strptime(raw.replace(' ', ' '), "%b %d, %Y").strftime("%Y-%m-%d") except Exception: pass return raw def cap_words(name): return None if not name else ' '.join(w.capitalize() for w in name.split()) def normalize_name_from_parts(last, first_block): last = (last or '').strip() tokens = [t for t in (first_block or '').strip().split(' ') if t] given_kept = tokens[:2] # keep up to two given names composed = ' '.join(given_kept + [last]).strip() return cap_words(composed) if composed else None def normalize_full_name_from_three(first, middle, last): # keep first + optional second from "first" block; ignore middle completely tokens = [t for t in (first or '').strip().split(' ') if t] given_kept = tokens[:2] composed = ' '.join(given_kept + [last or '']).strip() return cap_words(composed) if composed else None def take_within(lines, i, k=5): out = [] for j in range(1, k+1): if i+j < len(lines): t = str(lines[i+j]).strip() if t: out.append(t) return out def is_numeric_id(t): return bool(re.match(r'^\d{5,}$', str(t).replace(' ', ''))) def is_crn(t): # UMID CRN commonly 12 digits return bool(re.match(r'^\d{12}$', t.replace(' ', ''))) def is_date(t): t1 = t.replace(' ', '').replace('\\','/').replace('.','/') return bool(re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t1)) or bool(re.match(r'^\d{2}/\d{2}/\d{4}$', t)) or bool(re.match(r'^[A-Za-z]+\s*\d{1,2},\s*\d{4}$', t)) def extract_prc_info(lines): """ Extract PRC license information from OCR text lines. Args: lines (list): List of text lines from OCR processing Returns: dict: Extracted PRC information with keys: crn, registration_number, profession, etc. Why this approach: - PRC licenses have complex layouts with multiple fields - Need to handle various license formats (traditional and UMID-style) - Extracts profession-specific information - Handles both traditional PRC licenses and UMID-style PRC cards - Uses lookahead pattern matching for field extraction """ dprint("Lines to extract", lines) # Initialize variables for extracted information crn = None full_name = None birth_date = None gsis_number = None sss_number = None registration_number = None registration_date = None valid_until = None profession = None # Collect name parts separately for composition last_name_txt = None first_name_txt = None L = [str(x or '').strip() for x in lines] i = 0 while i < len(L): line = L[i] low = line.lower() dprint("Line", {"i": i, "text": line}) # Extract CRN (UMID format) - 12 digits if crn is None and is_crn(line): crn = line.replace(' ', '') dprint("Found CRN", crn) # Extract Last Name using lookahead pattern if 'last name' in low: ahead = take_within(L, i, 3) for t in ahead: tl = t.lower() if not any(k in tl for k in ['first', 'middle', 'registration', 'valid', 'date', 'no']): last_name_txt = t break # Extract First Name if 'firstname' in low or 'first name' in low: if i+1 < len(L): first_name_txt = L[i+1] # Extract Date of Birth if ('date of birth' in low) or ('birth' in low and 'date' in low): ahead = take_within(L, i, 4) for t in ahead: if is_date(t): birth_date = format_date(t) break # Extract Registration Number - handles split labels if low == 'registration' and i+1 < len(L) and L[i+1].lower() in ('no', 'no.', 'number'): ahead = take_within(L, i+1, 4) for t in ahead: if is_numeric_id(t): registration_number = t.replace(' ', '') break # Also handle fused label forms if ('registration no' in low) or ('registration number' in low): ahead = take_within(L, i, 4) for t in ahead: if is_numeric_id(t): registration_number = t.replace(' ', '') break # Extract Registration Date if low == 'registration' and i+1 < len(L) and L[i+1].lower() == 'date': ahead = take_within(L, i+1, 4) for t in ahead: if is_date(t): registration_date = format_date(t) break if 'registration date' in low: ahead = take_within(L, i, 3) for t in ahead: if is_date(t): registration_date = format_date(t) break # Extract Valid Until Date if 'valid until' in low or 'validity' in low: ahead = take_within(L, i, 3) for t in ahead: if is_date(t): valid_until = format_date(t) break # Extract Profession from bold lines if any(k in low for k in ['occupational','technician','engineer','teacher','nurse']): if len(line.split()) >= 2: profession = cap_words(line) dprint("Found profession", profession) # Extract SSS Number if sss_number is None and ('sss' in low or 'social security' in low): ahead = take_within(L, i, 3) for t in ahead: if is_numeric_id(t): sss_number = t.replace(' ', '') dprint("Found sss_number", sss_number) break # Extract GSIS Number if gsis_number is None and ('gsis' in low): ahead = take_within(L, i, 3) for t in ahead: if is_numeric_id(t): gsis_number = t.replace(' ', '') dprint("Found gsis_number", gsis_number) break i += 1 # Compose full name from parts if full_name is None: full_name = normalize_name_from_parts(last_name_txt, first_name_txt) # Return structured result result = { "id_type": "PRC ID", "crn": crn, "id_number": registration_number or crn, # Frontend expects id_number "registration_number": registration_number, "registration_date": registration_date, "valid_until": valid_until, "full_name": full_name, "birth_date": birth_date, "sss_number": sss_number, "gsis_number": gsis_number, "profession": profession } dprint("Final result", result) return result def extract_ocr_lines(image_path): os.makedirs("output", exist_ok=True) dprint("Initializing PaddleOCR") with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): ocr = PaddleOCR( use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False, lang='en' ) dprint("OCR initialized") dprint("Running OCR predict", image_path) results = ocr.predict(image_path) dprint("OCR predict done, results_count", len(results)) # Process OCR results directly all_text = [] try: lines = results[0] if results and isinstance(results[0], list) else results for item in lines: if isinstance(item, (list, tuple)) and len(item) >= 2: meta = item[1] if isinstance(meta, (list, tuple)) and len(meta) >= 1: all_text.append(str(meta[0])) except Exception as e: dprint("Error processing OCR results", str(e)) dprint("All direct texts", all_text) return extract_prc_info(all_text) if all_text else { "id_type": "PRC ID", "crn": None, "full_name": None, "birth_date": None } if len(sys.argv) < 2: sys.stdout = original_stdout print(json.dumps({"error": "No image URL provided"})) sys.exit(1) image_url = sys.argv[1] dprint("Processing image URL", image_url) try: image_path = download_image(image_url) dprint("Image downloaded to", image_path) ocr_results = extract_ocr_lines(image_path) dprint("OCR results ready") # Restore stdout and print only the JSON response sys.stdout = original_stdout sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results})) sys.stdout.flush() except Exception as e: dprint("Exception", str(e)) # Restore stdout for error JSON sys.stdout = original_stdout sys.stdout.write(json.dumps({"error": str(e)})) sys.stdout.flush() sys.exit(1) finally: # Clean up try: clean_cache() except: pass