Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Philippine PRC (Professional Regulation Commission) License Information Extraction Script | |
| Purpose: | |
| Extracts structured information from PRC license images using OCR. | |
| Handles various PRC license formats including UMID-style cards. | |
| Why this script exists: | |
| - PRC licenses have complex layouts with multiple information fields | |
| - Need to extract profession-specific information | |
| - Handles both traditional PRC licenses and UMID-style PRC cards | |
| - Required for professional verification workflows | |
| Key Features: | |
| - Extracts CRN (Common Reference Number) - 12-digit format | |
| - Processes registration numbers and dates | |
| - Extracts profession information | |
| - Handles GSIS/SSS number extraction | |
| - Supports validity date tracking | |
| Dependencies: | |
| - PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR) | |
| - Pillow (PIL): Image processing (https://pillow.readthedocs.io/) | |
| - requests: HTTP library (https://docs.python-requests.org/) | |
| Usage: | |
| python extract_prc.py "https://example.com/prc_license.jpg" | |
| Output: | |
| JSON with extracted information: crn, registration_number, profession, valid_until, etc. | |
| """ | |
| import sys, json, os, glob, re, requests | |
| from PIL import Image | |
| from io import BytesIO | |
| from datetime import datetime | |
| from contextlib import redirect_stdout, redirect_stderr | |
| # Immediately redirect all output to stderr except for our final JSON | |
| original_stdout = sys.stdout | |
| sys.stdout = sys.stderr | |
| # Suppress all PaddleOCR output | |
| os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' | |
| os.environ['QT_QPA_PLATFORM'] = 'offscreen' | |
| os.environ['DISPLAY'] = ':99' | |
| # Import PaddleOCR after setting environment variables | |
| from paddleocr import PaddleOCR | |
| def dprint(msg, obj=None): | |
| """ | |
| Debug print function that safely handles object serialization. | |
| Args: | |
| msg (str): Debug message | |
| obj (any): Object to print (optional) | |
| Why this approach: | |
| - Centralized debug logging | |
| - Safe object serialization | |
| - Consistent debug output format | |
| """ | |
| try: | |
| print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr) | |
| except Exception: | |
| pass | |
| def clean_cache(): | |
| cache_files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json'] | |
| for f in cache_files: | |
| if os.path.exists(f): | |
| os.remove(f) | |
| dprint("Removed cache file", f) | |
| if os.path.exists("output"): | |
| import shutil | |
| shutil.rmtree("output") | |
| dprint("Removed output directory") | |
| def download_image(url, output_path='temp_image.jpg'): | |
| dprint("Starting download", url) | |
| clean_cache() | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| r = requests.get(url, headers=headers, timeout=30) | |
| dprint("HTTP status", r.status_code) | |
| r.raise_for_status() | |
| img = Image.open(BytesIO(r.content)) | |
| if img.mode == 'RGBA': | |
| bg = Image.new('RGB', img.size, (255,255,255)) | |
| bg.paste(img, mask=img.split()[-1]) | |
| img = bg | |
| elif img.mode != 'RGB': | |
| img = img.convert('RGB') | |
| img.save(output_path, 'JPEG', quality=95) | |
| dprint("Saved image", output_path) | |
| return output_path | |
| def format_date(s): | |
| if not s: return None | |
| raw = s.strip() | |
| t = raw.replace(' ', '').replace('\\','/').replace('.','/') | |
| if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t): | |
| return t.replace('/', '-') | |
| # Accept mm/dd/yyyy style | |
| if re.match(r'^\d{2}/\d{2}/\d{4}$', raw): | |
| m, d, y = raw.split('/') | |
| return f"{y}-{int(m):02d}-{int(d):02d}" | |
| # Month name variants | |
| m = re.match(r'([A-Za-z]+)\s*\d{1,2},\s*\d{4}', raw) | |
| if m: | |
| try: | |
| return datetime.strptime(raw.replace(' ', ' '), "%B %d, %Y").strftime("%Y-%m-%d") | |
| except Exception: | |
| try: | |
| return datetime.strptime(raw.replace(' ', ' '), "%b %d, %Y").strftime("%Y-%m-%d") | |
| except Exception: | |
| pass | |
| return raw | |
| def cap_words(name): | |
| return None if not name else ' '.join(w.capitalize() for w in name.split()) | |
| def normalize_name_from_parts(last, first_block): | |
| last = (last or '').strip() | |
| tokens = [t for t in (first_block or '').strip().split(' ') if t] | |
| given_kept = tokens[:2] # keep up to two given names | |
| composed = ' '.join(given_kept + [last]).strip() | |
| return cap_words(composed) if composed else None | |
| def normalize_full_name_from_three(first, middle, last): | |
| # keep first + optional second from "first" block; ignore middle completely | |
| tokens = [t for t in (first or '').strip().split(' ') if t] | |
| given_kept = tokens[:2] | |
| composed = ' '.join(given_kept + [last or '']).strip() | |
| return cap_words(composed) if composed else None | |
| def take_within(lines, i, k=5): | |
| out = [] | |
| for j in range(1, k+1): | |
| if i+j < len(lines): | |
| t = str(lines[i+j]).strip() | |
| if t: | |
| out.append(t) | |
| return out | |
| def is_numeric_id(t): | |
| return bool(re.match(r'^\d{5,}$', str(t).replace(' ', ''))) | |
| def is_crn(t): | |
| # UMID CRN commonly 12 digits | |
| return bool(re.match(r'^\d{12}$', t.replace(' ', ''))) | |
| def is_date(t): | |
| t1 = t.replace(' ', '').replace('\\','/').replace('.','/') | |
| return bool(re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t1)) or bool(re.match(r'^\d{2}/\d{2}/\d{4}$', t)) or bool(re.match(r'^[A-Za-z]+\s*\d{1,2},\s*\d{4}$', t)) | |
| def extract_prc_info(lines): | |
| """ | |
| Extract PRC license information from OCR text lines. | |
| Args: | |
| lines (list): List of text lines from OCR processing | |
| Returns: | |
| dict: Extracted PRC information with keys: crn, registration_number, profession, etc. | |
| Why this approach: | |
| - PRC licenses have complex layouts with multiple fields | |
| - Need to handle various license formats (traditional and UMID-style) | |
| - Extracts profession-specific information | |
| - Handles both traditional PRC licenses and UMID-style PRC cards | |
| - Uses lookahead pattern matching for field extraction | |
| """ | |
| dprint("Lines to extract", lines) | |
| # Initialize variables for extracted information | |
| crn = None | |
| full_name = None | |
| birth_date = None | |
| gsis_number = None | |
| sss_number = None | |
| registration_number = None | |
| registration_date = None | |
| valid_until = None | |
| profession = None | |
| # Collect name parts separately for composition | |
| last_name_txt = None | |
| first_name_txt = None | |
| L = [str(x or '').strip() for x in lines] | |
| i = 0 | |
| while i < len(L): | |
| line = L[i] | |
| low = line.lower() | |
| dprint("Line", {"i": i, "text": line}) | |
| # Extract CRN (UMID format) - 12 digits | |
| if crn is None and is_crn(line): | |
| crn = line.replace(' ', '') | |
| dprint("Found CRN", crn) | |
| # Extract Last Name using lookahead pattern | |
| if 'last name' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| tl = t.lower() | |
| if not any(k in tl for k in ['first', 'middle', 'registration', 'valid', 'date', 'no']): | |
| last_name_txt = t | |
| break | |
| # Extract First Name | |
| if 'firstname' in low or 'first name' in low: | |
| if i+1 < len(L): | |
| first_name_txt = L[i+1] | |
| # Extract Date of Birth | |
| if ('date of birth' in low) or ('birth' in low and 'date' in low): | |
| ahead = take_within(L, i, 4) | |
| for t in ahead: | |
| if is_date(t): | |
| birth_date = format_date(t) | |
| break | |
| # Extract Registration Number - handles split labels | |
| if low == 'registration' and i+1 < len(L) and L[i+1].lower() in ('no', 'no.', 'number'): | |
| ahead = take_within(L, i+1, 4) | |
| for t in ahead: | |
| if is_numeric_id(t): | |
| registration_number = t.replace(' ', '') | |
| break | |
| # Also handle fused label forms | |
| if ('registration no' in low) or ('registration number' in low): | |
| ahead = take_within(L, i, 4) | |
| for t in ahead: | |
| if is_numeric_id(t): | |
| registration_number = t.replace(' ', '') | |
| break | |
| # Extract Registration Date | |
| if low == 'registration' and i+1 < len(L) and L[i+1].lower() == 'date': | |
| ahead = take_within(L, i+1, 4) | |
| for t in ahead: | |
| if is_date(t): | |
| registration_date = format_date(t) | |
| break | |
| if 'registration date' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if is_date(t): | |
| registration_date = format_date(t) | |
| break | |
| # Extract Valid Until Date | |
| if 'valid until' in low or 'validity' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if is_date(t): | |
| valid_until = format_date(t) | |
| break | |
| # Extract Profession from bold lines | |
| if any(k in low for k in ['occupational','technician','engineer','teacher','nurse']): | |
| if len(line.split()) >= 2: | |
| profession = cap_words(line) | |
| dprint("Found profession", profession) | |
| # Extract SSS Number | |
| if sss_number is None and ('sss' in low or 'social security' in low): | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if is_numeric_id(t): | |
| sss_number = t.replace(' ', '') | |
| dprint("Found sss_number", sss_number) | |
| break | |
| # Extract GSIS Number | |
| if gsis_number is None and ('gsis' in low): | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if is_numeric_id(t): | |
| gsis_number = t.replace(' ', '') | |
| dprint("Found gsis_number", gsis_number) | |
| break | |
| i += 1 | |
| # Compose full name from parts | |
| if full_name is None: | |
| full_name = normalize_name_from_parts(last_name_txt, first_name_txt) | |
| # Return structured result | |
| result = { | |
| "id_type": "PRC ID", | |
| "crn": crn, | |
| "id_number": registration_number or crn, # Frontend expects id_number | |
| "registration_number": registration_number, | |
| "registration_date": registration_date, | |
| "valid_until": valid_until, | |
| "full_name": full_name, | |
| "birth_date": birth_date, | |
| "sss_number": sss_number, | |
| "gsis_number": gsis_number, | |
| "profession": profession | |
| } | |
| dprint("Final result", result) | |
| return result | |
| def extract_ocr_lines(image_path): | |
| os.makedirs("output", exist_ok=True) | |
| dprint("Initializing PaddleOCR") | |
| with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): | |
| ocr = PaddleOCR( | |
| use_doc_orientation_classify=False, | |
| use_doc_unwarping=False, | |
| use_textline_orientation=False, | |
| lang='en' | |
| ) | |
| dprint("OCR initialized") | |
| dprint("Running OCR predict", image_path) | |
| results = ocr.predict(image_path) | |
| dprint("OCR predict done, results_count", len(results)) | |
| # Process OCR results directly | |
| all_text = [] | |
| try: | |
| lines = results[0] if results and isinstance(results[0], list) else results | |
| for item in lines: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| meta = item[1] | |
| if isinstance(meta, (list, tuple)) and len(meta) >= 1: | |
| all_text.append(str(meta[0])) | |
| except Exception as e: | |
| dprint("Error processing OCR results", str(e)) | |
| dprint("All direct texts", all_text) | |
| return extract_prc_info(all_text) if all_text else { | |
| "id_type": "PRC ID", | |
| "crn": None, | |
| "full_name": None, | |
| "birth_date": None | |
| } | |
| if len(sys.argv) < 2: | |
| sys.stdout = original_stdout | |
| print(json.dumps({"error": "No image URL provided"})) | |
| sys.exit(1) | |
| image_url = sys.argv[1] | |
| dprint("Processing image URL", image_url) | |
| try: | |
| image_path = download_image(image_url) | |
| dprint("Image downloaded to", image_path) | |
| ocr_results = extract_ocr_lines(image_path) | |
| dprint("OCR results ready") | |
| # Restore stdout and print only the JSON response | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results})) | |
| sys.stdout.flush() | |
| except Exception as e: | |
| dprint("Exception", str(e)) | |
| # Restore stdout for error JSON | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps({"error": str(e)})) | |
| sys.stdout.flush() | |
| sys.exit(1) | |
| finally: | |
| # Clean up | |
| try: | |
| clean_cache() | |
| except: | |
| pass | |