Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Philippine Passport Information Extraction Script | |
| Purpose: | |
| Extracts structured information from Philippine passport images using OCR. | |
| Handles complex passport layouts with multiple information fields. | |
| Why this script exists: | |
| - Passports have complex layouts with multiple information fields | |
| - Need to extract international-standard passport information | |
| - Handles bilingual labels (English/Filipino) | |
| - Required for passport verification workflows | |
| Key Features: | |
| - Extracts passport number (format: X0000000A) | |
| - Handles complex name structures (surname, given names, middle name) | |
| - Processes multiple date fields (birth, issue, expiration) | |
| - Extracts nationality and place of birth | |
| - Handles OCR digit correction | |
| Dependencies: | |
| - PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR) | |
| - Pillow (PIL): Image processing (https://pillow.readthedocs.io/) | |
| - requests: HTTP library (https://docs.python-requests.org/) | |
| Usage: | |
| python extract_passport.py "https://example.com/passport.jpg" | |
| Output: | |
| JSON with extracted information: passport_number, full_name, birth_date, valid_until, etc. | |
| """ | |
| import sys, json, os, glob, re, requests | |
| from PIL import Image | |
| from io import BytesIO | |
| from datetime import datetime | |
| from contextlib import redirect_stdout, redirect_stderr | |
| # Route any non-JSON prints to stderr by default | |
| _ORIG_STDOUT = sys.stdout | |
| sys.stdout = sys.stderr | |
| # Suppress all PaddleOCR output | |
| os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' | |
| os.environ['QT_QPA_PLATFORM'] = 'offscreen' | |
| os.environ['DISPLAY'] = ':99' | |
| # Import PaddleOCR after setting environment variables | |
| from paddleocr import PaddleOCR | |
| def dprint(msg, obj=None): | |
| try: | |
| print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr) | |
| except Exception: | |
| pass | |
| def clean_cache(): | |
| files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json'] | |
| for f in files: | |
| if os.path.exists(f): | |
| os.remove(f) | |
| dprint("Removed cache file", f) | |
| if os.path.exists("output"): | |
| import shutil | |
| shutil.rmtree("output") | |
| dprint("Removed output directory") | |
| def download_image(url, output_path='temp_image.jpg'): | |
| dprint("Starting download", url) | |
| clean_cache() | |
| r = requests.get(url) | |
| dprint("HTTP status", r.status_code) | |
| r.raise_for_status() | |
| img = Image.open(BytesIO(r.content)) | |
| if img.mode == 'RGBA': | |
| bg = Image.new('RGB', img.size, (255, 255, 255)) | |
| bg.paste(img, mask=img.split()[-1]) | |
| img = bg | |
| elif img.mode != 'RGB': | |
| img = img.convert('RGB') | |
| img.save(output_path, 'JPEG', quality=95) | |
| dprint("Saved image", output_path) | |
| return output_path | |
| def cap_words(s): | |
| return None if not s else ' '.join(w.capitalize() for w in s.split()) | |
| def normalize_digits(s): | |
| """ | |
| Fix common OCR digit confusions. | |
| Args: | |
| s (str): Text string that may contain OCR errors | |
| Returns: | |
| str: Text with corrected digits | |
| Why this is needed: | |
| - OCR often misreads similar-looking characters | |
| - Common errors: O→0, o→0, I/l→1, S→5, B→8 | |
| - Critical for accurate ID number extraction | |
| """ | |
| return ( | |
| str(s) | |
| .replace('O','0').replace('o','0') | |
| .replace('I','1').replace('l','1') | |
| .replace('S','5') | |
| .replace('B','8') | |
| ) | |
| def normalize_full_name(surname, given_names, middle_name=None): | |
| if not surname and not given_names: | |
| return None | |
| surname = surname.strip() if surname else "" | |
| given_names = given_names.strip() if given_names else "" | |
| middle_name = middle_name.strip() if middle_name else "" | |
| # Combine given names (first + second if present) | |
| given_parts = [p for p in given_names.split() if p] | |
| if len(given_parts) >= 2: | |
| # Keep first two given names, ignore middle name | |
| name_parts = [given_parts[0], given_parts[1], surname] | |
| elif len(given_parts) == 1: | |
| name_parts = [given_parts[0], surname] | |
| else: | |
| name_parts = [surname] | |
| return cap_words(' '.join(name_parts)) | |
| def format_date(s): | |
| if not s: | |
| return None | |
| raw = str(s).strip() | |
| # Fix OCR digit issues first | |
| raw = normalize_digits(raw) | |
| # Handle "16MAR1980" format (no spaces) | |
| try: | |
| if re.match(r'\d{2}[A-Z]{3}\d{4}', raw): | |
| return datetime.strptime(raw, "%d%b%Y").strftime("%Y-%m-%d") | |
| except Exception: | |
| pass | |
| # Handle "16 MAR 1980" format (with spaces) | |
| try: | |
| return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d") | |
| except Exception: | |
| pass | |
| # Handle "27 JUN 2016" format | |
| try: | |
| return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d") | |
| except Exception: | |
| pass | |
| # Handle other date formats | |
| t = raw.replace(' ', '').replace('\\','/').replace('.','/') | |
| if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t): | |
| return t.replace('/', '-') | |
| if re.match(r'^\d{2}/\d{2}/\d{4}$', raw): | |
| m, d, y = raw.split('/') | |
| return f"{y}-{int(m):02d}-{int(d):02d}" | |
| return raw | |
| def extract_passport_number(text): | |
| # Fix OCR digits first | |
| text = normalize_digits(text) | |
| # Look for passport number pattern like "P0000000A" | |
| passport_pattern = r'\b([A-Z]\d{7}[A-Z0-9])\b' | |
| match = re.search(passport_pattern, text) | |
| if match: | |
| return match.group(1) | |
| return None | |
| def take_within(lines, i, k=5): | |
| out = [] | |
| for j in range(1, k+1): | |
| if i+j < len(lines): | |
| t = str(lines[i+j]).strip() | |
| if t: | |
| out.append(t) | |
| return out | |
| def extract_passport_info(lines): | |
| """ | |
| Extract passport information from OCR text lines. | |
| Args: | |
| lines (list): List of text lines from OCR processing | |
| Returns: | |
| dict: Extracted passport information | |
| Why this approach: | |
| - Passports follow ICAO standards with specific formats | |
| - Complex name structure requires separate handling | |
| - Multiple date fields need individual processing | |
| - Uses lookahead pattern matching for field extraction | |
| """ | |
| dprint("Lines to extract", lines) | |
| # Initialize variables for extracted information | |
| full_name = None | |
| surname = None | |
| given_names = None | |
| middle_name = None | |
| passport_number = None | |
| birth_date = None | |
| sex = None | |
| nationality = None | |
| place_of_birth = None | |
| date_of_issue = None | |
| valid_until = None | |
| issuing_authority = None | |
| L = [str(x or '').strip() for x in lines] | |
| i = 0 | |
| while i < len(L): | |
| line = L[i] | |
| low = line.lower() | |
| dprint("Line", {"i": i, "text": line}) | |
| # Extract passport number using pattern matching | |
| if not passport_number: | |
| passport_num = extract_passport_number(line) | |
| if passport_num: | |
| passport_number = passport_num | |
| dprint("Found passport number", passport_number) | |
| # Extract Surname using lookahead pattern | |
| if 'surname' in low or 'apelyido' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): | |
| surname = t | |
| dprint("Found surname", surname) | |
| break | |
| # Also look for "DELA CRUZ" directly | |
| if not surname and 'dela' in low and 'cruz' in low: | |
| surname = line | |
| dprint("Found surname (direct)", surname) | |
| # Extract Given Names | |
| if 'given' in low and 'name' in low or 'pangalan' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): | |
| given_names = t | |
| dprint("Found given names", given_names) | |
| break | |
| # Also look for "MARIA" directly | |
| if not given_names and line == 'MARIA': | |
| given_names = line | |
| dprint("Found given names (direct)", given_names) | |
| # Extract Middle Name | |
| if 'middle' in low or 'panggitnang' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): | |
| middle_name = t | |
| dprint("Found middle name", middle_name) | |
| break | |
| # Also look for "SANTOS" directly | |
| if not middle_name and line == 'SANTOS': | |
| middle_name = line | |
| dprint("Found middle name (direct)", middle_name) | |
| # Extract Date of Birth | |
| if 'birth' in low or 'kapanganakan' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t): | |
| birth_date = format_date(t) | |
| dprint("Found birth date", birth_date) | |
| break | |
| # Also look for "16MAR1980" directly | |
| if not birth_date and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line): | |
| birth_date = format_date(line) | |
| dprint("Found birth date (direct)", birth_date) | |
| # Extract Sex | |
| if 'sex' in low or 'kasarian' in low: | |
| ahead = take_within(L, i, 2) | |
| for t in ahead: | |
| if t.upper() in ['M', 'F', 'MALE', 'FEMALE']: | |
| sex = 'M' if t.upper().startswith('M') else 'F' | |
| dprint("Found sex", sex) | |
| break | |
| # Also look for "F" directly | |
| if not sex and line == 'F': | |
| sex = 'F' | |
| dprint("Found sex (direct)", sex) | |
| # Extract Nationality | |
| if 'nationality' in low or 'nasyonalidad' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): | |
| nationality = t | |
| dprint("Found nationality", nationality) | |
| break | |
| # Also look for "FILIPINO" directly | |
| if not nationality and line == 'FILIPINO': | |
| nationality = line | |
| dprint("Found nationality (direct)", nationality) | |
| # Extract Place of Birth | |
| if 'place' in low and 'birth' in low or 'lugar' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): | |
| place_of_birth = t | |
| dprint("Found place of birth", place_of_birth) | |
| break | |
| # Also look for "MANILA" directly | |
| if not place_of_birth and line == 'MANILA': | |
| place_of_birth = line | |
| dprint("Found place of birth (direct)", place_of_birth) | |
| # Extract Date of Issue | |
| if 'issue' in low or 'pagkakaloob' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t): | |
| date_of_issue = format_date(t) | |
| dprint("Found date of issue", date_of_issue) | |
| break | |
| # Also look for "27JUN2016" directly | |
| if not date_of_issue and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line): | |
| date_of_issue = format_date(line) | |
| dprint("Found date of issue (direct)", date_of_issue) | |
| # Extract Valid Until Date | |
| if 'valid' in low or 'pagkawalang' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t): | |
| valid_until = format_date(t) | |
| dprint("Found valid until", valid_until) | |
| break | |
| # Also look for "26 JUN 2021" directly | |
| if not valid_until and re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', line): | |
| valid_until = format_date(line) | |
| dprint("Found valid until (direct)", valid_until) | |
| # Extract Issuing Authority | |
| if 'authority' in low or 'maykapangyarihang' in low: | |
| ahead = take_within(L, i, 3) | |
| for t in ahead: | |
| if re.search(r'[A-Z]{2,}', t) and 'DFA' in t: | |
| issuing_authority = t | |
| dprint("Found issuing authority", issuing_authority) | |
| break | |
| # Also look for "DFAMANILA" directly | |
| if not issuing_authority and 'DFA' in line: | |
| issuing_authority = line | |
| dprint("Found issuing authority (direct)", issuing_authority) | |
| i += 1 | |
| # Compose full name from separate fields | |
| if not full_name: | |
| full_name = normalize_full_name(surname, given_names, middle_name) | |
| dprint("Composed full name", {"surname": surname, "given": given_names, "middle": middle_name, "full": full_name}) | |
| # Return structured result | |
| result = { | |
| "id_type": "passport", | |
| "passport_number": passport_number, | |
| "id_number": passport_number, | |
| "full_name": full_name, | |
| "surname": surname, | |
| "given_names": given_names, | |
| "middle_name": middle_name, | |
| "birth_date": birth_date, | |
| "sex": sex, | |
| "nationality": nationality, | |
| "place_of_birth": place_of_birth, | |
| "date_of_issue": date_of_issue, | |
| "valid_until": valid_until, | |
| "issuing_authority": issuing_authority | |
| } | |
| dprint("Final result", result) | |
| return result | |
| def extract_ocr_lines(image_path): | |
| os.makedirs("output", exist_ok=True) | |
| dprint("Initializing PaddleOCR") | |
| # Ensure any internal downloader/progress writes go to stderr, not stdout | |
| with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): | |
| ocr = PaddleOCR( | |
| use_doc_orientation_classify=False, | |
| use_doc_unwarping=False, | |
| use_textline_orientation=False, | |
| lang='en', | |
| show_log=False | |
| ) | |
| dprint("OCR initialized") | |
| dprint("Running OCR ocr", image_path) | |
| results = ocr.ocr(image_path, cls=False) | |
| try: | |
| count = len(results[0]) if results and isinstance(results[0], list) else len(results) | |
| except Exception: | |
| count = 0 | |
| dprint("OCR ocr done, results_count", count) | |
| # Process OCR results directly | |
| all_text = [] | |
| try: | |
| lines = results[0] if results and isinstance(results[0], list) else results | |
| for item in lines: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| meta = item[1] | |
| if isinstance(meta, (list, tuple)) and len(meta) >= 1: | |
| all_text.append(str(meta[0])) | |
| except Exception as e: | |
| dprint("Error processing OCR results", str(e)) | |
| dprint("All direct texts", all_text) | |
| return extract_passport_info(all_text) if all_text else { | |
| "id_type": "passport", | |
| "passport_number": None, | |
| "id_number": None, | |
| "full_name": None, | |
| "birth_date": None | |
| } | |
| if len(sys.argv) < 2: | |
| print(json.dumps({"error": "No image URL provided"})) | |
| sys.exit(1) | |
| image_url = sys.argv[1] | |
| dprint("Processing image URL", image_url) | |
| try: | |
| image_path = download_image(image_url) | |
| dprint("Image downloaded to", image_path) | |
| ocr_results = extract_ocr_lines(image_path) | |
| dprint("OCR results ready") | |
| # Ensure only the final JSON goes to stdout | |
| sys.stdout = _ORIG_STDOUT | |
| print(json.dumps({"success": True, "ocr_results": ocr_results})) | |
| except Exception as e: | |
| import traceback | |
| error_msg = str(e) | |
| traceback_msg = traceback.format_exc() | |
| dprint("Exception", error_msg) | |
| dprint("Traceback", traceback_msg) | |
| print(json.dumps({ | |
| "error": error_msg, | |
| "traceback": traceback_msg, | |
| "success": False | |
| })) | |
| sys.exit(1) | |