#!/usr/bin/env python3 """ Philippine Passport Information Extraction Script Purpose: Extracts structured information from Philippine passport images using OCR. Handles complex passport layouts with multiple information fields. Why this script exists: - Passports have complex layouts with multiple information fields - Need to extract international-standard passport information - Handles bilingual labels (English/Filipino) - Required for passport verification workflows Key Features: - Extracts passport number (format: X0000000A) - Handles complex name structures (surname, given names, middle name) - Processes multiple date fields (birth, issue, expiration) - Extracts nationality and place of birth - Handles OCR digit correction Dependencies: - PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR) - Pillow (PIL): Image processing (https://pillow.readthedocs.io/) - requests: HTTP library (https://docs.python-requests.org/) Usage: python extract_passport.py "https://example.com/passport.jpg" Output: JSON with extracted information: passport_number, full_name, birth_date, valid_until, etc. """ import sys, json, os, glob, re, requests from PIL import Image from io import BytesIO from datetime import datetime from contextlib import redirect_stdout, redirect_stderr # Route any non-JSON prints to stderr by default _ORIG_STDOUT = sys.stdout sys.stdout = sys.stderr # Suppress all PaddleOCR output os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' os.environ['QT_QPA_PLATFORM'] = 'offscreen' os.environ['DISPLAY'] = ':99' # Import PaddleOCR after setting environment variables from paddleocr import PaddleOCR def dprint(msg, obj=None): try: print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr) except Exception: pass def clean_cache(): files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json'] for f in files: if os.path.exists(f): os.remove(f) dprint("Removed cache file", f) if os.path.exists("output"): import shutil shutil.rmtree("output") dprint("Removed output directory") def download_image(url, output_path='temp_image.jpg'): dprint("Starting download", url) clean_cache() r = requests.get(url) dprint("HTTP status", r.status_code) r.raise_for_status() img = Image.open(BytesIO(r.content)) if img.mode == 'RGBA': bg = Image.new('RGB', img.size, (255, 255, 255)) bg.paste(img, mask=img.split()[-1]) img = bg elif img.mode != 'RGB': img = img.convert('RGB') img.save(output_path, 'JPEG', quality=95) dprint("Saved image", output_path) return output_path def cap_words(s): return None if not s else ' '.join(w.capitalize() for w in s.split()) def normalize_digits(s): """ Fix common OCR digit confusions. Args: s (str): Text string that may contain OCR errors Returns: str: Text with corrected digits Why this is needed: - OCR often misreads similar-looking characters - Common errors: O→0, o→0, I/l→1, S→5, B→8 - Critical for accurate ID number extraction """ return ( str(s) .replace('O','0').replace('o','0') .replace('I','1').replace('l','1') .replace('S','5') .replace('B','8') ) def normalize_full_name(surname, given_names, middle_name=None): if not surname and not given_names: return None surname = surname.strip() if surname else "" given_names = given_names.strip() if given_names else "" middle_name = middle_name.strip() if middle_name else "" # Combine given names (first + second if present) given_parts = [p for p in given_names.split() if p] if len(given_parts) >= 2: # Keep first two given names, ignore middle name name_parts = [given_parts[0], given_parts[1], surname] elif len(given_parts) == 1: name_parts = [given_parts[0], surname] else: name_parts = [surname] return cap_words(' '.join(name_parts)) def format_date(s): if not s: return None raw = str(s).strip() # Fix OCR digit issues first raw = normalize_digits(raw) # Handle "16MAR1980" format (no spaces) try: if re.match(r'\d{2}[A-Z]{3}\d{4}', raw): return datetime.strptime(raw, "%d%b%Y").strftime("%Y-%m-%d") except Exception: pass # Handle "16 MAR 1980" format (with spaces) try: return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d") except Exception: pass # Handle "27 JUN 2016" format try: return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d") except Exception: pass # Handle other date formats t = raw.replace(' ', '').replace('\\','/').replace('.','/') if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t): return t.replace('/', '-') if re.match(r'^\d{2}/\d{2}/\d{4}$', raw): m, d, y = raw.split('/') return f"{y}-{int(m):02d}-{int(d):02d}" return raw def extract_passport_number(text): # Fix OCR digits first text = normalize_digits(text) # Look for passport number pattern like "P0000000A" passport_pattern = r'\b([A-Z]\d{7}[A-Z0-9])\b' match = re.search(passport_pattern, text) if match: return match.group(1) return None def take_within(lines, i, k=5): out = [] for j in range(1, k+1): if i+j < len(lines): t = str(lines[i+j]).strip() if t: out.append(t) return out def extract_passport_info(lines): """ Extract passport information from OCR text lines. Args: lines (list): List of text lines from OCR processing Returns: dict: Extracted passport information Why this approach: - Passports follow ICAO standards with specific formats - Complex name structure requires separate handling - Multiple date fields need individual processing - Uses lookahead pattern matching for field extraction """ dprint("Lines to extract", lines) # Initialize variables for extracted information full_name = None surname = None given_names = None middle_name = None passport_number = None birth_date = None sex = None nationality = None place_of_birth = None date_of_issue = None valid_until = None issuing_authority = None L = [str(x or '').strip() for x in lines] i = 0 while i < len(L): line = L[i] low = line.lower() dprint("Line", {"i": i, "text": line}) # Extract passport number using pattern matching if not passport_number: passport_num = extract_passport_number(line) if passport_num: passport_number = passport_num dprint("Found passport number", passport_number) # Extract Surname using lookahead pattern if 'surname' in low or 'apelyido' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): surname = t dprint("Found surname", surname) break # Also look for "DELA CRUZ" directly if not surname and 'dela' in low and 'cruz' in low: surname = line dprint("Found surname (direct)", surname) # Extract Given Names if 'given' in low and 'name' in low or 'pangalan' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): given_names = t dprint("Found given names", given_names) break # Also look for "MARIA" directly if not given_names and line == 'MARIA': given_names = line dprint("Found given names (direct)", given_names) # Extract Middle Name if 'middle' in low or 'panggitnang' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): middle_name = t dprint("Found middle name", middle_name) break # Also look for "SANTOS" directly if not middle_name and line == 'SANTOS': middle_name = line dprint("Found middle name (direct)", middle_name) # Extract Date of Birth if 'birth' in low or 'kapanganakan' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t): birth_date = format_date(t) dprint("Found birth date", birth_date) break # Also look for "16MAR1980" directly if not birth_date and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line): birth_date = format_date(line) dprint("Found birth date (direct)", birth_date) # Extract Sex if 'sex' in low or 'kasarian' in low: ahead = take_within(L, i, 2) for t in ahead: if t.upper() in ['M', 'F', 'MALE', 'FEMALE']: sex = 'M' if t.upper().startswith('M') else 'F' dprint("Found sex", sex) break # Also look for "F" directly if not sex and line == 'F': sex = 'F' dprint("Found sex (direct)", sex) # Extract Nationality if 'nationality' in low or 'nasyonalidad' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): nationality = t dprint("Found nationality", nationality) break # Also look for "FILIPINO" directly if not nationality and line == 'FILIPINO': nationality = line dprint("Found nationality (direct)", nationality) # Extract Place of Birth if 'place' in low and 'birth' in low or 'lugar' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t): place_of_birth = t dprint("Found place of birth", place_of_birth) break # Also look for "MANILA" directly if not place_of_birth and line == 'MANILA': place_of_birth = line dprint("Found place of birth (direct)", place_of_birth) # Extract Date of Issue if 'issue' in low or 'pagkakaloob' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t): date_of_issue = format_date(t) dprint("Found date of issue", date_of_issue) break # Also look for "27JUN2016" directly if not date_of_issue and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line): date_of_issue = format_date(line) dprint("Found date of issue (direct)", date_of_issue) # Extract Valid Until Date if 'valid' in low or 'pagkawalang' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t): valid_until = format_date(t) dprint("Found valid until", valid_until) break # Also look for "26 JUN 2021" directly if not valid_until and re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', line): valid_until = format_date(line) dprint("Found valid until (direct)", valid_until) # Extract Issuing Authority if 'authority' in low or 'maykapangyarihang' in low: ahead = take_within(L, i, 3) for t in ahead: if re.search(r'[A-Z]{2,}', t) and 'DFA' in t: issuing_authority = t dprint("Found issuing authority", issuing_authority) break # Also look for "DFAMANILA" directly if not issuing_authority and 'DFA' in line: issuing_authority = line dprint("Found issuing authority (direct)", issuing_authority) i += 1 # Compose full name from separate fields if not full_name: full_name = normalize_full_name(surname, given_names, middle_name) dprint("Composed full name", {"surname": surname, "given": given_names, "middle": middle_name, "full": full_name}) # Return structured result result = { "id_type": "passport", "passport_number": passport_number, "id_number": passport_number, "full_name": full_name, "surname": surname, "given_names": given_names, "middle_name": middle_name, "birth_date": birth_date, "sex": sex, "nationality": nationality, "place_of_birth": place_of_birth, "date_of_issue": date_of_issue, "valid_until": valid_until, "issuing_authority": issuing_authority } dprint("Final result", result) return result def extract_ocr_lines(image_path): os.makedirs("output", exist_ok=True) dprint("Initializing PaddleOCR") # Ensure any internal downloader/progress writes go to stderr, not stdout with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): ocr = PaddleOCR( use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False, lang='en', show_log=False ) dprint("OCR initialized") dprint("Running OCR ocr", image_path) results = ocr.ocr(image_path, cls=False) try: count = len(results[0]) if results and isinstance(results[0], list) else len(results) except Exception: count = 0 dprint("OCR ocr done, results_count", count) # Process OCR results directly all_text = [] try: lines = results[0] if results and isinstance(results[0], list) else results for item in lines: if isinstance(item, (list, tuple)) and len(item) >= 2: meta = item[1] if isinstance(meta, (list, tuple)) and len(meta) >= 1: all_text.append(str(meta[0])) except Exception as e: dprint("Error processing OCR results", str(e)) dprint("All direct texts", all_text) return extract_passport_info(all_text) if all_text else { "id_type": "passport", "passport_number": None, "id_number": None, "full_name": None, "birth_date": None } if len(sys.argv) < 2: print(json.dumps({"error": "No image URL provided"})) sys.exit(1) image_url = sys.argv[1] dprint("Processing image URL", image_url) try: image_path = download_image(image_url) dprint("Image downloaded to", image_path) ocr_results = extract_ocr_lines(image_path) dprint("OCR results ready") # Ensure only the final JSON goes to stdout sys.stdout = _ORIG_STDOUT print(json.dumps({"success": True, "ocr_results": ocr_results})) except Exception as e: import traceback error_msg = str(e) traceback_msg = traceback.format_exc() dprint("Exception", error_msg) dprint("Traceback", traceback_msg) print(json.dumps({ "error": error_msg, "traceback": traceback_msg, "success": False })) sys.exit(1)