#!/usr/bin/env python3
"""
Philippine Passport Information Extraction Script

Purpose:
    Extracts structured information from Philippine passport images using OCR.
    Handles complex passport layouts with multiple information fields.

Why this script exists:
    - Passports have complex layouts with multiple information fields
    - Need to extract international-standard passport information
    - Handles bilingual labels (English/Filipino)
    - Required for passport verification workflows

Key Features:
    - Extracts passport number (format: X0000000A)
    - Handles complex name structures (surname, given names, middle name)
    - Processes multiple date fields (birth, issue, expiration)
    - Extracts nationality and place of birth
    - Handles OCR digit correction

Dependencies:
    - PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
    - Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
    - requests: HTTP library (https://docs.python-requests.org/)

Usage:
    python extract_passport.py "https://example.com/passport.jpg"

Output:
    JSON with extracted information: passport_number, full_name, birth_date, valid_until, etc.
"""

import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr

# Route any non-JSON prints to stderr by default
_ORIG_STDOUT = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def dprint(msg, obj=None):
    try:
        print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
    except Exception:
        pass

def clean_cache():
    files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
    for f in files:
        if os.path.exists(f):
            os.remove(f)
            dprint("Removed cache file", f)
    if os.path.exists("output"):
        import shutil
        shutil.rmtree("output")
        dprint("Removed output directory")

def download_image(url, output_path='temp_image.jpg'):
    dprint("Starting download", url)
    clean_cache()
    r = requests.get(url)
    dprint("HTTP status", r.status_code)
    r.raise_for_status()
    img = Image.open(BytesIO(r.content))
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255, 255, 255))
        bg.paste(img, mask=img.split()[-1])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    img.save(output_path, 'JPEG', quality=95)
    dprint("Saved image", output_path)
    return output_path

def cap_words(s):
    return None if not s else ' '.join(w.capitalize() for w in s.split())

def normalize_digits(s):
    """
    Fix common OCR digit confusions.
    
    Args:
        s (str): Text string that may contain OCR errors
        
    Returns:
        str: Text with corrected digits
        
    Why this is needed:
    - OCR often misreads similar-looking characters
    - Common errors: O→0, o→0, I/l→1, S→5, B→8
    - Critical for accurate ID number extraction
    """
    return (
        str(s)
        .replace('O','0').replace('o','0')
        .replace('I','1').replace('l','1')
        .replace('S','5')
        .replace('B','8')
    )

def normalize_full_name(surname, given_names, middle_name=None):
    if not surname and not given_names:
        return None
    
    surname = surname.strip() if surname else ""
    given_names = given_names.strip() if given_names else ""
    middle_name = middle_name.strip() if middle_name else ""
    
    # Combine given names (first + second if present)
    given_parts = [p for p in given_names.split() if p]
    if len(given_parts) >= 2:
        # Keep first two given names, ignore middle name
        name_parts = [given_parts[0], given_parts[1], surname]
    elif len(given_parts) == 1:
        name_parts = [given_parts[0], surname]
    else:
        name_parts = [surname]
    
    return cap_words(' '.join(name_parts))

def format_date(s):
    if not s:
        return None
    raw = str(s).strip()
    
    # Fix OCR digit issues first
    raw = normalize_digits(raw)
    
    # Handle "16MAR1980" format (no spaces)
    try:
        if re.match(r'\d{2}[A-Z]{3}\d{4}', raw):
            return datetime.strptime(raw, "%d%b%Y").strftime("%Y-%m-%d")
    except Exception:
        pass
    
    # Handle "16 MAR 1980" format (with spaces)
    try:
        return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d")
    except Exception:
        pass
    
    # Handle "27 JUN 2016" format
    try:
        return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d")
    except Exception:
        pass
    
    # Handle other date formats
    t = raw.replace(' ', '').replace('\\','/').replace('.','/')
    if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
        return t.replace('/', '-')
    if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
        m, d, y = raw.split('/')
        return f"{y}-{int(m):02d}-{int(d):02d}"
    
    return raw

def extract_passport_number(text):
    # Fix OCR digits first
    text = normalize_digits(text)
    # Look for passport number pattern like "P0000000A"
    passport_pattern = r'\b([A-Z]\d{7}[A-Z0-9])\b'
    match = re.search(passport_pattern, text)
    if match:
        return match.group(1)
    return None

def take_within(lines, i, k=5):
    out = []
    for j in range(1, k+1):
        if i+j < len(lines):
            t = str(lines[i+j]).strip()
            if t:
                out.append(t)
    return out

def extract_passport_info(lines):
    """
    Extract passport information from OCR text lines.
    
    Args:
        lines (list): List of text lines from OCR processing
        
    Returns:
        dict: Extracted passport information
        
    Why this approach:
    - Passports follow ICAO standards with specific formats
    - Complex name structure requires separate handling
    - Multiple date fields need individual processing
    - Uses lookahead pattern matching for field extraction
    """
    dprint("Lines to extract", lines)

    # Initialize variables for extracted information
    full_name = None
    surname = None
    given_names = None
    middle_name = None
    passport_number = None
    birth_date = None
    sex = None
    nationality = None
    place_of_birth = None
    date_of_issue = None
    valid_until = None
    issuing_authority = None

    L = [str(x or '').strip() for x in lines]
    i = 0
    while i < len(L):
        line = L[i]
        low = line.lower()
        dprint("Line", {"i": i, "text": line})

        # Extract passport number using pattern matching
        if not passport_number:
            passport_num = extract_passport_number(line)
            if passport_num:
                passport_number = passport_num
                dprint("Found passport number", passport_number)

        # Extract Surname using lookahead pattern
        if 'surname' in low or 'apelyido' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
                    surname = t
                    dprint("Found surname", surname)
                    break
        # Also look for "DELA CRUZ" directly
        if not surname and 'dela' in low and 'cruz' in low:
            surname = line
            dprint("Found surname (direct)", surname)

        # Extract Given Names
        if 'given' in low and 'name' in low or 'pangalan' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
                    given_names = t
                    dprint("Found given names", given_names)
                    break
        # Also look for "MARIA" directly
        if not given_names and line == 'MARIA':
            given_names = line
            dprint("Found given names (direct)", given_names)

        # Extract Middle Name
        if 'middle' in low or 'panggitnang' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
                    middle_name = t
                    dprint("Found middle name", middle_name)
                    break
        # Also look for "SANTOS" directly
        if not middle_name and line == 'SANTOS':
            middle_name = line
            dprint("Found middle name (direct)", middle_name)

        # Extract Date of Birth
        if 'birth' in low or 'kapanganakan' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
                    birth_date = format_date(t)
                    dprint("Found birth date", birth_date)
                    break
        # Also look for "16MAR1980" directly
        if not birth_date and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line):
            birth_date = format_date(line)
            dprint("Found birth date (direct)", birth_date)

        # Extract Sex
        if 'sex' in low or 'kasarian' in low:
            ahead = take_within(L, i, 2)
            for t in ahead:
                if t.upper() in ['M', 'F', 'MALE', 'FEMALE']:
                    sex = 'M' if t.upper().startswith('M') else 'F'
                    dprint("Found sex", sex)
                    break
        # Also look for "F" directly
        if not sex and line == 'F':
            sex = 'F'
            dprint("Found sex (direct)", sex)

        # Extract Nationality
        if 'nationality' in low or 'nasyonalidad' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
                    nationality = t
                    dprint("Found nationality", nationality)
                    break
        # Also look for "FILIPINO" directly
        if not nationality and line == 'FILIPINO':
            nationality = line
            dprint("Found nationality (direct)", nationality)

        # Extract Place of Birth
        if 'place' in low and 'birth' in low or 'lugar' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
                    place_of_birth = t
                    dprint("Found place of birth", place_of_birth)
                    break
        # Also look for "MANILA" directly
        if not place_of_birth and line == 'MANILA':
            place_of_birth = line
            dprint("Found place of birth (direct)", place_of_birth)

        # Extract Date of Issue
        if 'issue' in low or 'pagkakaloob' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
                    date_of_issue = format_date(t)
                    dprint("Found date of issue", date_of_issue)
                    break
        # Also look for "27JUN2016" directly
        if not date_of_issue and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line):
            date_of_issue = format_date(line)
            dprint("Found date of issue (direct)", date_of_issue)

        # Extract Valid Until Date
        if 'valid' in low or 'pagkawalang' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
                    valid_until = format_date(t)
                    dprint("Found valid until", valid_until)
                    break
        # Also look for "26 JUN 2021" directly
        if not valid_until and re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', line):
            valid_until = format_date(line)
            dprint("Found valid until (direct)", valid_until)

        # Extract Issuing Authority
        if 'authority' in low or 'maykapangyarihang' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if re.search(r'[A-Z]{2,}', t) and 'DFA' in t:
                    issuing_authority = t
                    dprint("Found issuing authority", issuing_authority)
                    break
        # Also look for "DFAMANILA" directly
        if not issuing_authority and 'DFA' in line:
            issuing_authority = line
            dprint("Found issuing authority (direct)", issuing_authority)

        i += 1

    # Compose full name from separate fields
    if not full_name:
        full_name = normalize_full_name(surname, given_names, middle_name)
        dprint("Composed full name", {"surname": surname, "given": given_names, "middle": middle_name, "full": full_name})

    # Return structured result
    result = {
        "id_type": "passport",
        "passport_number": passport_number,
        "id_number": passport_number,
        "full_name": full_name,
        "surname": surname,
        "given_names": given_names,
        "middle_name": middle_name,
        "birth_date": birth_date,
        "sex": sex,
        "nationality": nationality,
        "place_of_birth": place_of_birth,
        "date_of_issue": date_of_issue,
        "valid_until": valid_until,
        "issuing_authority": issuing_authority
    }
    dprint("Final result", result)
    return result

def extract_ocr_lines(image_path):
    os.makedirs("output", exist_ok=True)
    dprint("Initializing PaddleOCR")
    
    # Ensure any internal downloader/progress writes go to stderr, not stdout
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=False, 
            use_doc_unwarping=False, 
            use_textline_orientation=False, 
            lang='en',
            show_log=False
        )
        dprint("OCR initialized")
        dprint("Running OCR ocr", image_path)
        results = ocr.ocr(image_path, cls=False)
    try:
        count = len(results[0]) if results and isinstance(results[0], list) else len(results)
    except Exception:
        count = 0
    dprint("OCR ocr done, results_count", count)

    # Process OCR results directly
    all_text = []
    try:
        lines = results[0] if results and isinstance(results[0], list) else results
        for item in lines:
            if isinstance(item, (list, tuple)) and len(item) >= 2:
                meta = item[1]
                if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                    all_text.append(str(meta[0]))
    except Exception as e:
        dprint("Error processing OCR results", str(e))
    
    dprint("All direct texts", all_text)
    return extract_passport_info(all_text) if all_text else {
        "id_type": "passport",
        "passport_number": None,
        "id_number": None,
        "full_name": None,
        "birth_date": None
    }

if len(sys.argv) < 2:
    print(json.dumps({"error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
    image_path = download_image(image_url)
    dprint("Image downloaded to", image_path)
    ocr_results = extract_ocr_lines(image_path)
    dprint("OCR results ready")
    # Ensure only the final JSON goes to stdout
    sys.stdout = _ORIG_STDOUT
    print(json.dumps({"success": True, "ocr_results": ocr_results}))
except Exception as e:
    import traceback
    error_msg = str(e)
    traceback_msg = traceback.format_exc()
    dprint("Exception", error_msg)
    dprint("Traceback", traceback_msg)
    print(json.dumps({
        "error": error_msg,
        "traceback": traceback_msg,
        "success": False
    }))
    sys.exit(1)