Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

File size: 12,930 Bytes

#!/usr/bin/env python3
"""
Philippine PRC (Professional Regulation Commission) License Information Extraction Script

Purpose:
    Extracts structured information from PRC license images using OCR.
    Handles various PRC license formats including UMID-style cards.

Why this script exists:
    - PRC licenses have complex layouts with multiple information fields
    - Need to extract profession-specific information
    - Handles both traditional PRC licenses and UMID-style PRC cards
    - Required for professional verification workflows

Key Features:
    - Extracts CRN (Common Reference Number) - 12-digit format
    - Processes registration numbers and dates
    - Extracts profession information
    - Handles GSIS/SSS number extraction
    - Supports validity date tracking

Dependencies:
    - PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
    - Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
    - requests: HTTP library (https://docs.python-requests.org/)

Usage:
    python extract_prc.py "https://example.com/prc_license.jpg"

Output:
    JSON with extracted information: crn, registration_number, profession, valid_until, etc.
"""

import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def dprint(msg, obj=None):
    """
    Debug print function that safely handles object serialization.
    
    Args:
        msg (str): Debug message
        obj (any): Object to print (optional)
        
    Why this approach:
    - Centralized debug logging
    - Safe object serialization
    - Consistent debug output format
    """
    try:
        print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
    except Exception:
        pass

def clean_cache():
    cache_files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
    for f in cache_files:
        if os.path.exists(f):
            os.remove(f)
            dprint("Removed cache file", f)
    if os.path.exists("output"):
        import shutil
        shutil.rmtree("output")
        dprint("Removed output directory")

def download_image(url, output_path='temp_image.jpg'):
    dprint("Starting download", url)
    clean_cache()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    r = requests.get(url, headers=headers, timeout=30)
    dprint("HTTP status", r.status_code)
    r.raise_for_status()
    img = Image.open(BytesIO(r.content))
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255,255,255))
        bg.paste(img, mask=img.split()[-1])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    img.save(output_path, 'JPEG', quality=95)
    dprint("Saved image", output_path)
    return output_path

def format_date(s):
    if not s: return None
    raw = s.strip()
    t = raw.replace(' ', '').replace('\\','/').replace('.','/')
    if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
        return t.replace('/', '-')
    # Accept mm/dd/yyyy style
    if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
        m, d, y = raw.split('/')
        return f"{y}-{int(m):02d}-{int(d):02d}"
    # Month name variants
    m = re.match(r'([A-Za-z]+)\s*\d{1,2},\s*\d{4}', raw)
    if m:
        try:
            return datetime.strptime(raw.replace('  ', ' '), "%B %d, %Y").strftime("%Y-%m-%d")
        except Exception:
            try:
                return datetime.strptime(raw.replace('  ', ' '), "%b %d, %Y").strftime("%Y-%m-%d")
            except Exception:
                pass
    return raw

def cap_words(name):
    return None if not name else ' '.join(w.capitalize() for w in name.split())

def normalize_name_from_parts(last, first_block):
    last = (last or '').strip()
    tokens = [t for t in (first_block or '').strip().split(' ') if t]
    given_kept = tokens[:2]  # keep up to two given names
    composed = ' '.join(given_kept + [last]).strip()
    return cap_words(composed) if composed else None

def normalize_full_name_from_three(first, middle, last):
    # keep first + optional second from "first" block; ignore middle completely
    tokens = [t for t in (first or '').strip().split(' ') if t]
    given_kept = tokens[:2]
    composed = ' '.join(given_kept + [last or '']).strip()
    return cap_words(composed) if composed else None

def take_within(lines, i, k=5):
    out = []
    for j in range(1, k+1):
        if i+j < len(lines):
            t = str(lines[i+j]).strip()
            if t:
                out.append(t)
    return out

def is_numeric_id(t):
    return bool(re.match(r'^\d{5,}$', str(t).replace(' ', '')))

def is_crn(t):
    # UMID CRN commonly 12 digits
    return bool(re.match(r'^\d{12}$', t.replace(' ', '')))

def is_date(t):
    t1 = t.replace(' ', '').replace('\\','/').replace('.','/')
    return bool(re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t1)) or bool(re.match(r'^\d{2}/\d{2}/\d{4}$', t)) or bool(re.match(r'^[A-Za-z]+\s*\d{1,2},\s*\d{4}$', t))

def extract_prc_info(lines):
    """
    Extract PRC license information from OCR text lines.
    
    Args:
        lines (list): List of text lines from OCR processing
        
    Returns:
        dict: Extracted PRC information with keys: crn, registration_number, profession, etc.
        
    Why this approach:
    - PRC licenses have complex layouts with multiple fields
    - Need to handle various license formats (traditional and UMID-style)
    - Extracts profession-specific information
    - Handles both traditional PRC licenses and UMID-style PRC cards
    - Uses lookahead pattern matching for field extraction
    """
    dprint("Lines to extract", lines)

    # Initialize variables for extracted information
    crn = None
    full_name = None
    birth_date = None
    gsis_number = None
    sss_number = None
    registration_number = None
    registration_date = None
    valid_until = None
    profession = None

    # Collect name parts separately for composition
    last_name_txt = None
    first_name_txt = None

    L = [str(x or '').strip() for x in lines]
    i = 0
    while i < len(L):
        line = L[i]
        low = line.lower()
        dprint("Line", {"i": i, "text": line})

        # Extract CRN (UMID format) - 12 digits
        if crn is None and is_crn(line):
            crn = line.replace(' ', '')
            dprint("Found CRN", crn)

        # Extract Last Name using lookahead pattern
        if 'last name' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                tl = t.lower()
                if not any(k in tl for k in ['first', 'middle', 'registration', 'valid', 'date', 'no']):
                    last_name_txt = t
                    break

        # Extract First Name
        if 'firstname' in low or 'first name' in low:
            if i+1 < len(L):
                first_name_txt = L[i+1]

        # Extract Date of Birth
        if ('date of birth' in low) or ('birth' in low and 'date' in low):
            ahead = take_within(L, i, 4)
            for t in ahead:
                if is_date(t):
                    birth_date = format_date(t)
                    break

        # Extract Registration Number - handles split labels
        if low == 'registration' and i+1 < len(L) and L[i+1].lower() in ('no', 'no.', 'number'):
            ahead = take_within(L, i+1, 4)
            for t in ahead:
                if is_numeric_id(t):
                    registration_number = t.replace(' ', '')
                    break
        
        # Also handle fused label forms
        if ('registration no' in low) or ('registration number' in low):
            ahead = take_within(L, i, 4)
            for t in ahead:
                if is_numeric_id(t):
                    registration_number = t.replace(' ', '')
                    break

        # Extract Registration Date
        if low == 'registration' and i+1 < len(L) and L[i+1].lower() == 'date':
            ahead = take_within(L, i+1, 4)
            for t in ahead:
                if is_date(t):
                    registration_date = format_date(t)
                    break
        if 'registration date' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if is_date(t):
                    registration_date = format_date(t)
                    break

        # Extract Valid Until Date
        if 'valid until' in low or 'validity' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if is_date(t):
                    valid_until = format_date(t)
                    break

        # Extract Profession from bold lines
        if any(k in low for k in ['occupational','technician','engineer','teacher','nurse']):
            if len(line.split()) >= 2:
                profession = cap_words(line)
                dprint("Found profession", profession)

        # Extract SSS Number
        if sss_number is None and ('sss' in low or 'social security' in low):
            ahead = take_within(L, i, 3)
            for t in ahead:
                if is_numeric_id(t):
                    sss_number = t.replace(' ', '')
                    dprint("Found sss_number", sss_number)
                    break

        # Extract GSIS Number
        if gsis_number is None and ('gsis' in low):
            ahead = take_within(L, i, 3)
            for t in ahead:
                if is_numeric_id(t):
                    gsis_number = t.replace(' ', '')
                    dprint("Found gsis_number", gsis_number)
                    break

        i += 1

    # Compose full name from parts
    if full_name is None:
        full_name = normalize_name_from_parts(last_name_txt, first_name_txt)

    # Return structured result
    result = {
        "id_type": "PRC ID",
        "crn": crn,
        "id_number": registration_number or crn,  # Frontend expects id_number
        "registration_number": registration_number,
        "registration_date": registration_date,
        "valid_until": valid_until,
        "full_name": full_name,
        "birth_date": birth_date,
        "sss_number": sss_number,
        "gsis_number": gsis_number,
        "profession": profession
    }
    dprint("Final result", result)
    return result

def extract_ocr_lines(image_path):
    os.makedirs("output", exist_ok=True)
    dprint("Initializing PaddleOCR")
    
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=False, 
            use_doc_unwarping=False, 
            use_textline_orientation=False, 
            lang='en'
        )
        dprint("OCR initialized")
        dprint("Running OCR predict", image_path)
        results = ocr.predict(image_path)
    dprint("OCR predict done, results_count", len(results))

    # Process OCR results directly
    all_text = []
    try:
        lines = results[0] if results and isinstance(results[0], list) else results
        for item in lines:
            if isinstance(item, (list, tuple)) and len(item) >= 2:
                meta = item[1]
                if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                    all_text.append(str(meta[0]))
    except Exception as e:
        dprint("Error processing OCR results", str(e))
    
    dprint("All direct texts", all_text)
    return extract_prc_info(all_text) if all_text else {
        "id_type": "PRC ID",
        "crn": None,
        "full_name": None,
        "birth_date": None
    }

if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
    image_path = download_image(image_url)
    dprint("Image downloaded to", image_path)
    ocr_results = extract_ocr_lines(image_path)
    dprint("OCR results ready")
    
    # Restore stdout and print only the JSON response
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results}))
    sys.stdout.flush()
    
except Exception as e:
    dprint("Exception", str(e))
    # Restore stdout for error JSON
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    # Clean up
    try:
        clean_cache()
    except:
        pass