Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

File size: 10,591 Bytes

db10255

import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def dprint(msg, obj=None):
    try:
        print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
    except Exception:
        pass

def clean_cache():
    files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
    for f in files:
        if os.path.exists(f):
            os.remove(f)
            dprint("Removed cache file", f)
    if os.path.exists("output"):
        import shutil
        shutil.rmtree("output")
        dprint("Removed output directory")

def download_image(url, output_path='temp_image.jpg'):
    dprint("Starting download", url)
    clean_cache()
    r = requests.get(url)
    dprint("HTTP status", r.status_code)
    r.raise_for_status()
    img = Image.open(BytesIO(r.content))
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255, 255, 255))
        bg.paste(img, mask=img.split()[-1])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    img.save(output_path, 'JPEG', quality=95)
    dprint("Saved image", output_path)
    return output_path

def format_date(s):
    if not s:
        return None
    raw = str(s).strip()
    t = raw.replace(' ', '').replace('\\','/').replace('.','/')
    # 1960/01/28 or 1960-01-28
    if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
        return t.replace('/', '-')
    # 01/28/1960
    if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
        m, d, y = raw.split('/')
        return f"{y}-{int(m):02d}-{int(d):02d}"
    # Month name variants
    try:
        return datetime.strptime(raw, "%B %d, %Y").strftime("%Y-%m-%d")
    except Exception:
        pass
    try:
        return datetime.strptime(raw, "%b %d, %Y").strftime("%Y-%m-%d")
    except Exception:
        pass
    return raw

def cap_words(s):
    return None if not s else ' '.join(w.capitalize() for w in s.split())

def take_within(lines, i, k=5):
    out = []
    for j in range(1, k+1):
        if i+j < len(lines):
            t = str(lines[i+j]).strip()
            if t:
                out.append(t)
    return out

def is_crn_text(t):
    # UMID CRN like 0028-1215160-9 or 002812151609
    z = str(t).strip()
    return bool(re.match(r'^\d{4}-\d{7}-\d$', z)) or bool(re.match(r'^\d{12,13}$', z))

def normalize_name(last, given):
    last = (last or '').strip()
    tokens = [t for t in (given or '').strip().split(' ') if t]
    kept = tokens[:2]  # First [+Second], ignore middles beyond that
    name = ' '.join(kept + [last]).strip()
    return cap_words(name) if name else None

def glue_address(lines, start_idx):
    parts = []
    stop_labels = ['crn', 'surname', 'given', 'middle', 'sex', 'date', 'birth']
    for k in range(1, 6):  # take up to 5 lines after ADDRESS
        idx = start_idx + k
        if idx >= len(lines):
            break
        t = str(lines[idx]).strip()
        if not t:
            continue
        low = t.lower()
        if any(lbl in low for lbl in stop_labels):
            break
        parts.append(t)
    # collapse extra spaces and commas
    address = ', '.join(parts)
    address = re.sub(r'\s{2,}', ' ', address)
    address = address.replace(' ,', ',')
    return address or None

def extract_umid_info(lines):
    dprint("Lines to extract", lines)

    crn = None
    full_name = None
    birth_date = None
    sex = None
    address = None

    last_name_txt = None
    given_name_txt = None

    L = [str(x or '').strip() for x in lines]
    i = 0
    while i < len(L):
        line = L[i]
        low = line.lower()
        dprint("Line", {"i": i, "text": line})

        # CRN
        crn_candidate = extract_crn_from_text(line)
        if crn is None and crn_candidate:
            crn = crn_candidate
            dprint("Found CRN", crn)
        elif crn is None and i+1 < len(L):
            crn_candidate = extract_crn_from_text(L[i+1])
            if crn_candidate:
                crn = crn_candidate
                dprint("Found CRN (next)", crn)

        # Surname / Given Name / Middle Name (ignore middle)
        if 'surname' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                tl = t.lower()
                if not any(k in tl for k in ['given', 'middle', 'sex', 'date', 'birth', 'address', 'crn']):
                    last_name_txt = t
                    dprint("Captured last_name", last_name_txt)
                    break

        if 'given name' in low or 'given' in low:
            if i+1 < len(L):
                # sometimes value is same line (rare), often next line
                val = L[i+1] if L[i+1] else line.replace('given name', '').strip()
                given_name_txt = val if val else None
                dprint("Captured given_name", given_name_txt)

        # Sex and Date of Birth (handles same-line case like "SEXM DATEOFBIRTH 196O/O1/28")
        if 'sex' in low:
            # sex inline
            if re.search(r'\bF(EMALE)?\b', line, flags=re.I): sex = 'F'
            if re.search(r'\bM(ALE)?\b', line, flags=re.I): sex = 'M'
            # lookahead fallback
            if sex is None:
                for t in take_within(L, i, 2):
                    tt = t.strip().upper()
                    if tt in ('M','F','MALE','FEMALE'):
                        sex = 'M' if tt.startswith('M') else 'F'
                        break

        # Date of Birth inline or in next tokens
        if 'date' in low and 'birth' in low:
            # inline date
            m = re.search(r'(\d[0-9OIl]{3}[/\-][0-1OIl]\d[/\-]\d{2,4})', line)
            if m:
                birth_date = format_date(normalize_digits(m.group(1)))
                dprint("Found birth_date (inline)", birth_date)
            if birth_date is None:
                for t in take_within(L, i, 3):
                    # fix digits then try
                    cand = normalize_digits(t)
                    if re.search(r'\d', cand) and format_date(cand):
                        birth_date = format_date(cand)
                        dprint("Found birth_date (lookahead)", birth_date)
                        break

        # Also catch standalone yyyy/mm/dd line
        if birth_date is None and re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', normalize_digits(line)):
            birth_date = format_date(normalize_digits(line))
            dprint("Found standalone birth_date", birth_date)

        # Address block
        if 'address' in low and address is None:
            address = glue_address(L, i)
            dprint("Found address", address)

        i += 1

    # Compose final name
    if full_name is None:
        full_name = normalize_name(last_name_txt, given_name_txt)
        dprint("Composed full_name", {"last": last_name_txt, "given": given_name_txt, "full": full_name})

    result = {
        "id_type": "umid",
        "crn": crn,
        "id_number": crn,   # frontend expects this
        "full_name": full_name,
        "birth_date": birth_date,
        "sex": sex,
        "address": address
    }
    dprint("Final result", result)
    return result

def extract_ocr_lines(image_path):
    os.makedirs("output", exist_ok=True)
    dprint("Initializing PaddleOCR")
    
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=False, 
            use_doc_unwarping=False, 
            use_textline_orientation=False, 
            lang='en',
            show_log=False
        )
        dprint("OCR initialized")
        dprint("Running OCR predict", image_path)
        results = ocr.ocr(image_path, cls=False)
    dprint("OCR predict done, results_count", len(results))

    # Process OCR results directly
    all_text = []
    try:
        lines = results[0] if results and isinstance(results[0], list) else results
        for item in lines:
            if isinstance(item, (list, tuple)) and len(item) >= 2:
                meta = item[1]
                if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                    all_text.append(str(meta[0]))
    except Exception as e:
        dprint("Error processing OCR results", str(e))
    
    dprint("All direct texts", all_text)
    return extract_umid_info(all_text) if all_text else {
        "id_type": "umid",
        "crn": None,
        "id_number": None,
        "full_name": None,
        "birth_date": None
    }

def normalize_digits(s):
    # Fix common OCR digit confusions: O→0, o→0, I/l→1, S→5, B→8
    return (
        str(s)
        .replace('O','0').replace('o','0')
        .replace('I','1').replace('l','1')
        .replace('S','5')
        .replace('B','8')
    )

def extract_crn_from_text(t):
    # Accept "CRN-0028-1215160-9" or plain digits/hyphens
    m = re.search(r'crn[^0-9]*([0-9OIl\-]{10,})', t, flags=re.IGNORECASE)
    if m:
        val = normalize_digits(m.group(1))
        # Keep hyphens; also accept compact digits
        if re.match(r'^\d{4}-\d{7}-\d$', val) or re.match(r'^\d{12,13}$', val):
            return val
    # Or whole token is the number
    val = normalize_digits(t.strip())
    if re.match(r'^\d{4}-\d{7}-\d$', val) or re.match(r'^\d{12,13}$', val):
        return val
    return None

if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
    image_path = download_image(image_url)
    dprint("Image downloaded to", image_path)
    ocr_results = extract_ocr_lines(image_path)
    dprint("OCR results ready")
    
    # Restore stdout and print only the JSON response
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results}))
    sys.stdout.flush()
    
except Exception as e:
    dprint("Exception", str(e))
    # Restore stdout for error JSON
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    # Clean up
    try:
        clean_cache()
    except:
        pass