Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

File size: 19,247 Bytes

import sys, json, os, glob, requests
import re
import time
from contextlib import redirect_stdout, redirect_stderr
from datetime import datetime

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def download_image(url, output_path='temp_postal_image.jpg'):
    # Remove any existing temp file
    if os.path.exists(output_path):
        os.remove(output_path)
    
    # Add cache-busting parameters
    timestamp = int(time.time())
    if '?' in url:
        url += f'&t={timestamp}'
    else:
        url += f'?t={timestamp}'
    
    # Add headers to prevent caching
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Cache-Control': 'no-cache, no-store, must-revalidate',
        'Pragma': 'no-cache',
        'Expires': '0'
    }
    
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()
    image_data = response.content
    
    # Save the image
    with open(output_path, 'wb') as f:
        f.write(image_data)
    
    return output_path

def format_date(date_str):
    """Format date from various formats to YYYY-MM-DD"""
    if not date_str:
        return None
    
    date_str = date_str.strip()
    
    # Fix common OCR errors first
    date_str = date_str.replace('Ol', '01').replace('O1', '01').replace('O0', '00').replace('OO', '00')
    date_str = date_str.replace('l', '1')  # lowercase L -> 1
    
    # Handle format like "14 Aug 88" or "14 Aug88" -> "1988-08-14"
    # Allow for missing space between month and year
    match = re.match(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', date_str)
    if match:
        day, month_str, year = match.groups()
        try:
            # Fix month OCR errors
            month_str = month_str.replace('Augu', 'Aug').replace('Augu', 'Aug')
            month_str = month_str.replace('Decm', 'Dec').replace('Dece', 'Dec')
            month_str = month_str.replace('Janu', 'Jan').replace('Febr', 'Feb')
            month_str = month_str.replace('Marc', 'Mar').replace('Apil', 'Apr')
            month_str = month_str.replace('May', 'May').replace('June', 'Jun')
            month_str = month_str.replace('July', 'Jul').replace('Sept', 'Sep')
            month_str = month_str.replace('Octo', 'Oct').replace('Novem', 'Nov')
            
            # Convert 2-digit year to 4-digit (assume 1900s for years > 50, 2000s for <= 50)
            if len(year) == 2:
                year_int = int(year)
                year = f"19{year}" if year_int > 50 else f"20{year}"
            
            # Parse month abbreviation (use first 3 chars)
            month = datetime.strptime(month_str[:3], '%b').month
            return f"{year}-{month:02d}-{int(day):02d}"
        except Exception as e:
            print(f"DEBUG: Date parsing error: {e}", file=sys.stderr)
            pass
    
    # Try other common formats
    for fmt in ["%d %b %Y", "%d %B %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%d%b%Y", "%d%B%Y"]:
        try:
            dt = datetime.strptime(date_str, fmt)
            return dt.strftime("%Y-%m-%d")
        except Exception:
            continue
    
    return date_str

def format_name(name):
    """Format name: capitalize properly"""
    if not name:
        return None
    
    # Remove extra spaces and normalize
    name = ' '.join(name.split())
    
    # Capitalize each word properly
    name = ' '.join([word.capitalize() for word in name.split()])
    
    return name.strip()

def format_address(address_lines):
    """Format address from multiple lines"""
    if not address_lines:
        return None
    
    # Join address lines and clean up
    address = ' '.join([line.strip() for line in address_lines if line.strip()])
    
    # Fix missing spaces: "585Gen." -> "585 Gen."
    address = re.sub(r'(\d+)([A-Z])', r'\1 \2', address)
    
    # Fix missing spaces before abbreviations: "Brgy.Rivera" -> "Brgy. Rivera"
    address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
    
    # Remove extra spaces
    address = ' '.join(address.split())
    
    return address.strip()

def extract_postal_details(lines):
    details = {
        'id_type': 'Postal ID',
        'prn': None,
        'full_name': None,
        'address': None,
        'birth_date': None,
        'nationality': None,
        'issuing_post_office': None,
        'valid_until': None,
        'success': False
    }
    
    # Clean lines - convert to strings and strip
    cleaned_lines = [str(line).strip() for line in lines if str(line).strip()]
    
    for i, line in enumerate(cleaned_lines):
        line_upper = line.upper().strip()
        line_stripped = line.strip()
        
        # Extract PRN (Postal Registration Number)
        # Format: "PRN 100141234567 P POSTAL" or "PRN100141234567P" or "PAN100141234567P" (OCR might misread PRN as PAN)
        if not details['prn']:
            # Look for PRN followed by digits (may have P POSTAL after)
            prn_match = re.search(r'PRN\s*(\d{10,15})', line_upper)
            if prn_match:
                details['prn'] = prn_match.group(1)
            # Also check for PAN (common OCR error where PRN is misread as PAN)
            elif re.search(r'PAN\s*(\d{10,15})', line_upper):
                pan_match = re.search(r'PAN\s*(\d{10,15})', line_upper)
                if pan_match:
                    details['prn'] = pan_match.group(1)
        
        # Extract Full Name - combine separate name parts
        # Look for label "First Name Middle Name Surname, Suffix" or name parts
        if not details['full_name']:
            # Check if this line is the label
            if ("FIRST NAME" in line_upper or "FINT NAME" in line_upper) and ("SURNAME" in line_upper or "SUMAME" in line_upper):
                # Collect name parts from next few lines
                name_parts = []
                for j in range(1, min(5, len(cleaned_lines) - i)):
                    next_line = cleaned_lines[i+j].strip()
                    next_upper = next_line.upper()
                    # Stop if we hit address or other labels
                    if any(label in next_upper for label in ['ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY']):
                        break
                    # Add if it looks like a name part (all caps, letters and spaces only, not too short)
                    if next_line and re.match(r'^[A-Z\s,]+$', next_line) and len(next_line) > 1:
                        # Skip if it's clearly not a name (like "ID", "C", etc.)
                        if next_line not in ['ID', 'C', 'P', 'POSTAL']:
                            name_parts.append(next_line)
                if name_parts:
                    details['full_name'] = ' '.join(name_parts)
            # Also check if line is a name part (all caps, not a label)
            elif re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped) > 2:
                # Make sure it's not a label or common words
                if not any(label in line_upper for label in ['FIRST NAME', 'MIDDLE NAME', 'SURNAME', 'ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'POSTAL', 'IDENTITY', 'CARD', 'PHCPOST', 'PHILIPPINE', 'PREMIUM']):
                    # Check if previous line is the name label
                    if i > 0:
                        prev_line = cleaned_lines[i-1].strip().upper()
                        if "FIRST NAME" in prev_line or "FINT NAME" in prev_line or "SUMAME" in prev_line or "SURNAME" in prev_line:
                            # Collect consecutive name parts
                            name_parts = [line_stripped]
                            for j in range(1, min(4, len(cleaned_lines) - i)):
                                next_line = cleaned_lines[i+j].strip()
                                if (next_line and re.match(r'^[A-Z\s,]+$', next_line) and 
                                    len(next_line) > 2 and
                                    not any(label in next_line.upper() for label in ['ADDRESS', 'DATE', 'BIRTH', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY', 'ID', 'POSTAL', 'PREMIUM'])):
                                    name_parts.append(next_line)
                                else:
                                    break
                            if len(name_parts) >= 2:
                                details['full_name'] = ' '.join(name_parts)
                            elif len(name_parts) == 1 and len(name_parts[0].split()) >= 2:
                                details['full_name'] = name_parts[0]
        
        # Extract Address - look for address parts (street numbers, Gen., Blvd., Brgy., City)
        if not details['address']:
            # Look for address indicators
            if any(indicator in line_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY']) or (re.match(r'^\d+', line_stripped) and len(line_stripped) > 2):
                address_lines = []
                # Check backwards a bit to see if we missed address start
                start_idx = max(0, i - 1)
                
                # Collect address lines forward
                for j in range(0, min(7, len(cleaned_lines) - start_idx)):
                    idx = start_idx + j
                    if idx >= len(cleaned_lines):
                        break
                    addr_line = cleaned_lines[idx].strip()
                    addr_upper = addr_line.upper()
                    
                    # Stop if we hit date, nationality, or other labels
                    if any(label in addr_upper for label in ['DATE', 'BIRTH', 'NATIONALITY', 'FILIPINO', 'ISSUING', 'VALID', 'PAN', 'NOCON']):
                        break
                    
                    # Skip very short lines that are likely OCR noise (like "101", "o00")
                    if len(addr_line) <= 2 and not re.match(r'^\d+$', addr_line):
                        continue
                    
                    # Add if it looks like address content
                    if addr_line and len(addr_line) > 1:
                        # Check if it's a number, street name, barangay, city, etc.
                        if (re.match(r'^\d+', addr_line) or 
                            any(indicator in addr_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY', 'STREET', 'AVE', 'BOULEVARD']) or
                            len(address_lines) > 0):  # Continue if we've started collecting
                            # Skip obvious OCR errors like "o00"
                            if addr_line.lower() not in ['o00', 'o0', '00']:
                                address_lines.append(addr_line)
                
                if address_lines:
                    details['address'] = format_address(address_lines)
        
        # Extract Date of Birth - handle OCR errors
        if not details['birth_date']:
            # Look for date patterns: "14 Aug88" or "14 Aug 88"
            date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', line_stripped)
            if date_match:
                # Check if it's not the valid until date
                if "VALID" not in line_upper and "UNTIL" not in line_upper:
                    # Fix spacing
                    day, month, year = date_match.groups()
                    details['birth_date'] = f"{day} {month} {year}"
        
        # Extract Nationality
        if not details['nationality']:
            if "NATIONALITY" in line_upper or line_upper == "FILIPINO":
                if line_upper == "FILIPINO":
                    details['nationality'] = "Filipino"
                elif i + 1 < len(cleaned_lines):
                    next_line = cleaned_lines[i+1].strip()
                    if next_line and len(next_line) < 20:
                        details['nationality'] = next_line
        
        # Extract Issuing Post Office - handle OCR errors like "IssungPostOmce"
        if not details['issuing_post_office']:
            if ("ISSUING POST OFFICE" in line_upper or "ISSUING POST" in line_upper or 
                "ISSUINGPOST" in line_upper or "ISSUINGPOSTOMCE" in line_upper):
                if i + 1 < len(cleaned_lines):
                    next_line = cleaned_lines[i+1].strip()
                    if next_line and len(next_line) < 20:
                        # Fix OCR errors: MNL.QE -> MNL-QE
                        next_line = next_line.replace('.', '-')
                        details['issuing_post_office'] = next_line
        
        # Extract Valid Until - handle OCR errors like "Vald Urt" and "OlDec17"
        if not details['valid_until']:
            if ("VALID UNTIL" in line_upper or "VALIDUNTIL" in line_upper or 
                "VALD URT" in line_upper or "VALDURT" in line_upper):
                if i + 1 < len(cleaned_lines):
                    next_line = cleaned_lines[i+1].strip()
                    # Fix OCR errors: OlDec17 -> 01 Dec 17
                    # Replace common OCR errors
                    next_line = next_line.replace('Ol', '01').replace('O1', '01')
                    next_line = next_line.replace('O0', '00').replace('OO', '00')
                    # Try to extract date pattern
                    date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', next_line)
                    if date_match:
                        day, month, year = date_match.groups()
                        details['valid_until'] = f"{day} {month} {year}"
                    elif next_line:
                        details['valid_until'] = next_line
    
    # Format extracted fields
    if details['full_name']:
        details['full_name'] = format_name(details['full_name'])
    if details['birth_date']:
        details['birth_date'] = format_date(details['birth_date'])
    if details['valid_until']:
        details['valid_until'] = format_date(details['valid_until'])
    
    if details['prn'] or details['full_name']:
        details['success'] = True
        
    return details

def extract_ocr_lines(image_path):
    # Check if file exists
    if not os.path.exists(image_path):
        return {'success': False, 'error': 'File not found'}
    
    file_size = os.path.getsize(image_path)
    print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr)

    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        # Try simple configuration first
        ocr = PaddleOCR(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,
            lang='en'
        )
        try:
            results = ocr.ocr(image_path)
        except Exception as e:
            print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr)
            if hasattr(ocr, 'predict'):
                results = ocr.predict(image_path)
            else:
                results = None
    
    # Debug: Print raw results structure
    print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr)
    
    all_text = []
    try:
        # Handle both old format (list) and new format (OCRResult object)
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            item_type_name = type(first_item).__name__
            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
            
            if is_ocr_result:
                print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
                # Access OCRResult as dictionary
                try:
                    if hasattr(first_item, 'keys'):
                        ocr_dict = dict(first_item)
                        # Look for rec_texts key
                        if 'rec_texts' in ocr_dict:
                            rec_texts = ocr_dict['rec_texts']
                            if isinstance(rec_texts, list):
                                all_text = [str(t) for t in rec_texts if t]
                                print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
                except Exception as e:
                    print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
            else:
                # Old format - list of lists
                lines = results[0] if results and isinstance(results[0], list) else results
                for item in lines:
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        meta = item[1]
                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                            all_text.append(str(meta[0]))
    except Exception as e:
        print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
        import traceback
        print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
    
    print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
    
    return extract_postal_details(all_text) if all_text else {
        'id_type': 'Postal ID',
        'prn': None,
        'full_name': None,
        'address': None,
        'birth_date': None,
        'nationality': None,
        'issuing_post_office': None,
        'valid_until': None,
        'success': False
    }

# Main Execution
if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"success": False, "error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
print(f"DEBUG: Processing Postal ID image URL: {image_url}", file=sys.stderr)

try:
    image_path = download_image(image_url, 'temp_postal_image.jpg')
    print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)

    ocr_results = extract_ocr_lines(image_path)
    print(f"DEBUG: OCR results: {ocr_results}", file=sys.stderr)
    
    # Clean up
    if os.path.exists(image_path):
        os.remove(image_path)
    
    response = {
        "success": ocr_results['success'],
        "data": ocr_results
    }
    
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps(response))
    sys.stdout.flush()
        
except Exception as e:
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    try:
        if os.path.exists('temp_postal_image.jpg'):
            os.remove('temp_postal_image.jpg')
    except:
        pass