Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

File size: 23,558 Bytes

import sys, json, os, glob, requests
import re
import time
import shutil
from contextlib import redirect_stdout, redirect_stderr

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def download_image(url, output_path='temp_image.jpg'):
    # Remove any existing temp file
    if os.path.exists(output_path):
        os.remove(output_path)
    
    # Add cache-busting parameters
    timestamp = int(time.time())
    if '?' in url:
        url += f'&t={timestamp}'
    else:
        url += f'?t={timestamp}'
    
    # Add headers to prevent caching
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Cache-Control': 'no-cache, no-store, must-revalidate',
        'Pragma': 'no-cache',
        'Expires': '0'
    }
    
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()
    image_data = response.content
    
    
    # Save the image and verify it's the right one
    with open(output_path, 'wb') as f:
        f.write(image_data)
    
    
    return output_path

# OCR Function to extract NBI ID NO, Name, Birth Date, and LIT
def extract_nbi_id(lines):
    nbi_id = None
    full_name = None
    birth_date = None
    lit = None  # LIT field (Last Issued To or similar)
    
    # Clean lines - convert to strings and strip
    cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines]
    
    # First pass: Look for NBI ID pattern in all lines (prioritize exact matches)
    # This helps catch IDs that might be on lines without labels
    for i, line in enumerate(cleaned_lines):
        line_upper = line.upper().strip()
        line_clean = line.strip()
        
        # Look for NBI ID pattern with hyphen first (most reliable)
        if not nbi_id:
            hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
            match = re.search(hyphen_pattern, line_clean)
            if match:
                candidate = match.group(1)
                # Validate length and that it's not part of an address
                if 17 <= len(candidate) <= 25:
                    # Check that line doesn't have too many words (NBI IDs are usually standalone)
                    line_words = line_clean.split()
                    if len(line_words) <= 3:  # Usually 1-2 words max (the ID itself)
                        # Additional validation: should have mix of letters and numbers
                        has_letters = bool(re.search(r'[A-Z]', candidate))
                        has_numbers = bool(re.search(r'[0-9]', candidate))
                        if has_letters and has_numbers:
                            nbi_id = candidate
                            print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr)
                            break
    
    # Second pass: Extract other fields and refine ID if needed
    for i, line in enumerate(cleaned_lines):
        line_upper = line.upper().strip()
        line_clean = line.strip()
        
        # Extract NBI ID Number (if not found in first pass)
        if not nbi_id:
            # Look for "NBI ID NO:" pattern (various formats)
            if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or 
                "NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper):
                # Extract the ID after the colon
                if ":" in line:
                    parts = line.split(':', 1)
                    if len(parts) > 1:
                        id_candidate = parts[1].strip()
                        # Clean up the ID (remove extra spaces, ensure proper format)
                        id_candidate = re.sub(r'\s+', '', id_candidate)  # Remove spaces
                        if len(id_candidate) > 5:  # Valid ID should be longer
                            nbi_id = id_candidate
                            print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr)
                            continue
                
                # Also check if the next line contains the ID (in case it's on a separate line)
                if i < len(cleaned_lines) - 1:
                    for j in range(1, min(3, len(cleaned_lines) - i)):
                        next_line = cleaned_lines[i + j].strip()
                        # Skip if it's clearly not an ID (too short, contains labels)
                        if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']):
                            continue
                        # Check if it looks like an NBI ID (alphanumeric, reasonable length)
                        if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')):
                            nbi_id = next_line.replace(' ', '')
                            print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr)
                            break
                    if nbi_id:
                        continue
            
            # Look for NBI ID pattern: alphanumeric with one hyphen
            # Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873
            # First part: 8-12 chars, hyphen, second part: 8-12 chars
            # Total length: 17-25 characters (including hyphen)
            
            # Priority 1: Pattern with hyphen (most common format)
            # Look for pattern like B450JRLR0B-RC248667
            hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
            match = re.search(hyphen_pattern, line_clean)
            if match:
                candidate = match.group(1)
                # Validate: should be 17-25 chars total
                if 17 <= len(candidate) <= 25:
                    # Make sure it's not matching address parts or other text
                    # Also check that the line doesn't have too many words (NBI IDs are usually standalone)
                    line_words = line_clean.split()
                    # Additional validation: should have mix of letters and numbers
                    has_letters = bool(re.search(r'[A-Z]', candidate))
                    has_numbers = bool(re.search(r'[0-9]', candidate))
                    if (has_letters and has_numbers and
                        not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and
                        len(line_words) <= 3):  # NBI ID is usually on its own line or with 1-2 other words
                        nbi_id = candidate
                        print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr)
                        continue
            
            # Priority 2: Pattern with space instead of hyphen
            space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b'
            match = re.search(space_pattern, line_clean)
            if match:
                part1, part2 = match.groups()
                candidate = f"{part1}-{part2}"
                if 17 <= len(candidate) <= 25:
                    has_letters = bool(re.search(r'[A-Z]', candidate))
                    has_numbers = bool(re.search(r'[0-9]', candidate))
                    if (has_letters and has_numbers and
                        not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])):
                        nbi_id = candidate
                        print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr)
                        continue
            
            # Priority 3: Pattern without hyphen/space (all together)
            # Only if we haven't found one yet and it's a reasonable length
            no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b'
            match = re.search(no_hyphen_pattern, line_clean)
            if match:
                candidate = match.group(1)
                # Make sure it doesn't contain common address words and has both letters and numbers
                has_letters = bool(re.search(r'[A-Z]', candidate))
                has_numbers = bool(re.search(r'[0-9]', candidate))
                if (has_letters and has_numbers and
                    not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])):
                    # Try to split it intelligently (usually split in the middle)
                    mid = len(candidate) // 2
                    # Try splitting at various points
                    for split_point in range(mid-2, mid+3):
                        if 8 <= split_point <= len(candidate) - 8:
                            part1 = candidate[:split_point]
                            part2 = candidate[split_point:]
                            if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12:
                                nbi_id = f"{part1}-{part2}"
                                print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr)
                                break
                    if nbi_id:
                        continue
        
        # Extract Full Name - look for name patterns after "NAME" label
        # Also handle cases where name might be on the same line or next lines
        if not full_name:
            # Check if line contains "NAME" label
            if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper):
                # First, check if name is on the same line after colon
                if ":" in line:
                    parts = line.split(':', 1)
                    if len(parts) > 1:
                        name_part = parts[1].strip()
                        if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2:
                            full_name = name_part
                            print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr)
                            continue
                
                # Check next few lines for name value
                for j in range(1, min(5, len(cleaned_lines) - i)):
                    next_line = cleaned_lines[i + j].strip()
                    next_upper = next_line.upper()
                    # Skip if it's another label or ID number
                    if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
                        continue
                    # Check if it looks like a name (has letters, may have commas, not all numbers)
                    if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2:
                        # Additional check: make sure it's not just a single word that's too short
                        if len(next_line.split()) >= 1 and len(next_line) > 3:
                            full_name = next_line
                            print(f"DEBUG: Found full name: {full_name}", file=sys.stderr)
                            break
        
        # Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label
        if not birth_date:
            if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or 
                ("DATE" in line_upper and "BIRTH" in line_upper)):
                # First, check if date is on the same line after colon
                if ":" in line:
                    parts = line.split(':', 1)
                    if len(parts) > 1:
                        date_part = parts[1].strip()
                        if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', date_part.upper()) or
                            re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or
                            re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)):
                            birth_date = date_part
                            print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr)
                            continue
                
                # Check next few lines for date value
                for j in range(1, min(5, len(cleaned_lines) - i)):
                    next_line = cleaned_lines[i + j].strip()
                    next_upper = next_line.upper()
                    # Skip if it's another label
                    if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
                        continue
                    # Check if it looks like a date (contains month name or date pattern)
                    if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', next_upper) or
                        re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or
                        re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)):
                        birth_date = next_line
                        print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
                        break
        
        # Extract LIT field - look for "LIT" label or pattern
        if not lit:
            # Look for "LIT" label (could be "LIT:", "LIT", or part of another label)
            if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper):
                # Check if LIT value is on the same line after colon or space
                if ":" in line:
                    parts = line.split(':', 1)
                    if len(parts) > 1:
                        lit_part = parts[1].strip()
                        if len(lit_part) > 0:
                            lit = lit_part
                            print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr)
                            continue
                # Check next few lines for LIT value
                for j in range(1, min(4, len(cleaned_lines) - i)):
                    next_line = cleaned_lines[i + j].strip()
                    next_upper = next_line.upper()
                    # Skip if it's another label
                    if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']):
                        continue
                    # Check if it looks like a valid LIT value (could be date, name, or other text)
                    if len(next_line) > 0:
                        lit = next_line
                        print(f"DEBUG: Found LIT: {lit}", file=sys.stderr)
                        break
    
    return {
        'clearance_type': 'nbi',
        'id_number': nbi_id,
        'full_name': full_name,
        'birth_date': birth_date,
        'lit': lit,
        'success': nbi_id is not None or full_name is not None
    }

def extract_ocr_lines_simple(image_path):
    
    # Try with different PaddleOCR settings
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=True,  # Enable orientation detection
            use_doc_unwarping=True,            # Enable document unwarping
            use_textline_orientation=True,     # Enable text line orientation
            lang='en'                          # Set language to English
        )
        try:
            results = ocr.predict(image_path)
        except Exception as e:
            print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
            if hasattr(ocr, 'ocr'):
                results = ocr.ocr(image_path)
            else:
                results = None
    
    all_text = []
    try:
        # Handle both old format (list) and new format (OCRResult object)
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            item_type_name = type(first_item).__name__
            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
            
            if is_ocr_result:
                print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
                # Access OCRResult as dictionary
                try:
                    if hasattr(first_item, 'keys'):
                        ocr_dict = dict(first_item)
                        # Look for rec_texts key
                        if 'rec_texts' in ocr_dict:
                            rec_texts = ocr_dict['rec_texts']
                            if isinstance(rec_texts, list):
                                all_text = [str(t) for t in rec_texts if t]
                                print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
                except Exception as e:
                    print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
            else:
                # Old format - list of lists
                lines = results[0] if results and isinstance(results[0], list) else results
                for item in lines:
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        meta = item[1]
                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                            all_text.append(str(meta[0]))
    except Exception as e:
        print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
    
    return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}

def extract_ocr_lines(image_path):
    # Check if file exists and has content
    if not os.path.exists(image_path):
        return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
    
    # Ensure output directory exists
    os.makedirs("output", exist_ok=True)
    
    # Clear previous output files
    for old_file in glob.glob("output/*"):
        os.remove(old_file)
    
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=False, 
            use_doc_unwarping=False, 
            use_textline_orientation=False,
            lang='en'
        )
        try:
            results = ocr.predict(image_path)
        except Exception as e:
            print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
            if hasattr(ocr, 'ocr'):
                results = ocr.ocr(image_path)
            else:
                results = None
    
    # Process OCR results - handle both old format (list) and new format (OCRResult object)
    all_text = []
    try:
        # Handle both old format (list) and new format (OCRResult object)
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            item_type_name = type(first_item).__name__
            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
            
            if is_ocr_result:
                print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
                # Access OCRResult as dictionary
                try:
                    if hasattr(first_item, 'keys'):
                        ocr_dict = dict(first_item)
                        # Look for rec_texts key
                        if 'rec_texts' in ocr_dict:
                            rec_texts = ocr_dict['rec_texts']
                            if isinstance(rec_texts, list):
                                all_text = [str(t) for t in rec_texts if t]
                                print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
                except Exception as e:
                    print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
            else:
                # Old format - list of lists
                lines = results[0] if results and isinstance(results[0], list) else results
                for item in lines:
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        meta = item[1]
                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                            all_text.append(str(meta[0]))
    except Exception as e:
        print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
        import traceback
        print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
    
    print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
    return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
    
# Main 
if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"success": False, "error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
print(f"DEBUG: Processing NBI image URL: {image_url}", file=sys.stderr)

try:
    image_path = download_image(image_url, f'temp_image.jpg')
    print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)

    # Try the original OCR method first
    ocr_results = extract_ocr_lines(image_path)
    print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr)
    
    # If original method fails, try simple method
    if not ocr_results['success']:
        print("DEBUG: Original method failed, trying simple method", file=sys.stderr)
        ocr_results = extract_ocr_lines_simple(image_path)
        print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr)
    
    # Clean up the temporary file
    if os.path.exists(image_path):
        os.remove(image_path)
    
    # Create the response object
    response = {
        "success": ocr_results['success'], 
        "ocr_results": ocr_results
    }
    
    # Restore stdout and print only the JSON response
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps(response))
    sys.stdout.flush()
        
except Exception as e:
    # Restore stdout for error JSON
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    # Clean up
    try:
        if os.path.exists('temp_image.jpg'):
            os.remove('temp_image.jpg')
    except:
        pass