Spaces:
Sleeping
Sleeping
| import sys, json, os, glob, requests | |
| import re | |
| import time | |
| from contextlib import redirect_stdout, redirect_stderr | |
| # Immediately redirect all output to stderr except for our final JSON | |
| original_stdout = sys.stdout | |
| sys.stdout = sys.stderr | |
| # Suppress all PaddleOCR output | |
| os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' | |
| os.environ['QT_QPA_PLATFORM'] = 'offscreen' | |
| os.environ['DISPLAY'] = ':99' | |
| # Import PaddleOCR after setting environment variables | |
| from paddleocr import PaddleOCR | |
| def download_image(url, output_path='temp_police_image.jpg'): | |
| # Remove any existing temp file | |
| if os.path.exists(output_path): | |
| os.remove(output_path) | |
| # Add cache-busting parameters | |
| timestamp = int(time.time()) | |
| if '?' in url: | |
| url += f'&t={timestamp}' | |
| else: | |
| url += f'?t={timestamp}' | |
| # Add headers to prevent caching | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Cache-Control': 'no-cache, no-store, must-revalidate', | |
| 'Pragma': 'no-cache', | |
| 'Expires': '0' | |
| } | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| image_data = response.content | |
| # Save the image | |
| with open(output_path, 'wb') as f: | |
| f.write(image_data) | |
| return output_path | |
| def format_name(name): | |
| """Format name: add proper spacing and commas (generic for all police clearances) | |
| Handles common OCR issues like missing spaces between name parts and missing comma spacing. | |
| Works with any name format, not specific to one document. | |
| """ | |
| if not name: | |
| return None | |
| # Remove extra spaces and normalize | |
| name = ' '.join(name.split()) | |
| # First, ensure comma spacing: "JAVA,ALBERT" -> "JAVA, ALBERT" | |
| name = re.sub(r',([A-Z])', r', \1', name) | |
| name = re.sub(r',\s*([A-Z])', r', \1', name) | |
| # Split by comma if present | |
| if ',' in name: | |
| parts = name.split(',') | |
| formatted_parts = [] | |
| for part in parts: | |
| part = part.strip() | |
| # Handle consecutive capitals: "JAVAALBERTJOY" -> "JAVA ALBERT JOY" | |
| # Strategy: split where a capital letter is followed by another capital + lowercase | |
| # "ALBERTJOY" -> "ALBERT JOY" | |
| part = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', part) | |
| # Handle remaining cases: "JOYBAUTISTA" -> "JOY BAUTISTA" | |
| part = re.sub(r'([A-Z][a-z]+)([A-Z][a-z]+)', r'\1 \2', part) | |
| formatted_parts.append(part) | |
| name = ', '.join(formatted_parts) | |
| else: | |
| # No comma, try to add spaces between name parts | |
| # "JAVAALBERTJOY BAUTISTA" -> "JAVA ALBERT JOY BAUTISTA" | |
| # Add space before capital letters that follow lowercase | |
| name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name) | |
| # Add space between consecutive capitals: "JAVAALBERT" -> "JAVA ALBERT" | |
| # But be careful: "BAUTISTA" should stay together | |
| # Split where we have multiple capitals followed by a capital+lowercase | |
| name = re.sub(r'([A-Z]{2,})([A-Z][a-z])', r'\1 \2', name) | |
| # Also handle: "ALBERTJOY" -> "ALBERT JOY" | |
| name = re.sub(r'([A-Z]+)([A-Z][a-z]+)', r'\1 \2', name) | |
| # Clean up multiple spaces | |
| name = ' '.join(name.split()) | |
| return name.strip() | |
| def format_address(address): | |
| """Format address: add proper spacing (generic for all police clearances)""" | |
| if not address: | |
| return None | |
| # Remove extra spaces | |
| address = ' '.join(address.split()) | |
| # Handle #BLK/#BLOCK pattern: ensure space after # if followed by letters and numbers | |
| # "#BLK11" -> "#BLK 11", "#BLOCK5" -> "#BLOCK 5" | |
| address = re.sub(r'#([A-Z]+)(\d+)', r'#\1 \2', address) | |
| # Add space before city names and common address parts (capital followed by capital+lowercase) | |
| # "CAMPOTINIO" -> "CAMPO TINIO", "CABANATUANCITY" -> "CABANATUAN CITY" | |
| address = re.sub(r'([A-Z])([A-Z][a-z]+)', r'\1 \2', address) | |
| # Ensure comma spacing: "CITY,NUEVA" -> "CITY, NUEVA" | |
| address = re.sub(r',([A-Z])', r', \1', address) | |
| address = re.sub(r',\s*([A-Z])', r', \1', address) | |
| # Clean up multiple spaces | |
| address = ' '.join(address.split()) | |
| return address.strip() | |
| def format_birth_place(place): | |
| """Format birth place: add proper spacing (generic for all police clearances)""" | |
| if not place: | |
| return None | |
| # Remove extra spaces | |
| place = ' '.join(place.split()) | |
| # Ensure comma spacing: "DILASAG,AURORA" -> "DILASAG, AURORA" | |
| place = re.sub(r',([A-Z])', r', \1', place) | |
| place = re.sub(r',\s*([A-Z])', r', \1', place) | |
| # Add space before province/region names if needed | |
| # "PLACE PROVINCE" -> already spaced, but handle "PLACEPROVINCE" -> "PLACE PROVINCE" | |
| place = re.sub(r'([A-Z])([A-Z][a-z]+)', r'\1 \2', place) | |
| # Clean up multiple spaces | |
| place = ' '.join(place.split()) | |
| return place.strip() | |
| def format_birth_date(date): | |
| """Format birth date: fix common OCR errors (generic for all police clearances)""" | |
| if not date: | |
| return None | |
| # Fix common OCR errors for month names (universal issues) | |
| date = date.replace('Juy', 'July') # Common OCR error | |
| date = date.replace('Januay', 'January') | |
| date = date.replace('Februay', 'February') | |
| date = date.replace('Marc', 'March') | |
| date = date.replace('Apil', 'April') | |
| date = date.replace('Jun', 'June') # Be careful - June is valid, but "Jun" might be incomplete | |
| date = date.replace('Augu', 'August') | |
| date = date.replace('Septemb', 'September') | |
| date = date.replace('Octob', 'October') | |
| date = date.replace('Novemb', 'November') | |
| date = date.replace('Decemb', 'December') | |
| # Fix year errors: "1905" when it should be "05" (day) - common OCR issue | |
| # Pattern: "July 1905, 1991" -> "July 05, 1991" | |
| # Check if we have a pattern like "Month 19XX, YYYY" where 19XX is likely the day misread | |
| match = re.search(r'(\w+)\s+19(\d{2}),\s*(\d{4})', date) | |
| if match: | |
| day = match.group(2) | |
| year = match.group(3) | |
| # If day is 00-31, it's likely a day, not a year | |
| if 0 <= int(day) <= 31: | |
| date = re.sub(r'(\w+)\s+19(\d{2}),\s*(\d{4})', rf'\1 {day}, \3', date) | |
| # Ensure proper date format: "July 05, 1991" | |
| date = re.sub(r'(\w+)\s+(\d{1,2})\s*,\s*(\d{4})', r'\1 \2, \3', date) | |
| # Clean up multiple spaces | |
| date = ' '.join(date.split()) | |
| return date.strip() | |
| def extract_police_details(lines): | |
| details = { | |
| 'clearance_type': 'police', | |
| 'id_number': None, | |
| 'full_name': None, | |
| 'address': None, | |
| 'birth_date': None, | |
| 'birth_place': None, | |
| 'citizenship': None, | |
| 'gender': None, | |
| 'status': None, | |
| 'success': False | |
| } | |
| for i, line in enumerate(lines): | |
| if not isinstance(line, str): | |
| continue | |
| line_upper = line.upper().strip() | |
| line_stripped = line.strip() | |
| # Extract Name - handle cases where NAME and value are on separate lines | |
| # Format: 'NAME' on one line, ':IRENE TIMBAL VILLAFUERTE' on next line | |
| if "NAME" in line_upper and not details['full_name']: | |
| # First, check if name is on the same line after colon | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| name_part = parts[1].strip() | |
| # Validate it's actually a name (not descriptive text) | |
| if name_part and len(name_part) > 2 and not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT']): | |
| details['full_name'] = name_part | |
| print(f"DEBUG: Found full name (same line): {details['full_name']}", file=sys.stderr) | |
| continue | |
| # Check next few lines for name value (prioritize lines starting with colon) | |
| if i + 1 < len(lines): | |
| for j in range(1, min(5, len(lines) - i)): | |
| next_line = lines[i+j].strip() | |
| next_upper = next_line.upper() | |
| # Skip if it's clearly a label or descriptive text | |
| if any(word in next_upper for word in ['ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID', 'THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT', 'CERTIFY', 'PERSON', 'WHOSE', 'PHOTO', 'SIGNATURE']): | |
| continue | |
| # Priority: Line starting with colon (most reliable format) | |
| if next_line.startswith(':') and len(next_line) > 1: | |
| name_part = next_line[1:].strip() | |
| # Validate it looks like a name (has letters, reasonable length, not descriptive text) | |
| if (name_part and len(name_part) > 3 and | |
| re.search(r'[A-Za-z]{2,}', name_part) and | |
| not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION'])): | |
| details['full_name'] = name_part | |
| print(f"DEBUG: Found full name (colon line): {details['full_name']}", file=sys.stderr) | |
| break | |
| # Fallback: Line that looks like a name (all caps, multiple words, reasonable length) | |
| elif (re.match(r'^[A-Z\s,]+$', next_line) and | |
| len(next_line.split()) >= 2 and | |
| len(next_line) > 5 and | |
| len(next_line) < 50): # Names are usually not too long | |
| # Make sure it's not descriptive text | |
| if not any(word in next_upper for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME']): | |
| details['full_name'] = next_line | |
| print(f"DEBUG: Found full name (all caps line): {details['full_name']}", file=sys.stderr) | |
| break | |
| # Also check for name patterns that start with colon (OCR sometimes splits NAME label) | |
| # But only if we haven't found a name yet | |
| if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5: | |
| name_candidate = line_stripped[1:].strip() | |
| # Check if it looks like a name (has letters, reasonable length, not descriptive text) | |
| if (re.search(r'[A-Za-z]{2,}', name_candidate) and | |
| len(name_candidate) > 3 and | |
| len(name_candidate) < 50 and | |
| not any(word in name_candidate.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'ADDRESS', 'BIRTH'])): | |
| # Make sure previous line wasn't ADDRESS or other label | |
| if i > 0: | |
| prev_line = lines[i-1].strip().upper() | |
| if "ADDRESS" not in prev_line and "BIRTH" not in prev_line and "CITIZEN" not in prev_line: | |
| details['full_name'] = name_candidate | |
| print(f"DEBUG: Found full name (colon pattern): {details['full_name']}", file=sys.stderr) | |
| # Extract Address | |
| if "ADDRESS" in line_upper and not details['address']: | |
| if ":" in line: | |
| parts = line.split(':') | |
| if len(parts) > 1: | |
| addr_part = parts[1].strip() | |
| if addr_part: | |
| details['address'] = addr_part | |
| elif i + 1 < len(lines): | |
| # Check next few lines for address value | |
| addr_parts = [] | |
| for j in range(1, min(4, len(lines) - i)): | |
| next_line = lines[i+j].strip() | |
| if next_line.startswith(':') and len(next_line) > 1: | |
| addr_parts.append(next_line[1:].strip()) | |
| elif "BIRTH" not in next_line.upper() and "CITIZEN" not in next_line.upper(): | |
| if ":" in next_line: | |
| parts = next_line.split(':', 1) | |
| if len(parts) > 1: | |
| addr_parts.append(parts[1].strip()) | |
| elif len(next_line) > 2: | |
| addr_parts.append(next_line) | |
| else: | |
| break | |
| if addr_parts: | |
| details['address'] = ' '.join(addr_parts).strip() | |
| # Extract Birth Date - handle OCR errors and combined patterns | |
| if ("BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper) and not details['birth_date']: | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| date_part = parts[1].strip() | |
| # Fix common OCR errors | |
| date_part = date_part.replace('Juy', 'July').replace('Juy', 'July') | |
| # Fix year errors (1001 -> 1991, etc.) | |
| date_part = re.sub(r'\b1001\b', '1991', date_part) | |
| date_part = re.sub(r'\b(\d{2})\b', lambda m: '19' + m.group(1) if len(m.group(1)) == 2 and int(m.group(1)) < 50 else m.group(1), date_part) | |
| if date_part: | |
| details['birth_date'] = date_part | |
| elif i + 1 < len(lines): | |
| next_line = lines[i+1].strip() | |
| if ":" in next_line: | |
| parts = next_line.split(':', 1) | |
| if len(parts) > 1: | |
| date_part = parts[1].strip() | |
| date_part = date_part.replace('Juy', 'July') | |
| date_part = re.sub(r'\b1001\b', '1991', date_part) | |
| if date_part: | |
| details['birth_date'] = date_part | |
| # Also look for date patterns in lines that might have been OCR'd incorrectly | |
| if not details['birth_date']: | |
| # Look for patterns like "Juy 05, 1001" or "July 03, 1991" | |
| date_pattern = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December|Juy|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}[,\s]+\d{4}', line_upper) | |
| if date_pattern: | |
| date_part = date_pattern.group() | |
| date_part = date_part.replace('Juy', 'July') | |
| date_part = re.sub(r'\b1001\b', '1991', date_part) | |
| details['birth_date'] = date_part | |
| # Extract Birth Place | |
| if "BIRTH PLACE" in line_upper and not details['birth_place']: | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| details['birth_place'] = parts[1].strip() | |
| elif i + 1 < len(lines): | |
| next_line = lines[i+1].strip() | |
| if next_line.startswith(':') and len(next_line) > 1: | |
| details['birth_place'] = next_line[1:].strip() | |
| elif ":" in next_line and "CITIZEN" not in next_line.upper(): | |
| parts = next_line.split(':', 1) | |
| if len(parts) > 1: | |
| details['birth_place'] = parts[1].strip() | |
| # Extract Citizenship | |
| if "CITIZENSHIP" in line_upper and not details['citizenship']: | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| details['citizenship'] = parts[1].strip() | |
| elif i + 1 < len(lines): | |
| next_line = lines[i+1].strip() | |
| if next_line.startswith(':') and len(next_line) > 1: | |
| details['citizenship'] = next_line[1:].strip() | |
| elif ":" in next_line: | |
| parts = next_line.split(':', 1) | |
| if len(parts) > 1: | |
| details['citizenship'] = parts[1].strip() | |
| # Extract Gender - handle cases where GENDER and value are on separate lines | |
| # Format: 'GENDER' on one line, 'FEMALE' or 'MALE' on next line | |
| if "GENDER" in line_upper and not details['gender']: | |
| # First, check if gender is on the same line after colon | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| gender_part = parts[1].strip().upper() | |
| if gender_part in ['MALE', 'FEMALE', 'M', 'F']: | |
| details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part | |
| print(f"DEBUG: Found gender (same line): {details['gender']}", file=sys.stderr) | |
| continue | |
| # Check next few lines for gender value | |
| if i + 1 < len(lines): | |
| for j in range(1, min(4, len(lines) - i)): | |
| next_line = lines[i+j].strip() | |
| next_upper = next_line.upper() | |
| # Skip if it's clearly a label | |
| if any(label in next_upper for label in ['NAME', 'ADDRESS', 'BIRTH', 'CITIZEN', 'DATE', 'PLACE', 'PICTURE', 'SIGNATURE', 'THUMBMARK']): | |
| continue | |
| # Check if line starts with colon | |
| if next_line.startswith(':') and len(next_line) > 1: | |
| gender_part = next_line[1:].strip().upper() | |
| if gender_part in ['MALE', 'FEMALE', 'M', 'F']: | |
| details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part | |
| print(f"DEBUG: Found gender (colon line): {details['gender']}", file=sys.stderr) | |
| break | |
| # Check if the line itself is the gender value | |
| elif next_upper in ['MALE', 'FEMALE', 'M', 'F']: | |
| details['gender'] = next_line.capitalize() if len(next_line) > 1 else next_line | |
| print(f"DEBUG: Found gender (direct): {details['gender']}", file=sys.stderr) | |
| break | |
| # Check if line contains colon with gender value | |
| elif ":" in next_line: | |
| parts = next_line.split(':', 1) | |
| if len(parts) > 1: | |
| gender_part = parts[1].strip().upper() | |
| if gender_part in ['MALE', 'FEMALE', 'M', 'F']: | |
| details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part | |
| print(f"DEBUG: Found gender (colon in line): {details['gender']}", file=sys.stderr) | |
| break | |
| # Extract ID Number (Usually "ID No.:" or near QR code) | |
| if "ID NO" in line_upper or "ID NO." in line_upper: | |
| parts = line.split(':') | |
| if len(parts) > 1: | |
| details['id_number'] = parts[1].strip() | |
| # Fallback ID extraction looking for specific patterns if not found by label | |
| if not details['id_number']: | |
| # Look for pattern like TRARH + digits | |
| id_match = re.search(r'\b[A-Z]{4,5}\d{10,15}\b', line_upper) | |
| if id_match: | |
| details['id_number'] = id_match.group() | |
| # Extract Status (e.g., "NO RECORD ON FILE") | |
| if "NO RECORD ON FILE" in line_upper: | |
| details['status'] = "NO RECORD ON FILE" | |
| elif "HAS A RECORD" in line_upper or "WITH RECORD" in line_upper: | |
| details['status'] = "HAS RECORD" | |
| if details['full_name'] or details['id_number']: | |
| details['success'] = True | |
| # Format the extracted fields | |
| if details['full_name']: | |
| details['full_name'] = format_name(details['full_name']) | |
| if details['address']: | |
| details['address'] = format_address(details['address']) | |
| if details['birth_place']: | |
| details['birth_place'] = format_birth_place(details['birth_place']) | |
| if details['birth_date']: | |
| details['birth_date'] = format_birth_date(details['birth_date']) | |
| return details | |
| def extract_ocr_lines(image_path): | |
| # Check if file exists | |
| if not os.path.exists(image_path): | |
| return {'success': False, 'error': 'File not found'} | |
| file_size = os.path.getsize(image_path) | |
| print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr) | |
| with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): | |
| # Try simple configuration first (matching NBI script primary method) | |
| ocr = PaddleOCR( | |
| use_doc_orientation_classify=False, | |
| use_doc_unwarping=False, | |
| use_textline_orientation=False, | |
| lang='en' | |
| ) | |
| try: | |
| results = ocr.ocr(image_path) | |
| except Exception as e: | |
| print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr) | |
| if hasattr(ocr, 'predict'): | |
| results = ocr.predict(image_path) | |
| else: | |
| results = None | |
| # Debug: Print raw results structure | |
| print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr) | |
| print(f"DEBUG: Raw OCR results is None: {results is None}", file=sys.stderr) | |
| if results is not None: | |
| print(f"DEBUG: Raw OCR results length: {len(results) if isinstance(results, list) else 'N/A'}", file=sys.stderr) | |
| if isinstance(results, list) and len(results) > 0: | |
| print(f"DEBUG: First level item type: {type(results[0])}", file=sys.stderr) | |
| print(f"DEBUG: First level item: {str(results[0])[:200] if results[0] else 'None'}", file=sys.stderr) | |
| if isinstance(results[0], list) and len(results[0]) > 0: | |
| print(f"DEBUG: Second level first item: {str(results[0][0])[:200] if results[0][0] else 'None'}", file=sys.stderr) | |
| # Process OCR results - handle both old format (list) and new format (OCRResult object) | |
| all_text = [] | |
| try: | |
| # Check if results contain OCRResult objects (new PaddleX format) | |
| if results and isinstance(results, list) and len(results) > 0: | |
| first_item = results[0] | |
| # Check if it's an OCRResult object by type name | |
| item_type_name = type(first_item).__name__ | |
| is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() | |
| if is_ocr_result: | |
| print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) | |
| # Inspect attributes | |
| attrs = dir(first_item) | |
| print(f"DEBUG: OCRResult attributes: {[a for a in attrs if not a.startswith('_')]}", file=sys.stderr) | |
| for ocr_result in results: | |
| # Try various possible attribute names for text | |
| text_found = False | |
| # First, try accessing as dictionary (OCRResult is dict-like) | |
| try: | |
| if hasattr(ocr_result, 'keys'): | |
| ocr_dict = dict(ocr_result) | |
| print(f"DEBUG: OCRResult as dict keys: {list(ocr_dict.keys())}", file=sys.stderr) | |
| # Look for common OCR result keys (rec_texts is the actual key in PaddleX OCRResult) | |
| for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results', 'ocr_result', 'dt_boxes']: | |
| if key in ocr_dict: | |
| val = ocr_dict[key] | |
| print(f"DEBUG: Found key '{key}': {type(val)}, length: {len(val) if isinstance(val, list) else 'N/A'}", file=sys.stderr) | |
| if isinstance(val, list): | |
| # rec_texts is a list of strings directly | |
| if key == 'rec_texts': | |
| for text_item in val: | |
| if isinstance(text_item, str) and text_item.strip(): | |
| all_text.append(text_item.strip()) | |
| elif text_item: | |
| all_text.append(str(text_item)) | |
| if val: | |
| text_found = True | |
| else: | |
| # For other keys, try to extract text from nested structures | |
| for item in val: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| # Format: [[coords], (text, confidence)] | |
| text_part = item[1] | |
| if isinstance(text_part, (list, tuple)) and len(text_part) >= 1: | |
| all_text.append(str(text_part[0])) | |
| elif isinstance(item, str): | |
| all_text.append(item) | |
| if val: | |
| text_found = True | |
| elif isinstance(val, str) and val: | |
| all_text.append(val) | |
| text_found = True | |
| if text_found: | |
| break | |
| except Exception as e: | |
| print(f"DEBUG: Error accessing OCRResult as dict: {e}", file=sys.stderr) | |
| # Try json() method | |
| if not text_found: | |
| try: | |
| if hasattr(ocr_result, 'json'): | |
| json_data = ocr_result.json() | |
| print(f"DEBUG: OCRResult.json() type: {type(json_data)}", file=sys.stderr) | |
| if isinstance(json_data, dict): | |
| print(f"DEBUG: OCRResult.json() keys: {list(json_data.keys())}", file=sys.stderr) | |
| # Look for text in JSON (rec_texts is the actual key) | |
| for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results']: | |
| if key in json_data: | |
| val = json_data[key] | |
| if isinstance(val, list): | |
| # rec_texts is a list of strings directly | |
| if key == 'rec_texts': | |
| for text_item in val: | |
| if isinstance(text_item, str) and text_item.strip(): | |
| all_text.append(text_item.strip()) | |
| elif text_item: | |
| all_text.append(str(text_item)) | |
| if val: | |
| text_found = True | |
| else: | |
| for item in val: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| text_part = item[1] | |
| if isinstance(text_part, (list, tuple)) and len(text_part) >= 1: | |
| all_text.append(str(text_part[0])) | |
| elif isinstance(item, str): | |
| all_text.append(item) | |
| if val: | |
| text_found = True | |
| elif isinstance(val, str) and val: | |
| all_text.append(val) | |
| text_found = True | |
| if text_found: | |
| break | |
| except Exception as e: | |
| print(f"DEBUG: Error calling json(): {e}", file=sys.stderr) | |
| # Try rec_text attribute | |
| if not text_found and hasattr(ocr_result, 'rec_text'): | |
| rec_text = ocr_result.rec_text | |
| print(f"DEBUG: Found rec_text attribute: {type(rec_text)}", file=sys.stderr) | |
| if isinstance(rec_text, list): | |
| all_text.extend([str(t) for t in rec_text if t]) | |
| text_found = True | |
| elif rec_text: | |
| all_text.append(str(rec_text)) | |
| text_found = True | |
| # Try text attribute | |
| if not text_found and hasattr(ocr_result, 'text'): | |
| text = ocr_result.text | |
| print(f"DEBUG: Found text attribute: {type(text)}", file=sys.stderr) | |
| if isinstance(text, list): | |
| all_text.extend([str(t) for t in text if t]) | |
| text_found = True | |
| elif text: | |
| all_text.append(str(text)) | |
| text_found = True | |
| # If still no text, print full structure for debugging | |
| if not text_found: | |
| print(f"DEBUG: Could not find text in OCRResult, trying to inspect structure", file=sys.stderr) | |
| try: | |
| print(f"DEBUG: OCRResult repr: {repr(ocr_result)[:500]}", file=sys.stderr) | |
| # Try to get all keys/items | |
| if hasattr(ocr_result, 'keys'): | |
| try: | |
| all_keys = list(ocr_result.keys()) | |
| print(f"DEBUG: All OCRResult keys: {all_keys}", file=sys.stderr) | |
| for key in all_keys: | |
| try: | |
| val = ocr_result[key] | |
| print(f"DEBUG: Key '{key}' type: {type(val)}, value preview: {str(val)[:100]}", file=sys.stderr) | |
| except: | |
| pass | |
| except: | |
| pass | |
| except Exception as e: | |
| print(f"DEBUG: Error inspecting structure: {e}", file=sys.stderr) | |
| else: | |
| # Old format - list of lists | |
| lines = results[0] if results and isinstance(results[0], list) else results | |
| print(f"DEBUG: Processing lines (old format), count: {len(lines) if isinstance(lines, list) else 'N/A'}", file=sys.stderr) | |
| for item in lines: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| meta = item[1] | |
| if isinstance(meta, (list, tuple)) and len(meta) >= 1: | |
| all_text.append(str(meta[0])) | |
| except Exception as e: | |
| print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) | |
| import traceback | |
| print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr) | |
| # Try to inspect the object attributes | |
| if results and isinstance(results, list) and len(results) > 0: | |
| first_item = results[0] | |
| print(f"DEBUG: First item attributes: {dir(first_item)}", file=sys.stderr) | |
| if hasattr(first_item, '__dict__'): | |
| print(f"DEBUG: First item dict: {first_item.__dict__}", file=sys.stderr) | |
| print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr) | |
| return extract_police_details(all_text) if all_text else {'clearance_type': 'police', 'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False} | |
| def extract_ocr_lines_simple(image_path): | |
| # Fallback method with advanced features (matching NBI script fallback) | |
| with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): | |
| ocr = PaddleOCR( | |
| use_doc_orientation_classify=True, | |
| use_doc_unwarping=True, | |
| use_textline_orientation=True, | |
| lang='en' | |
| ) | |
| results = ocr.ocr(image_path) | |
| # Debug: Print raw results structure for fallback method | |
| print(f"DEBUG (fallback): Raw OCR results type: {type(results)}", file=sys.stderr) | |
| print(f"DEBUG (fallback): Raw OCR results is None: {results is None}", file=sys.stderr) | |
| if results is not None: | |
| print(f"DEBUG (fallback): Raw OCR results length: {len(results) if isinstance(results, list) else 'N/A'}", file=sys.stderr) | |
| if isinstance(results, list) and len(results) > 0: | |
| print(f"DEBUG (fallback): First level item type: {type(results[0])}", file=sys.stderr) | |
| if isinstance(results[0], list) and len(results[0]) > 0: | |
| print(f"DEBUG (fallback): Second level first item: {str(results[0][0])[:200] if results[0][0] else 'None'}", file=sys.stderr) | |
| all_text = [] | |
| try: | |
| # Check if results contain OCRResult objects (new PaddleX format) | |
| if results and isinstance(results, list) and len(results) > 0: | |
| first_item = results[0] | |
| # Check if it's an OCRResult object by type name | |
| item_type_name = type(first_item).__name__ | |
| is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() | |
| if is_ocr_result: | |
| print(f"DEBUG (fallback): Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) | |
| # Inspect attributes | |
| attrs = dir(first_item) | |
| print(f"DEBUG (fallback): OCRResult attributes: {[a for a in attrs if not a.startswith('_')]}", file=sys.stderr) | |
| for ocr_result in results: | |
| # Try various possible attribute names for text | |
| text_found = False | |
| # First, try accessing as dictionary (OCRResult is dict-like) | |
| try: | |
| if hasattr(ocr_result, 'keys'): | |
| ocr_dict = dict(ocr_result) | |
| print(f"DEBUG (fallback): OCRResult as dict keys: {list(ocr_dict.keys())}", file=sys.stderr) | |
| # Look for common OCR result keys (rec_texts is the actual key in PaddleX OCRResult) | |
| for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results', 'ocr_result', 'dt_boxes']: | |
| if key in ocr_dict: | |
| val = ocr_dict[key] | |
| print(f"DEBUG (fallback): Found key '{key}': {type(val)}, length: {len(val) if isinstance(val, list) else 'N/A'}", file=sys.stderr) | |
| if isinstance(val, list): | |
| # rec_texts is a list of strings directly | |
| if key == 'rec_texts': | |
| for text_item in val: | |
| if isinstance(text_item, str) and text_item.strip(): | |
| all_text.append(text_item.strip()) | |
| elif text_item: | |
| all_text.append(str(text_item)) | |
| if val: | |
| text_found = True | |
| else: | |
| # For other keys, try to extract text from nested structures | |
| for item in val: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| # Format: [[coords], (text, confidence)] | |
| text_part = item[1] | |
| if isinstance(text_part, (list, tuple)) and len(text_part) >= 1: | |
| all_text.append(str(text_part[0])) | |
| elif isinstance(item, str): | |
| all_text.append(item) | |
| if val: | |
| text_found = True | |
| elif isinstance(val, str) and val: | |
| all_text.append(val) | |
| text_found = True | |
| if text_found: | |
| break | |
| except Exception as e: | |
| print(f"DEBUG (fallback): Error accessing OCRResult as dict: {e}", file=sys.stderr) | |
| # Try json() method | |
| if not text_found: | |
| try: | |
| if hasattr(ocr_result, 'json'): | |
| json_data = ocr_result.json() | |
| print(f"DEBUG (fallback): OCRResult.json() type: {type(json_data)}", file=sys.stderr) | |
| if isinstance(json_data, dict): | |
| print(f"DEBUG (fallback): OCRResult.json() keys: {list(json_data.keys())}", file=sys.stderr) | |
| # Look for text in JSON (rec_texts is the actual key) | |
| for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results']: | |
| if key in json_data: | |
| val = json_data[key] | |
| if isinstance(val, list): | |
| # rec_texts is a list of strings directly | |
| if key == 'rec_texts': | |
| for text_item in val: | |
| if isinstance(text_item, str) and text_item.strip(): | |
| all_text.append(text_item.strip()) | |
| elif text_item: | |
| all_text.append(str(text_item)) | |
| if val: | |
| text_found = True | |
| else: | |
| for item in val: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| text_part = item[1] | |
| if isinstance(text_part, (list, tuple)) and len(text_part) >= 1: | |
| all_text.append(str(text_part[0])) | |
| elif isinstance(item, str): | |
| all_text.append(item) | |
| if val: | |
| text_found = True | |
| elif isinstance(val, str) and val: | |
| all_text.append(val) | |
| text_found = True | |
| if text_found: | |
| break | |
| except Exception as e: | |
| print(f"DEBUG (fallback): Error calling json(): {e}", file=sys.stderr) | |
| # Try rec_text attribute | |
| if not text_found and hasattr(ocr_result, 'rec_text'): | |
| rec_text = ocr_result.rec_text | |
| print(f"DEBUG (fallback): Found rec_text attribute: {type(rec_text)}", file=sys.stderr) | |
| if isinstance(rec_text, list): | |
| all_text.extend([str(t) for t in rec_text if t]) | |
| text_found = True | |
| elif rec_text: | |
| all_text.append(str(rec_text)) | |
| text_found = True | |
| # Try text attribute | |
| if not text_found and hasattr(ocr_result, 'text'): | |
| text = ocr_result.text | |
| print(f"DEBUG (fallback): Found text attribute: {type(text)}", file=sys.stderr) | |
| if isinstance(text, list): | |
| all_text.extend([str(t) for t in text if t]) | |
| text_found = True | |
| elif text: | |
| all_text.append(str(text)) | |
| text_found = True | |
| # If still no text, print full structure for debugging | |
| if not text_found: | |
| print(f"DEBUG (fallback): Could not find text in OCRResult, trying to inspect structure", file=sys.stderr) | |
| try: | |
| print(f"DEBUG (fallback): OCRResult repr: {repr(ocr_result)[:500]}", file=sys.stderr) | |
| # Try to get all keys/items | |
| if hasattr(ocr_result, 'keys'): | |
| try: | |
| all_keys = list(ocr_result.keys()) | |
| print(f"DEBUG (fallback): All OCRResult keys: {all_keys}", file=sys.stderr) | |
| for key in all_keys: | |
| try: | |
| val = ocr_result[key] | |
| print(f"DEBUG (fallback): Key '{key}' type: {type(val)}, value preview: {str(val)[:100]}", file=sys.stderr) | |
| except: | |
| pass | |
| except: | |
| pass | |
| except Exception as e: | |
| print(f"DEBUG (fallback): Error inspecting structure: {e}", file=sys.stderr) | |
| else: | |
| # Old format - list of lists | |
| lines = results[0] if results and isinstance(results[0], list) else results | |
| print(f"DEBUG (fallback): Processing lines (old format), count: {len(lines) if isinstance(lines, list) else 'N/A'}", file=sys.stderr) | |
| for item in lines: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| meta = item[1] | |
| if isinstance(meta, (list, tuple)) and len(meta) >= 1: | |
| all_text.append(str(meta[0])) | |
| except Exception as e: | |
| print(f"DEBUG (fallback): Error processing OCR results: {str(e)}", file=sys.stderr) | |
| import traceback | |
| print(f"DEBUG (fallback): Traceback: {traceback.format_exc()}", file=sys.stderr) | |
| # Try to inspect the object attributes | |
| if results and isinstance(results, list) and len(results) > 0: | |
| first_item = results[0] | |
| print(f"DEBUG (fallback): First item attributes: {dir(first_item)}", file=sys.stderr) | |
| if hasattr(first_item, '__dict__'): | |
| print(f"DEBUG (fallback): First item dict: {first_item.__dict__}", file=sys.stderr) | |
| print(f"DEBUG (fallback): Extracted text lines: {all_text}", file=sys.stderr) | |
| return extract_police_details(all_text) if all_text else {'clearance_type': 'police', 'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False} | |
| # Main Execution | |
| if len(sys.argv) < 2: | |
| sys.stdout = original_stdout | |
| print(json.dumps({"success": False, "error": "No image URL provided"})) | |
| sys.exit(1) | |
| image_url = sys.argv[1] | |
| print(f"DEBUG: Processing Police Clearance image URL: {image_url}", file=sys.stderr) | |
| try: | |
| image_path = download_image(image_url, 'temp_police_image.jpg') | |
| print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr) | |
| # Try the original OCR method first | |
| ocr_results = extract_ocr_lines(image_path) | |
| print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr) | |
| # If original method fails, try simple method with advanced features | |
| if not ocr_results['success']: | |
| print("DEBUG: Original method failed, trying simple method with advanced features", file=sys.stderr) | |
| ocr_results = extract_ocr_lines_simple(image_path) | |
| print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr) | |
| # Clean up | |
| if os.path.exists(image_path): | |
| os.remove(image_path) | |
| response = { | |
| "success": ocr_results['success'], | |
| "data": ocr_results | |
| } | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps(response)) | |
| sys.stdout.flush() | |
| except Exception as e: | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps({"success": False, "error": str(e)})) | |
| sys.stdout.flush() | |
| sys.exit(1) | |
| finally: | |
| try: | |
| if os.path.exists('temp_police_image.jpg'): | |
| os.remove('temp_police_image.jpg') | |
| except: | |
| pass | |