Spaces:
Sleeping
Sleeping
| import sys, json, os, glob, requests | |
| import re | |
| import time | |
| import shutil | |
| from contextlib import redirect_stdout, redirect_stderr | |
| # Immediately redirect all output to stderr except for our final JSON | |
| original_stdout = sys.stdout | |
| sys.stdout = sys.stderr | |
| # Suppress all PaddleOCR output | |
| os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' | |
| os.environ['QT_QPA_PLATFORM'] = 'offscreen' | |
| os.environ['DISPLAY'] = ':99' | |
| # Import PaddleOCR after setting environment variables | |
| from paddleocr import PaddleOCR | |
| def download_image(url, output_path='temp_image.jpg'): | |
| # Remove any existing temp file | |
| if os.path.exists(output_path): | |
| os.remove(output_path) | |
| # Add cache-busting parameters | |
| timestamp = int(time.time()) | |
| if '?' in url: | |
| url += f'&t={timestamp}' | |
| else: | |
| url += f'?t={timestamp}' | |
| # Add headers to prevent caching | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Cache-Control': 'no-cache, no-store, must-revalidate', | |
| 'Pragma': 'no-cache', | |
| 'Expires': '0' | |
| } | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| image_data = response.content | |
| # Save the image and verify it's the right one | |
| with open(output_path, 'wb') as f: | |
| f.write(image_data) | |
| return output_path | |
| # OCR Function to extract NBI ID NO, Name, Birth Date, and LIT | |
| def extract_nbi_id(lines): | |
| nbi_id = None | |
| full_name = None | |
| birth_date = None | |
| lit = None # LIT field (Last Issued To or similar) | |
| # Clean lines - convert to strings and strip | |
| cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines] | |
| # First pass: Look for NBI ID pattern in all lines (prioritize exact matches) | |
| # This helps catch IDs that might be on lines without labels | |
| for i, line in enumerate(cleaned_lines): | |
| line_upper = line.upper().strip() | |
| line_clean = line.strip() | |
| # Look for NBI ID pattern with hyphen first (most reliable) | |
| if not nbi_id: | |
| hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b' | |
| match = re.search(hyphen_pattern, line_clean) | |
| if match: | |
| candidate = match.group(1) | |
| # Validate length and that it's not part of an address | |
| if 17 <= len(candidate) <= 25: | |
| # Check that line doesn't have too many words (NBI IDs are usually standalone) | |
| line_words = line_clean.split() | |
| if len(line_words) <= 3: # Usually 1-2 words max (the ID itself) | |
| # Additional validation: should have mix of letters and numbers | |
| has_letters = bool(re.search(r'[A-Z]', candidate)) | |
| has_numbers = bool(re.search(r'[0-9]', candidate)) | |
| if has_letters and has_numbers: | |
| nbi_id = candidate | |
| print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr) | |
| break | |
| # Second pass: Extract other fields and refine ID if needed | |
| for i, line in enumerate(cleaned_lines): | |
| line_upper = line.upper().strip() | |
| line_clean = line.strip() | |
| # Extract NBI ID Number (if not found in first pass) | |
| if not nbi_id: | |
| # Look for "NBI ID NO:" pattern (various formats) | |
| if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or | |
| "NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper): | |
| # Extract the ID after the colon | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| id_candidate = parts[1].strip() | |
| # Clean up the ID (remove extra spaces, ensure proper format) | |
| id_candidate = re.sub(r'\s+', '', id_candidate) # Remove spaces | |
| if len(id_candidate) > 5: # Valid ID should be longer | |
| nbi_id = id_candidate | |
| print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr) | |
| continue | |
| # Also check if the next line contains the ID (in case it's on a separate line) | |
| if i < len(cleaned_lines) - 1: | |
| for j in range(1, min(3, len(cleaned_lines) - i)): | |
| next_line = cleaned_lines[i + j].strip() | |
| # Skip if it's clearly not an ID (too short, contains labels) | |
| if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']): | |
| continue | |
| # Check if it looks like an NBI ID (alphanumeric, reasonable length) | |
| if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')): | |
| nbi_id = next_line.replace(' ', '') | |
| print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr) | |
| break | |
| if nbi_id: | |
| continue | |
| # Look for NBI ID pattern: alphanumeric with one hyphen | |
| # Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873 | |
| # First part: 8-12 chars, hyphen, second part: 8-12 chars | |
| # Total length: 17-25 characters (including hyphen) | |
| # Priority 1: Pattern with hyphen (most common format) | |
| # Look for pattern like B450JRLR0B-RC248667 | |
| hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b' | |
| match = re.search(hyphen_pattern, line_clean) | |
| if match: | |
| candidate = match.group(1) | |
| # Validate: should be 17-25 chars total | |
| if 17 <= len(candidate) <= 25: | |
| # Make sure it's not matching address parts or other text | |
| # Also check that the line doesn't have too many words (NBI IDs are usually standalone) | |
| line_words = line_clean.split() | |
| # Additional validation: should have mix of letters and numbers | |
| has_letters = bool(re.search(r'[A-Z]', candidate)) | |
| has_numbers = bool(re.search(r'[0-9]', candidate)) | |
| if (has_letters and has_numbers and | |
| not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and | |
| len(line_words) <= 3): # NBI ID is usually on its own line or with 1-2 other words | |
| nbi_id = candidate | |
| print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr) | |
| continue | |
| # Priority 2: Pattern with space instead of hyphen | |
| space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b' | |
| match = re.search(space_pattern, line_clean) | |
| if match: | |
| part1, part2 = match.groups() | |
| candidate = f"{part1}-{part2}" | |
| if 17 <= len(candidate) <= 25: | |
| has_letters = bool(re.search(r'[A-Z]', candidate)) | |
| has_numbers = bool(re.search(r'[0-9]', candidate)) | |
| if (has_letters and has_numbers and | |
| not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])): | |
| nbi_id = candidate | |
| print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr) | |
| continue | |
| # Priority 3: Pattern without hyphen/space (all together) | |
| # Only if we haven't found one yet and it's a reasonable length | |
| no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b' | |
| match = re.search(no_hyphen_pattern, line_clean) | |
| if match: | |
| candidate = match.group(1) | |
| # Make sure it doesn't contain common address words and has both letters and numbers | |
| has_letters = bool(re.search(r'[A-Z]', candidate)) | |
| has_numbers = bool(re.search(r'[0-9]', candidate)) | |
| if (has_letters and has_numbers and | |
| not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])): | |
| # Try to split it intelligently (usually split in the middle) | |
| mid = len(candidate) // 2 | |
| # Try splitting at various points | |
| for split_point in range(mid-2, mid+3): | |
| if 8 <= split_point <= len(candidate) - 8: | |
| part1 = candidate[:split_point] | |
| part2 = candidate[split_point:] | |
| if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12: | |
| nbi_id = f"{part1}-{part2}" | |
| print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr) | |
| break | |
| if nbi_id: | |
| continue | |
| # Extract Full Name - look for name patterns after "NAME" label | |
| # Also handle cases where name might be on the same line or next lines | |
| if not full_name: | |
| # Check if line contains "NAME" label | |
| if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper): | |
| # First, check if name is on the same line after colon | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| name_part = parts[1].strip() | |
| if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2: | |
| full_name = name_part | |
| print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr) | |
| continue | |
| # Check next few lines for name value | |
| for j in range(1, min(5, len(cleaned_lines) - i)): | |
| next_line = cleaned_lines[i + j].strip() | |
| next_upper = next_line.upper() | |
| # Skip if it's another label or ID number | |
| if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']): | |
| continue | |
| # Check if it looks like a name (has letters, may have commas, not all numbers) | |
| if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2: | |
| # Additional check: make sure it's not just a single word that's too short | |
| if len(next_line.split()) >= 1 and len(next_line) > 3: | |
| full_name = next_line | |
| print(f"DEBUG: Found full name: {full_name}", file=sys.stderr) | |
| break | |
| # Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label | |
| if not birth_date: | |
| if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or | |
| ("DATE" in line_upper and "BIRTH" in line_upper)): | |
| # First, check if date is on the same line after colon | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| date_part = parts[1].strip() | |
| if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', date_part.upper()) or | |
| re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or | |
| re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)): | |
| birth_date = date_part | |
| print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr) | |
| continue | |
| # Check next few lines for date value | |
| for j in range(1, min(5, len(cleaned_lines) - i)): | |
| next_line = cleaned_lines[i + j].strip() | |
| next_upper = next_line.upper() | |
| # Skip if it's another label | |
| if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']): | |
| continue | |
| # Check if it looks like a date (contains month name or date pattern) | |
| if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', next_upper) or | |
| re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or | |
| re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)): | |
| birth_date = next_line | |
| print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr) | |
| break | |
| # Extract LIT field - look for "LIT" label or pattern | |
| if not lit: | |
| # Look for "LIT" label (could be "LIT:", "LIT", or part of another label) | |
| if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper): | |
| # Check if LIT value is on the same line after colon or space | |
| if ":" in line: | |
| parts = line.split(':', 1) | |
| if len(parts) > 1: | |
| lit_part = parts[1].strip() | |
| if len(lit_part) > 0: | |
| lit = lit_part | |
| print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr) | |
| continue | |
| # Check next few lines for LIT value | |
| for j in range(1, min(4, len(cleaned_lines) - i)): | |
| next_line = cleaned_lines[i + j].strip() | |
| next_upper = next_line.upper() | |
| # Skip if it's another label | |
| if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']): | |
| continue | |
| # Check if it looks like a valid LIT value (could be date, name, or other text) | |
| if len(next_line) > 0: | |
| lit = next_line | |
| print(f"DEBUG: Found LIT: {lit}", file=sys.stderr) | |
| break | |
| return { | |
| 'clearance_type': 'nbi', | |
| 'id_number': nbi_id, | |
| 'full_name': full_name, | |
| 'birth_date': birth_date, | |
| 'lit': lit, | |
| 'success': nbi_id is not None or full_name is not None | |
| } | |
| def extract_ocr_lines_simple(image_path): | |
| # Try with different PaddleOCR settings | |
| with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): | |
| ocr = PaddleOCR( | |
| use_doc_orientation_classify=True, # Enable orientation detection | |
| use_doc_unwarping=True, # Enable document unwarping | |
| use_textline_orientation=True, # Enable text line orientation | |
| lang='en' # Set language to English | |
| ) | |
| try: | |
| results = ocr.predict(image_path) | |
| except Exception as e: | |
| print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr) | |
| if hasattr(ocr, 'ocr'): | |
| results = ocr.ocr(image_path) | |
| else: | |
| results = None | |
| all_text = [] | |
| try: | |
| # Handle both old format (list) and new format (OCRResult object) | |
| if results and isinstance(results, list) and len(results) > 0: | |
| first_item = results[0] | |
| item_type_name = type(first_item).__name__ | |
| is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() | |
| if is_ocr_result: | |
| print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) | |
| # Access OCRResult as dictionary | |
| try: | |
| if hasattr(first_item, 'keys'): | |
| ocr_dict = dict(first_item) | |
| # Look for rec_texts key | |
| if 'rec_texts' in ocr_dict: | |
| rec_texts = ocr_dict['rec_texts'] | |
| if isinstance(rec_texts, list): | |
| all_text = [str(t) for t in rec_texts if t] | |
| print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr) | |
| except Exception as e: | |
| print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr) | |
| else: | |
| # Old format - list of lists | |
| lines = results[0] if results and isinstance(results[0], list) else results | |
| for item in lines: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| meta = item[1] | |
| if isinstance(meta, (list, tuple)) and len(meta) >= 1: | |
| all_text.append(str(meta[0])) | |
| except Exception as e: | |
| print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) | |
| return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False} | |
| def extract_ocr_lines(image_path): | |
| # Check if file exists and has content | |
| if not os.path.exists(image_path): | |
| return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False} | |
| # Ensure output directory exists | |
| os.makedirs("output", exist_ok=True) | |
| # Clear previous output files | |
| for old_file in glob.glob("output/*"): | |
| os.remove(old_file) | |
| with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): | |
| ocr = PaddleOCR( | |
| use_doc_orientation_classify=False, | |
| use_doc_unwarping=False, | |
| use_textline_orientation=False, | |
| lang='en' | |
| ) | |
| try: | |
| results = ocr.predict(image_path) | |
| except Exception as e: | |
| print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr) | |
| if hasattr(ocr, 'ocr'): | |
| results = ocr.ocr(image_path) | |
| else: | |
| results = None | |
| # Process OCR results - handle both old format (list) and new format (OCRResult object) | |
| all_text = [] | |
| try: | |
| # Handle both old format (list) and new format (OCRResult object) | |
| if results and isinstance(results, list) and len(results) > 0: | |
| first_item = results[0] | |
| item_type_name = type(first_item).__name__ | |
| is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() | |
| if is_ocr_result: | |
| print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) | |
| # Access OCRResult as dictionary | |
| try: | |
| if hasattr(first_item, 'keys'): | |
| ocr_dict = dict(first_item) | |
| # Look for rec_texts key | |
| if 'rec_texts' in ocr_dict: | |
| rec_texts = ocr_dict['rec_texts'] | |
| if isinstance(rec_texts, list): | |
| all_text = [str(t) for t in rec_texts if t] | |
| print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr) | |
| except Exception as e: | |
| print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr) | |
| else: | |
| # Old format - list of lists | |
| lines = results[0] if results and isinstance(results[0], list) else results | |
| for item in lines: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| meta = item[1] | |
| if isinstance(meta, (list, tuple)) and len(meta) >= 1: | |
| all_text.append(str(meta[0])) | |
| except Exception as e: | |
| print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) | |
| import traceback | |
| print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr) | |
| print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr) | |
| return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False} | |
| # Main | |
| if len(sys.argv) < 2: | |
| sys.stdout = original_stdout | |
| print(json.dumps({"success": False, "error": "No image URL provided"})) | |
| sys.exit(1) | |
| image_url = sys.argv[1] | |
| print(f"DEBUG: Processing NBI image URL: {image_url}", file=sys.stderr) | |
| try: | |
| image_path = download_image(image_url, f'temp_image.jpg') | |
| print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr) | |
| # Try the original OCR method first | |
| ocr_results = extract_ocr_lines(image_path) | |
| print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr) | |
| # If original method fails, try simple method | |
| if not ocr_results['success']: | |
| print("DEBUG: Original method failed, trying simple method", file=sys.stderr) | |
| ocr_results = extract_ocr_lines_simple(image_path) | |
| print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr) | |
| # Clean up the temporary file | |
| if os.path.exists(image_path): | |
| os.remove(image_path) | |
| # Create the response object | |
| response = { | |
| "success": ocr_results['success'], | |
| "ocr_results": ocr_results | |
| } | |
| # Restore stdout and print only the JSON response | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps(response)) | |
| sys.stdout.flush() | |
| except Exception as e: | |
| # Restore stdout for error JSON | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps({"success": False, "error": str(e)})) | |
| sys.stdout.flush() | |
| sys.exit(1) | |
| finally: | |
| # Clean up | |
| try: | |
| if os.path.exists('temp_image.jpg'): | |
| os.remove('temp_image.jpg') | |
| except: | |
| pass |