import sys, json, os, glob, requests import re import time import shutil from contextlib import redirect_stdout, redirect_stderr # Immediately redirect all output to stderr except for our final JSON original_stdout = sys.stdout sys.stdout = sys.stderr # Suppress all PaddleOCR output os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' os.environ['QT_QPA_PLATFORM'] = 'offscreen' os.environ['DISPLAY'] = ':99' # Import PaddleOCR after setting environment variables from paddleocr import PaddleOCR def download_image(url, output_path='temp_image.jpg'): # Remove any existing temp file if os.path.exists(output_path): os.remove(output_path) # Add cache-busting parameters timestamp = int(time.time()) if '?' in url: url += f'&t={timestamp}' else: url += f'?t={timestamp}' # Add headers to prevent caching headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0' } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() image_data = response.content # Save the image and verify it's the right one with open(output_path, 'wb') as f: f.write(image_data) return output_path # OCR Function to extract NBI ID NO, Name, Birth Date, and LIT def extract_nbi_id(lines): nbi_id = None full_name = None birth_date = None lit = None # LIT field (Last Issued To or similar) # Clean lines - convert to strings and strip cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines] # First pass: Look for NBI ID pattern in all lines (prioritize exact matches) # This helps catch IDs that might be on lines without labels for i, line in enumerate(cleaned_lines): line_upper = line.upper().strip() line_clean = line.strip() # Look for NBI ID pattern with hyphen first (most reliable) if not nbi_id: hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b' match = re.search(hyphen_pattern, line_clean) if match: candidate = match.group(1) # Validate length and that it's not part of an address if 17 <= len(candidate) <= 25: # Check that line doesn't have too many words (NBI IDs are usually standalone) line_words = line_clean.split() if len(line_words) <= 3: # Usually 1-2 words max (the ID itself) # Additional validation: should have mix of letters and numbers has_letters = bool(re.search(r'[A-Z]', candidate)) has_numbers = bool(re.search(r'[0-9]', candidate)) if has_letters and has_numbers: nbi_id = candidate print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr) break # Second pass: Extract other fields and refine ID if needed for i, line in enumerate(cleaned_lines): line_upper = line.upper().strip() line_clean = line.strip() # Extract NBI ID Number (if not found in first pass) if not nbi_id: # Look for "NBI ID NO:" pattern (various formats) if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or "NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper): # Extract the ID after the colon if ":" in line: parts = line.split(':', 1) if len(parts) > 1: id_candidate = parts[1].strip() # Clean up the ID (remove extra spaces, ensure proper format) id_candidate = re.sub(r'\s+', '', id_candidate) # Remove spaces if len(id_candidate) > 5: # Valid ID should be longer nbi_id = id_candidate print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr) continue # Also check if the next line contains the ID (in case it's on a separate line) if i < len(cleaned_lines) - 1: for j in range(1, min(3, len(cleaned_lines) - i)): next_line = cleaned_lines[i + j].strip() # Skip if it's clearly not an ID (too short, contains labels) if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']): continue # Check if it looks like an NBI ID (alphanumeric, reasonable length) if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')): nbi_id = next_line.replace(' ', '') print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr) break if nbi_id: continue # Look for NBI ID pattern: alphanumeric with one hyphen # Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873 # First part: 8-12 chars, hyphen, second part: 8-12 chars # Total length: 17-25 characters (including hyphen) # Priority 1: Pattern with hyphen (most common format) # Look for pattern like B450JRLR0B-RC248667 hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b' match = re.search(hyphen_pattern, line_clean) if match: candidate = match.group(1) # Validate: should be 17-25 chars total if 17 <= len(candidate) <= 25: # Make sure it's not matching address parts or other text # Also check that the line doesn't have too many words (NBI IDs are usually standalone) line_words = line_clean.split() # Additional validation: should have mix of letters and numbers has_letters = bool(re.search(r'[A-Z]', candidate)) has_numbers = bool(re.search(r'[0-9]', candidate)) if (has_letters and has_numbers and not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and len(line_words) <= 3): # NBI ID is usually on its own line or with 1-2 other words nbi_id = candidate print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr) continue # Priority 2: Pattern with space instead of hyphen space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b' match = re.search(space_pattern, line_clean) if match: part1, part2 = match.groups() candidate = f"{part1}-{part2}" if 17 <= len(candidate) <= 25: has_letters = bool(re.search(r'[A-Z]', candidate)) has_numbers = bool(re.search(r'[0-9]', candidate)) if (has_letters and has_numbers and not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])): nbi_id = candidate print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr) continue # Priority 3: Pattern without hyphen/space (all together) # Only if we haven't found one yet and it's a reasonable length no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b' match = re.search(no_hyphen_pattern, line_clean) if match: candidate = match.group(1) # Make sure it doesn't contain common address words and has both letters and numbers has_letters = bool(re.search(r'[A-Z]', candidate)) has_numbers = bool(re.search(r'[0-9]', candidate)) if (has_letters and has_numbers and not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])): # Try to split it intelligently (usually split in the middle) mid = len(candidate) // 2 # Try splitting at various points for split_point in range(mid-2, mid+3): if 8 <= split_point <= len(candidate) - 8: part1 = candidate[:split_point] part2 = candidate[split_point:] if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12: nbi_id = f"{part1}-{part2}" print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr) break if nbi_id: continue # Extract Full Name - look for name patterns after "NAME" label # Also handle cases where name might be on the same line or next lines if not full_name: # Check if line contains "NAME" label if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper): # First, check if name is on the same line after colon if ":" in line: parts = line.split(':', 1) if len(parts) > 1: name_part = parts[1].strip() if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2: full_name = name_part print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr) continue # Check next few lines for name value for j in range(1, min(5, len(cleaned_lines) - i)): next_line = cleaned_lines[i + j].strip() next_upper = next_line.upper() # Skip if it's another label or ID number if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']): continue # Check if it looks like a name (has letters, may have commas, not all numbers) if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2: # Additional check: make sure it's not just a single word that's too short if len(next_line.split()) >= 1 and len(next_line) > 3: full_name = next_line print(f"DEBUG: Found full name: {full_name}", file=sys.stderr) break # Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label if not birth_date: if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or ("DATE" in line_upper and "BIRTH" in line_upper)): # First, check if date is on the same line after colon if ":" in line: parts = line.split(':', 1) if len(parts) > 1: date_part = parts[1].strip() if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', date_part.upper()) or re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)): birth_date = date_part print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr) continue # Check next few lines for date value for j in range(1, min(5, len(cleaned_lines) - i)): next_line = cleaned_lines[i + j].strip() next_upper = next_line.upper() # Skip if it's another label if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']): continue # Check if it looks like a date (contains month name or date pattern) if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', next_upper) or re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)): birth_date = next_line print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr) break # Extract LIT field - look for "LIT" label or pattern if not lit: # Look for "LIT" label (could be "LIT:", "LIT", or part of another label) if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper): # Check if LIT value is on the same line after colon or space if ":" in line: parts = line.split(':', 1) if len(parts) > 1: lit_part = parts[1].strip() if len(lit_part) > 0: lit = lit_part print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr) continue # Check next few lines for LIT value for j in range(1, min(4, len(cleaned_lines) - i)): next_line = cleaned_lines[i + j].strip() next_upper = next_line.upper() # Skip if it's another label if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']): continue # Check if it looks like a valid LIT value (could be date, name, or other text) if len(next_line) > 0: lit = next_line print(f"DEBUG: Found LIT: {lit}", file=sys.stderr) break return { 'clearance_type': 'nbi', 'id_number': nbi_id, 'full_name': full_name, 'birth_date': birth_date, 'lit': lit, 'success': nbi_id is not None or full_name is not None } def extract_ocr_lines_simple(image_path): # Try with different PaddleOCR settings with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): ocr = PaddleOCR( use_doc_orientation_classify=True, # Enable orientation detection use_doc_unwarping=True, # Enable document unwarping use_textline_orientation=True, # Enable text line orientation lang='en' # Set language to English ) try: results = ocr.predict(image_path) except Exception as e: print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr) if hasattr(ocr, 'ocr'): results = ocr.ocr(image_path) else: results = None all_text = [] try: # Handle both old format (list) and new format (OCRResult object) if results and isinstance(results, list) and len(results) > 0: first_item = results[0] item_type_name = type(first_item).__name__ is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() if is_ocr_result: print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) # Access OCRResult as dictionary try: if hasattr(first_item, 'keys'): ocr_dict = dict(first_item) # Look for rec_texts key if 'rec_texts' in ocr_dict: rec_texts = ocr_dict['rec_texts'] if isinstance(rec_texts, list): all_text = [str(t) for t in rec_texts if t] print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr) except Exception as e: print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr) else: # Old format - list of lists lines = results[0] if results and isinstance(results[0], list) else results for item in lines: if isinstance(item, (list, tuple)) and len(item) >= 2: meta = item[1] if isinstance(meta, (list, tuple)) and len(meta) >= 1: all_text.append(str(meta[0])) except Exception as e: print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False} def extract_ocr_lines(image_path): # Check if file exists and has content if not os.path.exists(image_path): return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False} # Ensure output directory exists os.makedirs("output", exist_ok=True) # Clear previous output files for old_file in glob.glob("output/*"): os.remove(old_file) with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): ocr = PaddleOCR( use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False, lang='en' ) try: results = ocr.predict(image_path) except Exception as e: print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr) if hasattr(ocr, 'ocr'): results = ocr.ocr(image_path) else: results = None # Process OCR results - handle both old format (list) and new format (OCRResult object) all_text = [] try: # Handle both old format (list) and new format (OCRResult object) if results and isinstance(results, list) and len(results) > 0: first_item = results[0] item_type_name = type(first_item).__name__ is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() if is_ocr_result: print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) # Access OCRResult as dictionary try: if hasattr(first_item, 'keys'): ocr_dict = dict(first_item) # Look for rec_texts key if 'rec_texts' in ocr_dict: rec_texts = ocr_dict['rec_texts'] if isinstance(rec_texts, list): all_text = [str(t) for t in rec_texts if t] print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr) except Exception as e: print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr) else: # Old format - list of lists lines = results[0] if results and isinstance(results[0], list) else results for item in lines: if isinstance(item, (list, tuple)) and len(item) >= 2: meta = item[1] if isinstance(meta, (list, tuple)) and len(meta) >= 1: all_text.append(str(meta[0])) except Exception as e: print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) import traceback print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr) print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr) return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False} # Main if len(sys.argv) < 2: sys.stdout = original_stdout print(json.dumps({"success": False, "error": "No image URL provided"})) sys.exit(1) image_url = sys.argv[1] print(f"DEBUG: Processing NBI image URL: {image_url}", file=sys.stderr) try: image_path = download_image(image_url, f'temp_image.jpg') print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr) # Try the original OCR method first ocr_results = extract_ocr_lines(image_path) print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr) # If original method fails, try simple method if not ocr_results['success']: print("DEBUG: Original method failed, trying simple method", file=sys.stderr) ocr_results = extract_ocr_lines_simple(image_path) print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr) # Clean up the temporary file if os.path.exists(image_path): os.remove(image_path) # Create the response object response = { "success": ocr_results['success'], "ocr_results": ocr_results } # Restore stdout and print only the JSON response sys.stdout = original_stdout sys.stdout.write(json.dumps(response)) sys.stdout.flush() except Exception as e: # Restore stdout for error JSON sys.stdout = original_stdout sys.stdout.write(json.dumps({"success": False, "error": str(e)})) sys.stdout.flush() sys.exit(1) finally: # Clean up try: if os.path.exists('temp_image.jpg'): os.remove('temp_image.jpg') except: pass