import sys, json, os, glob, requests import re import time from contextlib import redirect_stdout, redirect_stderr from datetime import datetime # Immediately redirect all output to stderr except for our final JSON original_stdout = sys.stdout sys.stdout = sys.stderr # Suppress all PaddleOCR output os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' os.environ['QT_QPA_PLATFORM'] = 'offscreen' os.environ['DISPLAY'] = ':99' # Import PaddleOCR after setting environment variables from paddleocr import PaddleOCR def download_image(url, output_path='temp_phic_image.jpg'): # Remove any existing temp file if os.path.exists(output_path): os.remove(output_path) # Add cache-busting parameters timestamp = int(time.time()) if '?' in url: url += f'&t={timestamp}' else: url += f'?t={timestamp}' # Add headers to prevent caching headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0' } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() image_data = response.content # Save the image with open(output_path, 'wb') as f: f.write(image_data) return output_path def format_date(date_str): """Format date from various formats to YYYY-MM-DD""" if not date_str: return None date_str = date_str.strip() # Fix common OCR errors first date_str = date_str.replace('VULY', 'JULY').replace('VUL', 'JUL') date_str = date_str.replace('Juy', 'July') date_str = date_str.replace('Januay', 'January').replace('Februay', 'February') date_str = date_str.replace('Marc', 'March').replace('Apil', 'April') date_str = date_str.replace('Augu', 'August').replace('Septemb', 'September') date_str = date_str.replace('Octob', 'October').replace('Novem', 'November') date_str = date_str.replace('Decemb', 'December') # Fix period instead of comma: "10.2003" -> "10, 2003" date_str = re.sub(r'(\d{1,2})\.(\d{4})', r'\1, \2', date_str) # Handle format like "JULY 10, 2003" or "July 10, 2003" match = re.match(r'([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})', date_str) if match: month_str, day, year = match.groups() try: # Parse month name (handle full month names and abbreviations) # Try abbreviation first (first 3 chars) try: month = datetime.strptime(month_str[:3], '%b').month except: # Try full month name month = datetime.strptime(month_str, '%B').month return f"{year}-{month:02d}-{int(day):02d}" except Exception as e: print(f"DEBUG: Date parsing error: {e}", file=sys.stderr) pass # Try other common formats for fmt in ["%B %d, %Y", "%b %d, %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"]: try: dt = datetime.strptime(date_str, fmt) return dt.strftime("%Y-%m-%d") except Exception: continue return date_str def format_name(name): """Format name: reorder to given names first, then last name, and capitalize properly""" if not name: return None # Ensure comma spacing: "FERNANDEZ,CARL" -> "FERNANDEZ, CARL" name = re.sub(r',([A-Z])', r', \1', name) name = re.sub(r',\s*([A-Z])', r', \1', name) # Fix missing spaces in name parts: "MATTHEWICOY" -> "MATTHEW ICOY" # Add space before capital letters that follow lowercase name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name) # Add space between consecutive capitals followed by lowercase (name parts) name = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', name) # Remove extra spaces and normalize name = ' '.join(name.split()) # Reorder name: "LASTNAME, FIRST MIDDLE" -> "FIRST MIDDLE LASTNAME" # Or "LASTNAME FIRST MIDDLE" -> "FIRST MIDDLE LASTNAME" if ',' in name: # Split by comma: "AGUILAR,JEDD EISHEN BAYATAN" -> ["AGUILAR", "JEDD EISHEN BAYATAN"] parts = [p.strip() for p in name.split(',')] if len(parts) >= 2: last_name = parts[0] given_names = ' '.join(parts[1:]) # Reorder: given names first, then last name name = f"{given_names} {last_name}" else: name = parts[0] else: # No comma, try to identify last name (usually the first word if it's all caps) words = name.split() if len(words) >= 2 and words[0].isupper() and len(words[0]) > 3: # First word might be last name, rest are given names # But this is less reliable, so we'll keep original order if no comma pass # Capitalize each word properly name = ' '.join([word.capitalize() for word in name.split()]) return name.strip() def format_address(address_lines): """Format address from multiple lines and fix OCR errors""" if not address_lines: return None # Join address lines and clean up address = ' '.join([line.strip() for line in address_lines if line.strip()]) # Fix OCR errors in address address = address.replace('DYLABONFACIO', 'BONIFACIO') address = address.replace('AVENJE', 'AVENUE') address = address.replace('SANIODOMINGO', 'SANTO DOMINGO') address = address.replace('SANIO', 'SANTO') address = address.replace('AZAL', 'RIZAL') address = address.replace('AZAL-', 'RIZAL ') # Fix missing spaces: "071A" -> "071 A" address = re.sub(r'(\d+)([A-Z])', r'\1 \2', address) # Fix missing spaces before city/province: "CAINTA AZAL" -> "CAINTA RIZAL" address = re.sub(r'([A-Z])([A-Z][a-z]+)', r'\1 \2', address) # Remove extra spaces address = ' '.join(address.split()) return address.strip() def extract_phic_details(lines): details = { 'id_type': 'PHIC', 'id_number': None, 'full_name': None, 'birth_date': None, 'sex': None, 'address': None, 'membership_category': None, 'success': False } # Clean lines - convert to strings and strip cleaned_lines = [str(line).strip() for line in lines if str(line).strip()] for i, line in enumerate(cleaned_lines): line_upper = line.upper().strip() line_stripped = line.strip() # Extract ID Number - Format: XX-XXXXXXXXX-X (e.g., "03-026765383-2") # Remove dashes from ID number if not details['id_number']: # Look for pattern: digits-digits-digits with hyphens id_match = re.search(r'(\d{2}-\d{9}-\d)', line_stripped) if id_match: # Remove dashes from ID number details['id_number'] = id_match.group(1).replace('-', '') # Also check for pattern without hyphens that might be OCR'd incorrectly elif re.match(r'^\d{12}$', line_stripped): # Use as is (no dashes) details['id_number'] = line_stripped # Extract Full Name - usually appears as "LASTNAME, FIRST MIDDLE" or on separate lines if not details['full_name']: # Look for name pattern with comma (LASTNAME, FIRST MIDDLE) if ',' in line_stripped and re.match(r'^[A-Z\s,]+$', line_stripped): # Make sure it's not a label if not any(label in line_upper for label in ['REPUBLIC', 'PHILIPPINE', 'HEALTH', 'INSURANCE', 'CORPORATION', 'PHILHEALTH', 'DATE', 'BIRTH', 'ADDRESS', 'MEMBERSHIP', 'CATEGORY']): # Store for later processing (will reorder in format_name) details['full_name'] = line_stripped # Also check for name parts on separate lines (e.g., "AGUILAR,JEDD" and "EISHEN BAYATAN") elif re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped.split()) >= 1: # Make sure it's not a label if not any(label in line_upper for label in ['REPUBLIC', 'PHILIPPINE', 'HEALTH', 'INSURANCE', 'CORPORATION', 'PHILHEALTH', 'DATE', 'BIRTH', 'ADDRESS', 'MEMBERSHIP', 'CATEGORY', 'INFORMAL', 'ECONOMY']): # Check if previous line might be part of name if i > 0: prev_line = cleaned_lines[i-1].strip() # If previous line has comma (LASTNAME, FIRST) and current line is more names if ',' in prev_line and re.match(r'^[A-Z\s,]+$', prev_line): # Combine: "AGUILAR,JEDD" + "EISHEN BAYATAN" -> "JEDD EISHEN BAYATAN AGUILAR" combined = f"{prev_line} {line_stripped}" details['full_name'] = combined elif re.match(r'^[A-Z\s,]+$', line_stripped): # Try to combine with previous line if it looks like a name if re.match(r'^[A-Z\s,]+$', prev_line) and ',' in prev_line: combined = f"{prev_line} {line_stripped}" details['full_name'] = combined # Extract Date of Birth and Sex - Format: "JULY 10, 2003 - MALE" or "VULY10.2003-MALE" (with OCR errors) # Also handle split dates like "FEBRUARY17," and "2003-" on separate lines if not details['birth_date'] or not details['sex']: # Look for date pattern followed by sex (handle OCR errors like "VULY10.2003-MALE") # Pattern: month name (may have OCR errors) + day (may have period instead of comma) + year + sex date_sex_match = re.search(r'([A-Za-z]+)\s*(\d{1,2})[.,]\s*(\d{4})\s*[-]?\s*(MALE|FEMALE)', line_upper) if date_sex_match: month_str = date_sex_match.group(1) day = date_sex_match.group(2) year = date_sex_match.group(3) sex_str = date_sex_match.group(4) # Fix OCR errors in month: VULY -> JULY month_str = month_str.replace('VULY', 'JULY').replace('VUL', 'JUL') month_str = month_str.replace('JANUA', 'JANUARY').replace('FEBRUA', 'FEBRUARY') month_str = month_str.replace('MARC', 'MARCH').replace('APIL', 'APRIL') month_str = month_str.replace('AUGU', 'AUGUST').replace('SEPTEM', 'SEPTEMBER') month_str = month_str.replace('OCTO', 'OCTOBER').replace('NOVEM', 'NOVEMBER') month_str = month_str.replace('DECEM', 'DECEMBER') if not details['birth_date']: details['birth_date'] = f"{month_str} {day}, {year}" if not details['sex']: details['sex'] = sex_str.capitalize() # Check for split date pattern: "FEBRUARY17," on one line and "2003-" on next line elif not details['birth_date']: # Look for month + day pattern (may end with comma): "FEBRUARY17," month_day_match = re.search(r'([A-Za-z]+)(\d{1,2})[,]?$', line_upper) if month_day_match: month_str = month_day_match.group(1) day = month_day_match.group(2) # Fix OCR errors month_str = month_str.replace('VULY', 'JULY').replace('VUL', 'JUL') month_str = month_str.replace('JANUA', 'JANUARY').replace('FEBRUA', 'FEBRUARY') month_str = month_str.replace('MARC', 'MARCH').replace('APIL', 'APRIL') month_str = month_str.replace('AUGU', 'AUGUST').replace('SEPTEM', 'SEPTEMBER') month_str = month_str.replace('OCTO', 'OCTOBER').replace('NOVEM', 'NOVEMBER') month_str = month_str.replace('DECEM', 'DECEMBER') # Look ahead for year (may have trailing dash): "2003-" if i + 1 < len(cleaned_lines): next_line = cleaned_lines[i+1].strip() year_match = re.search(r'(\d{4})', next_line) if year_match: year = year_match.group(1) details['birth_date'] = f"{month_str} {day}, {year}" # Also check for date pattern with period instead of comma on same line else: date_match = re.search(r'([A-Za-z]+)\s*(\d{1,2})[.,]\s*(\d{4})', line_upper) if date_match: month_str = date_match.group(1) day = date_match.group(2) year = date_match.group(3) # Fix OCR errors month_str = month_str.replace('VULY', 'JULY').replace('VUL', 'JUL') month_str = month_str.replace('JANUA', 'JANUARY').replace('FEBRUA', 'FEBRUARY') month_str = month_str.replace('MARC', 'MARCH').replace('APIL', 'APRIL') month_str = month_str.replace('AUGU', 'AUGUST').replace('SEPTEM', 'SEPTEMBER') month_str = month_str.replace('OCTO', 'OCTOBER').replace('NOVEM', 'NOVEMBER') month_str = month_str.replace('DECEM', 'DECEMBER') # Make sure it's not part of address or other field if "AVENUE" not in line_upper and "STREET" not in line_upper and "RIZAL" not in line_upper: details['birth_date'] = f"{month_str} {day}, {year}" # Check for sex alone if date was found separately if not details['sex']: if "MALE" in line_upper and "FEMALE" not in line_upper: details['sex'] = "Male" elif "FEMALE" in line_upper: details['sex'] = "Female" # Extract Address - usually a long line with street, city, province, zip if not details['address']: # Look for address indicators (but exclude ID numbers and names) # Check if line contains address keywords but not ID pattern or name pattern if (any(indicator in line_upper for indicator in ['AVENUE', 'AVENJE', 'STREET', 'ROAD', 'BLVD', 'BOULEVARD', 'CAINTA', 'RIZAL', 'AZAL', 'SANTO', 'SANIO', 'DOMINGO', 'BONIFACIO', 'DYLABONFACIO']) or (re.match(r'^\d+', line_stripped) and len(line_stripped) > 5)): # Make sure it's not an ID number or name is_id = bool(re.search(r'\d{2}-\d{9}-\d', line_stripped)) is_name = bool(',' in line_stripped and re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped.split()) <= 5) if not is_id and not is_name: address_lines = [] # Collect address lines forward for j in range(0, min(3, len(cleaned_lines) - i)): addr_line = cleaned_lines[i+j].strip() addr_upper = addr_line.upper() # Stop if we hit membership category, ID number, name, or other labels if (any(label in addr_upper for label in ['MEMBERSHIP', 'CATEGORY', 'INFORMAL', 'ECONOMY', 'DATE', 'BIRTH', 'MALE', 'FEMALE']) or re.search(r'\d{2}-\d{9}-\d', addr_line) or (',' in addr_line and re.match(r'^[A-Z\s,]+$', addr_line) and len(addr_line.split()) <= 5)): break # Add if it looks like address content if addr_line and len(addr_line) > 2: address_lines.append(addr_line) if address_lines: details['address'] = format_address(address_lines) # Extract Membership Category - usually "INFORMAL ECONOMY" or similar if not details['membership_category']: if "MEMBERSHIP" in line_upper and "CATEGORY" in line_upper: # Category is likely on next line if i + 1 < len(cleaned_lines): next_line = cleaned_lines[i+1].strip() if next_line: details['membership_category'] = next_line elif "INFORMAL ECONOMY" in line_upper or ("INFORMAL" in line_upper and "ECONOMY" in line_upper): details['membership_category'] = "INFORMAL ECONOMY" elif any(cat in line_upper for cat in ['INFORMAL', 'EMPLOYED', 'SELF-EMPLOYED', 'VOLUNTARY', 'SPONSORED', 'DEPENDENT']): # Check if it's a membership category (not part of address or name) if not any(label in line_upper for label in ['AVENUE', 'STREET', 'RIZAL', 'CAINTA', 'FERNANDEZ']): details['membership_category'] = line_stripped # Format extracted fields if details['full_name']: details['full_name'] = format_name(details['full_name']) if details['birth_date']: details['birth_date'] = format_date(details['birth_date']) if details['id_number'] or details['full_name']: details['success'] = True return details def extract_ocr_lines(image_path): # Check if file exists if not os.path.exists(image_path): return {'success': False, 'error': 'File not found'} file_size = os.path.getsize(image_path) print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr) with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): # Try simple configuration first ocr = PaddleOCR( use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False, lang='en' ) try: results = ocr.ocr(image_path) except Exception as e: print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr) if hasattr(ocr, 'predict'): results = ocr.predict(image_path) else: results = None # Debug: Print raw results structure print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr) all_text = [] try: # Handle both old format (list) and new format (OCRResult object) if results and isinstance(results, list) and len(results) > 0: first_item = results[0] item_type_name = type(first_item).__name__ is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() if is_ocr_result: print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) # Access OCRResult as dictionary try: if hasattr(first_item, 'keys'): ocr_dict = dict(first_item) # Look for rec_texts key if 'rec_texts' in ocr_dict: rec_texts = ocr_dict['rec_texts'] if isinstance(rec_texts, list): all_text = [str(t) for t in rec_texts if t] print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr) except Exception as e: print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr) else: # Old format - list of lists lines = results[0] if results and isinstance(results[0], list) else results for item in lines: if isinstance(item, (list, tuple)) and len(item) >= 2: meta = item[1] if isinstance(meta, (list, tuple)) and len(meta) >= 1: all_text.append(str(meta[0])) except Exception as e: print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) import traceback print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr) print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr) return extract_phic_details(all_text) if all_text else { 'id_type': 'PHIC', 'id_number': None, 'full_name': None, 'birth_date': None, 'sex': None, 'address': None, 'membership_category': None, 'success': False } # Main Execution if len(sys.argv) < 2: sys.stdout = original_stdout print(json.dumps({"success": False, "error": "No image URL provided"})) sys.exit(1) image_url = sys.argv[1] print(f"DEBUG: Processing PhilHealth ID image URL: {image_url}", file=sys.stderr) try: image_path = download_image(image_url, 'temp_phic_image.jpg') print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr) ocr_results = extract_ocr_lines(image_path) print(f"DEBUG: OCR results: {ocr_results}", file=sys.stderr) # Clean up if os.path.exists(image_path): os.remove(image_path) response = { "success": ocr_results['success'], "data": ocr_results } sys.stdout = original_stdout sys.stdout.write(json.dumps(response)) sys.stdout.flush() except Exception as e: sys.stdout = original_stdout sys.stdout.write(json.dumps({"success": False, "error": str(e)})) sys.stdout.flush() sys.exit(1) finally: try: if os.path.exists('temp_phic_image.jpg'): os.remove('temp_phic_image.jpg') except: pass