import sys, json, os, glob, requests import re import time from contextlib import redirect_stdout, redirect_stderr from datetime import datetime # Immediately redirect all output to stderr except for our final JSON original_stdout = sys.stdout sys.stdout = sys.stderr # Suppress all PaddleOCR output os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' os.environ['QT_QPA_PLATFORM'] = 'offscreen' os.environ['DISPLAY'] = ':99' # Import PaddleOCR after setting environment variables from paddleocr import PaddleOCR def download_image(url, output_path='temp_postal_image.jpg'): # Remove any existing temp file if os.path.exists(output_path): os.remove(output_path) # Add cache-busting parameters timestamp = int(time.time()) if '?' in url: url += f'&t={timestamp}' else: url += f'?t={timestamp}' # Add headers to prevent caching headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Cache-Control': 'no-cache, no-store, must-revalidate', 'Pragma': 'no-cache', 'Expires': '0' } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() image_data = response.content # Save the image with open(output_path, 'wb') as f: f.write(image_data) return output_path def format_date(date_str): """Format date from various formats to YYYY-MM-DD""" if not date_str: return None date_str = date_str.strip() # Fix common OCR errors first date_str = date_str.replace('Ol', '01').replace('O1', '01').replace('O0', '00').replace('OO', '00') date_str = date_str.replace('l', '1') # lowercase L -> 1 # Handle format like "14 Aug 88" or "14 Aug88" -> "1988-08-14" # Allow for missing space between month and year match = re.match(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', date_str) if match: day, month_str, year = match.groups() try: # Fix month OCR errors month_str = month_str.replace('Augu', 'Aug').replace('Augu', 'Aug') month_str = month_str.replace('Decm', 'Dec').replace('Dece', 'Dec') month_str = month_str.replace('Janu', 'Jan').replace('Febr', 'Feb') month_str = month_str.replace('Marc', 'Mar').replace('Apil', 'Apr') month_str = month_str.replace('May', 'May').replace('June', 'Jun') month_str = month_str.replace('July', 'Jul').replace('Sept', 'Sep') month_str = month_str.replace('Octo', 'Oct').replace('Novem', 'Nov') # Convert 2-digit year to 4-digit (assume 1900s for years > 50, 2000s for <= 50) if len(year) == 2: year_int = int(year) year = f"19{year}" if year_int > 50 else f"20{year}" # Parse month abbreviation (use first 3 chars) month = datetime.strptime(month_str[:3], '%b').month return f"{year}-{month:02d}-{int(day):02d}" except Exception as e: print(f"DEBUG: Date parsing error: {e}", file=sys.stderr) pass # Try other common formats for fmt in ["%d %b %Y", "%d %B %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%d%b%Y", "%d%B%Y"]: try: dt = datetime.strptime(date_str, fmt) return dt.strftime("%Y-%m-%d") except Exception: continue return date_str def format_name(name): """Format name: capitalize properly""" if not name: return None # Remove extra spaces and normalize name = ' '.join(name.split()) # Capitalize each word properly name = ' '.join([word.capitalize() for word in name.split()]) return name.strip() def format_address(address_lines): """Format address from multiple lines""" if not address_lines: return None # Join address lines and clean up address = ' '.join([line.strip() for line in address_lines if line.strip()]) # Fix missing spaces: "585Gen." -> "585 Gen." address = re.sub(r'(\d+)([A-Z])', r'\1 \2', address) # Fix missing spaces before abbreviations: "Brgy.Rivera" -> "Brgy. Rivera" address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address) # Remove extra spaces address = ' '.join(address.split()) return address.strip() def extract_postal_details(lines): details = { 'id_type': 'Postal ID', 'prn': None, 'full_name': None, 'address': None, 'birth_date': None, 'nationality': None, 'issuing_post_office': None, 'valid_until': None, 'success': False } # Clean lines - convert to strings and strip cleaned_lines = [str(line).strip() for line in lines if str(line).strip()] for i, line in enumerate(cleaned_lines): line_upper = line.upper().strip() line_stripped = line.strip() # Extract PRN (Postal Registration Number) # Format: "PRN 100141234567 P POSTAL" or "PRN100141234567P" or "PAN100141234567P" (OCR might misread PRN as PAN) if not details['prn']: # Look for PRN followed by digits (may have P POSTAL after) prn_match = re.search(r'PRN\s*(\d{10,15})', line_upper) if prn_match: details['prn'] = prn_match.group(1) # Also check for PAN (common OCR error where PRN is misread as PAN) elif re.search(r'PAN\s*(\d{10,15})', line_upper): pan_match = re.search(r'PAN\s*(\d{10,15})', line_upper) if pan_match: details['prn'] = pan_match.group(1) # Extract Full Name - combine separate name parts # Look for label "First Name Middle Name Surname, Suffix" or name parts if not details['full_name']: # Check if this line is the label if ("FIRST NAME" in line_upper or "FINT NAME" in line_upper) and ("SURNAME" in line_upper or "SUMAME" in line_upper): # Collect name parts from next few lines name_parts = [] for j in range(1, min(5, len(cleaned_lines) - i)): next_line = cleaned_lines[i+j].strip() next_upper = next_line.upper() # Stop if we hit address or other labels if any(label in next_upper for label in ['ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY']): break # Add if it looks like a name part (all caps, letters and spaces only, not too short) if next_line and re.match(r'^[A-Z\s,]+$', next_line) and len(next_line) > 1: # Skip if it's clearly not a name (like "ID", "C", etc.) if next_line not in ['ID', 'C', 'P', 'POSTAL']: name_parts.append(next_line) if name_parts: details['full_name'] = ' '.join(name_parts) # Also check if line is a name part (all caps, not a label) elif re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped) > 2: # Make sure it's not a label or common words if not any(label in line_upper for label in ['FIRST NAME', 'MIDDLE NAME', 'SURNAME', 'ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'POSTAL', 'IDENTITY', 'CARD', 'PHCPOST', 'PHILIPPINE', 'PREMIUM']): # Check if previous line is the name label if i > 0: prev_line = cleaned_lines[i-1].strip().upper() if "FIRST NAME" in prev_line or "FINT NAME" in prev_line or "SUMAME" in prev_line or "SURNAME" in prev_line: # Collect consecutive name parts name_parts = [line_stripped] for j in range(1, min(4, len(cleaned_lines) - i)): next_line = cleaned_lines[i+j].strip() if (next_line and re.match(r'^[A-Z\s,]+$', next_line) and len(next_line) > 2 and not any(label in next_line.upper() for label in ['ADDRESS', 'DATE', 'BIRTH', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY', 'ID', 'POSTAL', 'PREMIUM'])): name_parts.append(next_line) else: break if len(name_parts) >= 2: details['full_name'] = ' '.join(name_parts) elif len(name_parts) == 1 and len(name_parts[0].split()) >= 2: details['full_name'] = name_parts[0] # Extract Address - look for address parts (street numbers, Gen., Blvd., Brgy., City) if not details['address']: # Look for address indicators if any(indicator in line_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY']) or (re.match(r'^\d+', line_stripped) and len(line_stripped) > 2): address_lines = [] # Check backwards a bit to see if we missed address start start_idx = max(0, i - 1) # Collect address lines forward for j in range(0, min(7, len(cleaned_lines) - start_idx)): idx = start_idx + j if idx >= len(cleaned_lines): break addr_line = cleaned_lines[idx].strip() addr_upper = addr_line.upper() # Stop if we hit date, nationality, or other labels if any(label in addr_upper for label in ['DATE', 'BIRTH', 'NATIONALITY', 'FILIPINO', 'ISSUING', 'VALID', 'PAN', 'NOCON']): break # Skip very short lines that are likely OCR noise (like "101", "o00") if len(addr_line) <= 2 and not re.match(r'^\d+$', addr_line): continue # Add if it looks like address content if addr_line and len(addr_line) > 1: # Check if it's a number, street name, barangay, city, etc. if (re.match(r'^\d+', addr_line) or any(indicator in addr_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY', 'STREET', 'AVE', 'BOULEVARD']) or len(address_lines) > 0): # Continue if we've started collecting # Skip obvious OCR errors like "o00" if addr_line.lower() not in ['o00', 'o0', '00']: address_lines.append(addr_line) if address_lines: details['address'] = format_address(address_lines) # Extract Date of Birth - handle OCR errors if not details['birth_date']: # Look for date patterns: "14 Aug88" or "14 Aug 88" date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', line_stripped) if date_match: # Check if it's not the valid until date if "VALID" not in line_upper and "UNTIL" not in line_upper: # Fix spacing day, month, year = date_match.groups() details['birth_date'] = f"{day} {month} {year}" # Extract Nationality if not details['nationality']: if "NATIONALITY" in line_upper or line_upper == "FILIPINO": if line_upper == "FILIPINO": details['nationality'] = "Filipino" elif i + 1 < len(cleaned_lines): next_line = cleaned_lines[i+1].strip() if next_line and len(next_line) < 20: details['nationality'] = next_line # Extract Issuing Post Office - handle OCR errors like "IssungPostOmce" if not details['issuing_post_office']: if ("ISSUING POST OFFICE" in line_upper or "ISSUING POST" in line_upper or "ISSUINGPOST" in line_upper or "ISSUINGPOSTOMCE" in line_upper): if i + 1 < len(cleaned_lines): next_line = cleaned_lines[i+1].strip() if next_line and len(next_line) < 20: # Fix OCR errors: MNL.QE -> MNL-QE next_line = next_line.replace('.', '-') details['issuing_post_office'] = next_line # Extract Valid Until - handle OCR errors like "Vald Urt" and "OlDec17" if not details['valid_until']: if ("VALID UNTIL" in line_upper or "VALIDUNTIL" in line_upper or "VALD URT" in line_upper or "VALDURT" in line_upper): if i + 1 < len(cleaned_lines): next_line = cleaned_lines[i+1].strip() # Fix OCR errors: OlDec17 -> 01 Dec 17 # Replace common OCR errors next_line = next_line.replace('Ol', '01').replace('O1', '01') next_line = next_line.replace('O0', '00').replace('OO', '00') # Try to extract date pattern date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', next_line) if date_match: day, month, year = date_match.groups() details['valid_until'] = f"{day} {month} {year}" elif next_line: details['valid_until'] = next_line # Format extracted fields if details['full_name']: details['full_name'] = format_name(details['full_name']) if details['birth_date']: details['birth_date'] = format_date(details['birth_date']) if details['valid_until']: details['valid_until'] = format_date(details['valid_until']) if details['prn'] or details['full_name']: details['success'] = True return details def extract_ocr_lines(image_path): # Check if file exists if not os.path.exists(image_path): return {'success': False, 'error': 'File not found'} file_size = os.path.getsize(image_path) print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr) with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): # Try simple configuration first ocr = PaddleOCR( use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False, lang='en' ) try: results = ocr.ocr(image_path) except Exception as e: print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr) if hasattr(ocr, 'predict'): results = ocr.predict(image_path) else: results = None # Debug: Print raw results structure print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr) all_text = [] try: # Handle both old format (list) and new format (OCRResult object) if results and isinstance(results, list) and len(results) > 0: first_item = results[0] item_type_name = type(first_item).__name__ is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() if is_ocr_result: print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) # Access OCRResult as dictionary try: if hasattr(first_item, 'keys'): ocr_dict = dict(first_item) # Look for rec_texts key if 'rec_texts' in ocr_dict: rec_texts = ocr_dict['rec_texts'] if isinstance(rec_texts, list): all_text = [str(t) for t in rec_texts if t] print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr) except Exception as e: print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr) else: # Old format - list of lists lines = results[0] if results and isinstance(results[0], list) else results for item in lines: if isinstance(item, (list, tuple)) and len(item) >= 2: meta = item[1] if isinstance(meta, (list, tuple)) and len(meta) >= 1: all_text.append(str(meta[0])) except Exception as e: print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) import traceback print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr) print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr) return extract_postal_details(all_text) if all_text else { 'id_type': 'Postal ID', 'prn': None, 'full_name': None, 'address': None, 'birth_date': None, 'nationality': None, 'issuing_post_office': None, 'valid_until': None, 'success': False } # Main Execution if len(sys.argv) < 2: sys.stdout = original_stdout print(json.dumps({"success": False, "error": "No image URL provided"})) sys.exit(1) image_url = sys.argv[1] print(f"DEBUG: Processing Postal ID image URL: {image_url}", file=sys.stderr) try: image_path = download_image(image_url, 'temp_postal_image.jpg') print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr) ocr_results = extract_ocr_lines(image_path) print(f"DEBUG: OCR results: {ocr_results}", file=sys.stderr) # Clean up if os.path.exists(image_path): os.remove(image_path) response = { "success": ocr_results['success'], "data": ocr_results } sys.stdout = original_stdout sys.stdout.write(json.dumps(response)) sys.stdout.flush() except Exception as e: sys.stdout = original_stdout sys.stdout.write(json.dumps({"success": False, "error": str(e)})) sys.stdout.flush() sys.exit(1) finally: try: if os.path.exists('temp_postal_image.jpg'): os.remove('temp_postal_image.jpg') except: pass