Spaces:
Sleeping
Sleeping
| import sys, json, os, glob, requests | |
| import re | |
| import time | |
| from contextlib import redirect_stdout, redirect_stderr | |
| from datetime import datetime | |
| # Immediately redirect all output to stderr except for our final JSON | |
| original_stdout = sys.stdout | |
| sys.stdout = sys.stderr | |
| # Suppress all PaddleOCR output | |
| os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' | |
| os.environ['QT_QPA_PLATFORM'] = 'offscreen' | |
| os.environ['DISPLAY'] = ':99' | |
| # Import PaddleOCR after setting environment variables | |
| from paddleocr import PaddleOCR | |
| def download_image(url, output_path='temp_postal_image.jpg'): | |
| # Remove any existing temp file | |
| if os.path.exists(output_path): | |
| os.remove(output_path) | |
| # Add cache-busting parameters | |
| timestamp = int(time.time()) | |
| if '?' in url: | |
| url += f'&t={timestamp}' | |
| else: | |
| url += f'?t={timestamp}' | |
| # Add headers to prevent caching | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Cache-Control': 'no-cache, no-store, must-revalidate', | |
| 'Pragma': 'no-cache', | |
| 'Expires': '0' | |
| } | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() | |
| image_data = response.content | |
| # Save the image | |
| with open(output_path, 'wb') as f: | |
| f.write(image_data) | |
| return output_path | |
| def format_date(date_str): | |
| """Format date from various formats to YYYY-MM-DD""" | |
| if not date_str: | |
| return None | |
| date_str = date_str.strip() | |
| # Fix common OCR errors first | |
| date_str = date_str.replace('Ol', '01').replace('O1', '01').replace('O0', '00').replace('OO', '00') | |
| date_str = date_str.replace('l', '1') # lowercase L -> 1 | |
| # Handle format like "14 Aug 88" or "14 Aug88" -> "1988-08-14" | |
| # Allow for missing space between month and year | |
| match = re.match(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', date_str) | |
| if match: | |
| day, month_str, year = match.groups() | |
| try: | |
| # Fix month OCR errors | |
| month_str = month_str.replace('Augu', 'Aug').replace('Augu', 'Aug') | |
| month_str = month_str.replace('Decm', 'Dec').replace('Dece', 'Dec') | |
| month_str = month_str.replace('Janu', 'Jan').replace('Febr', 'Feb') | |
| month_str = month_str.replace('Marc', 'Mar').replace('Apil', 'Apr') | |
| month_str = month_str.replace('May', 'May').replace('June', 'Jun') | |
| month_str = month_str.replace('July', 'Jul').replace('Sept', 'Sep') | |
| month_str = month_str.replace('Octo', 'Oct').replace('Novem', 'Nov') | |
| # Convert 2-digit year to 4-digit (assume 1900s for years > 50, 2000s for <= 50) | |
| if len(year) == 2: | |
| year_int = int(year) | |
| year = f"19{year}" if year_int > 50 else f"20{year}" | |
| # Parse month abbreviation (use first 3 chars) | |
| month = datetime.strptime(month_str[:3], '%b').month | |
| return f"{year}-{month:02d}-{int(day):02d}" | |
| except Exception as e: | |
| print(f"DEBUG: Date parsing error: {e}", file=sys.stderr) | |
| pass | |
| # Try other common formats | |
| for fmt in ["%d %b %Y", "%d %B %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%d%b%Y", "%d%B%Y"]: | |
| try: | |
| dt = datetime.strptime(date_str, fmt) | |
| return dt.strftime("%Y-%m-%d") | |
| except Exception: | |
| continue | |
| return date_str | |
| def format_name(name): | |
| """Format name: capitalize properly""" | |
| if not name: | |
| return None | |
| # Remove extra spaces and normalize | |
| name = ' '.join(name.split()) | |
| # Capitalize each word properly | |
| name = ' '.join([word.capitalize() for word in name.split()]) | |
| return name.strip() | |
| def format_address(address_lines): | |
| """Format address from multiple lines""" | |
| if not address_lines: | |
| return None | |
| # Join address lines and clean up | |
| address = ' '.join([line.strip() for line in address_lines if line.strip()]) | |
| # Fix missing spaces: "585Gen." -> "585 Gen." | |
| address = re.sub(r'(\d+)([A-Z])', r'\1 \2', address) | |
| # Fix missing spaces before abbreviations: "Brgy.Rivera" -> "Brgy. Rivera" | |
| address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address) | |
| # Remove extra spaces | |
| address = ' '.join(address.split()) | |
| return address.strip() | |
| def extract_postal_details(lines): | |
| details = { | |
| 'id_type': 'Postal ID', | |
| 'prn': None, | |
| 'full_name': None, | |
| 'address': None, | |
| 'birth_date': None, | |
| 'nationality': None, | |
| 'issuing_post_office': None, | |
| 'valid_until': None, | |
| 'success': False | |
| } | |
| # Clean lines - convert to strings and strip | |
| cleaned_lines = [str(line).strip() for line in lines if str(line).strip()] | |
| for i, line in enumerate(cleaned_lines): | |
| line_upper = line.upper().strip() | |
| line_stripped = line.strip() | |
| # Extract PRN (Postal Registration Number) | |
| # Format: "PRN 100141234567 P POSTAL" or "PRN100141234567P" or "PAN100141234567P" (OCR might misread PRN as PAN) | |
| if not details['prn']: | |
| # Look for PRN followed by digits (may have P POSTAL after) | |
| prn_match = re.search(r'PRN\s*(\d{10,15})', line_upper) | |
| if prn_match: | |
| details['prn'] = prn_match.group(1) | |
| # Also check for PAN (common OCR error where PRN is misread as PAN) | |
| elif re.search(r'PAN\s*(\d{10,15})', line_upper): | |
| pan_match = re.search(r'PAN\s*(\d{10,15})', line_upper) | |
| if pan_match: | |
| details['prn'] = pan_match.group(1) | |
| # Extract Full Name - combine separate name parts | |
| # Look for label "First Name Middle Name Surname, Suffix" or name parts | |
| if not details['full_name']: | |
| # Check if this line is the label | |
| if ("FIRST NAME" in line_upper or "FINT NAME" in line_upper) and ("SURNAME" in line_upper or "SUMAME" in line_upper): | |
| # Collect name parts from next few lines | |
| name_parts = [] | |
| for j in range(1, min(5, len(cleaned_lines) - i)): | |
| next_line = cleaned_lines[i+j].strip() | |
| next_upper = next_line.upper() | |
| # Stop if we hit address or other labels | |
| if any(label in next_upper for label in ['ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY']): | |
| break | |
| # Add if it looks like a name part (all caps, letters and spaces only, not too short) | |
| if next_line and re.match(r'^[A-Z\s,]+$', next_line) and len(next_line) > 1: | |
| # Skip if it's clearly not a name (like "ID", "C", etc.) | |
| if next_line not in ['ID', 'C', 'P', 'POSTAL']: | |
| name_parts.append(next_line) | |
| if name_parts: | |
| details['full_name'] = ' '.join(name_parts) | |
| # Also check if line is a name part (all caps, not a label) | |
| elif re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped) > 2: | |
| # Make sure it's not a label or common words | |
| if not any(label in line_upper for label in ['FIRST NAME', 'MIDDLE NAME', 'SURNAME', 'ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'POSTAL', 'IDENTITY', 'CARD', 'PHCPOST', 'PHILIPPINE', 'PREMIUM']): | |
| # Check if previous line is the name label | |
| if i > 0: | |
| prev_line = cleaned_lines[i-1].strip().upper() | |
| if "FIRST NAME" in prev_line or "FINT NAME" in prev_line or "SUMAME" in prev_line or "SURNAME" in prev_line: | |
| # Collect consecutive name parts | |
| name_parts = [line_stripped] | |
| for j in range(1, min(4, len(cleaned_lines) - i)): | |
| next_line = cleaned_lines[i+j].strip() | |
| if (next_line and re.match(r'^[A-Z\s,]+$', next_line) and | |
| len(next_line) > 2 and | |
| not any(label in next_line.upper() for label in ['ADDRESS', 'DATE', 'BIRTH', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY', 'ID', 'POSTAL', 'PREMIUM'])): | |
| name_parts.append(next_line) | |
| else: | |
| break | |
| if len(name_parts) >= 2: | |
| details['full_name'] = ' '.join(name_parts) | |
| elif len(name_parts) == 1 and len(name_parts[0].split()) >= 2: | |
| details['full_name'] = name_parts[0] | |
| # Extract Address - look for address parts (street numbers, Gen., Blvd., Brgy., City) | |
| if not details['address']: | |
| # Look for address indicators | |
| if any(indicator in line_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY']) or (re.match(r'^\d+', line_stripped) and len(line_stripped) > 2): | |
| address_lines = [] | |
| # Check backwards a bit to see if we missed address start | |
| start_idx = max(0, i - 1) | |
| # Collect address lines forward | |
| for j in range(0, min(7, len(cleaned_lines) - start_idx)): | |
| idx = start_idx + j | |
| if idx >= len(cleaned_lines): | |
| break | |
| addr_line = cleaned_lines[idx].strip() | |
| addr_upper = addr_line.upper() | |
| # Stop if we hit date, nationality, or other labels | |
| if any(label in addr_upper for label in ['DATE', 'BIRTH', 'NATIONALITY', 'FILIPINO', 'ISSUING', 'VALID', 'PAN', 'NOCON']): | |
| break | |
| # Skip very short lines that are likely OCR noise (like "101", "o00") | |
| if len(addr_line) <= 2 and not re.match(r'^\d+$', addr_line): | |
| continue | |
| # Add if it looks like address content | |
| if addr_line and len(addr_line) > 1: | |
| # Check if it's a number, street name, barangay, city, etc. | |
| if (re.match(r'^\d+', addr_line) or | |
| any(indicator in addr_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY', 'STREET', 'AVE', 'BOULEVARD']) or | |
| len(address_lines) > 0): # Continue if we've started collecting | |
| # Skip obvious OCR errors like "o00" | |
| if addr_line.lower() not in ['o00', 'o0', '00']: | |
| address_lines.append(addr_line) | |
| if address_lines: | |
| details['address'] = format_address(address_lines) | |
| # Extract Date of Birth - handle OCR errors | |
| if not details['birth_date']: | |
| # Look for date patterns: "14 Aug88" or "14 Aug 88" | |
| date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', line_stripped) | |
| if date_match: | |
| # Check if it's not the valid until date | |
| if "VALID" not in line_upper and "UNTIL" not in line_upper: | |
| # Fix spacing | |
| day, month, year = date_match.groups() | |
| details['birth_date'] = f"{day} {month} {year}" | |
| # Extract Nationality | |
| if not details['nationality']: | |
| if "NATIONALITY" in line_upper or line_upper == "FILIPINO": | |
| if line_upper == "FILIPINO": | |
| details['nationality'] = "Filipino" | |
| elif i + 1 < len(cleaned_lines): | |
| next_line = cleaned_lines[i+1].strip() | |
| if next_line and len(next_line) < 20: | |
| details['nationality'] = next_line | |
| # Extract Issuing Post Office - handle OCR errors like "IssungPostOmce" | |
| if not details['issuing_post_office']: | |
| if ("ISSUING POST OFFICE" in line_upper or "ISSUING POST" in line_upper or | |
| "ISSUINGPOST" in line_upper or "ISSUINGPOSTOMCE" in line_upper): | |
| if i + 1 < len(cleaned_lines): | |
| next_line = cleaned_lines[i+1].strip() | |
| if next_line and len(next_line) < 20: | |
| # Fix OCR errors: MNL.QE -> MNL-QE | |
| next_line = next_line.replace('.', '-') | |
| details['issuing_post_office'] = next_line | |
| # Extract Valid Until - handle OCR errors like "Vald Urt" and "OlDec17" | |
| if not details['valid_until']: | |
| if ("VALID UNTIL" in line_upper or "VALIDUNTIL" in line_upper or | |
| "VALD URT" in line_upper or "VALDURT" in line_upper): | |
| if i + 1 < len(cleaned_lines): | |
| next_line = cleaned_lines[i+1].strip() | |
| # Fix OCR errors: OlDec17 -> 01 Dec 17 | |
| # Replace common OCR errors | |
| next_line = next_line.replace('Ol', '01').replace('O1', '01') | |
| next_line = next_line.replace('O0', '00').replace('OO', '00') | |
| # Try to extract date pattern | |
| date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', next_line) | |
| if date_match: | |
| day, month, year = date_match.groups() | |
| details['valid_until'] = f"{day} {month} {year}" | |
| elif next_line: | |
| details['valid_until'] = next_line | |
| # Format extracted fields | |
| if details['full_name']: | |
| details['full_name'] = format_name(details['full_name']) | |
| if details['birth_date']: | |
| details['birth_date'] = format_date(details['birth_date']) | |
| if details['valid_until']: | |
| details['valid_until'] = format_date(details['valid_until']) | |
| if details['prn'] or details['full_name']: | |
| details['success'] = True | |
| return details | |
| def extract_ocr_lines(image_path): | |
| # Check if file exists | |
| if not os.path.exists(image_path): | |
| return {'success': False, 'error': 'File not found'} | |
| file_size = os.path.getsize(image_path) | |
| print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr) | |
| with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): | |
| # Try simple configuration first | |
| ocr = PaddleOCR( | |
| use_doc_orientation_classify=False, | |
| use_doc_unwarping=False, | |
| use_textline_orientation=False, | |
| lang='en' | |
| ) | |
| try: | |
| results = ocr.ocr(image_path) | |
| except Exception as e: | |
| print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr) | |
| if hasattr(ocr, 'predict'): | |
| results = ocr.predict(image_path) | |
| else: | |
| results = None | |
| # Debug: Print raw results structure | |
| print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr) | |
| all_text = [] | |
| try: | |
| # Handle both old format (list) and new format (OCRResult object) | |
| if results and isinstance(results, list) and len(results) > 0: | |
| first_item = results[0] | |
| item_type_name = type(first_item).__name__ | |
| is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() | |
| if is_ocr_result: | |
| print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) | |
| # Access OCRResult as dictionary | |
| try: | |
| if hasattr(first_item, 'keys'): | |
| ocr_dict = dict(first_item) | |
| # Look for rec_texts key | |
| if 'rec_texts' in ocr_dict: | |
| rec_texts = ocr_dict['rec_texts'] | |
| if isinstance(rec_texts, list): | |
| all_text = [str(t) for t in rec_texts if t] | |
| print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr) | |
| except Exception as e: | |
| print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr) | |
| else: | |
| # Old format - list of lists | |
| lines = results[0] if results and isinstance(results[0], list) else results | |
| for item in lines: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| meta = item[1] | |
| if isinstance(meta, (list, tuple)) and len(meta) >= 1: | |
| all_text.append(str(meta[0])) | |
| except Exception as e: | |
| print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) | |
| import traceback | |
| print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr) | |
| print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr) | |
| return extract_postal_details(all_text) if all_text else { | |
| 'id_type': 'Postal ID', | |
| 'prn': None, | |
| 'full_name': None, | |
| 'address': None, | |
| 'birth_date': None, | |
| 'nationality': None, | |
| 'issuing_post_office': None, | |
| 'valid_until': None, | |
| 'success': False | |
| } | |
| # Main Execution | |
| if len(sys.argv) < 2: | |
| sys.stdout = original_stdout | |
| print(json.dumps({"success": False, "error": "No image URL provided"})) | |
| sys.exit(1) | |
| image_url = sys.argv[1] | |
| print(f"DEBUG: Processing Postal ID image URL: {image_url}", file=sys.stderr) | |
| try: | |
| image_path = download_image(image_url, 'temp_postal_image.jpg') | |
| print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr) | |
| ocr_results = extract_ocr_lines(image_path) | |
| print(f"DEBUG: OCR results: {ocr_results}", file=sys.stderr) | |
| # Clean up | |
| if os.path.exists(image_path): | |
| os.remove(image_path) | |
| response = { | |
| "success": ocr_results['success'], | |
| "data": ocr_results | |
| } | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps(response)) | |
| sys.stdout.flush() | |
| except Exception as e: | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps({"success": False, "error": str(e)})) | |
| sys.stdout.flush() | |
| sys.exit(1) | |
| finally: | |
| try: | |
| if os.path.exists('temp_postal_image.jpg'): | |
| os.remove('temp_postal_image.jpg') | |
| except: | |
| pass | |