handyhome-ocr-api / extract_postal.py
takomattyy's picture
Upload 10 files
6916300 verified
import sys, json, os, glob, requests
import re
import time
from contextlib import redirect_stdout, redirect_stderr
from datetime import datetime
# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr
# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'
# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR
def download_image(url, output_path='temp_postal_image.jpg'):
# Remove any existing temp file
if os.path.exists(output_path):
os.remove(output_path)
# Add cache-busting parameters
timestamp = int(time.time())
if '?' in url:
url += f'&t={timestamp}'
else:
url += f'?t={timestamp}'
# Add headers to prevent caching
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
image_data = response.content
# Save the image
with open(output_path, 'wb') as f:
f.write(image_data)
return output_path
def format_date(date_str):
"""Format date from various formats to YYYY-MM-DD"""
if not date_str:
return None
date_str = date_str.strip()
# Fix common OCR errors first
date_str = date_str.replace('Ol', '01').replace('O1', '01').replace('O0', '00').replace('OO', '00')
date_str = date_str.replace('l', '1') # lowercase L -> 1
# Handle format like "14 Aug 88" or "14 Aug88" -> "1988-08-14"
# Allow for missing space between month and year
match = re.match(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', date_str)
if match:
day, month_str, year = match.groups()
try:
# Fix month OCR errors
month_str = month_str.replace('Augu', 'Aug').replace('Augu', 'Aug')
month_str = month_str.replace('Decm', 'Dec').replace('Dece', 'Dec')
month_str = month_str.replace('Janu', 'Jan').replace('Febr', 'Feb')
month_str = month_str.replace('Marc', 'Mar').replace('Apil', 'Apr')
month_str = month_str.replace('May', 'May').replace('June', 'Jun')
month_str = month_str.replace('July', 'Jul').replace('Sept', 'Sep')
month_str = month_str.replace('Octo', 'Oct').replace('Novem', 'Nov')
# Convert 2-digit year to 4-digit (assume 1900s for years > 50, 2000s for <= 50)
if len(year) == 2:
year_int = int(year)
year = f"19{year}" if year_int > 50 else f"20{year}"
# Parse month abbreviation (use first 3 chars)
month = datetime.strptime(month_str[:3], '%b').month
return f"{year}-{month:02d}-{int(day):02d}"
except Exception as e:
print(f"DEBUG: Date parsing error: {e}", file=sys.stderr)
pass
# Try other common formats
for fmt in ["%d %b %Y", "%d %B %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%d%b%Y", "%d%B%Y"]:
try:
dt = datetime.strptime(date_str, fmt)
return dt.strftime("%Y-%m-%d")
except Exception:
continue
return date_str
def format_name(name):
"""Format name: capitalize properly"""
if not name:
return None
# Remove extra spaces and normalize
name = ' '.join(name.split())
# Capitalize each word properly
name = ' '.join([word.capitalize() for word in name.split()])
return name.strip()
def format_address(address_lines):
"""Format address from multiple lines"""
if not address_lines:
return None
# Join address lines and clean up
address = ' '.join([line.strip() for line in address_lines if line.strip()])
# Fix missing spaces: "585Gen." -> "585 Gen."
address = re.sub(r'(\d+)([A-Z])', r'\1 \2', address)
# Fix missing spaces before abbreviations: "Brgy.Rivera" -> "Brgy. Rivera"
address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
# Remove extra spaces
address = ' '.join(address.split())
return address.strip()
def extract_postal_details(lines):
details = {
'id_type': 'Postal ID',
'prn': None,
'full_name': None,
'address': None,
'birth_date': None,
'nationality': None,
'issuing_post_office': None,
'valid_until': None,
'success': False
}
# Clean lines - convert to strings and strip
cleaned_lines = [str(line).strip() for line in lines if str(line).strip()]
for i, line in enumerate(cleaned_lines):
line_upper = line.upper().strip()
line_stripped = line.strip()
# Extract PRN (Postal Registration Number)
# Format: "PRN 100141234567 P POSTAL" or "PRN100141234567P" or "PAN100141234567P" (OCR might misread PRN as PAN)
if not details['prn']:
# Look for PRN followed by digits (may have P POSTAL after)
prn_match = re.search(r'PRN\s*(\d{10,15})', line_upper)
if prn_match:
details['prn'] = prn_match.group(1)
# Also check for PAN (common OCR error where PRN is misread as PAN)
elif re.search(r'PAN\s*(\d{10,15})', line_upper):
pan_match = re.search(r'PAN\s*(\d{10,15})', line_upper)
if pan_match:
details['prn'] = pan_match.group(1)
# Extract Full Name - combine separate name parts
# Look for label "First Name Middle Name Surname, Suffix" or name parts
if not details['full_name']:
# Check if this line is the label
if ("FIRST NAME" in line_upper or "FINT NAME" in line_upper) and ("SURNAME" in line_upper or "SUMAME" in line_upper):
# Collect name parts from next few lines
name_parts = []
for j in range(1, min(5, len(cleaned_lines) - i)):
next_line = cleaned_lines[i+j].strip()
next_upper = next_line.upper()
# Stop if we hit address or other labels
if any(label in next_upper for label in ['ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY']):
break
# Add if it looks like a name part (all caps, letters and spaces only, not too short)
if next_line and re.match(r'^[A-Z\s,]+$', next_line) and len(next_line) > 1:
# Skip if it's clearly not a name (like "ID", "C", etc.)
if next_line not in ['ID', 'C', 'P', 'POSTAL']:
name_parts.append(next_line)
if name_parts:
details['full_name'] = ' '.join(name_parts)
# Also check if line is a name part (all caps, not a label)
elif re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped) > 2:
# Make sure it's not a label or common words
if not any(label in line_upper for label in ['FIRST NAME', 'MIDDLE NAME', 'SURNAME', 'ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'POSTAL', 'IDENTITY', 'CARD', 'PHCPOST', 'PHILIPPINE', 'PREMIUM']):
# Check if previous line is the name label
if i > 0:
prev_line = cleaned_lines[i-1].strip().upper()
if "FIRST NAME" in prev_line or "FINT NAME" in prev_line or "SUMAME" in prev_line or "SURNAME" in prev_line:
# Collect consecutive name parts
name_parts = [line_stripped]
for j in range(1, min(4, len(cleaned_lines) - i)):
next_line = cleaned_lines[i+j].strip()
if (next_line and re.match(r'^[A-Z\s,]+$', next_line) and
len(next_line) > 2 and
not any(label in next_line.upper() for label in ['ADDRESS', 'DATE', 'BIRTH', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY', 'ID', 'POSTAL', 'PREMIUM'])):
name_parts.append(next_line)
else:
break
if len(name_parts) >= 2:
details['full_name'] = ' '.join(name_parts)
elif len(name_parts) == 1 and len(name_parts[0].split()) >= 2:
details['full_name'] = name_parts[0]
# Extract Address - look for address parts (street numbers, Gen., Blvd., Brgy., City)
if not details['address']:
# Look for address indicators
if any(indicator in line_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY']) or (re.match(r'^\d+', line_stripped) and len(line_stripped) > 2):
address_lines = []
# Check backwards a bit to see if we missed address start
start_idx = max(0, i - 1)
# Collect address lines forward
for j in range(0, min(7, len(cleaned_lines) - start_idx)):
idx = start_idx + j
if idx >= len(cleaned_lines):
break
addr_line = cleaned_lines[idx].strip()
addr_upper = addr_line.upper()
# Stop if we hit date, nationality, or other labels
if any(label in addr_upper for label in ['DATE', 'BIRTH', 'NATIONALITY', 'FILIPINO', 'ISSUING', 'VALID', 'PAN', 'NOCON']):
break
# Skip very short lines that are likely OCR noise (like "101", "o00")
if len(addr_line) <= 2 and not re.match(r'^\d+$', addr_line):
continue
# Add if it looks like address content
if addr_line and len(addr_line) > 1:
# Check if it's a number, street name, barangay, city, etc.
if (re.match(r'^\d+', addr_line) or
any(indicator in addr_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY', 'STREET', 'AVE', 'BOULEVARD']) or
len(address_lines) > 0): # Continue if we've started collecting
# Skip obvious OCR errors like "o00"
if addr_line.lower() not in ['o00', 'o0', '00']:
address_lines.append(addr_line)
if address_lines:
details['address'] = format_address(address_lines)
# Extract Date of Birth - handle OCR errors
if not details['birth_date']:
# Look for date patterns: "14 Aug88" or "14 Aug 88"
date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', line_stripped)
if date_match:
# Check if it's not the valid until date
if "VALID" not in line_upper and "UNTIL" not in line_upper:
# Fix spacing
day, month, year = date_match.groups()
details['birth_date'] = f"{day} {month} {year}"
# Extract Nationality
if not details['nationality']:
if "NATIONALITY" in line_upper or line_upper == "FILIPINO":
if line_upper == "FILIPINO":
details['nationality'] = "Filipino"
elif i + 1 < len(cleaned_lines):
next_line = cleaned_lines[i+1].strip()
if next_line and len(next_line) < 20:
details['nationality'] = next_line
# Extract Issuing Post Office - handle OCR errors like "IssungPostOmce"
if not details['issuing_post_office']:
if ("ISSUING POST OFFICE" in line_upper or "ISSUING POST" in line_upper or
"ISSUINGPOST" in line_upper or "ISSUINGPOSTOMCE" in line_upper):
if i + 1 < len(cleaned_lines):
next_line = cleaned_lines[i+1].strip()
if next_line and len(next_line) < 20:
# Fix OCR errors: MNL.QE -> MNL-QE
next_line = next_line.replace('.', '-')
details['issuing_post_office'] = next_line
# Extract Valid Until - handle OCR errors like "Vald Urt" and "OlDec17"
if not details['valid_until']:
if ("VALID UNTIL" in line_upper or "VALIDUNTIL" in line_upper or
"VALD URT" in line_upper or "VALDURT" in line_upper):
if i + 1 < len(cleaned_lines):
next_line = cleaned_lines[i+1].strip()
# Fix OCR errors: OlDec17 -> 01 Dec 17
# Replace common OCR errors
next_line = next_line.replace('Ol', '01').replace('O1', '01')
next_line = next_line.replace('O0', '00').replace('OO', '00')
# Try to extract date pattern
date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', next_line)
if date_match:
day, month, year = date_match.groups()
details['valid_until'] = f"{day} {month} {year}"
elif next_line:
details['valid_until'] = next_line
# Format extracted fields
if details['full_name']:
details['full_name'] = format_name(details['full_name'])
if details['birth_date']:
details['birth_date'] = format_date(details['birth_date'])
if details['valid_until']:
details['valid_until'] = format_date(details['valid_until'])
if details['prn'] or details['full_name']:
details['success'] = True
return details
def extract_ocr_lines(image_path):
# Check if file exists
if not os.path.exists(image_path):
return {'success': False, 'error': 'File not found'}
file_size = os.path.getsize(image_path)
print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr)
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
# Try simple configuration first
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang='en'
)
try:
results = ocr.ocr(image_path)
except Exception as e:
print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr)
if hasattr(ocr, 'predict'):
results = ocr.predict(image_path)
else:
results = None
# Debug: Print raw results structure
print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr)
all_text = []
try:
# Handle both old format (list) and new format (OCRResult object)
if results and isinstance(results, list) and len(results) > 0:
first_item = results[0]
item_type_name = type(first_item).__name__
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
if is_ocr_result:
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
# Access OCRResult as dictionary
try:
if hasattr(first_item, 'keys'):
ocr_dict = dict(first_item)
# Look for rec_texts key
if 'rec_texts' in ocr_dict:
rec_texts = ocr_dict['rec_texts']
if isinstance(rec_texts, list):
all_text = [str(t) for t in rec_texts if t]
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
except Exception as e:
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
else:
# Old format - list of lists
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
import traceback
print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
return extract_postal_details(all_text) if all_text else {
'id_type': 'Postal ID',
'prn': None,
'full_name': None,
'address': None,
'birth_date': None,
'nationality': None,
'issuing_post_office': None,
'valid_until': None,
'success': False
}
# Main Execution
if len(sys.argv) < 2:
sys.stdout = original_stdout
print(json.dumps({"success": False, "error": "No image URL provided"}))
sys.exit(1)
image_url = sys.argv[1]
print(f"DEBUG: Processing Postal ID image URL: {image_url}", file=sys.stderr)
try:
image_path = download_image(image_url, 'temp_postal_image.jpg')
print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)
ocr_results = extract_ocr_lines(image_path)
print(f"DEBUG: OCR results: {ocr_results}", file=sys.stderr)
# Clean up
if os.path.exists(image_path):
os.remove(image_path)
response = {
"success": ocr_results['success'],
"data": ocr_results
}
sys.stdout = original_stdout
sys.stdout.write(json.dumps(response))
sys.stdout.flush()
except Exception as e:
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
sys.stdout.flush()
sys.exit(1)
finally:
try:
if os.path.exists('temp_postal_image.jpg'):
os.remove('temp_postal_image.jpg')
except:
pass