Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

handyhome-ocr-api / extract_postal.py

takomattyy

Upload 10 files

6916300 verified 21 days ago

raw

history blame contribute delete

19.2 kB

	import sys, json, os, glob, requests
	import re
	import time
	from contextlib import redirect_stdout, redirect_stderr
	from datetime import datetime

	# Immediately redirect all output to stderr except for our final JSON
	original_stdout = sys.stdout
	sys.stdout = sys.stderr

	# Suppress all PaddleOCR output
	os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
	os.environ['QT_QPA_PLATFORM'] = 'offscreen'
	os.environ['DISPLAY'] = ':99'

	# Import PaddleOCR after setting environment variables
	from paddleocr import PaddleOCR

	def download_image(url, output_path='temp_postal_image.jpg'):
	# Remove any existing temp file
	if os.path.exists(output_path):
	os.remove(output_path)

	# Add cache-busting parameters
	timestamp = int(time.time())
	if '?' in url:
	url += f'&t={timestamp}'
	else:
	url += f'?t={timestamp}'

	# Add headers to prevent caching
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Cache-Control': 'no-cache, no-store, must-revalidate',
	'Pragma': 'no-cache',
	'Expires': '0'
	}

	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()
	image_data = response.content

	# Save the image
	with open(output_path, 'wb') as f:
	f.write(image_data)

	return output_path

	def format_date(date_str):
	"""Format date from various formats to YYYY-MM-DD"""
	if not date_str:
	return None

	date_str = date_str.strip()

	# Fix common OCR errors first
	date_str = date_str.replace('Ol', '01').replace('O1', '01').replace('O0', '00').replace('OO', '00')
	date_str = date_str.replace('l', '1') # lowercase L -> 1

	# Handle format like "14 Aug 88" or "14 Aug88" -> "1988-08-14"
	# Allow for missing space between month and year
	match = re.match(r'(\d{1,2})\s([A-Za-z]{3})\s(\d{2,4})', date_str)
	if match:
	day, month_str, year = match.groups()
	try:
	# Fix month OCR errors
	month_str = month_str.replace('Augu', 'Aug').replace('Augu', 'Aug')
	month_str = month_str.replace('Decm', 'Dec').replace('Dece', 'Dec')
	month_str = month_str.replace('Janu', 'Jan').replace('Febr', 'Feb')
	month_str = month_str.replace('Marc', 'Mar').replace('Apil', 'Apr')
	month_str = month_str.replace('May', 'May').replace('June', 'Jun')
	month_str = month_str.replace('July', 'Jul').replace('Sept', 'Sep')
	month_str = month_str.replace('Octo', 'Oct').replace('Novem', 'Nov')

	# Convert 2-digit year to 4-digit (assume 1900s for years > 50, 2000s for <= 50)
	if len(year) == 2:
	year_int = int(year)
	year = f"19{year}" if year_int > 50 else f"20{year}"

	# Parse month abbreviation (use first 3 chars)
	month = datetime.strptime(month_str[:3], '%b').month
	return f"{year}-{month:02d}-{int(day):02d}"
	except Exception as e:
	print(f"DEBUG: Date parsing error: {e}", file=sys.stderr)
	pass

	# Try other common formats
	for fmt in ["%d %b %Y", "%d %B %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%d%b%Y", "%d%B%Y"]:
	try:
	dt = datetime.strptime(date_str, fmt)
	return dt.strftime("%Y-%m-%d")
	except Exception:
	continue

	return date_str

	def format_name(name):
	"""Format name: capitalize properly"""
	if not name:
	return None

	# Remove extra spaces and normalize
	name = ' '.join(name.split())

	# Capitalize each word properly
	name = ' '.join([word.capitalize() for word in name.split()])

	return name.strip()

	def format_address(address_lines):
	"""Format address from multiple lines"""
	if not address_lines:
	return None

	# Join address lines and clean up
	address = ' '.join([line.strip() for line in address_lines if line.strip()])

	# Fix missing spaces: "585Gen." -> "585 Gen."
	address = re.sub(r'(\d+)([A-Z])', r'\1 \2', address)

	# Fix missing spaces before abbreviations: "Brgy.Rivera" -> "Brgy. Rivera"
	address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)

	# Remove extra spaces
	address = ' '.join(address.split())

	return address.strip()

	def extract_postal_details(lines):
	details = {
	'id_type': 'Postal ID',
	'prn': None,
	'full_name': None,
	'address': None,
	'birth_date': None,
	'nationality': None,
	'issuing_post_office': None,
	'valid_until': None,
	'success': False
	}

	# Clean lines - convert to strings and strip
	cleaned_lines = [str(line).strip() for line in lines if str(line).strip()]

	for i, line in enumerate(cleaned_lines):
	line_upper = line.upper().strip()
	line_stripped = line.strip()

	# Extract PRN (Postal Registration Number)
	# Format: "PRN 100141234567 P POSTAL" or "PRN100141234567P" or "PAN100141234567P" (OCR might misread PRN as PAN)
	if not details['prn']:
	# Look for PRN followed by digits (may have P POSTAL after)
	prn_match = re.search(r'PRN\s*(\d{10,15})', line_upper)
	if prn_match:
	details['prn'] = prn_match.group(1)
	# Also check for PAN (common OCR error where PRN is misread as PAN)
	elif re.search(r'PAN\s*(\d{10,15})', line_upper):
	pan_match = re.search(r'PAN\s*(\d{10,15})', line_upper)
	if pan_match:
	details['prn'] = pan_match.group(1)

	# Extract Full Name - combine separate name parts
	# Look for label "First Name Middle Name Surname, Suffix" or name parts
	if not details['full_name']:
	# Check if this line is the label
	if ("FIRST NAME" in line_upper or "FINT NAME" in line_upper) and ("SURNAME" in line_upper or "SUMAME" in line_upper):
	# Collect name parts from next few lines
	name_parts = []
	for j in range(1, min(5, len(cleaned_lines) - i)):
	next_line = cleaned_lines[i+j].strip()
	next_upper = next_line.upper()
	# Stop if we hit address or other labels
	if any(label in next_upper for label in ['ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY']):
	break
	# Add if it looks like a name part (all caps, letters and spaces only, not too short)
	if next_line and re.match(r'^[A-Z\s,]+$', next_line) and len(next_line) > 1:
	# Skip if it's clearly not a name (like "ID", "C", etc.)
	if next_line not in ['ID', 'C', 'P', 'POSTAL']:
	name_parts.append(next_line)
	if name_parts:
	details['full_name'] = ' '.join(name_parts)
	# Also check if line is a name part (all caps, not a label)
	elif re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped) > 2:
	# Make sure it's not a label or common words
	if not any(label in line_upper for label in ['FIRST NAME', 'MIDDLE NAME', 'SURNAME', 'ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'POSTAL', 'IDENTITY', 'CARD', 'PHCPOST', 'PHILIPPINE', 'PREMIUM']):
	# Check if previous line is the name label
	if i > 0:
	prev_line = cleaned_lines[i-1].strip().upper()
	if "FIRST NAME" in prev_line or "FINT NAME" in prev_line or "SUMAME" in prev_line or "SURNAME" in prev_line:
	# Collect consecutive name parts
	name_parts = [line_stripped]
	for j in range(1, min(4, len(cleaned_lines) - i)):
	next_line = cleaned_lines[i+j].strip()
	if (next_line and re.match(r'^[A-Z\s,]+$', next_line) and
	len(next_line) > 2 and
	not any(label in next_line.upper() for label in ['ADDRESS', 'DATE', 'BIRTH', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY', 'ID', 'POSTAL', 'PREMIUM'])):
	name_parts.append(next_line)
	else:
	break
	if len(name_parts) >= 2:
	details['full_name'] = ' '.join(name_parts)
	elif len(name_parts) == 1 and len(name_parts[0].split()) >= 2:
	details['full_name'] = name_parts[0]

	# Extract Address - look for address parts (street numbers, Gen., Blvd., Brgy., City)
	if not details['address']:
	# Look for address indicators
	if any(indicator in line_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY']) or (re.match(r'^\d+', line_stripped) and len(line_stripped) > 2):
	address_lines = []
	# Check backwards a bit to see if we missed address start
	start_idx = max(0, i - 1)

	# Collect address lines forward
	for j in range(0, min(7, len(cleaned_lines) - start_idx)):
	idx = start_idx + j
	if idx >= len(cleaned_lines):
	break
	addr_line = cleaned_lines[idx].strip()
	addr_upper = addr_line.upper()

	# Stop if we hit date, nationality, or other labels
	if any(label in addr_upper for label in ['DATE', 'BIRTH', 'NATIONALITY', 'FILIPINO', 'ISSUING', 'VALID', 'PAN', 'NOCON']):
	break

	# Skip very short lines that are likely OCR noise (like "101", "o00")
	if len(addr_line) <= 2 and not re.match(r'^\d+$', addr_line):
	continue

	# Add if it looks like address content
	if addr_line and len(addr_line) > 1:
	# Check if it's a number, street name, barangay, city, etc.
	if (re.match(r'^\d+', addr_line) or
	any(indicator in addr_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY', 'STREET', 'AVE', 'BOULEVARD']) or
	len(address_lines) > 0): # Continue if we've started collecting
	# Skip obvious OCR errors like "o00"
	if addr_line.lower() not in ['o00', 'o0', '00']:
	address_lines.append(addr_line)

	if address_lines:
	details['address'] = format_address(address_lines)

	# Extract Date of Birth - handle OCR errors
	if not details['birth_date']:
	# Look for date patterns: "14 Aug88" or "14 Aug 88"
	date_match = re.search(r'(\d{1,2})\s([A-Za-z]{3})\s(\d{2,4})', line_stripped)
	if date_match:
	# Check if it's not the valid until date
	if "VALID" not in line_upper and "UNTIL" not in line_upper:
	# Fix spacing
	day, month, year = date_match.groups()
	details['birth_date'] = f"{day} {month} {year}"

	# Extract Nationality
	if not details['nationality']:
	if "NATIONALITY" in line_upper or line_upper == "FILIPINO":
	if line_upper == "FILIPINO":
	details['nationality'] = "Filipino"
	elif i + 1 < len(cleaned_lines):
	next_line = cleaned_lines[i+1].strip()
	if next_line and len(next_line) < 20:
	details['nationality'] = next_line

	# Extract Issuing Post Office - handle OCR errors like "IssungPostOmce"
	if not details['issuing_post_office']:
	if ("ISSUING POST OFFICE" in line_upper or "ISSUING POST" in line_upper or
	"ISSUINGPOST" in line_upper or "ISSUINGPOSTOMCE" in line_upper):
	if i + 1 < len(cleaned_lines):
	next_line = cleaned_lines[i+1].strip()
	if next_line and len(next_line) < 20:
	# Fix OCR errors: MNL.QE -> MNL-QE
	next_line = next_line.replace('.', '-')
	details['issuing_post_office'] = next_line

	# Extract Valid Until - handle OCR errors like "Vald Urt" and "OlDec17"
	if not details['valid_until']:
	if ("VALID UNTIL" in line_upper or "VALIDUNTIL" in line_upper or
	"VALD URT" in line_upper or "VALDURT" in line_upper):
	if i + 1 < len(cleaned_lines):
	next_line = cleaned_lines[i+1].strip()
	# Fix OCR errors: OlDec17 -> 01 Dec 17
	# Replace common OCR errors
	next_line = next_line.replace('Ol', '01').replace('O1', '01')
	next_line = next_line.replace('O0', '00').replace('OO', '00')
	# Try to extract date pattern
	date_match = re.search(r'(\d{1,2})\s([A-Za-z]{3})\s(\d{2,4})', next_line)
	if date_match:
	day, month, year = date_match.groups()
	details['valid_until'] = f"{day} {month} {year}"
	elif next_line:
	details['valid_until'] = next_line

	# Format extracted fields
	if details['full_name']:
	details['full_name'] = format_name(details['full_name'])
	if details['birth_date']:
	details['birth_date'] = format_date(details['birth_date'])
	if details['valid_until']:
	details['valid_until'] = format_date(details['valid_until'])

	if details['prn'] or details['full_name']:
	details['success'] = True

	return details

	def extract_ocr_lines(image_path):
	# Check if file exists
	if not os.path.exists(image_path):
	return {'success': False, 'error': 'File not found'}

	file_size = os.path.getsize(image_path)
	print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr)

	with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
	# Try simple configuration first
	ocr = PaddleOCR(
	use_doc_orientation_classify=False,
	use_doc_unwarping=False,
	use_textline_orientation=False,
	lang='en'
	)
	try:
	results = ocr.ocr(image_path)
	except Exception as e:
	print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr)
	if hasattr(ocr, 'predict'):
	results = ocr.predict(image_path)
	else:
	results = None

	# Debug: Print raw results structure
	print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr)

	all_text = []
	try:
	# Handle both old format (list) and new format (OCRResult object)
	if results and isinstance(results, list) and len(results) > 0:
	first_item = results[0]
	item_type_name = type(first_item).__name__
	is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()

	if is_ocr_result:
	print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
	# Access OCRResult as dictionary
	try:
	if hasattr(first_item, 'keys'):
	ocr_dict = dict(first_item)
	# Look for rec_texts key
	if 'rec_texts' in ocr_dict:
	rec_texts = ocr_dict['rec_texts']
	if isinstance(rec_texts, list):
	all_text = [str(t) for t in rec_texts if t]
	print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
	except Exception as e:
	print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
	else:
	# Old format - list of lists
	lines = results[0] if results and isinstance(results[0], list) else results
	for item in lines:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	meta = item[1]
	if isinstance(meta, (list, tuple)) and len(meta) >= 1:
	all_text.append(str(meta[0]))
	except Exception as e:
	print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
	import traceback
	print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)

	print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)

	return extract_postal_details(all_text) if all_text else {
	'id_type': 'Postal ID',
	'prn': None,
	'full_name': None,
	'address': None,
	'birth_date': None,
	'nationality': None,
	'issuing_post_office': None,
	'valid_until': None,
	'success': False
	}

	# Main Execution
	if len(sys.argv) < 2:
	sys.stdout = original_stdout
	print(json.dumps({"success": False, "error": "No image URL provided"}))
	sys.exit(1)

	image_url = sys.argv[1]
	print(f"DEBUG: Processing Postal ID image URL: {image_url}", file=sys.stderr)

	try:
	image_path = download_image(image_url, 'temp_postal_image.jpg')
	print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)

	ocr_results = extract_ocr_lines(image_path)
	print(f"DEBUG: OCR results: {ocr_results}", file=sys.stderr)

	# Clean up
	if os.path.exists(image_path):
	os.remove(image_path)

	response = {
	"success": ocr_results['success'],
	"data": ocr_results
	}

	sys.stdout = original_stdout
	sys.stdout.write(json.dumps(response))
	sys.stdout.flush()

	except Exception as e:
	sys.stdout = original_stdout
	sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
	sys.stdout.flush()
	sys.exit(1)
	finally:
	try:
	if os.path.exists('temp_postal_image.jpg'):
	os.remove('temp_postal_image.jpg')
	except:
	pass