Spaces:
Sleeping
Sleeping
File size: 19,247 Bytes
db10255 7908d00 db10255 7908d00 db10255 6916300 db10255 6916300 db10255 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 |
import sys, json, os, glob, requests
import re
import time
from contextlib import redirect_stdout, redirect_stderr
from datetime import datetime
# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr
# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'
# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR
def download_image(url, output_path='temp_postal_image.jpg'):
# Remove any existing temp file
if os.path.exists(output_path):
os.remove(output_path)
# Add cache-busting parameters
timestamp = int(time.time())
if '?' in url:
url += f'&t={timestamp}'
else:
url += f'?t={timestamp}'
# Add headers to prevent caching
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
image_data = response.content
# Save the image
with open(output_path, 'wb') as f:
f.write(image_data)
return output_path
def format_date(date_str):
"""Format date from various formats to YYYY-MM-DD"""
if not date_str:
return None
date_str = date_str.strip()
# Fix common OCR errors first
date_str = date_str.replace('Ol', '01').replace('O1', '01').replace('O0', '00').replace('OO', '00')
date_str = date_str.replace('l', '1') # lowercase L -> 1
# Handle format like "14 Aug 88" or "14 Aug88" -> "1988-08-14"
# Allow for missing space between month and year
match = re.match(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', date_str)
if match:
day, month_str, year = match.groups()
try:
# Fix month OCR errors
month_str = month_str.replace('Augu', 'Aug').replace('Augu', 'Aug')
month_str = month_str.replace('Decm', 'Dec').replace('Dece', 'Dec')
month_str = month_str.replace('Janu', 'Jan').replace('Febr', 'Feb')
month_str = month_str.replace('Marc', 'Mar').replace('Apil', 'Apr')
month_str = month_str.replace('May', 'May').replace('June', 'Jun')
month_str = month_str.replace('July', 'Jul').replace('Sept', 'Sep')
month_str = month_str.replace('Octo', 'Oct').replace('Novem', 'Nov')
# Convert 2-digit year to 4-digit (assume 1900s for years > 50, 2000s for <= 50)
if len(year) == 2:
year_int = int(year)
year = f"19{year}" if year_int > 50 else f"20{year}"
# Parse month abbreviation (use first 3 chars)
month = datetime.strptime(month_str[:3], '%b').month
return f"{year}-{month:02d}-{int(day):02d}"
except Exception as e:
print(f"DEBUG: Date parsing error: {e}", file=sys.stderr)
pass
# Try other common formats
for fmt in ["%d %b %Y", "%d %B %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y", "%d%b%Y", "%d%B%Y"]:
try:
dt = datetime.strptime(date_str, fmt)
return dt.strftime("%Y-%m-%d")
except Exception:
continue
return date_str
def format_name(name):
"""Format name: capitalize properly"""
if not name:
return None
# Remove extra spaces and normalize
name = ' '.join(name.split())
# Capitalize each word properly
name = ' '.join([word.capitalize() for word in name.split()])
return name.strip()
def format_address(address_lines):
"""Format address from multiple lines"""
if not address_lines:
return None
# Join address lines and clean up
address = ' '.join([line.strip() for line in address_lines if line.strip()])
# Fix missing spaces: "585Gen." -> "585 Gen."
address = re.sub(r'(\d+)([A-Z])', r'\1 \2', address)
# Fix missing spaces before abbreviations: "Brgy.Rivera" -> "Brgy. Rivera"
address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address)
# Remove extra spaces
address = ' '.join(address.split())
return address.strip()
def extract_postal_details(lines):
details = {
'id_type': 'Postal ID',
'prn': None,
'full_name': None,
'address': None,
'birth_date': None,
'nationality': None,
'issuing_post_office': None,
'valid_until': None,
'success': False
}
# Clean lines - convert to strings and strip
cleaned_lines = [str(line).strip() for line in lines if str(line).strip()]
for i, line in enumerate(cleaned_lines):
line_upper = line.upper().strip()
line_stripped = line.strip()
# Extract PRN (Postal Registration Number)
# Format: "PRN 100141234567 P POSTAL" or "PRN100141234567P" or "PAN100141234567P" (OCR might misread PRN as PAN)
if not details['prn']:
# Look for PRN followed by digits (may have P POSTAL after)
prn_match = re.search(r'PRN\s*(\d{10,15})', line_upper)
if prn_match:
details['prn'] = prn_match.group(1)
# Also check for PAN (common OCR error where PRN is misread as PAN)
elif re.search(r'PAN\s*(\d{10,15})', line_upper):
pan_match = re.search(r'PAN\s*(\d{10,15})', line_upper)
if pan_match:
details['prn'] = pan_match.group(1)
# Extract Full Name - combine separate name parts
# Look for label "First Name Middle Name Surname, Suffix" or name parts
if not details['full_name']:
# Check if this line is the label
if ("FIRST NAME" in line_upper or "FINT NAME" in line_upper) and ("SURNAME" in line_upper or "SUMAME" in line_upper):
# Collect name parts from next few lines
name_parts = []
for j in range(1, min(5, len(cleaned_lines) - i)):
next_line = cleaned_lines[i+j].strip()
next_upper = next_line.upper()
# Stop if we hit address or other labels
if any(label in next_upper for label in ['ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY']):
break
# Add if it looks like a name part (all caps, letters and spaces only, not too short)
if next_line and re.match(r'^[A-Z\s,]+$', next_line) and len(next_line) > 1:
# Skip if it's clearly not a name (like "ID", "C", etc.)
if next_line not in ['ID', 'C', 'P', 'POSTAL']:
name_parts.append(next_line)
if name_parts:
details['full_name'] = ' '.join(name_parts)
# Also check if line is a name part (all caps, not a label)
elif re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped) > 2:
# Make sure it's not a label or common words
if not any(label in line_upper for label in ['FIRST NAME', 'MIDDLE NAME', 'SURNAME', 'ADDRESS', 'DATE', 'BIRTH', 'NATIONALITY', 'ISSUING', 'VALID', 'POSTAL', 'IDENTITY', 'CARD', 'PHCPOST', 'PHILIPPINE', 'PREMIUM']):
# Check if previous line is the name label
if i > 0:
prev_line = cleaned_lines[i-1].strip().upper()
if "FIRST NAME" in prev_line or "FINT NAME" in prev_line or "SUMAME" in prev_line or "SURNAME" in prev_line:
# Collect consecutive name parts
name_parts = [line_stripped]
for j in range(1, min(4, len(cleaned_lines) - i)):
next_line = cleaned_lines[i+j].strip()
if (next_line and re.match(r'^[A-Z\s,]+$', next_line) and
len(next_line) > 2 and
not any(label in next_line.upper() for label in ['ADDRESS', 'DATE', 'BIRTH', 'GEN', 'TUAZON', 'BLVD', 'BRGY', '585', 'PASAY', 'ID', 'POSTAL', 'PREMIUM'])):
name_parts.append(next_line)
else:
break
if len(name_parts) >= 2:
details['full_name'] = ' '.join(name_parts)
elif len(name_parts) == 1 and len(name_parts[0].split()) >= 2:
details['full_name'] = name_parts[0]
# Extract Address - look for address parts (street numbers, Gen., Blvd., Brgy., City)
if not details['address']:
# Look for address indicators
if any(indicator in line_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY']) or (re.match(r'^\d+', line_stripped) and len(line_stripped) > 2):
address_lines = []
# Check backwards a bit to see if we missed address start
start_idx = max(0, i - 1)
# Collect address lines forward
for j in range(0, min(7, len(cleaned_lines) - start_idx)):
idx = start_idx + j
if idx >= len(cleaned_lines):
break
addr_line = cleaned_lines[idx].strip()
addr_upper = addr_line.upper()
# Stop if we hit date, nationality, or other labels
if any(label in addr_upper for label in ['DATE', 'BIRTH', 'NATIONALITY', 'FILIPINO', 'ISSUING', 'VALID', 'PAN', 'NOCON']):
break
# Skip very short lines that are likely OCR noise (like "101", "o00")
if len(addr_line) <= 2 and not re.match(r'^\d+$', addr_line):
continue
# Add if it looks like address content
if addr_line and len(addr_line) > 1:
# Check if it's a number, street name, barangay, city, etc.
if (re.match(r'^\d+', addr_line) or
any(indicator in addr_upper for indicator in ['GEN', 'TUAZON', 'BLVD', 'BRGY', 'PASAY', 'CITY', 'STREET', 'AVE', 'BOULEVARD']) or
len(address_lines) > 0): # Continue if we've started collecting
# Skip obvious OCR errors like "o00"
if addr_line.lower() not in ['o00', 'o0', '00']:
address_lines.append(addr_line)
if address_lines:
details['address'] = format_address(address_lines)
# Extract Date of Birth - handle OCR errors
if not details['birth_date']:
# Look for date patterns: "14 Aug88" or "14 Aug 88"
date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', line_stripped)
if date_match:
# Check if it's not the valid until date
if "VALID" not in line_upper and "UNTIL" not in line_upper:
# Fix spacing
day, month, year = date_match.groups()
details['birth_date'] = f"{day} {month} {year}"
# Extract Nationality
if not details['nationality']:
if "NATIONALITY" in line_upper or line_upper == "FILIPINO":
if line_upper == "FILIPINO":
details['nationality'] = "Filipino"
elif i + 1 < len(cleaned_lines):
next_line = cleaned_lines[i+1].strip()
if next_line and len(next_line) < 20:
details['nationality'] = next_line
# Extract Issuing Post Office - handle OCR errors like "IssungPostOmce"
if not details['issuing_post_office']:
if ("ISSUING POST OFFICE" in line_upper or "ISSUING POST" in line_upper or
"ISSUINGPOST" in line_upper or "ISSUINGPOSTOMCE" in line_upper):
if i + 1 < len(cleaned_lines):
next_line = cleaned_lines[i+1].strip()
if next_line and len(next_line) < 20:
# Fix OCR errors: MNL.QE -> MNL-QE
next_line = next_line.replace('.', '-')
details['issuing_post_office'] = next_line
# Extract Valid Until - handle OCR errors like "Vald Urt" and "OlDec17"
if not details['valid_until']:
if ("VALID UNTIL" in line_upper or "VALIDUNTIL" in line_upper or
"VALD URT" in line_upper or "VALDURT" in line_upper):
if i + 1 < len(cleaned_lines):
next_line = cleaned_lines[i+1].strip()
# Fix OCR errors: OlDec17 -> 01 Dec 17
# Replace common OCR errors
next_line = next_line.replace('Ol', '01').replace('O1', '01')
next_line = next_line.replace('O0', '00').replace('OO', '00')
# Try to extract date pattern
date_match = re.search(r'(\d{1,2})\s*([A-Za-z]{3})\s*(\d{2,4})', next_line)
if date_match:
day, month, year = date_match.groups()
details['valid_until'] = f"{day} {month} {year}"
elif next_line:
details['valid_until'] = next_line
# Format extracted fields
if details['full_name']:
details['full_name'] = format_name(details['full_name'])
if details['birth_date']:
details['birth_date'] = format_date(details['birth_date'])
if details['valid_until']:
details['valid_until'] = format_date(details['valid_until'])
if details['prn'] or details['full_name']:
details['success'] = True
return details
def extract_ocr_lines(image_path):
# Check if file exists
if not os.path.exists(image_path):
return {'success': False, 'error': 'File not found'}
file_size = os.path.getsize(image_path)
print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr)
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
# Try simple configuration first
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang='en'
)
try:
results = ocr.ocr(image_path)
except Exception as e:
print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr)
if hasattr(ocr, 'predict'):
results = ocr.predict(image_path)
else:
results = None
# Debug: Print raw results structure
print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr)
all_text = []
try:
# Handle both old format (list) and new format (OCRResult object)
if results and isinstance(results, list) and len(results) > 0:
first_item = results[0]
item_type_name = type(first_item).__name__
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
if is_ocr_result:
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
# Access OCRResult as dictionary
try:
if hasattr(first_item, 'keys'):
ocr_dict = dict(first_item)
# Look for rec_texts key
if 'rec_texts' in ocr_dict:
rec_texts = ocr_dict['rec_texts']
if isinstance(rec_texts, list):
all_text = [str(t) for t in rec_texts if t]
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
except Exception as e:
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
else:
# Old format - list of lists
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
import traceback
print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
return extract_postal_details(all_text) if all_text else {
'id_type': 'Postal ID',
'prn': None,
'full_name': None,
'address': None,
'birth_date': None,
'nationality': None,
'issuing_post_office': None,
'valid_until': None,
'success': False
}
# Main Execution
if len(sys.argv) < 2:
sys.stdout = original_stdout
print(json.dumps({"success": False, "error": "No image URL provided"}))
sys.exit(1)
image_url = sys.argv[1]
print(f"DEBUG: Processing Postal ID image URL: {image_url}", file=sys.stderr)
try:
image_path = download_image(image_url, 'temp_postal_image.jpg')
print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)
ocr_results = extract_ocr_lines(image_path)
print(f"DEBUG: OCR results: {ocr_results}", file=sys.stderr)
# Clean up
if os.path.exists(image_path):
os.remove(image_path)
response = {
"success": ocr_results['success'],
"data": ocr_results
}
sys.stdout = original_stdout
sys.stdout.write(json.dumps(response))
sys.stdout.flush()
except Exception as e:
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
sys.stdout.flush()
sys.exit(1)
finally:
try:
if os.path.exists('temp_postal_image.jpg'):
os.remove('temp_postal_image.jpg')
except:
pass
|