Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

handyhome-ocr-api / extract_police_ocr.py

takomattyy

Upload 2 files

2b089f9 verified 18 days ago

raw

history blame contribute delete

48.4 kB

	import sys, json, os, glob, requests
	import re
	import time
	from contextlib import redirect_stdout, redirect_stderr

	# Immediately redirect all output to stderr except for our final JSON
	original_stdout = sys.stdout
	sys.stdout = sys.stderr

	# Suppress all PaddleOCR output
	os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
	os.environ['QT_QPA_PLATFORM'] = 'offscreen'
	os.environ['DISPLAY'] = ':99'

	# Import PaddleOCR after setting environment variables
	from paddleocr import PaddleOCR

	def download_image(url, output_path='temp_police_image.jpg'):
	# Remove any existing temp file
	if os.path.exists(output_path):
	os.remove(output_path)

	# Add cache-busting parameters
	timestamp = int(time.time())
	if '?' in url:
	url += f'&t={timestamp}'
	else:
	url += f'?t={timestamp}'

	# Add headers to prevent caching
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Cache-Control': 'no-cache, no-store, must-revalidate',
	'Pragma': 'no-cache',
	'Expires': '0'
	}

	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()
	image_data = response.content

	# Save the image
	with open(output_path, 'wb') as f:
	f.write(image_data)

	return output_path

	def format_name(name):
	"""Format name: add proper spacing and commas (generic for all police clearances)

	Handles common OCR issues like missing spaces between name parts and missing comma spacing.
	Works with any name format, not specific to one document.
	"""
	if not name:
	return None

	# Remove extra spaces and normalize
	name = ' '.join(name.split())

	# First, ensure comma spacing: "JAVA,ALBERT" -> "JAVA, ALBERT"
	name = re.sub(r',([A-Z])', r', \1', name)
	name = re.sub(r',\s*([A-Z])', r', \1', name)

	# Split by comma if present
	if ',' in name:
	parts = name.split(',')
	formatted_parts = []
	for part in parts:
	part = part.strip()
	# Handle consecutive capitals: "JAVAALBERTJOY" -> "JAVA ALBERT JOY"
	# Strategy: split where a capital letter is followed by another capital + lowercase
	# "ALBERTJOY" -> "ALBERT JOY"
	part = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', part)
	# Handle remaining cases: "JOYBAUTISTA" -> "JOY BAUTISTA"
	part = re.sub(r'([A-Z][a-z]+)([A-Z][a-z]+)', r'\1 \2', part)
	formatted_parts.append(part)
	name = ', '.join(formatted_parts)
	else:
	# No comma, try to add spaces between name parts
	# "JAVAALBERTJOY BAUTISTA" -> "JAVA ALBERT JOY BAUTISTA"
	# Add space before capital letters that follow lowercase
	name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)
	# Add space between consecutive capitals: "JAVAALBERT" -> "JAVA ALBERT"
	# But be careful: "BAUTISTA" should stay together
	# Split where we have multiple capitals followed by a capital+lowercase
	name = re.sub(r'([A-Z]{2,})([A-Z][a-z])', r'\1 \2', name)
	# Also handle: "ALBERTJOY" -> "ALBERT JOY"
	name = re.sub(r'([A-Z]+)([A-Z][a-z]+)', r'\1 \2', name)

	# Clean up multiple spaces
	name = ' '.join(name.split())

	return name.strip()

	def format_address(address):
	"""Format address: add proper spacing (generic for all police clearances)"""
	if not address:
	return None

	# Remove extra spaces
	address = ' '.join(address.split())

	# Handle #BLK/#BLOCK pattern: ensure space after # if followed by letters and numbers
	# "#BLK11" -> "#BLK 11", "#BLOCK5" -> "#BLOCK 5"
	address = re.sub(r'#([A-Z]+)(\d+)', r'#\1 \2', address)

	# Add space before city names and common address parts (capital followed by capital+lowercase)
	# "CAMPOTINIO" -> "CAMPO TINIO", "CABANATUANCITY" -> "CABANATUAN CITY"
	address = re.sub(r'([A-Z])([A-Z][a-z]+)', r'\1 \2', address)

	# Ensure comma spacing: "CITY,NUEVA" -> "CITY, NUEVA"
	address = re.sub(r',([A-Z])', r', \1', address)
	address = re.sub(r',\s*([A-Z])', r', \1', address)

	# Clean up multiple spaces
	address = ' '.join(address.split())

	return address.strip()

	def format_birth_place(place):
	"""Format birth place: add proper spacing (generic for all police clearances)"""
	if not place:
	return None

	# Remove extra spaces
	place = ' '.join(place.split())

	# Ensure comma spacing: "DILASAG,AURORA" -> "DILASAG, AURORA"
	place = re.sub(r',([A-Z])', r', \1', place)
	place = re.sub(r',\s*([A-Z])', r', \1', place)

	# Add space before province/region names if needed
	# "PLACE PROVINCE" -> already spaced, but handle "PLACEPROVINCE" -> "PLACE PROVINCE"
	place = re.sub(r'([A-Z])([A-Z][a-z]+)', r'\1 \2', place)

	# Clean up multiple spaces
	place = ' '.join(place.split())

	return place.strip()

	def format_birth_date(date):
	"""Format birth date: fix common OCR errors (generic for all police clearances)"""
	if not date:
	return None

	# Fix common OCR errors for month names (universal issues)
	date = date.replace('Juy', 'July') # Common OCR error
	date = date.replace('Januay', 'January')
	date = date.replace('Februay', 'February')
	date = date.replace('Marc', 'March')
	date = date.replace('Apil', 'April')
	date = date.replace('Jun', 'June') # Be careful - June is valid, but "Jun" might be incomplete
	date = date.replace('Augu', 'August')
	date = date.replace('Septemb', 'September')
	date = date.replace('Octob', 'October')
	date = date.replace('Novemb', 'November')
	date = date.replace('Decemb', 'December')

	# Fix year errors: "1905" when it should be "05" (day) - common OCR issue
	# Pattern: "July 1905, 1991" -> "July 05, 1991"
	# Check if we have a pattern like "Month 19XX, YYYY" where 19XX is likely the day misread
	match = re.search(r'(\w+)\s+19(\d{2}),\s*(\d{4})', date)
	if match:
	day = match.group(2)
	year = match.group(3)
	# If day is 00-31, it's likely a day, not a year
	if 0 <= int(day) <= 31:
	date = re.sub(r'(\w+)\s+19(\d{2}),\s*(\d{4})', rf'\1 {day}, \3', date)

	# Ensure proper date format: "July 05, 1991"
	date = re.sub(r'(\w+)\s+(\d{1,2})\s,\s(\d{4})', r'\1 \2, \3', date)

	# Clean up multiple spaces
	date = ' '.join(date.split())

	return date.strip()

	def extract_police_details(lines):
	details = {
	'clearance_type': 'police',
	'id_number': None,
	'full_name': None,
	'address': None,
	'birth_date': None,
	'birth_place': None,
	'citizenship': None,
	'gender': None,
	'status': None,
	'success': False
	}

	for i, line in enumerate(lines):
	if not isinstance(line, str):
	continue

	line_upper = line.upper().strip()
	line_stripped = line.strip()

	# Extract Name - handle cases where NAME and value are on separate lines
	# Format: 'NAME' on one line, ':IRENE TIMBAL VILLAFUERTE' on next line
	if "NAME" in line_upper and not details['full_name']:
	# First, check if name is on the same line after colon
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	name_part = parts[1].strip()
	# Validate it's actually a name (not descriptive text)
	if name_part and len(name_part) > 2 and not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT']):
	details['full_name'] = name_part
	print(f"DEBUG: Found full name (same line): {details['full_name']}", file=sys.stderr)
	continue

	# Check next few lines for name value (prioritize lines starting with colon)
	if i + 1 < len(lines):
	for j in range(1, min(5, len(lines) - i)):
	next_line = lines[i+j].strip()
	next_upper = next_line.upper()

	# Skip if it's clearly a label or descriptive text
	if any(word in next_upper for word in ['ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID', 'THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT', 'CERTIFY', 'PERSON', 'WHOSE', 'PHOTO', 'SIGNATURE']):
	continue

	# Priority: Line starting with colon (most reliable format)
	if next_line.startswith(':') and len(next_line) > 1:
	name_part = next_line[1:].strip()
	# Validate it looks like a name (has letters, reasonable length, not descriptive text)
	if (name_part and len(name_part) > 3 and
	re.search(r'[A-Za-z]{2,}', name_part) and
	not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION'])):
	details['full_name'] = name_part
	print(f"DEBUG: Found full name (colon line): {details['full_name']}", file=sys.stderr)
	break

	# Fallback: Line that looks like a name (all caps, multiple words, reasonable length)
	elif (re.match(r'^[A-Z\s,]+$', next_line) and
	len(next_line.split()) >= 2 and
	len(next_line) > 5 and
	len(next_line) < 50): # Names are usually not too long
	# Make sure it's not descriptive text
	if not any(word in next_upper for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME']):
	details['full_name'] = next_line
	print(f"DEBUG: Found full name (all caps line): {details['full_name']}", file=sys.stderr)
	break

	# Also check for name patterns that start with colon (OCR sometimes splits NAME label)
	# But only if we haven't found a name yet
	if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5:
	name_candidate = line_stripped[1:].strip()
	# Check if it looks like a name (has letters, reasonable length, not descriptive text)
	if (re.search(r'[A-Za-z]{2,}', name_candidate) and
	len(name_candidate) > 3 and
	len(name_candidate) < 50 and
	not any(word in name_candidate.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'ADDRESS', 'BIRTH'])):
	# Make sure previous line wasn't ADDRESS or other label
	if i > 0:
	prev_line = lines[i-1].strip().upper()
	if "ADDRESS" not in prev_line and "BIRTH" not in prev_line and "CITIZEN" not in prev_line:
	details['full_name'] = name_candidate
	print(f"DEBUG: Found full name (colon pattern): {details['full_name']}", file=sys.stderr)

	# Extract Address
	if "ADDRESS" in line_upper and not details['address']:
	if ":" in line:
	parts = line.split(':')
	if len(parts) > 1:
	addr_part = parts[1].strip()
	if addr_part:
	details['address'] = addr_part
	elif i + 1 < len(lines):
	# Check next few lines for address value
	addr_parts = []
	for j in range(1, min(4, len(lines) - i)):
	next_line = lines[i+j].strip()
	if next_line.startswith(':') and len(next_line) > 1:
	addr_parts.append(next_line[1:].strip())
	elif "BIRTH" not in next_line.upper() and "CITIZEN" not in next_line.upper():
	if ":" in next_line:
	parts = next_line.split(':', 1)
	if len(parts) > 1:
	addr_parts.append(parts[1].strip())
	elif len(next_line) > 2:
	addr_parts.append(next_line)
	else:
	break
	if addr_parts:
	details['address'] = ' '.join(addr_parts).strip()

	# Extract Birth Date - handle OCR errors and combined patterns
	if ("BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper) and not details['birth_date']:
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	date_part = parts[1].strip()
	# Fix common OCR errors
	date_part = date_part.replace('Juy', 'July').replace('Juy', 'July')
	# Fix year errors (1001 -> 1991, etc.)
	date_part = re.sub(r'\b1001\b', '1991', date_part)
	date_part = re.sub(r'\b(\d{2})\b', lambda m: '19' + m.group(1) if len(m.group(1)) == 2 and int(m.group(1)) < 50 else m.group(1), date_part)
	if date_part:
	details['birth_date'] = date_part
	elif i + 1 < len(lines):
	next_line = lines[i+1].strip()
	if ":" in next_line:
	parts = next_line.split(':', 1)
	if len(parts) > 1:
	date_part = parts[1].strip()
	date_part = date_part.replace('Juy', 'July')
	date_part = re.sub(r'\b1001\b', '1991', date_part)
	if date_part:
	details['birth_date'] = date_part

	# Also look for date patterns in lines that might have been OCR'd incorrectly
	if not details['birth_date']:
	# Look for patterns like "Juy 05, 1001" or "July 03, 1991"
	date_pattern = re.search(r'(January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December\|Juy\|Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)\s+\d{1,2}[,\s]+\d{4}', line_upper)
	if date_pattern:
	date_part = date_pattern.group()
	date_part = date_part.replace('Juy', 'July')
	date_part = re.sub(r'\b1001\b', '1991', date_part)
	details['birth_date'] = date_part

	# Extract Birth Place
	if "BIRTH PLACE" in line_upper and not details['birth_place']:
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	details['birth_place'] = parts[1].strip()
	elif i + 1 < len(lines):
	next_line = lines[i+1].strip()
	if next_line.startswith(':') and len(next_line) > 1:
	details['birth_place'] = next_line[1:].strip()
	elif ":" in next_line and "CITIZEN" not in next_line.upper():
	parts = next_line.split(':', 1)
	if len(parts) > 1:
	details['birth_place'] = parts[1].strip()

	# Extract Citizenship
	if "CITIZENSHIP" in line_upper and not details['citizenship']:
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	details['citizenship'] = parts[1].strip()
	elif i + 1 < len(lines):
	next_line = lines[i+1].strip()
	if next_line.startswith(':') and len(next_line) > 1:
	details['citizenship'] = next_line[1:].strip()
	elif ":" in next_line:
	parts = next_line.split(':', 1)
	if len(parts) > 1:
	details['citizenship'] = parts[1].strip()

	# Extract Gender - handle cases where GENDER and value are on separate lines
	# Format: 'GENDER' on one line, 'FEMALE' or 'MALE' on next line
	if "GENDER" in line_upper and not details['gender']:
	# First, check if gender is on the same line after colon
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	gender_part = parts[1].strip().upper()
	if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
	details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
	print(f"DEBUG: Found gender (same line): {details['gender']}", file=sys.stderr)
	continue

	# Check next few lines for gender value
	if i + 1 < len(lines):
	for j in range(1, min(4, len(lines) - i)):
	next_line = lines[i+j].strip()
	next_upper = next_line.upper()

	# Skip if it's clearly a label
	if any(label in next_upper for label in ['NAME', 'ADDRESS', 'BIRTH', 'CITIZEN', 'DATE', 'PLACE', 'PICTURE', 'SIGNATURE', 'THUMBMARK']):
	continue

	# Check if line starts with colon
	if next_line.startswith(':') and len(next_line) > 1:
	gender_part = next_line[1:].strip().upper()
	if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
	details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
	print(f"DEBUG: Found gender (colon line): {details['gender']}", file=sys.stderr)
	break

	# Check if the line itself is the gender value
	elif next_upper in ['MALE', 'FEMALE', 'M', 'F']:
	details['gender'] = next_line.capitalize() if len(next_line) > 1 else next_line
	print(f"DEBUG: Found gender (direct): {details['gender']}", file=sys.stderr)
	break

	# Check if line contains colon with gender value
	elif ":" in next_line:
	parts = next_line.split(':', 1)
	if len(parts) > 1:
	gender_part = parts[1].strip().upper()
	if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
	details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
	print(f"DEBUG: Found gender (colon in line): {details['gender']}", file=sys.stderr)
	break

	# Extract ID Number (Usually "ID No.:" or near QR code)
	if "ID NO" in line_upper or "ID NO." in line_upper:
	parts = line.split(':')
	if len(parts) > 1:
	details['id_number'] = parts[1].strip()

	# Fallback ID extraction looking for specific patterns if not found by label
	if not details['id_number']:
	# Look for pattern like TRARH + digits
	id_match = re.search(r'\b[A-Z]{4,5}\d{10,15}\b', line_upper)
	if id_match:
	details['id_number'] = id_match.group()

	# Extract Status (e.g., "NO RECORD ON FILE")
	if "NO RECORD ON FILE" in line_upper:
	details['status'] = "NO RECORD ON FILE"
	elif "HAS A RECORD" in line_upper or "WITH RECORD" in line_upper:
	details['status'] = "HAS RECORD"

	if details['full_name'] or details['id_number']:
	details['success'] = True

	# Format the extracted fields
	if details['full_name']:
	details['full_name'] = format_name(details['full_name'])
	if details['address']:
	details['address'] = format_address(details['address'])
	if details['birth_place']:
	details['birth_place'] = format_birth_place(details['birth_place'])
	if details['birth_date']:
	details['birth_date'] = format_birth_date(details['birth_date'])

	return details

	def extract_ocr_lines(image_path):
	# Check if file exists
	if not os.path.exists(image_path):
	return {'success': False, 'error': 'File not found'}

	file_size = os.path.getsize(image_path)
	print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr)

	with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
	# Try simple configuration first (matching NBI script primary method)
	ocr = PaddleOCR(
	use_doc_orientation_classify=False,
	use_doc_unwarping=False,
	use_textline_orientation=False,
	lang='en'
	)
	try:
	results = ocr.ocr(image_path)
	except Exception as e:
	print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr)
	if hasattr(ocr, 'predict'):
	results = ocr.predict(image_path)
	else:
	results = None

	# Debug: Print raw results structure
	print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr)
	print(f"DEBUG: Raw OCR results is None: {results is None}", file=sys.stderr)
	if results is not None:
	print(f"DEBUG: Raw OCR results length: {len(results) if isinstance(results, list) else 'N/A'}", file=sys.stderr)
	if isinstance(results, list) and len(results) > 0:
	print(f"DEBUG: First level item type: {type(results[0])}", file=sys.stderr)
	print(f"DEBUG: First level item: {str(results[0])[:200] if results[0] else 'None'}", file=sys.stderr)
	if isinstance(results[0], list) and len(results[0]) > 0:
	print(f"DEBUG: Second level first item: {str(results[0][0])[:200] if results[0][0] else 'None'}", file=sys.stderr)

	# Process OCR results - handle both old format (list) and new format (OCRResult object)
	all_text = []
	try:
	# Check if results contain OCRResult objects (new PaddleX format)
	if results and isinstance(results, list) and len(results) > 0:
	first_item = results[0]
	# Check if it's an OCRResult object by type name
	item_type_name = type(first_item).__name__
	is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()

	if is_ocr_result:
	print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
	# Inspect attributes
	attrs = dir(first_item)
	print(f"DEBUG: OCRResult attributes: {[a for a in attrs if not a.startswith('_')]}", file=sys.stderr)

	for ocr_result in results:
	# Try various possible attribute names for text
	text_found = False

	# First, try accessing as dictionary (OCRResult is dict-like)
	try:
	if hasattr(ocr_result, 'keys'):
	ocr_dict = dict(ocr_result)
	print(f"DEBUG: OCRResult as dict keys: {list(ocr_dict.keys())}", file=sys.stderr)

	# Look for common OCR result keys (rec_texts is the actual key in PaddleX OCRResult)
	for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results', 'ocr_result', 'dt_boxes']:
	if key in ocr_dict:
	val = ocr_dict[key]
	print(f"DEBUG: Found key '{key}': {type(val)}, length: {len(val) if isinstance(val, list) else 'N/A'}", file=sys.stderr)
	if isinstance(val, list):
	# rec_texts is a list of strings directly
	if key == 'rec_texts':
	for text_item in val:
	if isinstance(text_item, str) and text_item.strip():
	all_text.append(text_item.strip())
	elif text_item:
	all_text.append(str(text_item))
	if val:
	text_found = True
	else:
	# For other keys, try to extract text from nested structures
	for item in val:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	# Format: [[coords], (text, confidence)]
	text_part = item[1]
	if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
	all_text.append(str(text_part[0]))
	elif isinstance(item, str):
	all_text.append(item)
	if val:
	text_found = True
	elif isinstance(val, str) and val:
	all_text.append(val)
	text_found = True
	if text_found:
	break
	except Exception as e:
	print(f"DEBUG: Error accessing OCRResult as dict: {e}", file=sys.stderr)

	# Try json() method
	if not text_found:
	try:
	if hasattr(ocr_result, 'json'):
	json_data = ocr_result.json()
	print(f"DEBUG: OCRResult.json() type: {type(json_data)}", file=sys.stderr)
	if isinstance(json_data, dict):
	print(f"DEBUG: OCRResult.json() keys: {list(json_data.keys())}", file=sys.stderr)
	# Look for text in JSON (rec_texts is the actual key)
	for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results']:
	if key in json_data:
	val = json_data[key]
	if isinstance(val, list):
	# rec_texts is a list of strings directly
	if key == 'rec_texts':
	for text_item in val:
	if isinstance(text_item, str) and text_item.strip():
	all_text.append(text_item.strip())
	elif text_item:
	all_text.append(str(text_item))
	if val:
	text_found = True
	else:
	for item in val:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	text_part = item[1]
	if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
	all_text.append(str(text_part[0]))
	elif isinstance(item, str):
	all_text.append(item)
	if val:
	text_found = True
	elif isinstance(val, str) and val:
	all_text.append(val)
	text_found = True
	if text_found:
	break
	except Exception as e:
	print(f"DEBUG: Error calling json(): {e}", file=sys.stderr)

	# Try rec_text attribute
	if not text_found and hasattr(ocr_result, 'rec_text'):
	rec_text = ocr_result.rec_text
	print(f"DEBUG: Found rec_text attribute: {type(rec_text)}", file=sys.stderr)
	if isinstance(rec_text, list):
	all_text.extend([str(t) for t in rec_text if t])
	text_found = True
	elif rec_text:
	all_text.append(str(rec_text))
	text_found = True

	# Try text attribute
	if not text_found and hasattr(ocr_result, 'text'):
	text = ocr_result.text
	print(f"DEBUG: Found text attribute: {type(text)}", file=sys.stderr)
	if isinstance(text, list):
	all_text.extend([str(t) for t in text if t])
	text_found = True
	elif text:
	all_text.append(str(text))
	text_found = True

	# If still no text, print full structure for debugging
	if not text_found:
	print(f"DEBUG: Could not find text in OCRResult, trying to inspect structure", file=sys.stderr)
	try:
	print(f"DEBUG: OCRResult repr: {repr(ocr_result)[:500]}", file=sys.stderr)
	# Try to get all keys/items
	if hasattr(ocr_result, 'keys'):
	try:
	all_keys = list(ocr_result.keys())
	print(f"DEBUG: All OCRResult keys: {all_keys}", file=sys.stderr)
	for key in all_keys:
	try:
	val = ocr_result[key]
	print(f"DEBUG: Key '{key}' type: {type(val)}, value preview: {str(val)[:100]}", file=sys.stderr)
	except:
	pass
	except:
	pass
	except Exception as e:
	print(f"DEBUG: Error inspecting structure: {e}", file=sys.stderr)
	else:
	# Old format - list of lists
	lines = results[0] if results and isinstance(results[0], list) else results
	print(f"DEBUG: Processing lines (old format), count: {len(lines) if isinstance(lines, list) else 'N/A'}", file=sys.stderr)
	for item in lines:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	meta = item[1]
	if isinstance(meta, (list, tuple)) and len(meta) >= 1:
	all_text.append(str(meta[0]))
	except Exception as e:
	print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
	import traceback
	print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
	# Try to inspect the object attributes
	if results and isinstance(results, list) and len(results) > 0:
	first_item = results[0]
	print(f"DEBUG: First item attributes: {dir(first_item)}", file=sys.stderr)
	if hasattr(first_item, '__dict__'):
	print(f"DEBUG: First item dict: {first_item.__dict__}", file=sys.stderr)

	print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)

	return extract_police_details(all_text) if all_text else {'clearance_type': 'police', 'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}

	def extract_ocr_lines_simple(image_path):
	# Fallback method with advanced features (matching NBI script fallback)
	with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
	ocr = PaddleOCR(
	use_doc_orientation_classify=True,
	use_doc_unwarping=True,
	use_textline_orientation=True,
	lang='en'
	)
	results = ocr.ocr(image_path)

	# Debug: Print raw results structure for fallback method
	print(f"DEBUG (fallback): Raw OCR results type: {type(results)}", file=sys.stderr)
	print(f"DEBUG (fallback): Raw OCR results is None: {results is None}", file=sys.stderr)
	if results is not None:
	print(f"DEBUG (fallback): Raw OCR results length: {len(results) if isinstance(results, list) else 'N/A'}", file=sys.stderr)
	if isinstance(results, list) and len(results) > 0:
	print(f"DEBUG (fallback): First level item type: {type(results[0])}", file=sys.stderr)
	if isinstance(results[0], list) and len(results[0]) > 0:
	print(f"DEBUG (fallback): Second level first item: {str(results[0][0])[:200] if results[0][0] else 'None'}", file=sys.stderr)

	all_text = []
	try:
	# Check if results contain OCRResult objects (new PaddleX format)
	if results and isinstance(results, list) and len(results) > 0:
	first_item = results[0]
	# Check if it's an OCRResult object by type name
	item_type_name = type(first_item).__name__
	is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()

	if is_ocr_result:
	print(f"DEBUG (fallback): Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
	# Inspect attributes
	attrs = dir(first_item)
	print(f"DEBUG (fallback): OCRResult attributes: {[a for a in attrs if not a.startswith('_')]}", file=sys.stderr)

	for ocr_result in results:
	# Try various possible attribute names for text
	text_found = False

	# First, try accessing as dictionary (OCRResult is dict-like)
	try:
	if hasattr(ocr_result, 'keys'):
	ocr_dict = dict(ocr_result)
	print(f"DEBUG (fallback): OCRResult as dict keys: {list(ocr_dict.keys())}", file=sys.stderr)

	# Look for common OCR result keys (rec_texts is the actual key in PaddleX OCRResult)
	for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results', 'ocr_result', 'dt_boxes']:
	if key in ocr_dict:
	val = ocr_dict[key]
	print(f"DEBUG (fallback): Found key '{key}': {type(val)}, length: {len(val) if isinstance(val, list) else 'N/A'}", file=sys.stderr)
	if isinstance(val, list):
	# rec_texts is a list of strings directly
	if key == 'rec_texts':
	for text_item in val:
	if isinstance(text_item, str) and text_item.strip():
	all_text.append(text_item.strip())
	elif text_item:
	all_text.append(str(text_item))
	if val:
	text_found = True
	else:
	# For other keys, try to extract text from nested structures
	for item in val:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	# Format: [[coords], (text, confidence)]
	text_part = item[1]
	if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
	all_text.append(str(text_part[0]))
	elif isinstance(item, str):
	all_text.append(item)
	if val:
	text_found = True
	elif isinstance(val, str) and val:
	all_text.append(val)
	text_found = True
	if text_found:
	break
	except Exception as e:
	print(f"DEBUG (fallback): Error accessing OCRResult as dict: {e}", file=sys.stderr)

	# Try json() method
	if not text_found:
	try:
	if hasattr(ocr_result, 'json'):
	json_data = ocr_result.json()
	print(f"DEBUG (fallback): OCRResult.json() type: {type(json_data)}", file=sys.stderr)
	if isinstance(json_data, dict):
	print(f"DEBUG (fallback): OCRResult.json() keys: {list(json_data.keys())}", file=sys.stderr)
	# Look for text in JSON (rec_texts is the actual key)
	for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results']:
	if key in json_data:
	val = json_data[key]
	if isinstance(val, list):
	# rec_texts is a list of strings directly
	if key == 'rec_texts':
	for text_item in val:
	if isinstance(text_item, str) and text_item.strip():
	all_text.append(text_item.strip())
	elif text_item:
	all_text.append(str(text_item))
	if val:
	text_found = True
	else:
	for item in val:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	text_part = item[1]
	if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
	all_text.append(str(text_part[0]))
	elif isinstance(item, str):
	all_text.append(item)
	if val:
	text_found = True
	elif isinstance(val, str) and val:
	all_text.append(val)
	text_found = True
	if text_found:
	break
	except Exception as e:
	print(f"DEBUG (fallback): Error calling json(): {e}", file=sys.stderr)

	# Try rec_text attribute
	if not text_found and hasattr(ocr_result, 'rec_text'):
	rec_text = ocr_result.rec_text
	print(f"DEBUG (fallback): Found rec_text attribute: {type(rec_text)}", file=sys.stderr)
	if isinstance(rec_text, list):
	all_text.extend([str(t) for t in rec_text if t])
	text_found = True
	elif rec_text:
	all_text.append(str(rec_text))
	text_found = True

	# Try text attribute
	if not text_found and hasattr(ocr_result, 'text'):
	text = ocr_result.text
	print(f"DEBUG (fallback): Found text attribute: {type(text)}", file=sys.stderr)
	if isinstance(text, list):
	all_text.extend([str(t) for t in text if t])
	text_found = True
	elif text:
	all_text.append(str(text))
	text_found = True

	# If still no text, print full structure for debugging
	if not text_found:
	print(f"DEBUG (fallback): Could not find text in OCRResult, trying to inspect structure", file=sys.stderr)
	try:
	print(f"DEBUG (fallback): OCRResult repr: {repr(ocr_result)[:500]}", file=sys.stderr)
	# Try to get all keys/items
	if hasattr(ocr_result, 'keys'):
	try:
	all_keys = list(ocr_result.keys())
	print(f"DEBUG (fallback): All OCRResult keys: {all_keys}", file=sys.stderr)
	for key in all_keys:
	try:
	val = ocr_result[key]
	print(f"DEBUG (fallback): Key '{key}' type: {type(val)}, value preview: {str(val)[:100]}", file=sys.stderr)
	except:
	pass
	except:
	pass
	except Exception as e:
	print(f"DEBUG (fallback): Error inspecting structure: {e}", file=sys.stderr)
	else:
	# Old format - list of lists
	lines = results[0] if results and isinstance(results[0], list) else results
	print(f"DEBUG (fallback): Processing lines (old format), count: {len(lines) if isinstance(lines, list) else 'N/A'}", file=sys.stderr)
	for item in lines:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	meta = item[1]
	if isinstance(meta, (list, tuple)) and len(meta) >= 1:
	all_text.append(str(meta[0]))
	except Exception as e:
	print(f"DEBUG (fallback): Error processing OCR results: {str(e)}", file=sys.stderr)
	import traceback
	print(f"DEBUG (fallback): Traceback: {traceback.format_exc()}", file=sys.stderr)
	# Try to inspect the object attributes
	if results and isinstance(results, list) and len(results) > 0:
	first_item = results[0]
	print(f"DEBUG (fallback): First item attributes: {dir(first_item)}", file=sys.stderr)
	if hasattr(first_item, '__dict__'):
	print(f"DEBUG (fallback): First item dict: {first_item.__dict__}", file=sys.stderr)

	print(f"DEBUG (fallback): Extracted text lines: {all_text}", file=sys.stderr)

	return extract_police_details(all_text) if all_text else {'clearance_type': 'police', 'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}

	# Main Execution
	if len(sys.argv) < 2:
	sys.stdout = original_stdout
	print(json.dumps({"success": False, "error": "No image URL provided"}))
	sys.exit(1)

	image_url = sys.argv[1]
	print(f"DEBUG: Processing Police Clearance image URL: {image_url}", file=sys.stderr)

	try:
	image_path = download_image(image_url, 'temp_police_image.jpg')
	print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)

	# Try the original OCR method first
	ocr_results = extract_ocr_lines(image_path)
	print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr)

	# If original method fails, try simple method with advanced features
	if not ocr_results['success']:
	print("DEBUG: Original method failed, trying simple method with advanced features", file=sys.stderr)
	ocr_results = extract_ocr_lines_simple(image_path)
	print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr)

	# Clean up
	if os.path.exists(image_path):
	os.remove(image_path)

	response = {
	"success": ocr_results['success'],
	"data": ocr_results
	}

	sys.stdout = original_stdout
	sys.stdout.write(json.dumps(response))
	sys.stdout.flush()

	except Exception as e:
	sys.stdout = original_stdout
	sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
	sys.stdout.flush()
	sys.exit(1)
	finally:
	try:
	if os.path.exists('temp_police_image.jpg'):
	os.remove('temp_police_image.jpg')
	except:
	pass