Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

handyhome-ocr-api / extract_nbi_ocr.py

takomattyy

Upload 2 files

2b089f9 verified 20 days ago

raw

history blame contribute delete

23.6 kB

	import sys, json, os, glob, requests
	import re
	import time
	import shutil
	from contextlib import redirect_stdout, redirect_stderr

	# Immediately redirect all output to stderr except for our final JSON
	original_stdout = sys.stdout
	sys.stdout = sys.stderr

	# Suppress all PaddleOCR output
	os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
	os.environ['QT_QPA_PLATFORM'] = 'offscreen'
	os.environ['DISPLAY'] = ':99'

	# Import PaddleOCR after setting environment variables
	from paddleocr import PaddleOCR

	def download_image(url, output_path='temp_image.jpg'):
	# Remove any existing temp file
	if os.path.exists(output_path):
	os.remove(output_path)

	# Add cache-busting parameters
	timestamp = int(time.time())
	if '?' in url:
	url += f'&t={timestamp}'
	else:
	url += f'?t={timestamp}'

	# Add headers to prevent caching
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Cache-Control': 'no-cache, no-store, must-revalidate',
	'Pragma': 'no-cache',
	'Expires': '0'
	}

	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status()
	image_data = response.content


	# Save the image and verify it's the right one
	with open(output_path, 'wb') as f:
	f.write(image_data)


	return output_path

	# OCR Function to extract NBI ID NO, Name, Birth Date, and LIT
	def extract_nbi_id(lines):
	nbi_id = None
	full_name = None
	birth_date = None
	lit = None # LIT field (Last Issued To or similar)

	# Clean lines - convert to strings and strip
	cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines]

	# First pass: Look for NBI ID pattern in all lines (prioritize exact matches)
	# This helps catch IDs that might be on lines without labels
	for i, line in enumerate(cleaned_lines):
	line_upper = line.upper().strip()
	line_clean = line.strip()

	# Look for NBI ID pattern with hyphen first (most reliable)
	if not nbi_id:
	hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
	match = re.search(hyphen_pattern, line_clean)
	if match:
	candidate = match.group(1)
	# Validate length and that it's not part of an address
	if 17 <= len(candidate) <= 25:
	# Check that line doesn't have too many words (NBI IDs are usually standalone)
	line_words = line_clean.split()
	if len(line_words) <= 3: # Usually 1-2 words max (the ID itself)
	# Additional validation: should have mix of letters and numbers
	has_letters = bool(re.search(r'[A-Z]', candidate))
	has_numbers = bool(re.search(r'[0-9]', candidate))
	if has_letters and has_numbers:
	nbi_id = candidate
	print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr)
	break

	# Second pass: Extract other fields and refine ID if needed
	for i, line in enumerate(cleaned_lines):
	line_upper = line.upper().strip()
	line_clean = line.strip()

	# Extract NBI ID Number (if not found in first pass)
	if not nbi_id:
	# Look for "NBI ID NO:" pattern (various formats)
	if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or
	"NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper):
	# Extract the ID after the colon
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	id_candidate = parts[1].strip()
	# Clean up the ID (remove extra spaces, ensure proper format)
	id_candidate = re.sub(r'\s+', '', id_candidate) # Remove spaces
	if len(id_candidate) > 5: # Valid ID should be longer
	nbi_id = id_candidate
	print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr)
	continue

	# Also check if the next line contains the ID (in case it's on a separate line)
	if i < len(cleaned_lines) - 1:
	for j in range(1, min(3, len(cleaned_lines) - i)):
	next_line = cleaned_lines[i + j].strip()
	# Skip if it's clearly not an ID (too short, contains labels)
	if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']):
	continue
	# Check if it looks like an NBI ID (alphanumeric, reasonable length)
	if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')):
	nbi_id = next_line.replace(' ', '')
	print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr)
	break
	if nbi_id:
	continue

	# Look for NBI ID pattern: alphanumeric with one hyphen
	# Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873
	# First part: 8-12 chars, hyphen, second part: 8-12 chars
	# Total length: 17-25 characters (including hyphen)

	# Priority 1: Pattern with hyphen (most common format)
	# Look for pattern like B450JRLR0B-RC248667
	hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
	match = re.search(hyphen_pattern, line_clean)
	if match:
	candidate = match.group(1)
	# Validate: should be 17-25 chars total
	if 17 <= len(candidate) <= 25:
	# Make sure it's not matching address parts or other text
	# Also check that the line doesn't have too many words (NBI IDs are usually standalone)
	line_words = line_clean.split()
	# Additional validation: should have mix of letters and numbers
	has_letters = bool(re.search(r'[A-Z]', candidate))
	has_numbers = bool(re.search(r'[0-9]', candidate))
	if (has_letters and has_numbers and
	not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and
	len(line_words) <= 3): # NBI ID is usually on its own line or with 1-2 other words
	nbi_id = candidate
	print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr)
	continue

	# Priority 2: Pattern with space instead of hyphen
	space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b'
	match = re.search(space_pattern, line_clean)
	if match:
	part1, part2 = match.groups()
	candidate = f"{part1}-{part2}"
	if 17 <= len(candidate) <= 25:
	has_letters = bool(re.search(r'[A-Z]', candidate))
	has_numbers = bool(re.search(r'[0-9]', candidate))
	if (has_letters and has_numbers and
	not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])):
	nbi_id = candidate
	print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr)
	continue

	# Priority 3: Pattern without hyphen/space (all together)
	# Only if we haven't found one yet and it's a reasonable length
	no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b'
	match = re.search(no_hyphen_pattern, line_clean)
	if match:
	candidate = match.group(1)
	# Make sure it doesn't contain common address words and has both letters and numbers
	has_letters = bool(re.search(r'[A-Z]', candidate))
	has_numbers = bool(re.search(r'[0-9]', candidate))
	if (has_letters and has_numbers and
	not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])):
	# Try to split it intelligently (usually split in the middle)
	mid = len(candidate) // 2
	# Try splitting at various points
	for split_point in range(mid-2, mid+3):
	if 8 <= split_point <= len(candidate) - 8:
	part1 = candidate[:split_point]
	part2 = candidate[split_point:]
	if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12:
	nbi_id = f"{part1}-{part2}"
	print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr)
	break
	if nbi_id:
	continue

	# Extract Full Name - look for name patterns after "NAME" label
	# Also handle cases where name might be on the same line or next lines
	if not full_name:
	# Check if line contains "NAME" label
	if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper):
	# First, check if name is on the same line after colon
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	name_part = parts[1].strip()
	if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2:
	full_name = name_part
	print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr)
	continue

	# Check next few lines for name value
	for j in range(1, min(5, len(cleaned_lines) - i)):
	next_line = cleaned_lines[i + j].strip()
	next_upper = next_line.upper()
	# Skip if it's another label or ID number
	if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
	continue
	# Check if it looks like a name (has letters, may have commas, not all numbers)
	if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2:
	# Additional check: make sure it's not just a single word that's too short
	if len(next_line.split()) >= 1 and len(next_line) > 3:
	full_name = next_line
	print(f"DEBUG: Found full name: {full_name}", file=sys.stderr)
	break

	# Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label
	if not birth_date:
	if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or
	("DATE" in line_upper and "BIRTH" in line_upper)):
	# First, check if date is on the same line after colon
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	date_part = parts[1].strip()
	if (re.search(r'(JANUARY\|FEBRUARY\|MARCH\|APRIL\|MAY\|JUNE\|JULY\|AUGUST\|SEPTEMBER\|OCTOBER\|NOVEMBER\|DECEMBER\|JAN\|FEB\|MAR\|APR\|JUN\|JUL\|AUG\|SEP\|OCT\|NOV\|DEC)', date_part.upper()) or
	re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or
	re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)):
	birth_date = date_part
	print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr)
	continue

	# Check next few lines for date value
	for j in range(1, min(5, len(cleaned_lines) - i)):
	next_line = cleaned_lines[i + j].strip()
	next_upper = next_line.upper()
	# Skip if it's another label
	if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
	continue
	# Check if it looks like a date (contains month name or date pattern)
	if (re.search(r'(JANUARY\|FEBRUARY\|MARCH\|APRIL\|MAY\|JUNE\|JULY\|AUGUST\|SEPTEMBER\|OCTOBER\|NOVEMBER\|DECEMBER\|JAN\|FEB\|MAR\|APR\|JUN\|JUL\|AUG\|SEP\|OCT\|NOV\|DEC)', next_upper) or
	re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or
	re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)):
	birth_date = next_line
	print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
	break

	# Extract LIT field - look for "LIT" label or pattern
	if not lit:
	# Look for "LIT" label (could be "LIT:", "LIT", or part of another label)
	if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper):
	# Check if LIT value is on the same line after colon or space
	if ":" in line:
	parts = line.split(':', 1)
	if len(parts) > 1:
	lit_part = parts[1].strip()
	if len(lit_part) > 0:
	lit = lit_part
	print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr)
	continue
	# Check next few lines for LIT value
	for j in range(1, min(4, len(cleaned_lines) - i)):
	next_line = cleaned_lines[i + j].strip()
	next_upper = next_line.upper()
	# Skip if it's another label
	if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']):
	continue
	# Check if it looks like a valid LIT value (could be date, name, or other text)
	if len(next_line) > 0:
	lit = next_line
	print(f"DEBUG: Found LIT: {lit}", file=sys.stderr)
	break

	return {
	'clearance_type': 'nbi',
	'id_number': nbi_id,
	'full_name': full_name,
	'birth_date': birth_date,
	'lit': lit,
	'success': nbi_id is not None or full_name is not None
	}

	def extract_ocr_lines_simple(image_path):

	# Try with different PaddleOCR settings
	with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
	ocr = PaddleOCR(
	use_doc_orientation_classify=True, # Enable orientation detection
	use_doc_unwarping=True, # Enable document unwarping
	use_textline_orientation=True, # Enable text line orientation
	lang='en' # Set language to English
	)
	try:
	results = ocr.predict(image_path)
	except Exception as e:
	print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
	if hasattr(ocr, 'ocr'):
	results = ocr.ocr(image_path)
	else:
	results = None

	all_text = []
	try:
	# Handle both old format (list) and new format (OCRResult object)
	if results and isinstance(results, list) and len(results) > 0:
	first_item = results[0]
	item_type_name = type(first_item).__name__
	is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()

	if is_ocr_result:
	print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
	# Access OCRResult as dictionary
	try:
	if hasattr(first_item, 'keys'):
	ocr_dict = dict(first_item)
	# Look for rec_texts key
	if 'rec_texts' in ocr_dict:
	rec_texts = ocr_dict['rec_texts']
	if isinstance(rec_texts, list):
	all_text = [str(t) for t in rec_texts if t]
	print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
	except Exception as e:
	print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
	else:
	# Old format - list of lists
	lines = results[0] if results and isinstance(results[0], list) else results
	for item in lines:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	meta = item[1]
	if isinstance(meta, (list, tuple)) and len(meta) >= 1:
	all_text.append(str(meta[0]))
	except Exception as e:
	print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)

	return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}

	def extract_ocr_lines(image_path):
	# Check if file exists and has content
	if not os.path.exists(image_path):
	return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}

	# Ensure output directory exists
	os.makedirs("output", exist_ok=True)

	# Clear previous output files
	for old_file in glob.glob("output/*"):
	os.remove(old_file)

	with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
	ocr = PaddleOCR(
	use_doc_orientation_classify=False,
	use_doc_unwarping=False,
	use_textline_orientation=False,
	lang='en'
	)
	try:
	results = ocr.predict(image_path)
	except Exception as e:
	print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
	if hasattr(ocr, 'ocr'):
	results = ocr.ocr(image_path)
	else:
	results = None

	# Process OCR results - handle both old format (list) and new format (OCRResult object)
	all_text = []
	try:
	# Handle both old format (list) and new format (OCRResult object)
	if results and isinstance(results, list) and len(results) > 0:
	first_item = results[0]
	item_type_name = type(first_item).__name__
	is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()

	if is_ocr_result:
	print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
	# Access OCRResult as dictionary
	try:
	if hasattr(first_item, 'keys'):
	ocr_dict = dict(first_item)
	# Look for rec_texts key
	if 'rec_texts' in ocr_dict:
	rec_texts = ocr_dict['rec_texts']
	if isinstance(rec_texts, list):
	all_text = [str(t) for t in rec_texts if t]
	print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
	except Exception as e:
	print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
	else:
	# Old format - list of lists
	lines = results[0] if results and isinstance(results[0], list) else results
	for item in lines:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	meta = item[1]
	if isinstance(meta, (list, tuple)) and len(meta) >= 1:
	all_text.append(str(meta[0]))
	except Exception as e:
	print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
	import traceback
	print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)

	print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
	return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}

	# Main
	if len(sys.argv) < 2:
	sys.stdout = original_stdout
	print(json.dumps({"success": False, "error": "No image URL provided"}))
	sys.exit(1)

	image_url = sys.argv[1]
	print(f"DEBUG: Processing NBI image URL: {image_url}", file=sys.stderr)

	try:
	image_path = download_image(image_url, f'temp_image.jpg')
	print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)

	# Try the original OCR method first
	ocr_results = extract_ocr_lines(image_path)
	print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr)

	# If original method fails, try simple method
	if not ocr_results['success']:
	print("DEBUG: Original method failed, trying simple method", file=sys.stderr)
	ocr_results = extract_ocr_lines_simple(image_path)
	print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr)

	# Clean up the temporary file
	if os.path.exists(image_path):
	os.remove(image_path)

	# Create the response object
	response = {
	"success": ocr_results['success'],
	"ocr_results": ocr_results
	}

	# Restore stdout and print only the JSON response
	sys.stdout = original_stdout
	sys.stdout.write(json.dumps(response))
	sys.stdout.flush()

	except Exception as e:
	# Restore stdout for error JSON
	sys.stdout = original_stdout
	sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
	sys.stdout.flush()
	sys.exit(1)
	finally:
	# Clean up
	try:
	if os.path.exists('temp_image.jpg'):
	os.remove('temp_image.jpg')
	except:
	pass