handyhome-ocr-api / extract_nbi_ocr.py
takomattyy's picture
Upload 2 files
2b089f9 verified
import sys, json, os, glob, requests
import re
import time
import shutil
from contextlib import redirect_stdout, redirect_stderr
# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr
# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'
# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR
def download_image(url, output_path='temp_image.jpg'):
# Remove any existing temp file
if os.path.exists(output_path):
os.remove(output_path)
# Add cache-busting parameters
timestamp = int(time.time())
if '?' in url:
url += f'&t={timestamp}'
else:
url += f'?t={timestamp}'
# Add headers to prevent caching
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
image_data = response.content
# Save the image and verify it's the right one
with open(output_path, 'wb') as f:
f.write(image_data)
return output_path
# OCR Function to extract NBI ID NO, Name, Birth Date, and LIT
def extract_nbi_id(lines):
nbi_id = None
full_name = None
birth_date = None
lit = None # LIT field (Last Issued To or similar)
# Clean lines - convert to strings and strip
cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines]
# First pass: Look for NBI ID pattern in all lines (prioritize exact matches)
# This helps catch IDs that might be on lines without labels
for i, line in enumerate(cleaned_lines):
line_upper = line.upper().strip()
line_clean = line.strip()
# Look for NBI ID pattern with hyphen first (most reliable)
if not nbi_id:
hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
match = re.search(hyphen_pattern, line_clean)
if match:
candidate = match.group(1)
# Validate length and that it's not part of an address
if 17 <= len(candidate) <= 25:
# Check that line doesn't have too many words (NBI IDs are usually standalone)
line_words = line_clean.split()
if len(line_words) <= 3: # Usually 1-2 words max (the ID itself)
# Additional validation: should have mix of letters and numbers
has_letters = bool(re.search(r'[A-Z]', candidate))
has_numbers = bool(re.search(r'[0-9]', candidate))
if has_letters and has_numbers:
nbi_id = candidate
print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr)
break
# Second pass: Extract other fields and refine ID if needed
for i, line in enumerate(cleaned_lines):
line_upper = line.upper().strip()
line_clean = line.strip()
# Extract NBI ID Number (if not found in first pass)
if not nbi_id:
# Look for "NBI ID NO:" pattern (various formats)
if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or
"NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper):
# Extract the ID after the colon
if ":" in line:
parts = line.split(':', 1)
if len(parts) > 1:
id_candidate = parts[1].strip()
# Clean up the ID (remove extra spaces, ensure proper format)
id_candidate = re.sub(r'\s+', '', id_candidate) # Remove spaces
if len(id_candidate) > 5: # Valid ID should be longer
nbi_id = id_candidate
print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr)
continue
# Also check if the next line contains the ID (in case it's on a separate line)
if i < len(cleaned_lines) - 1:
for j in range(1, min(3, len(cleaned_lines) - i)):
next_line = cleaned_lines[i + j].strip()
# Skip if it's clearly not an ID (too short, contains labels)
if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']):
continue
# Check if it looks like an NBI ID (alphanumeric, reasonable length)
if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')):
nbi_id = next_line.replace(' ', '')
print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr)
break
if nbi_id:
continue
# Look for NBI ID pattern: alphanumeric with one hyphen
# Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873
# First part: 8-12 chars, hyphen, second part: 8-12 chars
# Total length: 17-25 characters (including hyphen)
# Priority 1: Pattern with hyphen (most common format)
# Look for pattern like B450JRLR0B-RC248667
hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
match = re.search(hyphen_pattern, line_clean)
if match:
candidate = match.group(1)
# Validate: should be 17-25 chars total
if 17 <= len(candidate) <= 25:
# Make sure it's not matching address parts or other text
# Also check that the line doesn't have too many words (NBI IDs are usually standalone)
line_words = line_clean.split()
# Additional validation: should have mix of letters and numbers
has_letters = bool(re.search(r'[A-Z]', candidate))
has_numbers = bool(re.search(r'[0-9]', candidate))
if (has_letters and has_numbers and
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and
len(line_words) <= 3): # NBI ID is usually on its own line or with 1-2 other words
nbi_id = candidate
print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr)
continue
# Priority 2: Pattern with space instead of hyphen
space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b'
match = re.search(space_pattern, line_clean)
if match:
part1, part2 = match.groups()
candidate = f"{part1}-{part2}"
if 17 <= len(candidate) <= 25:
has_letters = bool(re.search(r'[A-Z]', candidate))
has_numbers = bool(re.search(r'[0-9]', candidate))
if (has_letters and has_numbers and
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])):
nbi_id = candidate
print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr)
continue
# Priority 3: Pattern without hyphen/space (all together)
# Only if we haven't found one yet and it's a reasonable length
no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b'
match = re.search(no_hyphen_pattern, line_clean)
if match:
candidate = match.group(1)
# Make sure it doesn't contain common address words and has both letters and numbers
has_letters = bool(re.search(r'[A-Z]', candidate))
has_numbers = bool(re.search(r'[0-9]', candidate))
if (has_letters and has_numbers and
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])):
# Try to split it intelligently (usually split in the middle)
mid = len(candidate) // 2
# Try splitting at various points
for split_point in range(mid-2, mid+3):
if 8 <= split_point <= len(candidate) - 8:
part1 = candidate[:split_point]
part2 = candidate[split_point:]
if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12:
nbi_id = f"{part1}-{part2}"
print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr)
break
if nbi_id:
continue
# Extract Full Name - look for name patterns after "NAME" label
# Also handle cases where name might be on the same line or next lines
if not full_name:
# Check if line contains "NAME" label
if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper):
# First, check if name is on the same line after colon
if ":" in line:
parts = line.split(':', 1)
if len(parts) > 1:
name_part = parts[1].strip()
if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2:
full_name = name_part
print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr)
continue
# Check next few lines for name value
for j in range(1, min(5, len(cleaned_lines) - i)):
next_line = cleaned_lines[i + j].strip()
next_upper = next_line.upper()
# Skip if it's another label or ID number
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
continue
# Check if it looks like a name (has letters, may have commas, not all numbers)
if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2:
# Additional check: make sure it's not just a single word that's too short
if len(next_line.split()) >= 1 and len(next_line) > 3:
full_name = next_line
print(f"DEBUG: Found full name: {full_name}", file=sys.stderr)
break
# Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label
if not birth_date:
if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or
("DATE" in line_upper and "BIRTH" in line_upper)):
# First, check if date is on the same line after colon
if ":" in line:
parts = line.split(':', 1)
if len(parts) > 1:
date_part = parts[1].strip()
if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', date_part.upper()) or
re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or
re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)):
birth_date = date_part
print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr)
continue
# Check next few lines for date value
for j in range(1, min(5, len(cleaned_lines) - i)):
next_line = cleaned_lines[i + j].strip()
next_upper = next_line.upper()
# Skip if it's another label
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
continue
# Check if it looks like a date (contains month name or date pattern)
if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', next_upper) or
re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or
re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)):
birth_date = next_line
print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
break
# Extract LIT field - look for "LIT" label or pattern
if not lit:
# Look for "LIT" label (could be "LIT:", "LIT", or part of another label)
if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper):
# Check if LIT value is on the same line after colon or space
if ":" in line:
parts = line.split(':', 1)
if len(parts) > 1:
lit_part = parts[1].strip()
if len(lit_part) > 0:
lit = lit_part
print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr)
continue
# Check next few lines for LIT value
for j in range(1, min(4, len(cleaned_lines) - i)):
next_line = cleaned_lines[i + j].strip()
next_upper = next_line.upper()
# Skip if it's another label
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']):
continue
# Check if it looks like a valid LIT value (could be date, name, or other text)
if len(next_line) > 0:
lit = next_line
print(f"DEBUG: Found LIT: {lit}", file=sys.stderr)
break
return {
'clearance_type': 'nbi',
'id_number': nbi_id,
'full_name': full_name,
'birth_date': birth_date,
'lit': lit,
'success': nbi_id is not None or full_name is not None
}
def extract_ocr_lines_simple(image_path):
# Try with different PaddleOCR settings
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
ocr = PaddleOCR(
use_doc_orientation_classify=True, # Enable orientation detection
use_doc_unwarping=True, # Enable document unwarping
use_textline_orientation=True, # Enable text line orientation
lang='en' # Set language to English
)
try:
results = ocr.predict(image_path)
except Exception as e:
print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
if hasattr(ocr, 'ocr'):
results = ocr.ocr(image_path)
else:
results = None
all_text = []
try:
# Handle both old format (list) and new format (OCRResult object)
if results and isinstance(results, list) and len(results) > 0:
first_item = results[0]
item_type_name = type(first_item).__name__
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
if is_ocr_result:
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
# Access OCRResult as dictionary
try:
if hasattr(first_item, 'keys'):
ocr_dict = dict(first_item)
# Look for rec_texts key
if 'rec_texts' in ocr_dict:
rec_texts = ocr_dict['rec_texts']
if isinstance(rec_texts, list):
all_text = [str(t) for t in rec_texts if t]
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
except Exception as e:
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
else:
# Old format - list of lists
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
def extract_ocr_lines(image_path):
# Check if file exists and has content
if not os.path.exists(image_path):
return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
# Ensure output directory exists
os.makedirs("output", exist_ok=True)
# Clear previous output files
for old_file in glob.glob("output/*"):
os.remove(old_file)
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang='en'
)
try:
results = ocr.predict(image_path)
except Exception as e:
print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
if hasattr(ocr, 'ocr'):
results = ocr.ocr(image_path)
else:
results = None
# Process OCR results - handle both old format (list) and new format (OCRResult object)
all_text = []
try:
# Handle both old format (list) and new format (OCRResult object)
if results and isinstance(results, list) and len(results) > 0:
first_item = results[0]
item_type_name = type(first_item).__name__
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
if is_ocr_result:
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
# Access OCRResult as dictionary
try:
if hasattr(first_item, 'keys'):
ocr_dict = dict(first_item)
# Look for rec_texts key
if 'rec_texts' in ocr_dict:
rec_texts = ocr_dict['rec_texts']
if isinstance(rec_texts, list):
all_text = [str(t) for t in rec_texts if t]
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
except Exception as e:
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
else:
# Old format - list of lists
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
import traceback
print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
# Main
if len(sys.argv) < 2:
sys.stdout = original_stdout
print(json.dumps({"success": False, "error": "No image URL provided"}))
sys.exit(1)
image_url = sys.argv[1]
print(f"DEBUG: Processing NBI image URL: {image_url}", file=sys.stderr)
try:
image_path = download_image(image_url, f'temp_image.jpg')
print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)
# Try the original OCR method first
ocr_results = extract_ocr_lines(image_path)
print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr)
# If original method fails, try simple method
if not ocr_results['success']:
print("DEBUG: Original method failed, trying simple method", file=sys.stderr)
ocr_results = extract_ocr_lines_simple(image_path)
print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr)
# Clean up the temporary file
if os.path.exists(image_path):
os.remove(image_path)
# Create the response object
response = {
"success": ocr_results['success'],
"ocr_results": ocr_results
}
# Restore stdout and print only the JSON response
sys.stdout = original_stdout
sys.stdout.write(json.dumps(response))
sys.stdout.flush()
except Exception as e:
# Restore stdout for error JSON
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
sys.stdout.flush()
sys.exit(1)
finally:
# Clean up
try:
if os.path.exists('temp_image.jpg'):
os.remove('temp_image.jpg')
except:
pass