Spaces:
Sleeping
Sleeping
File size: 23,558 Bytes
db10255 7908d00 db10255 7908d00 db10255 2b089f9 db10255 2b089f9 db10255 2b089f9 db10255 2b089f9 db10255 6916300 db10255 2b089f9 db10255 7908d00 db10255 2b089f9 db10255 2b089f9 db10255 2b089f9 db10255 7908d00 db10255 2b089f9 db10255 2b089f9 db10255 2b089f9 db10255 2b089f9 db10255 2b089f9 db10255 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 |
import sys, json, os, glob, requests
import re
import time
import shutil
from contextlib import redirect_stdout, redirect_stderr
# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr
# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'
# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR
def download_image(url, output_path='temp_image.jpg'):
# Remove any existing temp file
if os.path.exists(output_path):
os.remove(output_path)
# Add cache-busting parameters
timestamp = int(time.time())
if '?' in url:
url += f'&t={timestamp}'
else:
url += f'?t={timestamp}'
# Add headers to prevent caching
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Cache-Control': 'no-cache, no-store, must-revalidate',
'Pragma': 'no-cache',
'Expires': '0'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
image_data = response.content
# Save the image and verify it's the right one
with open(output_path, 'wb') as f:
f.write(image_data)
return output_path
# OCR Function to extract NBI ID NO, Name, Birth Date, and LIT
def extract_nbi_id(lines):
nbi_id = None
full_name = None
birth_date = None
lit = None # LIT field (Last Issued To or similar)
# Clean lines - convert to strings and strip
cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines]
# First pass: Look for NBI ID pattern in all lines (prioritize exact matches)
# This helps catch IDs that might be on lines without labels
for i, line in enumerate(cleaned_lines):
line_upper = line.upper().strip()
line_clean = line.strip()
# Look for NBI ID pattern with hyphen first (most reliable)
if not nbi_id:
hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
match = re.search(hyphen_pattern, line_clean)
if match:
candidate = match.group(1)
# Validate length and that it's not part of an address
if 17 <= len(candidate) <= 25:
# Check that line doesn't have too many words (NBI IDs are usually standalone)
line_words = line_clean.split()
if len(line_words) <= 3: # Usually 1-2 words max (the ID itself)
# Additional validation: should have mix of letters and numbers
has_letters = bool(re.search(r'[A-Z]', candidate))
has_numbers = bool(re.search(r'[0-9]', candidate))
if has_letters and has_numbers:
nbi_id = candidate
print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr)
break
# Second pass: Extract other fields and refine ID if needed
for i, line in enumerate(cleaned_lines):
line_upper = line.upper().strip()
line_clean = line.strip()
# Extract NBI ID Number (if not found in first pass)
if not nbi_id:
# Look for "NBI ID NO:" pattern (various formats)
if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or
"NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper):
# Extract the ID after the colon
if ":" in line:
parts = line.split(':', 1)
if len(parts) > 1:
id_candidate = parts[1].strip()
# Clean up the ID (remove extra spaces, ensure proper format)
id_candidate = re.sub(r'\s+', '', id_candidate) # Remove spaces
if len(id_candidate) > 5: # Valid ID should be longer
nbi_id = id_candidate
print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr)
continue
# Also check if the next line contains the ID (in case it's on a separate line)
if i < len(cleaned_lines) - 1:
for j in range(1, min(3, len(cleaned_lines) - i)):
next_line = cleaned_lines[i + j].strip()
# Skip if it's clearly not an ID (too short, contains labels)
if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']):
continue
# Check if it looks like an NBI ID (alphanumeric, reasonable length)
if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')):
nbi_id = next_line.replace(' ', '')
print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr)
break
if nbi_id:
continue
# Look for NBI ID pattern: alphanumeric with one hyphen
# Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873
# First part: 8-12 chars, hyphen, second part: 8-12 chars
# Total length: 17-25 characters (including hyphen)
# Priority 1: Pattern with hyphen (most common format)
# Look for pattern like B450JRLR0B-RC248667
hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
match = re.search(hyphen_pattern, line_clean)
if match:
candidate = match.group(1)
# Validate: should be 17-25 chars total
if 17 <= len(candidate) <= 25:
# Make sure it's not matching address parts or other text
# Also check that the line doesn't have too many words (NBI IDs are usually standalone)
line_words = line_clean.split()
# Additional validation: should have mix of letters and numbers
has_letters = bool(re.search(r'[A-Z]', candidate))
has_numbers = bool(re.search(r'[0-9]', candidate))
if (has_letters and has_numbers and
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and
len(line_words) <= 3): # NBI ID is usually on its own line or with 1-2 other words
nbi_id = candidate
print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr)
continue
# Priority 2: Pattern with space instead of hyphen
space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b'
match = re.search(space_pattern, line_clean)
if match:
part1, part2 = match.groups()
candidate = f"{part1}-{part2}"
if 17 <= len(candidate) <= 25:
has_letters = bool(re.search(r'[A-Z]', candidate))
has_numbers = bool(re.search(r'[0-9]', candidate))
if (has_letters and has_numbers and
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])):
nbi_id = candidate
print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr)
continue
# Priority 3: Pattern without hyphen/space (all together)
# Only if we haven't found one yet and it's a reasonable length
no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b'
match = re.search(no_hyphen_pattern, line_clean)
if match:
candidate = match.group(1)
# Make sure it doesn't contain common address words and has both letters and numbers
has_letters = bool(re.search(r'[A-Z]', candidate))
has_numbers = bool(re.search(r'[0-9]', candidate))
if (has_letters and has_numbers and
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])):
# Try to split it intelligently (usually split in the middle)
mid = len(candidate) // 2
# Try splitting at various points
for split_point in range(mid-2, mid+3):
if 8 <= split_point <= len(candidate) - 8:
part1 = candidate[:split_point]
part2 = candidate[split_point:]
if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12:
nbi_id = f"{part1}-{part2}"
print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr)
break
if nbi_id:
continue
# Extract Full Name - look for name patterns after "NAME" label
# Also handle cases where name might be on the same line or next lines
if not full_name:
# Check if line contains "NAME" label
if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper):
# First, check if name is on the same line after colon
if ":" in line:
parts = line.split(':', 1)
if len(parts) > 1:
name_part = parts[1].strip()
if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2:
full_name = name_part
print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr)
continue
# Check next few lines for name value
for j in range(1, min(5, len(cleaned_lines) - i)):
next_line = cleaned_lines[i + j].strip()
next_upper = next_line.upper()
# Skip if it's another label or ID number
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
continue
# Check if it looks like a name (has letters, may have commas, not all numbers)
if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2:
# Additional check: make sure it's not just a single word that's too short
if len(next_line.split()) >= 1 and len(next_line) > 3:
full_name = next_line
print(f"DEBUG: Found full name: {full_name}", file=sys.stderr)
break
# Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label
if not birth_date:
if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or
("DATE" in line_upper and "BIRTH" in line_upper)):
# First, check if date is on the same line after colon
if ":" in line:
parts = line.split(':', 1)
if len(parts) > 1:
date_part = parts[1].strip()
if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', date_part.upper()) or
re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or
re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)):
birth_date = date_part
print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr)
continue
# Check next few lines for date value
for j in range(1, min(5, len(cleaned_lines) - i)):
next_line = cleaned_lines[i + j].strip()
next_upper = next_line.upper()
# Skip if it's another label
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
continue
# Check if it looks like a date (contains month name or date pattern)
if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', next_upper) or
re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or
re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)):
birth_date = next_line
print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
break
# Extract LIT field - look for "LIT" label or pattern
if not lit:
# Look for "LIT" label (could be "LIT:", "LIT", or part of another label)
if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper):
# Check if LIT value is on the same line after colon or space
if ":" in line:
parts = line.split(':', 1)
if len(parts) > 1:
lit_part = parts[1].strip()
if len(lit_part) > 0:
lit = lit_part
print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr)
continue
# Check next few lines for LIT value
for j in range(1, min(4, len(cleaned_lines) - i)):
next_line = cleaned_lines[i + j].strip()
next_upper = next_line.upper()
# Skip if it's another label
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']):
continue
# Check if it looks like a valid LIT value (could be date, name, or other text)
if len(next_line) > 0:
lit = next_line
print(f"DEBUG: Found LIT: {lit}", file=sys.stderr)
break
return {
'clearance_type': 'nbi',
'id_number': nbi_id,
'full_name': full_name,
'birth_date': birth_date,
'lit': lit,
'success': nbi_id is not None or full_name is not None
}
def extract_ocr_lines_simple(image_path):
# Try with different PaddleOCR settings
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
ocr = PaddleOCR(
use_doc_orientation_classify=True, # Enable orientation detection
use_doc_unwarping=True, # Enable document unwarping
use_textline_orientation=True, # Enable text line orientation
lang='en' # Set language to English
)
try:
results = ocr.predict(image_path)
except Exception as e:
print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
if hasattr(ocr, 'ocr'):
results = ocr.ocr(image_path)
else:
results = None
all_text = []
try:
# Handle both old format (list) and new format (OCRResult object)
if results and isinstance(results, list) and len(results) > 0:
first_item = results[0]
item_type_name = type(first_item).__name__
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
if is_ocr_result:
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
# Access OCRResult as dictionary
try:
if hasattr(first_item, 'keys'):
ocr_dict = dict(first_item)
# Look for rec_texts key
if 'rec_texts' in ocr_dict:
rec_texts = ocr_dict['rec_texts']
if isinstance(rec_texts, list):
all_text = [str(t) for t in rec_texts if t]
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
except Exception as e:
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
else:
# Old format - list of lists
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
def extract_ocr_lines(image_path):
# Check if file exists and has content
if not os.path.exists(image_path):
return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
# Ensure output directory exists
os.makedirs("output", exist_ok=True)
# Clear previous output files
for old_file in glob.glob("output/*"):
os.remove(old_file)
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang='en'
)
try:
results = ocr.predict(image_path)
except Exception as e:
print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
if hasattr(ocr, 'ocr'):
results = ocr.ocr(image_path)
else:
results = None
# Process OCR results - handle both old format (list) and new format (OCRResult object)
all_text = []
try:
# Handle both old format (list) and new format (OCRResult object)
if results and isinstance(results, list) and len(results) > 0:
first_item = results[0]
item_type_name = type(first_item).__name__
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
if is_ocr_result:
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
# Access OCRResult as dictionary
try:
if hasattr(first_item, 'keys'):
ocr_dict = dict(first_item)
# Look for rec_texts key
if 'rec_texts' in ocr_dict:
rec_texts = ocr_dict['rec_texts']
if isinstance(rec_texts, list):
all_text = [str(t) for t in rec_texts if t]
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
except Exception as e:
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
else:
# Old format - list of lists
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
import traceback
print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
# Main
if len(sys.argv) < 2:
sys.stdout = original_stdout
print(json.dumps({"success": False, "error": "No image URL provided"}))
sys.exit(1)
image_url = sys.argv[1]
print(f"DEBUG: Processing NBI image URL: {image_url}", file=sys.stderr)
try:
image_path = download_image(image_url, f'temp_image.jpg')
print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)
# Try the original OCR method first
ocr_results = extract_ocr_lines(image_path)
print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr)
# If original method fails, try simple method
if not ocr_results['success']:
print("DEBUG: Original method failed, trying simple method", file=sys.stderr)
ocr_results = extract_ocr_lines_simple(image_path)
print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr)
# Clean up the temporary file
if os.path.exists(image_path):
os.remove(image_path)
# Create the response object
response = {
"success": ocr_results['success'],
"ocr_results": ocr_results
}
# Restore stdout and print only the JSON response
sys.stdout = original_stdout
sys.stdout.write(json.dumps(response))
sys.stdout.flush()
except Exception as e:
# Restore stdout for error JSON
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
sys.stdout.flush()
sys.exit(1)
finally:
# Clean up
try:
if os.path.exists('temp_image.jpg'):
os.remove('temp_image.jpg')
except:
pass |