Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

handyhome-ocr-api / extract_passport.py

takomattyy

Upload 20 files

db10255 verified 25 days ago

raw

history blame

16 kB

	#!/usr/bin/env python3
	"""
	Philippine Passport Information Extraction Script

	Purpose:
	Extracts structured information from Philippine passport images using OCR.
	Handles complex passport layouts with multiple information fields.

	Why this script exists:
	- Passports have complex layouts with multiple information fields
	- Need to extract international-standard passport information
	- Handles bilingual labels (English/Filipino)
	- Required for passport verification workflows

	Key Features:
	- Extracts passport number (format: X0000000A)
	- Handles complex name structures (surname, given names, middle name)
	- Processes multiple date fields (birth, issue, expiration)
	- Extracts nationality and place of birth
	- Handles OCR digit correction

	Dependencies:
	- PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
	- Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
	- requests: HTTP library (https://docs.python-requests.org/)

	Usage:
	python extract_passport.py "https://example.com/passport.jpg"

	Output:
	JSON with extracted information: passport_number, full_name, birth_date, valid_until, etc.
	"""

	import sys, json, os, glob, re, requests
	from PIL import Image
	from io import BytesIO
	from datetime import datetime
	from contextlib import redirect_stdout, redirect_stderr

	# Route any non-JSON prints to stderr by default
	_ORIG_STDOUT = sys.stdout
	sys.stdout = sys.stderr

	# Suppress all PaddleOCR output
	os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
	os.environ['QT_QPA_PLATFORM'] = 'offscreen'
	os.environ['DISPLAY'] = ':99'

	# Import PaddleOCR after setting environment variables
	from paddleocr import PaddleOCR

	def dprint(msg, obj=None):
	try:
	print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
	except Exception:
	pass

	def clean_cache():
	files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
	for f in files:
	if os.path.exists(f):
	os.remove(f)
	dprint("Removed cache file", f)
	if os.path.exists("output"):
	import shutil
	shutil.rmtree("output")
	dprint("Removed output directory")

	def download_image(url, output_path='temp_image.jpg'):
	dprint("Starting download", url)
	clean_cache()
	r = requests.get(url)
	dprint("HTTP status", r.status_code)
	r.raise_for_status()
	img = Image.open(BytesIO(r.content))
	if img.mode == 'RGBA':
	bg = Image.new('RGB', img.size, (255, 255, 255))
	bg.paste(img, mask=img.split()[-1])
	img = bg
	elif img.mode != 'RGB':
	img = img.convert('RGB')
	img.save(output_path, 'JPEG', quality=95)
	dprint("Saved image", output_path)
	return output_path

	def cap_words(s):
	return None if not s else ' '.join(w.capitalize() for w in s.split())

	def normalize_digits(s):
	"""
	Fix common OCR digit confusions.

	Args:
	s (str): Text string that may contain OCR errors

	Returns:
	str: Text with corrected digits

	Why this is needed:
	- OCR often misreads similar-looking characters
	- Common errors: O→0, o→0, I/l→1, S→5, B→8
	- Critical for accurate ID number extraction
	"""
	return (
	str(s)
	.replace('O','0').replace('o','0')
	.replace('I','1').replace('l','1')
	.replace('S','5')
	.replace('B','8')
	)

	def normalize_full_name(surname, given_names, middle_name=None):
	if not surname and not given_names:
	return None

	surname = surname.strip() if surname else ""
	given_names = given_names.strip() if given_names else ""
	middle_name = middle_name.strip() if middle_name else ""

	# Combine given names (first + second if present)
	given_parts = [p for p in given_names.split() if p]
	if len(given_parts) >= 2:
	# Keep first two given names, ignore middle name
	name_parts = [given_parts[0], given_parts[1], surname]
	elif len(given_parts) == 1:
	name_parts = [given_parts[0], surname]
	else:
	name_parts = [surname]

	return cap_words(' '.join(name_parts))

	def format_date(s):
	if not s:
	return None
	raw = str(s).strip()

	# Fix OCR digit issues first
	raw = normalize_digits(raw)

	# Handle "16MAR1980" format (no spaces)
	try:
	if re.match(r'\d{2}[A-Z]{3}\d{4}', raw):
	return datetime.strptime(raw, "%d%b%Y").strftime("%Y-%m-%d")
	except Exception:
	pass

	# Handle "16 MAR 1980" format (with spaces)
	try:
	return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d")
	except Exception:
	pass

	# Handle "27 JUN 2016" format
	try:
	return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d")
	except Exception:
	pass

	# Handle other date formats
	t = raw.replace(' ', '').replace('\\','/').replace('.','/')
	if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
	return t.replace('/', '-')
	if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
	m, d, y = raw.split('/')
	return f"{y}-{int(m):02d}-{int(d):02d}"

	return raw

	def extract_passport_number(text):
	# Fix OCR digits first
	text = normalize_digits(text)
	# Look for passport number pattern like "P0000000A"
	passport_pattern = r'\b([A-Z]\d{7}[A-Z0-9])\b'
	match = re.search(passport_pattern, text)
	if match:
	return match.group(1)
	return None

	def take_within(lines, i, k=5):
	out = []
	for j in range(1, k+1):
	if i+j < len(lines):
	t = str(lines[i+j]).strip()
	if t:
	out.append(t)
	return out

	def extract_passport_info(lines):
	"""
	Extract passport information from OCR text lines.

	Args:
	lines (list): List of text lines from OCR processing

	Returns:
	dict: Extracted passport information

	Why this approach:
	- Passports follow ICAO standards with specific formats
	- Complex name structure requires separate handling
	- Multiple date fields need individual processing
	- Uses lookahead pattern matching for field extraction
	"""
	dprint("Lines to extract", lines)

	# Initialize variables for extracted information
	full_name = None
	surname = None
	given_names = None
	middle_name = None
	passport_number = None
	birth_date = None
	sex = None
	nationality = None
	place_of_birth = None
	date_of_issue = None
	valid_until = None
	issuing_authority = None

	L = [str(x or '').strip() for x in lines]
	i = 0
	while i < len(L):
	line = L[i]
	low = line.lower()
	dprint("Line", {"i": i, "text": line})

	# Extract passport number using pattern matching
	if not passport_number:
	passport_num = extract_passport_number(line)
	if passport_num:
	passport_number = passport_num
	dprint("Found passport number", passport_number)

	# Extract Surname using lookahead pattern
	if 'surname' in low or 'apelyido' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
	surname = t
	dprint("Found surname", surname)
	break
	# Also look for "DELA CRUZ" directly
	if not surname and 'dela' in low and 'cruz' in low:
	surname = line
	dprint("Found surname (direct)", surname)

	# Extract Given Names
	if 'given' in low and 'name' in low or 'pangalan' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
	given_names = t
	dprint("Found given names", given_names)
	break
	# Also look for "MARIA" directly
	if not given_names and line == 'MARIA':
	given_names = line
	dprint("Found given names (direct)", given_names)

	# Extract Middle Name
	if 'middle' in low or 'panggitnang' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
	middle_name = t
	dprint("Found middle name", middle_name)
	break
	# Also look for "SANTOS" directly
	if not middle_name and line == 'SANTOS':
	middle_name = line
	dprint("Found middle name (direct)", middle_name)

	# Extract Date of Birth
	if 'birth' in low or 'kapanganakan' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
	birth_date = format_date(t)
	dprint("Found birth date", birth_date)
	break
	# Also look for "16MAR1980" directly
	if not birth_date and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line):
	birth_date = format_date(line)
	dprint("Found birth date (direct)", birth_date)

	# Extract Sex
	if 'sex' in low or 'kasarian' in low:
	ahead = take_within(L, i, 2)
	for t in ahead:
	if t.upper() in ['M', 'F', 'MALE', 'FEMALE']:
	sex = 'M' if t.upper().startswith('M') else 'F'
	dprint("Found sex", sex)
	break
	# Also look for "F" directly
	if not sex and line == 'F':
	sex = 'F'
	dprint("Found sex (direct)", sex)

	# Extract Nationality
	if 'nationality' in low or 'nasyonalidad' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
	nationality = t
	dprint("Found nationality", nationality)
	break
	# Also look for "FILIPINO" directly
	if not nationality and line == 'FILIPINO':
	nationality = line
	dprint("Found nationality (direct)", nationality)

	# Extract Place of Birth
	if 'place' in low and 'birth' in low or 'lugar' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
	place_of_birth = t
	dprint("Found place of birth", place_of_birth)
	break
	# Also look for "MANILA" directly
	if not place_of_birth and line == 'MANILA':
	place_of_birth = line
	dprint("Found place of birth (direct)", place_of_birth)

	# Extract Date of Issue
	if 'issue' in low or 'pagkakaloob' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
	date_of_issue = format_date(t)
	dprint("Found date of issue", date_of_issue)
	break
	# Also look for "27JUN2016" directly
	if not date_of_issue and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line):
	date_of_issue = format_date(line)
	dprint("Found date of issue (direct)", date_of_issue)

	# Extract Valid Until Date
	if 'valid' in low or 'pagkawalang' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
	valid_until = format_date(t)
	dprint("Found valid until", valid_until)
	break
	# Also look for "26 JUN 2021" directly
	if not valid_until and re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', line):
	valid_until = format_date(line)
	dprint("Found valid until (direct)", valid_until)

	# Extract Issuing Authority
	if 'authority' in low or 'maykapangyarihang' in low:
	ahead = take_within(L, i, 3)
	for t in ahead:
	if re.search(r'[A-Z]{2,}', t) and 'DFA' in t:
	issuing_authority = t
	dprint("Found issuing authority", issuing_authority)
	break
	# Also look for "DFAMANILA" directly
	if not issuing_authority and 'DFA' in line:
	issuing_authority = line
	dprint("Found issuing authority (direct)", issuing_authority)

	i += 1

	# Compose full name from separate fields
	if not full_name:
	full_name = normalize_full_name(surname, given_names, middle_name)
	dprint("Composed full name", {"surname": surname, "given": given_names, "middle": middle_name, "full": full_name})

	# Return structured result
	result = {
	"id_type": "passport",
	"passport_number": passport_number,
	"id_number": passport_number,
	"full_name": full_name,
	"surname": surname,
	"given_names": given_names,
	"middle_name": middle_name,
	"birth_date": birth_date,
	"sex": sex,
	"nationality": nationality,
	"place_of_birth": place_of_birth,
	"date_of_issue": date_of_issue,
	"valid_until": valid_until,
	"issuing_authority": issuing_authority
	}
	dprint("Final result", result)
	return result

	def extract_ocr_lines(image_path):
	os.makedirs("output", exist_ok=True)
	dprint("Initializing PaddleOCR")

	# Ensure any internal downloader/progress writes go to stderr, not stdout
	with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
	ocr = PaddleOCR(
	use_doc_orientation_classify=False,
	use_doc_unwarping=False,
	use_textline_orientation=False,
	lang='en',
	show_log=False
	)
	dprint("OCR initialized")
	dprint("Running OCR ocr", image_path)
	results = ocr.ocr(image_path, cls=False)
	try:
	count = len(results[0]) if results and isinstance(results[0], list) else len(results)
	except Exception:
	count = 0
	dprint("OCR ocr done, results_count", count)

	# Process OCR results directly
	all_text = []
	try:
	lines = results[0] if results and isinstance(results[0], list) else results
	for item in lines:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	meta = item[1]
	if isinstance(meta, (list, tuple)) and len(meta) >= 1:
	all_text.append(str(meta[0]))
	except Exception as e:
	dprint("Error processing OCR results", str(e))

	dprint("All direct texts", all_text)
	return extract_passport_info(all_text) if all_text else {
	"id_type": "passport",
	"passport_number": None,
	"id_number": None,
	"full_name": None,
	"birth_date": None
	}

	if len(sys.argv) < 2:
	print(json.dumps({"error": "No image URL provided"}))
	sys.exit(1)

	image_url = sys.argv[1]
	dprint("Processing image URL", image_url)
	try:
	image_path = download_image(image_url)
	dprint("Image downloaded to", image_path)
	ocr_results = extract_ocr_lines(image_path)
	dprint("OCR results ready")
	# Ensure only the final JSON goes to stdout
	sys.stdout = _ORIG_STDOUT
	print(json.dumps({"success": True, "ocr_results": ocr_results}))
	except Exception as e:
	import traceback
	error_msg = str(e)
	traceback_msg = traceback.format_exc()
	dprint("Exception", error_msg)
	dprint("Traceback", traceback_msg)
	print(json.dumps({
	"error": error_msg,
	"traceback": traceback_msg,
	"success": False
	}))
	sys.exit(1)