handyhome-ocr-api / extract_passport.py
takomattyy's picture
Upload 20 files
db10255 verified
raw
history blame
16 kB
#!/usr/bin/env python3
"""
Philippine Passport Information Extraction Script
Purpose:
Extracts structured information from Philippine passport images using OCR.
Handles complex passport layouts with multiple information fields.
Why this script exists:
- Passports have complex layouts with multiple information fields
- Need to extract international-standard passport information
- Handles bilingual labels (English/Filipino)
- Required for passport verification workflows
Key Features:
- Extracts passport number (format: X0000000A)
- Handles complex name structures (surname, given names, middle name)
- Processes multiple date fields (birth, issue, expiration)
- Extracts nationality and place of birth
- Handles OCR digit correction
Dependencies:
- PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
- Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
- requests: HTTP library (https://docs.python-requests.org/)
Usage:
python extract_passport.py "https://example.com/passport.jpg"
Output:
JSON with extracted information: passport_number, full_name, birth_date, valid_until, etc.
"""
import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr
# Route any non-JSON prints to stderr by default
_ORIG_STDOUT = sys.stdout
sys.stdout = sys.stderr
# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'
# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR
def dprint(msg, obj=None):
try:
print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
except Exception:
pass
def clean_cache():
files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
for f in files:
if os.path.exists(f):
os.remove(f)
dprint("Removed cache file", f)
if os.path.exists("output"):
import shutil
shutil.rmtree("output")
dprint("Removed output directory")
def download_image(url, output_path='temp_image.jpg'):
dprint("Starting download", url)
clean_cache()
r = requests.get(url)
dprint("HTTP status", r.status_code)
r.raise_for_status()
img = Image.open(BytesIO(r.content))
if img.mode == 'RGBA':
bg = Image.new('RGB', img.size, (255, 255, 255))
bg.paste(img, mask=img.split()[-1])
img = bg
elif img.mode != 'RGB':
img = img.convert('RGB')
img.save(output_path, 'JPEG', quality=95)
dprint("Saved image", output_path)
return output_path
def cap_words(s):
return None if not s else ' '.join(w.capitalize() for w in s.split())
def normalize_digits(s):
"""
Fix common OCR digit confusions.
Args:
s (str): Text string that may contain OCR errors
Returns:
str: Text with corrected digits
Why this is needed:
- OCR often misreads similar-looking characters
- Common errors: O→0, o→0, I/l→1, S→5, B→8
- Critical for accurate ID number extraction
"""
return (
str(s)
.replace('O','0').replace('o','0')
.replace('I','1').replace('l','1')
.replace('S','5')
.replace('B','8')
)
def normalize_full_name(surname, given_names, middle_name=None):
if not surname and not given_names:
return None
surname = surname.strip() if surname else ""
given_names = given_names.strip() if given_names else ""
middle_name = middle_name.strip() if middle_name else ""
# Combine given names (first + second if present)
given_parts = [p for p in given_names.split() if p]
if len(given_parts) >= 2:
# Keep first two given names, ignore middle name
name_parts = [given_parts[0], given_parts[1], surname]
elif len(given_parts) == 1:
name_parts = [given_parts[0], surname]
else:
name_parts = [surname]
return cap_words(' '.join(name_parts))
def format_date(s):
if not s:
return None
raw = str(s).strip()
# Fix OCR digit issues first
raw = normalize_digits(raw)
# Handle "16MAR1980" format (no spaces)
try:
if re.match(r'\d{2}[A-Z]{3}\d{4}', raw):
return datetime.strptime(raw, "%d%b%Y").strftime("%Y-%m-%d")
except Exception:
pass
# Handle "16 MAR 1980" format (with spaces)
try:
return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d")
except Exception:
pass
# Handle "27 JUN 2016" format
try:
return datetime.strptime(raw, "%d %b %Y").strftime("%Y-%m-%d")
except Exception:
pass
# Handle other date formats
t = raw.replace(' ', '').replace('\\','/').replace('.','/')
if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
return t.replace('/', '-')
if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
m, d, y = raw.split('/')
return f"{y}-{int(m):02d}-{int(d):02d}"
return raw
def extract_passport_number(text):
# Fix OCR digits first
text = normalize_digits(text)
# Look for passport number pattern like "P0000000A"
passport_pattern = r'\b([A-Z]\d{7}[A-Z0-9])\b'
match = re.search(passport_pattern, text)
if match:
return match.group(1)
return None
def take_within(lines, i, k=5):
out = []
for j in range(1, k+1):
if i+j < len(lines):
t = str(lines[i+j]).strip()
if t:
out.append(t)
return out
def extract_passport_info(lines):
"""
Extract passport information from OCR text lines.
Args:
lines (list): List of text lines from OCR processing
Returns:
dict: Extracted passport information
Why this approach:
- Passports follow ICAO standards with specific formats
- Complex name structure requires separate handling
- Multiple date fields need individual processing
- Uses lookahead pattern matching for field extraction
"""
dprint("Lines to extract", lines)
# Initialize variables for extracted information
full_name = None
surname = None
given_names = None
middle_name = None
passport_number = None
birth_date = None
sex = None
nationality = None
place_of_birth = None
date_of_issue = None
valid_until = None
issuing_authority = None
L = [str(x or '').strip() for x in lines]
i = 0
while i < len(L):
line = L[i]
low = line.lower()
dprint("Line", {"i": i, "text": line})
# Extract passport number using pattern matching
if not passport_number:
passport_num = extract_passport_number(line)
if passport_num:
passport_number = passport_num
dprint("Found passport number", passport_number)
# Extract Surname using lookahead pattern
if 'surname' in low or 'apelyido' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
surname = t
dprint("Found surname", surname)
break
# Also look for "DELA CRUZ" directly
if not surname and 'dela' in low and 'cruz' in low:
surname = line
dprint("Found surname (direct)", surname)
# Extract Given Names
if 'given' in low and 'name' in low or 'pangalan' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
given_names = t
dprint("Found given names", given_names)
break
# Also look for "MARIA" directly
if not given_names and line == 'MARIA':
given_names = line
dprint("Found given names (direct)", given_names)
# Extract Middle Name
if 'middle' in low or 'panggitnang' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
middle_name = t
dprint("Found middle name", middle_name)
break
# Also look for "SANTOS" directly
if not middle_name and line == 'SANTOS':
middle_name = line
dprint("Found middle name (direct)", middle_name)
# Extract Date of Birth
if 'birth' in low or 'kapanganakan' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
birth_date = format_date(t)
dprint("Found birth date", birth_date)
break
# Also look for "16MAR1980" directly
if not birth_date and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line):
birth_date = format_date(line)
dprint("Found birth date (direct)", birth_date)
# Extract Sex
if 'sex' in low or 'kasarian' in low:
ahead = take_within(L, i, 2)
for t in ahead:
if t.upper() in ['M', 'F', 'MALE', 'FEMALE']:
sex = 'M' if t.upper().startswith('M') else 'F'
dprint("Found sex", sex)
break
# Also look for "F" directly
if not sex and line == 'F':
sex = 'F'
dprint("Found sex (direct)", sex)
# Extract Nationality
if 'nationality' in low or 'nasyonalidad' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
nationality = t
dprint("Found nationality", nationality)
break
# Also look for "FILIPINO" directly
if not nationality and line == 'FILIPINO':
nationality = line
dprint("Found nationality (direct)", nationality)
# Extract Place of Birth
if 'place' in low and 'birth' in low or 'lugar' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'[A-Z]{2,}', t) and not re.search(r'[0-9]', t):
place_of_birth = t
dprint("Found place of birth", place_of_birth)
break
# Also look for "MANILA" directly
if not place_of_birth and line == 'MANILA':
place_of_birth = line
dprint("Found place of birth (direct)", place_of_birth)
# Extract Date of Issue
if 'issue' in low or 'pagkakaloob' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'\d{1,2}[A-Z]{3}\d{4}', t) or re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
date_of_issue = format_date(t)
dprint("Found date of issue", date_of_issue)
break
# Also look for "27JUN2016" directly
if not date_of_issue and re.search(r'\d{1,2}[A-Z]{3}\d{4}', line):
date_of_issue = format_date(line)
dprint("Found date of issue (direct)", date_of_issue)
# Extract Valid Until Date
if 'valid' in low or 'pagkawalang' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', t):
valid_until = format_date(t)
dprint("Found valid until", valid_until)
break
# Also look for "26 JUN 2021" directly
if not valid_until and re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', line):
valid_until = format_date(line)
dprint("Found valid until (direct)", valid_until)
# Extract Issuing Authority
if 'authority' in low or 'maykapangyarihang' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if re.search(r'[A-Z]{2,}', t) and 'DFA' in t:
issuing_authority = t
dprint("Found issuing authority", issuing_authority)
break
# Also look for "DFAMANILA" directly
if not issuing_authority and 'DFA' in line:
issuing_authority = line
dprint("Found issuing authority (direct)", issuing_authority)
i += 1
# Compose full name from separate fields
if not full_name:
full_name = normalize_full_name(surname, given_names, middle_name)
dprint("Composed full name", {"surname": surname, "given": given_names, "middle": middle_name, "full": full_name})
# Return structured result
result = {
"id_type": "passport",
"passport_number": passport_number,
"id_number": passport_number,
"full_name": full_name,
"surname": surname,
"given_names": given_names,
"middle_name": middle_name,
"birth_date": birth_date,
"sex": sex,
"nationality": nationality,
"place_of_birth": place_of_birth,
"date_of_issue": date_of_issue,
"valid_until": valid_until,
"issuing_authority": issuing_authority
}
dprint("Final result", result)
return result
def extract_ocr_lines(image_path):
os.makedirs("output", exist_ok=True)
dprint("Initializing PaddleOCR")
# Ensure any internal downloader/progress writes go to stderr, not stdout
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang='en',
show_log=False
)
dprint("OCR initialized")
dprint("Running OCR ocr", image_path)
results = ocr.ocr(image_path, cls=False)
try:
count = len(results[0]) if results and isinstance(results[0], list) else len(results)
except Exception:
count = 0
dprint("OCR ocr done, results_count", count)
# Process OCR results directly
all_text = []
try:
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
dprint("Error processing OCR results", str(e))
dprint("All direct texts", all_text)
return extract_passport_info(all_text) if all_text else {
"id_type": "passport",
"passport_number": None,
"id_number": None,
"full_name": None,
"birth_date": None
}
if len(sys.argv) < 2:
print(json.dumps({"error": "No image URL provided"}))
sys.exit(1)
image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
image_path = download_image(image_url)
dprint("Image downloaded to", image_path)
ocr_results = extract_ocr_lines(image_path)
dprint("OCR results ready")
# Ensure only the final JSON goes to stdout
sys.stdout = _ORIG_STDOUT
print(json.dumps({"success": True, "ocr_results": ocr_results}))
except Exception as e:
import traceback
error_msg = str(e)
traceback_msg = traceback.format_exc()
dprint("Exception", error_msg)
dprint("Traceback", traceback_msg)
print(json.dumps({
"error": error_msg,
"traceback": traceback_msg,
"success": False
}))
sys.exit(1)