handyhome-ocr-api / extract_national_id.py
takomattyy's picture
Upload 10 files
6916300 verified
#!/usr/bin/env python3
"""
Philippine National ID Information Extraction Script
Purpose:
Extracts structured information from Philippine National ID card images using OCR.
This script is designed to work as a microservice for document verification systems.
Why this script exists:
- Manual verification of National IDs is time-consuming and error-prone
- Need standardized data extraction from bilingual (English/Filipino) ID cards
- Required for automated document verification workflows
- Supports API integration for web applications
Key Features:
- Extracts 19-digit National ID number (format: XXXX-XXXX-XXXX-XXXX)
- Handles bilingual labels (English/Filipino)
- Processes name composition from separate fields
- Formats dates consistently (ISO format: YYYY-MM-DD)
- Preserves EXIF metadata when possible
Dependencies:
- PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
- Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
- requests: HTTP library for image downloads (https://docs.python-requests.org/)
Usage:
python extract_national_id.py "https://example.com/national_id.jpg"
Output:
JSON with extracted information: id_number, full_name, birth_date
"""
import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr
# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr
# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'
# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR
def clean_cache():
"""
Delete all cached images and output files to prevent disk space issues.
Why this is needed:
- OCR processing creates temporary files that can accumulate
- Prevents disk space exhaustion in production environments
- Ensures clean state for each new document processing
"""
# List of temporary files created by PaddleOCR and our processing
cache_files = [
'temp_image.jpg', # Downloaded image
'temp_image_ocr_res_img.jpg', # OCR result visualization
'temp_image_preprocessed_img.jpg', # Preprocessed image
'temp_image_res.json' # OCR result JSON
]
# Delete individual cache files
for file in cache_files:
if os.path.exists(file):
os.remove(file)
print(f"DEBUG: Removed cached file: {file}", file=sys.stderr)
# Delete output directory and its contents
# This removes any OCR output files from previous runs
if os.path.exists("output"):
import shutil
shutil.rmtree("output")
print("DEBUG: Removed output directory", file=sys.stderr)
def download_image(url, output_path='temp_image.jpg'):
"""
Download image from URL and process it for OCR.
Args:
url (str): URL of the National ID image to process
output_path (str): Local path to save the downloaded image
Returns:
str: Path to the downloaded and processed image
Why this approach:
- Downloads images from URLs (common in web applications)
- Converts RGBA to RGB (JPEG doesn't support alpha channels)
- Preserves EXIF data when possible (important for document authenticity)
- Uses high quality (95%) to maintain OCR accuracy
"""
# Clean cache before downloading new image to prevent conflicts
clean_cache()
# Download the image with error handling and User-Agent header
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status() # Raise exception for HTTP errors
image_data = response.content
# First, try to extract EXIF data from the original image
# EXIF data is important for document authenticity verification
original_exif = None
try:
original_image = Image.open(BytesIO(image_data))
# Check if image has EXIF data (some formats don't support it)
if hasattr(original_image, '_getexif') and original_image._getexif():
original_exif = original_image._getexif()
except Exception as e:
# If EXIF extraction fails, continue without it
pass
# Now process the image for OCR
image = Image.open(BytesIO(image_data))
# Convert to RGB if necessary (JPEG doesn't support alpha channel)
# This is crucial for OCR accuracy as PaddleOCR expects RGB images
if image.mode == 'RGBA':
# Create a white background for transparent images
background = Image.new('RGB', image.size, (255, 255, 255))
# Paste the image onto the background using alpha channel as mask
background.paste(image, mask=image.split()[-1]) # Use alpha channel as mask
image = background
elif image.mode != 'RGB':
# Convert other formats (like L for grayscale) to RGB
image = image.convert('RGB')
# Save as JPG and preserve EXIF data if it existed
# High quality (95%) maintains OCR accuracy while keeping file size reasonable
if original_exif:
image.save(output_path, 'JPEG', quality=95, exif=original_exif)
else:
image.save(output_path, 'JPEG', quality=95)
print(f"DEBUG: Downloaded and saved image to: {output_path}", file=sys.stderr)
return output_path
# Formatting Helpers
def format_birth_date(date_str):
"""
Format birth date string to ISO format (YYYY-MM-DD).
Args:
date_str (str): Raw date string from OCR
Returns:
str: Formatted date in YYYY-MM-DD format, or original string if parsing fails
Why this complexity:
- OCR often produces inconsistent date formats
- Philippine IDs use various date formats
- Need standardized output for database storage
- Handles common OCR errors like missing spaces
"""
if not date_str:
return None
# Try to match formats like 'JULY10,2003' or 'JULY 10, 2003'
# Remove spaces first to handle OCR inconsistencies
date_str = date_str.replace(' ', '')
match = re.match(r'([A-Za-z]+)(\d{1,2}),?(\d{4})', date_str)
if match:
month_str, day, year = match.groups()
try:
# Convert month name to number using first 3 characters
month = datetime.strptime(month_str[:3], '%b').month
return f"{year}-{int(month):02d}-{int(day):02d}"
except Exception:
pass
# Try to parse with datetime for other possible formats
# This handles various common date formats found in Philippine IDs
for fmt in ("%B%d,%Y", "%B%d,%Y", "%b%d,%Y", "%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"):
try:
dt = datetime.strptime(date_str, fmt)
return dt.strftime("%Y-%m-%d")
except Exception:
continue
# Fallback to original if parsing fails
return date_str
def capitalize_name(name):
"""
Properly capitalize name string and split concatenated names.
Args:
name (str): Raw name string from OCR
Returns:
str: Properly capitalized name with spaces between name parts
Why this is needed:
- OCR often produces inconsistent capitalization
- OCR may concatenate multiple given names without spaces
- Need standardized name format for database storage
- Handles multiple spaces and OCR artifacts
"""
if not name:
return name
# Split by existing spaces first
words = name.split()
processed_words = []
for word in words:
# Check if word is all caps and might be multiple names concatenated
# Example: "CARLMATTHEW" -> "Carl Matthew"
if word.isupper() and len(word) > 5:
# Try to split on capital letters that likely start new names
# Look for patterns where a lowercase would naturally follow
# Common Filipino/Western given names are 3-7 letters
parts = []
current = []
for i, char in enumerate(word):
current.append(char)
# Check if this might be the end of a name part
# (next char is capital and current part is 3+ letters)
if i < len(word) - 1 and len(current) >= 3:
# Check if the accumulated part looks like a complete name
current_str = ''.join(current)
# Common name endings or patterns
if current_str.upper() in ['CARL', 'MATTHEW', 'JOHN', 'MARK', 'LUKE', 'PAUL', 'MARIA', 'JOSE', 'JUAN', 'PEDRO', 'MIGUEL', 'ANGEL', 'LUIS', 'CARLOS', 'MARCO', 'ANDRE', 'ANDRE', 'ALBERT', 'JOY']:
parts.append(''.join(current))
current = []
# Add remaining
if current:
parts.append(''.join(current))
# If we found multiple parts, use them; otherwise use original
if len(parts) > 1:
processed_words.extend([p.capitalize() for p in parts])
else:
processed_words.append(word.capitalize())
else:
processed_words.append(word.capitalize())
return ' '.join(processed_words)
# OCR Function
def extract_id_info(lines):
"""
Extract structured information from OCR text lines.
Args:
lines (list): List of text lines from OCR processing
Returns:
dict: Extracted information with keys: id_number, full_name, birth_date
Why this approach:
- Philippine National IDs have specific format requirements
- ID number is always 19 characters with 3 hyphens
- Uses bilingual labels (English/Filipino) for field identification
- Handles cases where labels and values are on separate lines
"""
print("DEBUG: Processing lines:", lines, file=sys.stderr)
# Initialize variables for extracted information
id_number = None
last_name = None
given_names = None
birth_date = None
# Process each line to find relevant information
for i in range(len(lines)):
line = lines[i]
line_upper = line.upper().replace(' ', '') if isinstance(line, str) else ''
print(f"DEBUG: Processing line {i}: '{line}'", file=sys.stderr)
# Check for National ID number format: XXXX-XXXX-XXXX-XXXX
# This is the standard format for Philippine National IDs
if isinstance(line, str) and len(line) == 19 and line.count('-') == 3 and all(part.isdigit() for part in line.split('-')):
id_number = line
print(f"DEBUG: Found ID number: {id_number}", file=sys.stderr)
# Look for bilingual "Last Name" label (flexible matching)
# Philippine IDs often have both English and Filipino labels
if ('APELYIDO' in line_upper and 'LASTNAME' in line_upper) and i+1 < len(lines):
last_name = lines[i+1]
print(f"DEBUG: Found last name: {last_name}", file=sys.stderr)
# Look for bilingual "Given Names" label (flexible matching)
if ('PANGALAN' in line_upper and 'GIVENNAMES' in line_upper) and i+1 < len(lines):
given_names = lines[i+1]
print(f"DEBUG: Found given names: {given_names}", file=sys.stderr)
# Look for bilingual "Date of Birth" label (flexible matching)
if ('KAPANGANAKAN' in line_upper or ('DATEOF' in line_upper and 'BIRTH' in line_upper)):
# Look ahead for the actual date value (skip any labels)
for j in range(i+1, min(i+4, len(lines))):
next_line = lines[j]
next_upper = next_line.upper().replace(' ', '') if isinstance(next_line, str) else ''
# Skip if it's another label
if any(keyword in next_upper for keyword in ['DIGITAL', 'NUMBER', 'ADDRESS', 'TIRAHAN', 'ID']):
continue
# Check if it looks like a date (contains month name or digits)
if any(month in next_line for month in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) or re.search(r'\d{1,2}[,.\s]+\d{4}', next_line):
birth_date = next_line
print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
break
# Compose full name from separate fields
# Philippine names typically follow: Given Names + Last Name
name_parts = [given_names, last_name]
full_name = ' '.join([part for part in name_parts if part])
full_name = capitalize_name(full_name)
# Format birth date to ISO standard
formatted_birth_date = format_birth_date(birth_date) if birth_date else None
# Return structured result
result = {
'id_type': 'National ID',
'id_number': id_number,
'full_name': full_name,
'birth_date': formatted_birth_date
}
print("DEBUG: Final result:", json.dumps(result, indent=2), file=sys.stderr)
return result
def extract_ocr_lines(image_path):
"""
Perform OCR on image and extract text lines.
Args:
image_path (str): Path to the image file
Returns:
dict: Extracted information from the image
Why these OCR settings:
- Disabled expensive features for better performance in production
- English language setting (Philippine IDs primarily use English)
- Suppressed logs to keep output clean for API responses
- Offscreen rendering for server environments
"""
# Ensure output directory exists for OCR results
os.makedirs("output", exist_ok=True)
# Initialize PaddleOCR with optimized settings
# These settings balance accuracy with performance
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
ocr = PaddleOCR(
use_doc_orientation_classify=False, # Disable for better performance
use_doc_unwarping=False, # Disable for better performance
use_textline_orientation=False, # Disable for better performance
lang='en' # English language
)
results = ocr.ocr(image_path)
# Process OCR results directly
all_text = []
try:
# Handle both old format (list) and new format (OCRResult object)
if results and isinstance(results, list) and len(results) > 0:
first_item = results[0]
item_type_name = type(first_item).__name__
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
if is_ocr_result:
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
# Access OCRResult as dictionary
try:
if hasattr(first_item, 'keys'):
ocr_dict = dict(first_item)
# Look for rec_texts key
if 'rec_texts' in ocr_dict:
rec_texts = ocr_dict['rec_texts']
if isinstance(rec_texts, list):
all_text = [str(t) for t in rec_texts if t]
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
except Exception as e:
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
else:
# Old format - list of lists
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
return extract_id_info(all_text) if all_text else {'id_type': 'National ID', 'id_number': None, 'full_name': None, 'birth_date': None}
# Main execution
if __name__ == "__main__":
# Validate command line arguments
if len(sys.argv) < 2:
print(json.dumps({"error": "No image URL provided"}))
sys.exit(1)
image_url = sys.argv[1]
print(f"DEBUG: Processing image URL: {image_url}", file=sys.stderr)
try:
# Download and process the image
image_path = download_image(image_url)
# Perform OCR and extract information
ocr_results = extract_ocr_lines(image_path)
# Restore stdout and print only the JSON response
sys.stdout = original_stdout
sys.stdout.write(json.dumps({
"success": True,
"ocr_results": ocr_results
}))
sys.stdout.flush()
except Exception as e:
# Restore stdout for error JSON
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"error": str(e)}))
sys.stdout.flush()
sys.exit(1)
finally:
# Clean up
try:
clean_cache()
except:
pass