Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

handyhome-ocr-api / extract_national_id.py

takomattyy

Upload 10 files

6916300 verified 21 days ago

raw

history blame contribute delete

18.3 kB

	#!/usr/bin/env python3
	"""
	Philippine National ID Information Extraction Script

	Purpose:
	Extracts structured information from Philippine National ID card images using OCR.
	This script is designed to work as a microservice for document verification systems.

	Why this script exists:
	- Manual verification of National IDs is time-consuming and error-prone
	- Need standardized data extraction from bilingual (English/Filipino) ID cards
	- Required for automated document verification workflows
	- Supports API integration for web applications

	Key Features:
	- Extracts 19-digit National ID number (format: XXXX-XXXX-XXXX-XXXX)
	- Handles bilingual labels (English/Filipino)
	- Processes name composition from separate fields
	- Formats dates consistently (ISO format: YYYY-MM-DD)
	- Preserves EXIF metadata when possible

	Dependencies:
	- PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
	- Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
	- requests: HTTP library for image downloads (https://docs.python-requests.org/)

	Usage:
	python extract_national_id.py "https://example.com/national_id.jpg"

	Output:
	JSON with extracted information: id_number, full_name, birth_date
	"""

	import sys, json, os, glob, re, requests
	from PIL import Image
	from io import BytesIO
	from datetime import datetime
	from contextlib import redirect_stdout, redirect_stderr

	# Immediately redirect all output to stderr except for our final JSON
	original_stdout = sys.stdout
	sys.stdout = sys.stderr

	# Suppress all PaddleOCR output
	os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
	os.environ['QT_QPA_PLATFORM'] = 'offscreen'
	os.environ['DISPLAY'] = ':99'

	# Import PaddleOCR after setting environment variables
	from paddleocr import PaddleOCR

	def clean_cache():
	"""
	Delete all cached images and output files to prevent disk space issues.

	Why this is needed:
	- OCR processing creates temporary files that can accumulate
	- Prevents disk space exhaustion in production environments
	- Ensures clean state for each new document processing
	"""
	# List of temporary files created by PaddleOCR and our processing
	cache_files = [
	'temp_image.jpg', # Downloaded image
	'temp_image_ocr_res_img.jpg', # OCR result visualization
	'temp_image_preprocessed_img.jpg', # Preprocessed image
	'temp_image_res.json' # OCR result JSON
	]

	# Delete individual cache files
	for file in cache_files:
	if os.path.exists(file):
	os.remove(file)
	print(f"DEBUG: Removed cached file: {file}", file=sys.stderr)

	# Delete output directory and its contents
	# This removes any OCR output files from previous runs
	if os.path.exists("output"):
	import shutil
	shutil.rmtree("output")
	print("DEBUG: Removed output directory", file=sys.stderr)

	def download_image(url, output_path='temp_image.jpg'):
	"""
	Download image from URL and process it for OCR.

	Args:
	url (str): URL of the National ID image to process
	output_path (str): Local path to save the downloaded image

	Returns:
	str: Path to the downloaded and processed image

	Why this approach:
	- Downloads images from URLs (common in web applications)
	- Converts RGBA to RGB (JPEG doesn't support alpha channels)
	- Preserves EXIF data when possible (important for document authenticity)
	- Uses high quality (95%) to maintain OCR accuracy
	"""
	# Clean cache before downloading new image to prevent conflicts
	clean_cache()

	# Download the image with error handling and User-Agent header
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, headers=headers, timeout=30)
	response.raise_for_status() # Raise exception for HTTP errors
	image_data = response.content

	# First, try to extract EXIF data from the original image
	# EXIF data is important for document authenticity verification
	original_exif = None
	try:
	original_image = Image.open(BytesIO(image_data))

	# Check if image has EXIF data (some formats don't support it)
	if hasattr(original_image, '_getexif') and original_image._getexif():
	original_exif = original_image._getexif()
	except Exception as e:
	# If EXIF extraction fails, continue without it
	pass

	# Now process the image for OCR
	image = Image.open(BytesIO(image_data))

	# Convert to RGB if necessary (JPEG doesn't support alpha channel)
	# This is crucial for OCR accuracy as PaddleOCR expects RGB images
	if image.mode == 'RGBA':
	# Create a white background for transparent images
	background = Image.new('RGB', image.size, (255, 255, 255))
	# Paste the image onto the background using alpha channel as mask
	background.paste(image, mask=image.split()[-1]) # Use alpha channel as mask
	image = background
	elif image.mode != 'RGB':
	# Convert other formats (like L for grayscale) to RGB
	image = image.convert('RGB')

	# Save as JPG and preserve EXIF data if it existed
	# High quality (95%) maintains OCR accuracy while keeping file size reasonable
	if original_exif:
	image.save(output_path, 'JPEG', quality=95, exif=original_exif)
	else:
	image.save(output_path, 'JPEG', quality=95)

	print(f"DEBUG: Downloaded and saved image to: {output_path}", file=sys.stderr)
	return output_path

	# Formatting Helpers
	def format_birth_date(date_str):
	"""
	Format birth date string to ISO format (YYYY-MM-DD).

	Args:
	date_str (str): Raw date string from OCR

	Returns:
	str: Formatted date in YYYY-MM-DD format, or original string if parsing fails

	Why this complexity:
	- OCR often produces inconsistent date formats
	- Philippine IDs use various date formats
	- Need standardized output for database storage
	- Handles common OCR errors like missing spaces
	"""
	if not date_str:
	return None

	# Try to match formats like 'JULY10,2003' or 'JULY 10, 2003'
	# Remove spaces first to handle OCR inconsistencies
	date_str = date_str.replace(' ', '')
	match = re.match(r'([A-Za-z]+)(\d{1,2}),?(\d{4})', date_str)
	if match:
	month_str, day, year = match.groups()
	try:
	# Convert month name to number using first 3 characters
	month = datetime.strptime(month_str[:3], '%b').month
	return f"{year}-{int(month):02d}-{int(day):02d}"
	except Exception:
	pass

	# Try to parse with datetime for other possible formats
	# This handles various common date formats found in Philippine IDs
	for fmt in ("%B%d,%Y", "%B%d,%Y", "%b%d,%Y", "%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"):
	try:
	dt = datetime.strptime(date_str, fmt)
	return dt.strftime("%Y-%m-%d")
	except Exception:
	continue

	# Fallback to original if parsing fails
	return date_str

	def capitalize_name(name):
	"""
	Properly capitalize name string and split concatenated names.

	Args:
	name (str): Raw name string from OCR

	Returns:
	str: Properly capitalized name with spaces between name parts

	Why this is needed:
	- OCR often produces inconsistent capitalization
	- OCR may concatenate multiple given names without spaces
	- Need standardized name format for database storage
	- Handles multiple spaces and OCR artifacts
	"""
	if not name:
	return name

	# Split by existing spaces first
	words = name.split()
	processed_words = []

	for word in words:
	# Check if word is all caps and might be multiple names concatenated
	# Example: "CARLMATTHEW" -> "Carl Matthew"
	if word.isupper() and len(word) > 5:
	# Try to split on capital letters that likely start new names
	# Look for patterns where a lowercase would naturally follow
	# Common Filipino/Western given names are 3-7 letters
	parts = []
	current = []

	for i, char in enumerate(word):
	current.append(char)
	# Check if this might be the end of a name part
	# (next char is capital and current part is 3+ letters)
	if i < len(word) - 1 and len(current) >= 3:
	# Check if the accumulated part looks like a complete name
	current_str = ''.join(current)
	# Common name endings or patterns
	if current_str.upper() in ['CARL', 'MATTHEW', 'JOHN', 'MARK', 'LUKE', 'PAUL', 'MARIA', 'JOSE', 'JUAN', 'PEDRO', 'MIGUEL', 'ANGEL', 'LUIS', 'CARLOS', 'MARCO', 'ANDRE', 'ANDRE', 'ALBERT', 'JOY']:
	parts.append(''.join(current))
	current = []

	# Add remaining
	if current:
	parts.append(''.join(current))

	# If we found multiple parts, use them; otherwise use original
	if len(parts) > 1:
	processed_words.extend([p.capitalize() for p in parts])
	else:
	processed_words.append(word.capitalize())
	else:
	processed_words.append(word.capitalize())

	return ' '.join(processed_words)

	# OCR Function
	def extract_id_info(lines):
	"""
	Extract structured information from OCR text lines.

	Args:
	lines (list): List of text lines from OCR processing

	Returns:
	dict: Extracted information with keys: id_number, full_name, birth_date

	Why this approach:
	- Philippine National IDs have specific format requirements
	- ID number is always 19 characters with 3 hyphens
	- Uses bilingual labels (English/Filipino) for field identification
	- Handles cases where labels and values are on separate lines
	"""
	print("DEBUG: Processing lines:", lines, file=sys.stderr)

	# Initialize variables for extracted information
	id_number = None
	last_name = None
	given_names = None
	birth_date = None

	# Process each line to find relevant information
	for i in range(len(lines)):
	line = lines[i]
	line_upper = line.upper().replace(' ', '') if isinstance(line, str) else ''
	print(f"DEBUG: Processing line {i}: '{line}'", file=sys.stderr)

	# Check for National ID number format: XXXX-XXXX-XXXX-XXXX
	# This is the standard format for Philippine National IDs
	if isinstance(line, str) and len(line) == 19 and line.count('-') == 3 and all(part.isdigit() for part in line.split('-')):
	id_number = line
	print(f"DEBUG: Found ID number: {id_number}", file=sys.stderr)

	# Look for bilingual "Last Name" label (flexible matching)
	# Philippine IDs often have both English and Filipino labels
	if ('APELYIDO' in line_upper and 'LASTNAME' in line_upper) and i+1 < len(lines):
	last_name = lines[i+1]
	print(f"DEBUG: Found last name: {last_name}", file=sys.stderr)

	# Look for bilingual "Given Names" label (flexible matching)
	if ('PANGALAN' in line_upper and 'GIVENNAMES' in line_upper) and i+1 < len(lines):
	given_names = lines[i+1]
	print(f"DEBUG: Found given names: {given_names}", file=sys.stderr)

	# Look for bilingual "Date of Birth" label (flexible matching)
	if ('KAPANGANAKAN' in line_upper or ('DATEOF' in line_upper and 'BIRTH' in line_upper)):
	# Look ahead for the actual date value (skip any labels)
	for j in range(i+1, min(i+4, len(lines))):
	next_line = lines[j]
	next_upper = next_line.upper().replace(' ', '') if isinstance(next_line, str) else ''
	# Skip if it's another label
	if any(keyword in next_upper for keyword in ['DIGITAL', 'NUMBER', 'ADDRESS', 'TIRAHAN', 'ID']):
	continue
	# Check if it looks like a date (contains month name or digits)
	if any(month in next_line for month in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) or re.search(r'\d{1,2}[,.\s]+\d{4}', next_line):
	birth_date = next_line
	print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
	break

	# Compose full name from separate fields
	# Philippine names typically follow: Given Names + Last Name
	name_parts = [given_names, last_name]
	full_name = ' '.join([part for part in name_parts if part])
	full_name = capitalize_name(full_name)

	# Format birth date to ISO standard
	formatted_birth_date = format_birth_date(birth_date) if birth_date else None

	# Return structured result
	result = {
	'id_type': 'National ID',
	'id_number': id_number,
	'full_name': full_name,
	'birth_date': formatted_birth_date
	}

	print("DEBUG: Final result:", json.dumps(result, indent=2), file=sys.stderr)
	return result

	def extract_ocr_lines(image_path):
	"""
	Perform OCR on image and extract text lines.

	Args:
	image_path (str): Path to the image file

	Returns:
	dict: Extracted information from the image

	Why these OCR settings:
	- Disabled expensive features for better performance in production
	- English language setting (Philippine IDs primarily use English)
	- Suppressed logs to keep output clean for API responses
	- Offscreen rendering for server environments
	"""
	# Ensure output directory exists for OCR results
	os.makedirs("output", exist_ok=True)

	# Initialize PaddleOCR with optimized settings
	# These settings balance accuracy with performance
	with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
	ocr = PaddleOCR(
	use_doc_orientation_classify=False, # Disable for better performance
	use_doc_unwarping=False, # Disable for better performance
	use_textline_orientation=False, # Disable for better performance
	lang='en' # English language
	)
	results = ocr.ocr(image_path)

	# Process OCR results directly
	all_text = []
	try:
	# Handle both old format (list) and new format (OCRResult object)
	if results and isinstance(results, list) and len(results) > 0:
	first_item = results[0]
	item_type_name = type(first_item).__name__
	is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()

	if is_ocr_result:
	print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
	# Access OCRResult as dictionary
	try:
	if hasattr(first_item, 'keys'):
	ocr_dict = dict(first_item)
	# Look for rec_texts key
	if 'rec_texts' in ocr_dict:
	rec_texts = ocr_dict['rec_texts']
	if isinstance(rec_texts, list):
	all_text = [str(t) for t in rec_texts if t]
	print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
	except Exception as e:
	print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
	else:
	# Old format - list of lists
	lines = results[0] if results and isinstance(results[0], list) else results
	for item in lines:
	if isinstance(item, (list, tuple)) and len(item) >= 2:
	meta = item[1]
	if isinstance(meta, (list, tuple)) and len(meta) >= 1:
	all_text.append(str(meta[0]))
	except Exception as e:
	print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)

	print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
	return extract_id_info(all_text) if all_text else {'id_type': 'National ID', 'id_number': None, 'full_name': None, 'birth_date': None}

	# Main execution
	if __name__ == "__main__":
	# Validate command line arguments
	if len(sys.argv) < 2:
	print(json.dumps({"error": "No image URL provided"}))
	sys.exit(1)

	image_url = sys.argv[1]
	print(f"DEBUG: Processing image URL: {image_url}", file=sys.stderr)

	try:
	# Download and process the image
	image_path = download_image(image_url)

	# Perform OCR and extract information
	ocr_results = extract_ocr_lines(image_path)

	# Restore stdout and print only the JSON response
	sys.stdout = original_stdout
	sys.stdout.write(json.dumps({
	"success": True,
	"ocr_results": ocr_results
	}))
	sys.stdout.flush()

	except Exception as e:
	# Restore stdout for error JSON
	sys.stdout = original_stdout
	sys.stdout.write(json.dumps({"error": str(e)}))
	sys.stdout.flush()
	sys.exit(1)
	finally:
	# Clean up
	try:
	clean_cache()
	except:
	pass