Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Philippine National ID Information Extraction Script | |
| Purpose: | |
| Extracts structured information from Philippine National ID card images using OCR. | |
| This script is designed to work as a microservice for document verification systems. | |
| Why this script exists: | |
| - Manual verification of National IDs is time-consuming and error-prone | |
| - Need standardized data extraction from bilingual (English/Filipino) ID cards | |
| - Required for automated document verification workflows | |
| - Supports API integration for web applications | |
| Key Features: | |
| - Extracts 19-digit National ID number (format: XXXX-XXXX-XXXX-XXXX) | |
| - Handles bilingual labels (English/Filipino) | |
| - Processes name composition from separate fields | |
| - Formats dates consistently (ISO format: YYYY-MM-DD) | |
| - Preserves EXIF metadata when possible | |
| Dependencies: | |
| - PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR) | |
| - Pillow (PIL): Image processing (https://pillow.readthedocs.io/) | |
| - requests: HTTP library for image downloads (https://docs.python-requests.org/) | |
| Usage: | |
| python extract_national_id.py "https://example.com/national_id.jpg" | |
| Output: | |
| JSON with extracted information: id_number, full_name, birth_date | |
| """ | |
| import sys, json, os, glob, re, requests | |
| from PIL import Image | |
| from io import BytesIO | |
| from datetime import datetime | |
| from contextlib import redirect_stdout, redirect_stderr | |
| # Immediately redirect all output to stderr except for our final JSON | |
| original_stdout = sys.stdout | |
| sys.stdout = sys.stderr | |
| # Suppress all PaddleOCR output | |
| os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR' | |
| os.environ['QT_QPA_PLATFORM'] = 'offscreen' | |
| os.environ['DISPLAY'] = ':99' | |
| # Import PaddleOCR after setting environment variables | |
| from paddleocr import PaddleOCR | |
| def clean_cache(): | |
| """ | |
| Delete all cached images and output files to prevent disk space issues. | |
| Why this is needed: | |
| - OCR processing creates temporary files that can accumulate | |
| - Prevents disk space exhaustion in production environments | |
| - Ensures clean state for each new document processing | |
| """ | |
| # List of temporary files created by PaddleOCR and our processing | |
| cache_files = [ | |
| 'temp_image.jpg', # Downloaded image | |
| 'temp_image_ocr_res_img.jpg', # OCR result visualization | |
| 'temp_image_preprocessed_img.jpg', # Preprocessed image | |
| 'temp_image_res.json' # OCR result JSON | |
| ] | |
| # Delete individual cache files | |
| for file in cache_files: | |
| if os.path.exists(file): | |
| os.remove(file) | |
| print(f"DEBUG: Removed cached file: {file}", file=sys.stderr) | |
| # Delete output directory and its contents | |
| # This removes any OCR output files from previous runs | |
| if os.path.exists("output"): | |
| import shutil | |
| shutil.rmtree("output") | |
| print("DEBUG: Removed output directory", file=sys.stderr) | |
| def download_image(url, output_path='temp_image.jpg'): | |
| """ | |
| Download image from URL and process it for OCR. | |
| Args: | |
| url (str): URL of the National ID image to process | |
| output_path (str): Local path to save the downloaded image | |
| Returns: | |
| str: Path to the downloaded and processed image | |
| Why this approach: | |
| - Downloads images from URLs (common in web applications) | |
| - Converts RGBA to RGB (JPEG doesn't support alpha channels) | |
| - Preserves EXIF data when possible (important for document authenticity) | |
| - Uses high quality (95%) to maintain OCR accuracy | |
| """ | |
| # Clean cache before downloading new image to prevent conflicts | |
| clean_cache() | |
| # Download the image with error handling and User-Agent header | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=30) | |
| response.raise_for_status() # Raise exception for HTTP errors | |
| image_data = response.content | |
| # First, try to extract EXIF data from the original image | |
| # EXIF data is important for document authenticity verification | |
| original_exif = None | |
| try: | |
| original_image = Image.open(BytesIO(image_data)) | |
| # Check if image has EXIF data (some formats don't support it) | |
| if hasattr(original_image, '_getexif') and original_image._getexif(): | |
| original_exif = original_image._getexif() | |
| except Exception as e: | |
| # If EXIF extraction fails, continue without it | |
| pass | |
| # Now process the image for OCR | |
| image = Image.open(BytesIO(image_data)) | |
| # Convert to RGB if necessary (JPEG doesn't support alpha channel) | |
| # This is crucial for OCR accuracy as PaddleOCR expects RGB images | |
| if image.mode == 'RGBA': | |
| # Create a white background for transparent images | |
| background = Image.new('RGB', image.size, (255, 255, 255)) | |
| # Paste the image onto the background using alpha channel as mask | |
| background.paste(image, mask=image.split()[-1]) # Use alpha channel as mask | |
| image = background | |
| elif image.mode != 'RGB': | |
| # Convert other formats (like L for grayscale) to RGB | |
| image = image.convert('RGB') | |
| # Save as JPG and preserve EXIF data if it existed | |
| # High quality (95%) maintains OCR accuracy while keeping file size reasonable | |
| if original_exif: | |
| image.save(output_path, 'JPEG', quality=95, exif=original_exif) | |
| else: | |
| image.save(output_path, 'JPEG', quality=95) | |
| print(f"DEBUG: Downloaded and saved image to: {output_path}", file=sys.stderr) | |
| return output_path | |
| # Formatting Helpers | |
| def format_birth_date(date_str): | |
| """ | |
| Format birth date string to ISO format (YYYY-MM-DD). | |
| Args: | |
| date_str (str): Raw date string from OCR | |
| Returns: | |
| str: Formatted date in YYYY-MM-DD format, or original string if parsing fails | |
| Why this complexity: | |
| - OCR often produces inconsistent date formats | |
| - Philippine IDs use various date formats | |
| - Need standardized output for database storage | |
| - Handles common OCR errors like missing spaces | |
| """ | |
| if not date_str: | |
| return None | |
| # Try to match formats like 'JULY10,2003' or 'JULY 10, 2003' | |
| # Remove spaces first to handle OCR inconsistencies | |
| date_str = date_str.replace(' ', '') | |
| match = re.match(r'([A-Za-z]+)(\d{1,2}),?(\d{4})', date_str) | |
| if match: | |
| month_str, day, year = match.groups() | |
| try: | |
| # Convert month name to number using first 3 characters | |
| month = datetime.strptime(month_str[:3], '%b').month | |
| return f"{year}-{int(month):02d}-{int(day):02d}" | |
| except Exception: | |
| pass | |
| # Try to parse with datetime for other possible formats | |
| # This handles various common date formats found in Philippine IDs | |
| for fmt in ("%B%d,%Y", "%B%d,%Y", "%b%d,%Y", "%B %d, %Y", "%b %d, %Y", "%Y-%m-%d"): | |
| try: | |
| dt = datetime.strptime(date_str, fmt) | |
| return dt.strftime("%Y-%m-%d") | |
| except Exception: | |
| continue | |
| # Fallback to original if parsing fails | |
| return date_str | |
| def capitalize_name(name): | |
| """ | |
| Properly capitalize name string and split concatenated names. | |
| Args: | |
| name (str): Raw name string from OCR | |
| Returns: | |
| str: Properly capitalized name with spaces between name parts | |
| Why this is needed: | |
| - OCR often produces inconsistent capitalization | |
| - OCR may concatenate multiple given names without spaces | |
| - Need standardized name format for database storage | |
| - Handles multiple spaces and OCR artifacts | |
| """ | |
| if not name: | |
| return name | |
| # Split by existing spaces first | |
| words = name.split() | |
| processed_words = [] | |
| for word in words: | |
| # Check if word is all caps and might be multiple names concatenated | |
| # Example: "CARLMATTHEW" -> "Carl Matthew" | |
| if word.isupper() and len(word) > 5: | |
| # Try to split on capital letters that likely start new names | |
| # Look for patterns where a lowercase would naturally follow | |
| # Common Filipino/Western given names are 3-7 letters | |
| parts = [] | |
| current = [] | |
| for i, char in enumerate(word): | |
| current.append(char) | |
| # Check if this might be the end of a name part | |
| # (next char is capital and current part is 3+ letters) | |
| if i < len(word) - 1 and len(current) >= 3: | |
| # Check if the accumulated part looks like a complete name | |
| current_str = ''.join(current) | |
| # Common name endings or patterns | |
| if current_str.upper() in ['CARL', 'MATTHEW', 'JOHN', 'MARK', 'LUKE', 'PAUL', 'MARIA', 'JOSE', 'JUAN', 'PEDRO', 'MIGUEL', 'ANGEL', 'LUIS', 'CARLOS', 'MARCO', 'ANDRE', 'ANDRE', 'ALBERT', 'JOY']: | |
| parts.append(''.join(current)) | |
| current = [] | |
| # Add remaining | |
| if current: | |
| parts.append(''.join(current)) | |
| # If we found multiple parts, use them; otherwise use original | |
| if len(parts) > 1: | |
| processed_words.extend([p.capitalize() for p in parts]) | |
| else: | |
| processed_words.append(word.capitalize()) | |
| else: | |
| processed_words.append(word.capitalize()) | |
| return ' '.join(processed_words) | |
| # OCR Function | |
| def extract_id_info(lines): | |
| """ | |
| Extract structured information from OCR text lines. | |
| Args: | |
| lines (list): List of text lines from OCR processing | |
| Returns: | |
| dict: Extracted information with keys: id_number, full_name, birth_date | |
| Why this approach: | |
| - Philippine National IDs have specific format requirements | |
| - ID number is always 19 characters with 3 hyphens | |
| - Uses bilingual labels (English/Filipino) for field identification | |
| - Handles cases where labels and values are on separate lines | |
| """ | |
| print("DEBUG: Processing lines:", lines, file=sys.stderr) | |
| # Initialize variables for extracted information | |
| id_number = None | |
| last_name = None | |
| given_names = None | |
| birth_date = None | |
| # Process each line to find relevant information | |
| for i in range(len(lines)): | |
| line = lines[i] | |
| line_upper = line.upper().replace(' ', '') if isinstance(line, str) else '' | |
| print(f"DEBUG: Processing line {i}: '{line}'", file=sys.stderr) | |
| # Check for National ID number format: XXXX-XXXX-XXXX-XXXX | |
| # This is the standard format for Philippine National IDs | |
| if isinstance(line, str) and len(line) == 19 and line.count('-') == 3 and all(part.isdigit() for part in line.split('-')): | |
| id_number = line | |
| print(f"DEBUG: Found ID number: {id_number}", file=sys.stderr) | |
| # Look for bilingual "Last Name" label (flexible matching) | |
| # Philippine IDs often have both English and Filipino labels | |
| if ('APELYIDO' in line_upper and 'LASTNAME' in line_upper) and i+1 < len(lines): | |
| last_name = lines[i+1] | |
| print(f"DEBUG: Found last name: {last_name}", file=sys.stderr) | |
| # Look for bilingual "Given Names" label (flexible matching) | |
| if ('PANGALAN' in line_upper and 'GIVENNAMES' in line_upper) and i+1 < len(lines): | |
| given_names = lines[i+1] | |
| print(f"DEBUG: Found given names: {given_names}", file=sys.stderr) | |
| # Look for bilingual "Date of Birth" label (flexible matching) | |
| if ('KAPANGANAKAN' in line_upper or ('DATEOF' in line_upper and 'BIRTH' in line_upper)): | |
| # Look ahead for the actual date value (skip any labels) | |
| for j in range(i+1, min(i+4, len(lines))): | |
| next_line = lines[j] | |
| next_upper = next_line.upper().replace(' ', '') if isinstance(next_line, str) else '' | |
| # Skip if it's another label | |
| if any(keyword in next_upper for keyword in ['DIGITAL', 'NUMBER', 'ADDRESS', 'TIRAHAN', 'ID']): | |
| continue | |
| # Check if it looks like a date (contains month name or digits) | |
| if any(month in next_line for month in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) or re.search(r'\d{1,2}[,.\s]+\d{4}', next_line): | |
| birth_date = next_line | |
| print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr) | |
| break | |
| # Compose full name from separate fields | |
| # Philippine names typically follow: Given Names + Last Name | |
| name_parts = [given_names, last_name] | |
| full_name = ' '.join([part for part in name_parts if part]) | |
| full_name = capitalize_name(full_name) | |
| # Format birth date to ISO standard | |
| formatted_birth_date = format_birth_date(birth_date) if birth_date else None | |
| # Return structured result | |
| result = { | |
| 'id_type': 'National ID', | |
| 'id_number': id_number, | |
| 'full_name': full_name, | |
| 'birth_date': formatted_birth_date | |
| } | |
| print("DEBUG: Final result:", json.dumps(result, indent=2), file=sys.stderr) | |
| return result | |
| def extract_ocr_lines(image_path): | |
| """ | |
| Perform OCR on image and extract text lines. | |
| Args: | |
| image_path (str): Path to the image file | |
| Returns: | |
| dict: Extracted information from the image | |
| Why these OCR settings: | |
| - Disabled expensive features for better performance in production | |
| - English language setting (Philippine IDs primarily use English) | |
| - Suppressed logs to keep output clean for API responses | |
| - Offscreen rendering for server environments | |
| """ | |
| # Ensure output directory exists for OCR results | |
| os.makedirs("output", exist_ok=True) | |
| # Initialize PaddleOCR with optimized settings | |
| # These settings balance accuracy with performance | |
| with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr): | |
| ocr = PaddleOCR( | |
| use_doc_orientation_classify=False, # Disable for better performance | |
| use_doc_unwarping=False, # Disable for better performance | |
| use_textline_orientation=False, # Disable for better performance | |
| lang='en' # English language | |
| ) | |
| results = ocr.ocr(image_path) | |
| # Process OCR results directly | |
| all_text = [] | |
| try: | |
| # Handle both old format (list) and new format (OCRResult object) | |
| if results and isinstance(results, list) and len(results) > 0: | |
| first_item = results[0] | |
| item_type_name = type(first_item).__name__ | |
| is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower() | |
| if is_ocr_result: | |
| print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr) | |
| # Access OCRResult as dictionary | |
| try: | |
| if hasattr(first_item, 'keys'): | |
| ocr_dict = dict(first_item) | |
| # Look for rec_texts key | |
| if 'rec_texts' in ocr_dict: | |
| rec_texts = ocr_dict['rec_texts'] | |
| if isinstance(rec_texts, list): | |
| all_text = [str(t) for t in rec_texts if t] | |
| print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr) | |
| except Exception as e: | |
| print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr) | |
| else: | |
| # Old format - list of lists | |
| lines = results[0] if results and isinstance(results[0], list) else results | |
| for item in lines: | |
| if isinstance(item, (list, tuple)) and len(item) >= 2: | |
| meta = item[1] | |
| if isinstance(meta, (list, tuple)) and len(meta) >= 1: | |
| all_text.append(str(meta[0])) | |
| except Exception as e: | |
| print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr) | |
| print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr) | |
| return extract_id_info(all_text) if all_text else {'id_type': 'National ID', 'id_number': None, 'full_name': None, 'birth_date': None} | |
| # Main execution | |
| if __name__ == "__main__": | |
| # Validate command line arguments | |
| if len(sys.argv) < 2: | |
| print(json.dumps({"error": "No image URL provided"})) | |
| sys.exit(1) | |
| image_url = sys.argv[1] | |
| print(f"DEBUG: Processing image URL: {image_url}", file=sys.stderr) | |
| try: | |
| # Download and process the image | |
| image_path = download_image(image_url) | |
| # Perform OCR and extract information | |
| ocr_results = extract_ocr_lines(image_path) | |
| # Restore stdout and print only the JSON response | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps({ | |
| "success": True, | |
| "ocr_results": ocr_results | |
| })) | |
| sys.stdout.flush() | |
| except Exception as e: | |
| # Restore stdout for error JSON | |
| sys.stdout = original_stdout | |
| sys.stdout.write(json.dumps({"error": str(e)})) | |
| sys.stdout.flush() | |
| sys.exit(1) | |
| finally: | |
| # Clean up | |
| try: | |
| clean_cache() | |
| except: | |
| pass | |