handyhome-ocr-api / extract_prc.py
takomattyy's picture
Upload 10 files
6916300 verified
#!/usr/bin/env python3
"""
Philippine PRC (Professional Regulation Commission) License Information Extraction Script
Purpose:
Extracts structured information from PRC license images using OCR.
Handles various PRC license formats including UMID-style cards.
Why this script exists:
- PRC licenses have complex layouts with multiple information fields
- Need to extract profession-specific information
- Handles both traditional PRC licenses and UMID-style PRC cards
- Required for professional verification workflows
Key Features:
- Extracts CRN (Common Reference Number) - 12-digit format
- Processes registration numbers and dates
- Extracts profession information
- Handles GSIS/SSS number extraction
- Supports validity date tracking
Dependencies:
- PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
- Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
- requests: HTTP library (https://docs.python-requests.org/)
Usage:
python extract_prc.py "https://example.com/prc_license.jpg"
Output:
JSON with extracted information: crn, registration_number, profession, valid_until, etc.
"""
import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr
# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr
# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'
# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR
def dprint(msg, obj=None):
"""
Debug print function that safely handles object serialization.
Args:
msg (str): Debug message
obj (any): Object to print (optional)
Why this approach:
- Centralized debug logging
- Safe object serialization
- Consistent debug output format
"""
try:
print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
except Exception:
pass
def clean_cache():
cache_files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
for f in cache_files:
if os.path.exists(f):
os.remove(f)
dprint("Removed cache file", f)
if os.path.exists("output"):
import shutil
shutil.rmtree("output")
dprint("Removed output directory")
def download_image(url, output_path='temp_image.jpg'):
dprint("Starting download", url)
clean_cache()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
r = requests.get(url, headers=headers, timeout=30)
dprint("HTTP status", r.status_code)
r.raise_for_status()
img = Image.open(BytesIO(r.content))
if img.mode == 'RGBA':
bg = Image.new('RGB', img.size, (255,255,255))
bg.paste(img, mask=img.split()[-1])
img = bg
elif img.mode != 'RGB':
img = img.convert('RGB')
img.save(output_path, 'JPEG', quality=95)
dprint("Saved image", output_path)
return output_path
def format_date(s):
if not s: return None
raw = s.strip()
t = raw.replace(' ', '').replace('\\','/').replace('.','/')
if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
return t.replace('/', '-')
# Accept mm/dd/yyyy style
if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
m, d, y = raw.split('/')
return f"{y}-{int(m):02d}-{int(d):02d}"
# Month name variants
m = re.match(r'([A-Za-z]+)\s*\d{1,2},\s*\d{4}', raw)
if m:
try:
return datetime.strptime(raw.replace(' ', ' '), "%B %d, %Y").strftime("%Y-%m-%d")
except Exception:
try:
return datetime.strptime(raw.replace(' ', ' '), "%b %d, %Y").strftime("%Y-%m-%d")
except Exception:
pass
return raw
def cap_words(name):
return None if not name else ' '.join(w.capitalize() for w in name.split())
def normalize_name_from_parts(last, first_block):
last = (last or '').strip()
tokens = [t for t in (first_block or '').strip().split(' ') if t]
given_kept = tokens[:2] # keep up to two given names
composed = ' '.join(given_kept + [last]).strip()
return cap_words(composed) if composed else None
def normalize_full_name_from_three(first, middle, last):
# keep first + optional second from "first" block; ignore middle completely
tokens = [t for t in (first or '').strip().split(' ') if t]
given_kept = tokens[:2]
composed = ' '.join(given_kept + [last or '']).strip()
return cap_words(composed) if composed else None
def take_within(lines, i, k=5):
out = []
for j in range(1, k+1):
if i+j < len(lines):
t = str(lines[i+j]).strip()
if t:
out.append(t)
return out
def is_numeric_id(t):
return bool(re.match(r'^\d{5,}$', str(t).replace(' ', '')))
def is_crn(t):
# UMID CRN commonly 12 digits
return bool(re.match(r'^\d{12}$', t.replace(' ', '')))
def is_date(t):
t1 = t.replace(' ', '').replace('\\','/').replace('.','/')
return bool(re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t1)) or bool(re.match(r'^\d{2}/\d{2}/\d{4}$', t)) or bool(re.match(r'^[A-Za-z]+\s*\d{1,2},\s*\d{4}$', t))
def extract_prc_info(lines):
"""
Extract PRC license information from OCR text lines.
Args:
lines (list): List of text lines from OCR processing
Returns:
dict: Extracted PRC information with keys: crn, registration_number, profession, etc.
Why this approach:
- PRC licenses have complex layouts with multiple fields
- Need to handle various license formats (traditional and UMID-style)
- Extracts profession-specific information
- Handles both traditional PRC licenses and UMID-style PRC cards
- Uses lookahead pattern matching for field extraction
"""
dprint("Lines to extract", lines)
# Initialize variables for extracted information
crn = None
full_name = None
birth_date = None
gsis_number = None
sss_number = None
registration_number = None
registration_date = None
valid_until = None
profession = None
# Collect name parts separately for composition
last_name_txt = None
first_name_txt = None
L = [str(x or '').strip() for x in lines]
i = 0
while i < len(L):
line = L[i]
low = line.lower()
dprint("Line", {"i": i, "text": line})
# Extract CRN (UMID format) - 12 digits
if crn is None and is_crn(line):
crn = line.replace(' ', '')
dprint("Found CRN", crn)
# Extract Last Name using lookahead pattern
if 'last name' in low:
ahead = take_within(L, i, 3)
for t in ahead:
tl = t.lower()
if not any(k in tl for k in ['first', 'middle', 'registration', 'valid', 'date', 'no']):
last_name_txt = t
break
# Extract First Name
if 'firstname' in low or 'first name' in low:
if i+1 < len(L):
first_name_txt = L[i+1]
# Extract Date of Birth
if ('date of birth' in low) or ('birth' in low and 'date' in low):
ahead = take_within(L, i, 4)
for t in ahead:
if is_date(t):
birth_date = format_date(t)
break
# Extract Registration Number - handles split labels
if low == 'registration' and i+1 < len(L) and L[i+1].lower() in ('no', 'no.', 'number'):
ahead = take_within(L, i+1, 4)
for t in ahead:
if is_numeric_id(t):
registration_number = t.replace(' ', '')
break
# Also handle fused label forms
if ('registration no' in low) or ('registration number' in low):
ahead = take_within(L, i, 4)
for t in ahead:
if is_numeric_id(t):
registration_number = t.replace(' ', '')
break
# Extract Registration Date
if low == 'registration' and i+1 < len(L) and L[i+1].lower() == 'date':
ahead = take_within(L, i+1, 4)
for t in ahead:
if is_date(t):
registration_date = format_date(t)
break
if 'registration date' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if is_date(t):
registration_date = format_date(t)
break
# Extract Valid Until Date
if 'valid until' in low or 'validity' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if is_date(t):
valid_until = format_date(t)
break
# Extract Profession from bold lines
if any(k in low for k in ['occupational','technician','engineer','teacher','nurse']):
if len(line.split()) >= 2:
profession = cap_words(line)
dprint("Found profession", profession)
# Extract SSS Number
if sss_number is None and ('sss' in low or 'social security' in low):
ahead = take_within(L, i, 3)
for t in ahead:
if is_numeric_id(t):
sss_number = t.replace(' ', '')
dprint("Found sss_number", sss_number)
break
# Extract GSIS Number
if gsis_number is None and ('gsis' in low):
ahead = take_within(L, i, 3)
for t in ahead:
if is_numeric_id(t):
gsis_number = t.replace(' ', '')
dprint("Found gsis_number", gsis_number)
break
i += 1
# Compose full name from parts
if full_name is None:
full_name = normalize_name_from_parts(last_name_txt, first_name_txt)
# Return structured result
result = {
"id_type": "PRC ID",
"crn": crn,
"id_number": registration_number or crn, # Frontend expects id_number
"registration_number": registration_number,
"registration_date": registration_date,
"valid_until": valid_until,
"full_name": full_name,
"birth_date": birth_date,
"sss_number": sss_number,
"gsis_number": gsis_number,
"profession": profession
}
dprint("Final result", result)
return result
def extract_ocr_lines(image_path):
os.makedirs("output", exist_ok=True)
dprint("Initializing PaddleOCR")
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang='en'
)
dprint("OCR initialized")
dprint("Running OCR predict", image_path)
results = ocr.predict(image_path)
dprint("OCR predict done, results_count", len(results))
# Process OCR results directly
all_text = []
try:
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
dprint("Error processing OCR results", str(e))
dprint("All direct texts", all_text)
return extract_prc_info(all_text) if all_text else {
"id_type": "PRC ID",
"crn": None,
"full_name": None,
"birth_date": None
}
if len(sys.argv) < 2:
sys.stdout = original_stdout
print(json.dumps({"error": "No image URL provided"}))
sys.exit(1)
image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
image_path = download_image(image_url)
dprint("Image downloaded to", image_path)
ocr_results = extract_ocr_lines(image_path)
dprint("OCR results ready")
# Restore stdout and print only the JSON response
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results}))
sys.stdout.flush()
except Exception as e:
dprint("Exception", str(e))
# Restore stdout for error JSON
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"error": str(e)}))
sys.stdout.flush()
sys.exit(1)
finally:
# Clean up
try:
clean_cache()
except:
pass