Spaces:
Sleeping
Sleeping
File size: 12,930 Bytes
db10255 7908d00 db10255 6916300 db10255 7908d00 db10255 7908d00 db10255 6916300 db10255 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 |
#!/usr/bin/env python3
"""
Philippine PRC (Professional Regulation Commission) License Information Extraction Script
Purpose:
Extracts structured information from PRC license images using OCR.
Handles various PRC license formats including UMID-style cards.
Why this script exists:
- PRC licenses have complex layouts with multiple information fields
- Need to extract profession-specific information
- Handles both traditional PRC licenses and UMID-style PRC cards
- Required for professional verification workflows
Key Features:
- Extracts CRN (Common Reference Number) - 12-digit format
- Processes registration numbers and dates
- Extracts profession information
- Handles GSIS/SSS number extraction
- Supports validity date tracking
Dependencies:
- PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
- Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
- requests: HTTP library (https://docs.python-requests.org/)
Usage:
python extract_prc.py "https://example.com/prc_license.jpg"
Output:
JSON with extracted information: crn, registration_number, profession, valid_until, etc.
"""
import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr
# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr
# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'
# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR
def dprint(msg, obj=None):
"""
Debug print function that safely handles object serialization.
Args:
msg (str): Debug message
obj (any): Object to print (optional)
Why this approach:
- Centralized debug logging
- Safe object serialization
- Consistent debug output format
"""
try:
print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
except Exception:
pass
def clean_cache():
cache_files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
for f in cache_files:
if os.path.exists(f):
os.remove(f)
dprint("Removed cache file", f)
if os.path.exists("output"):
import shutil
shutil.rmtree("output")
dprint("Removed output directory")
def download_image(url, output_path='temp_image.jpg'):
dprint("Starting download", url)
clean_cache()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
r = requests.get(url, headers=headers, timeout=30)
dprint("HTTP status", r.status_code)
r.raise_for_status()
img = Image.open(BytesIO(r.content))
if img.mode == 'RGBA':
bg = Image.new('RGB', img.size, (255,255,255))
bg.paste(img, mask=img.split()[-1])
img = bg
elif img.mode != 'RGB':
img = img.convert('RGB')
img.save(output_path, 'JPEG', quality=95)
dprint("Saved image", output_path)
return output_path
def format_date(s):
if not s: return None
raw = s.strip()
t = raw.replace(' ', '').replace('\\','/').replace('.','/')
if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
return t.replace('/', '-')
# Accept mm/dd/yyyy style
if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
m, d, y = raw.split('/')
return f"{y}-{int(m):02d}-{int(d):02d}"
# Month name variants
m = re.match(r'([A-Za-z]+)\s*\d{1,2},\s*\d{4}', raw)
if m:
try:
return datetime.strptime(raw.replace(' ', ' '), "%B %d, %Y").strftime("%Y-%m-%d")
except Exception:
try:
return datetime.strptime(raw.replace(' ', ' '), "%b %d, %Y").strftime("%Y-%m-%d")
except Exception:
pass
return raw
def cap_words(name):
return None if not name else ' '.join(w.capitalize() for w in name.split())
def normalize_name_from_parts(last, first_block):
last = (last or '').strip()
tokens = [t for t in (first_block or '').strip().split(' ') if t]
given_kept = tokens[:2] # keep up to two given names
composed = ' '.join(given_kept + [last]).strip()
return cap_words(composed) if composed else None
def normalize_full_name_from_three(first, middle, last):
# keep first + optional second from "first" block; ignore middle completely
tokens = [t for t in (first or '').strip().split(' ') if t]
given_kept = tokens[:2]
composed = ' '.join(given_kept + [last or '']).strip()
return cap_words(composed) if composed else None
def take_within(lines, i, k=5):
out = []
for j in range(1, k+1):
if i+j < len(lines):
t = str(lines[i+j]).strip()
if t:
out.append(t)
return out
def is_numeric_id(t):
return bool(re.match(r'^\d{5,}$', str(t).replace(' ', '')))
def is_crn(t):
# UMID CRN commonly 12 digits
return bool(re.match(r'^\d{12}$', t.replace(' ', '')))
def is_date(t):
t1 = t.replace(' ', '').replace('\\','/').replace('.','/')
return bool(re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t1)) or bool(re.match(r'^\d{2}/\d{2}/\d{4}$', t)) or bool(re.match(r'^[A-Za-z]+\s*\d{1,2},\s*\d{4}$', t))
def extract_prc_info(lines):
"""
Extract PRC license information from OCR text lines.
Args:
lines (list): List of text lines from OCR processing
Returns:
dict: Extracted PRC information with keys: crn, registration_number, profession, etc.
Why this approach:
- PRC licenses have complex layouts with multiple fields
- Need to handle various license formats (traditional and UMID-style)
- Extracts profession-specific information
- Handles both traditional PRC licenses and UMID-style PRC cards
- Uses lookahead pattern matching for field extraction
"""
dprint("Lines to extract", lines)
# Initialize variables for extracted information
crn = None
full_name = None
birth_date = None
gsis_number = None
sss_number = None
registration_number = None
registration_date = None
valid_until = None
profession = None
# Collect name parts separately for composition
last_name_txt = None
first_name_txt = None
L = [str(x or '').strip() for x in lines]
i = 0
while i < len(L):
line = L[i]
low = line.lower()
dprint("Line", {"i": i, "text": line})
# Extract CRN (UMID format) - 12 digits
if crn is None and is_crn(line):
crn = line.replace(' ', '')
dprint("Found CRN", crn)
# Extract Last Name using lookahead pattern
if 'last name' in low:
ahead = take_within(L, i, 3)
for t in ahead:
tl = t.lower()
if not any(k in tl for k in ['first', 'middle', 'registration', 'valid', 'date', 'no']):
last_name_txt = t
break
# Extract First Name
if 'firstname' in low or 'first name' in low:
if i+1 < len(L):
first_name_txt = L[i+1]
# Extract Date of Birth
if ('date of birth' in low) or ('birth' in low and 'date' in low):
ahead = take_within(L, i, 4)
for t in ahead:
if is_date(t):
birth_date = format_date(t)
break
# Extract Registration Number - handles split labels
if low == 'registration' and i+1 < len(L) and L[i+1].lower() in ('no', 'no.', 'number'):
ahead = take_within(L, i+1, 4)
for t in ahead:
if is_numeric_id(t):
registration_number = t.replace(' ', '')
break
# Also handle fused label forms
if ('registration no' in low) or ('registration number' in low):
ahead = take_within(L, i, 4)
for t in ahead:
if is_numeric_id(t):
registration_number = t.replace(' ', '')
break
# Extract Registration Date
if low == 'registration' and i+1 < len(L) and L[i+1].lower() == 'date':
ahead = take_within(L, i+1, 4)
for t in ahead:
if is_date(t):
registration_date = format_date(t)
break
if 'registration date' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if is_date(t):
registration_date = format_date(t)
break
# Extract Valid Until Date
if 'valid until' in low or 'validity' in low:
ahead = take_within(L, i, 3)
for t in ahead:
if is_date(t):
valid_until = format_date(t)
break
# Extract Profession from bold lines
if any(k in low for k in ['occupational','technician','engineer','teacher','nurse']):
if len(line.split()) >= 2:
profession = cap_words(line)
dprint("Found profession", profession)
# Extract SSS Number
if sss_number is None and ('sss' in low or 'social security' in low):
ahead = take_within(L, i, 3)
for t in ahead:
if is_numeric_id(t):
sss_number = t.replace(' ', '')
dprint("Found sss_number", sss_number)
break
# Extract GSIS Number
if gsis_number is None and ('gsis' in low):
ahead = take_within(L, i, 3)
for t in ahead:
if is_numeric_id(t):
gsis_number = t.replace(' ', '')
dprint("Found gsis_number", gsis_number)
break
i += 1
# Compose full name from parts
if full_name is None:
full_name = normalize_name_from_parts(last_name_txt, first_name_txt)
# Return structured result
result = {
"id_type": "PRC ID",
"crn": crn,
"id_number": registration_number or crn, # Frontend expects id_number
"registration_number": registration_number,
"registration_date": registration_date,
"valid_until": valid_until,
"full_name": full_name,
"birth_date": birth_date,
"sss_number": sss_number,
"gsis_number": gsis_number,
"profession": profession
}
dprint("Final result", result)
return result
def extract_ocr_lines(image_path):
os.makedirs("output", exist_ok=True)
dprint("Initializing PaddleOCR")
with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
ocr = PaddleOCR(
use_doc_orientation_classify=False,
use_doc_unwarping=False,
use_textline_orientation=False,
lang='en'
)
dprint("OCR initialized")
dprint("Running OCR predict", image_path)
results = ocr.predict(image_path)
dprint("OCR predict done, results_count", len(results))
# Process OCR results directly
all_text = []
try:
lines = results[0] if results and isinstance(results[0], list) else results
for item in lines:
if isinstance(item, (list, tuple)) and len(item) >= 2:
meta = item[1]
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
all_text.append(str(meta[0]))
except Exception as e:
dprint("Error processing OCR results", str(e))
dprint("All direct texts", all_text)
return extract_prc_info(all_text) if all_text else {
"id_type": "PRC ID",
"crn": None,
"full_name": None,
"birth_date": None
}
if len(sys.argv) < 2:
sys.stdout = original_stdout
print(json.dumps({"error": "No image URL provided"}))
sys.exit(1)
image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
image_path = download_image(image_url)
dprint("Image downloaded to", image_path)
ocr_results = extract_ocr_lines(image_path)
dprint("OCR results ready")
# Restore stdout and print only the JSON response
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results}))
sys.stdout.flush()
except Exception as e:
dprint("Exception", str(e))
# Restore stdout for error JSON
sys.stdout = original_stdout
sys.stdout.write(json.dumps({"error": str(e)}))
sys.stdout.flush()
sys.exit(1)
finally:
# Clean up
try:
clean_cache()
except:
pass
|