File size: 12,930 Bytes
db10255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7908d00
 
 
 
db10255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6916300
db10255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7908d00
db10255
 
 
7908d00
db10255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6916300
db10255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#!/usr/bin/env python3
"""
Philippine PRC (Professional Regulation Commission) License Information Extraction Script

Purpose:
    Extracts structured information from PRC license images using OCR.
    Handles various PRC license formats including UMID-style cards.

Why this script exists:
    - PRC licenses have complex layouts with multiple information fields
    - Need to extract profession-specific information
    - Handles both traditional PRC licenses and UMID-style PRC cards
    - Required for professional verification workflows

Key Features:
    - Extracts CRN (Common Reference Number) - 12-digit format
    - Processes registration numbers and dates
    - Extracts profession information
    - Handles GSIS/SSS number extraction
    - Supports validity date tracking

Dependencies:
    - PaddleOCR: High-accuracy OCR engine (https://github.com/PaddlePaddle/PaddleOCR)
    - Pillow (PIL): Image processing (https://pillow.readthedocs.io/)
    - requests: HTTP library (https://docs.python-requests.org/)

Usage:
    python extract_prc.py "https://example.com/prc_license.jpg"

Output:
    JSON with extracted information: crn, registration_number, profession, valid_until, etc.
"""

import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def dprint(msg, obj=None):
    """
    Debug print function that safely handles object serialization.
    
    Args:
        msg (str): Debug message
        obj (any): Object to print (optional)
        
    Why this approach:
    - Centralized debug logging
    - Safe object serialization
    - Consistent debug output format
    """
    try:
        print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
    except Exception:
        pass

def clean_cache():
    cache_files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
    for f in cache_files:
        if os.path.exists(f):
            os.remove(f)
            dprint("Removed cache file", f)
    if os.path.exists("output"):
        import shutil
        shutil.rmtree("output")
        dprint("Removed output directory")

def download_image(url, output_path='temp_image.jpg'):
    dprint("Starting download", url)
    clean_cache()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    r = requests.get(url, headers=headers, timeout=30)
    dprint("HTTP status", r.status_code)
    r.raise_for_status()
    img = Image.open(BytesIO(r.content))
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255,255,255))
        bg.paste(img, mask=img.split()[-1])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    img.save(output_path, 'JPEG', quality=95)
    dprint("Saved image", output_path)
    return output_path

def format_date(s):
    if not s: return None
    raw = s.strip()
    t = raw.replace(' ', '').replace('\\','/').replace('.','/')
    if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t):
        return t.replace('/', '-')
    # Accept mm/dd/yyyy style
    if re.match(r'^\d{2}/\d{2}/\d{4}$', raw):
        m, d, y = raw.split('/')
        return f"{y}-{int(m):02d}-{int(d):02d}"
    # Month name variants
    m = re.match(r'([A-Za-z]+)\s*\d{1,2},\s*\d{4}', raw)
    if m:
        try:
            return datetime.strptime(raw.replace('  ', ' '), "%B %d, %Y").strftime("%Y-%m-%d")
        except Exception:
            try:
                return datetime.strptime(raw.replace('  ', ' '), "%b %d, %Y").strftime("%Y-%m-%d")
            except Exception:
                pass
    return raw

def cap_words(name):
    return None if not name else ' '.join(w.capitalize() for w in name.split())

def normalize_name_from_parts(last, first_block):
    last = (last or '').strip()
    tokens = [t for t in (first_block or '').strip().split(' ') if t]
    given_kept = tokens[:2]  # keep up to two given names
    composed = ' '.join(given_kept + [last]).strip()
    return cap_words(composed) if composed else None

def normalize_full_name_from_three(first, middle, last):
    # keep first + optional second from "first" block; ignore middle completely
    tokens = [t for t in (first or '').strip().split(' ') if t]
    given_kept = tokens[:2]
    composed = ' '.join(given_kept + [last or '']).strip()
    return cap_words(composed) if composed else None

def take_within(lines, i, k=5):
    out = []
    for j in range(1, k+1):
        if i+j < len(lines):
            t = str(lines[i+j]).strip()
            if t:
                out.append(t)
    return out

def is_numeric_id(t):
    return bool(re.match(r'^\d{5,}$', str(t).replace(' ', '')))

def is_crn(t):
    # UMID CRN commonly 12 digits
    return bool(re.match(r'^\d{12}$', t.replace(' ', '')))

def is_date(t):
    t1 = t.replace(' ', '').replace('\\','/').replace('.','/')
    return bool(re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t1)) or bool(re.match(r'^\d{2}/\d{2}/\d{4}$', t)) or bool(re.match(r'^[A-Za-z]+\s*\d{1,2},\s*\d{4}$', t))

def extract_prc_info(lines):
    """
    Extract PRC license information from OCR text lines.
    
    Args:
        lines (list): List of text lines from OCR processing
        
    Returns:
        dict: Extracted PRC information with keys: crn, registration_number, profession, etc.
        
    Why this approach:
    - PRC licenses have complex layouts with multiple fields
    - Need to handle various license formats (traditional and UMID-style)
    - Extracts profession-specific information
    - Handles both traditional PRC licenses and UMID-style PRC cards
    - Uses lookahead pattern matching for field extraction
    """
    dprint("Lines to extract", lines)

    # Initialize variables for extracted information
    crn = None
    full_name = None
    birth_date = None
    gsis_number = None
    sss_number = None
    registration_number = None
    registration_date = None
    valid_until = None
    profession = None

    # Collect name parts separately for composition
    last_name_txt = None
    first_name_txt = None

    L = [str(x or '').strip() for x in lines]
    i = 0
    while i < len(L):
        line = L[i]
        low = line.lower()
        dprint("Line", {"i": i, "text": line})

        # Extract CRN (UMID format) - 12 digits
        if crn is None and is_crn(line):
            crn = line.replace(' ', '')
            dprint("Found CRN", crn)

        # Extract Last Name using lookahead pattern
        if 'last name' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                tl = t.lower()
                if not any(k in tl for k in ['first', 'middle', 'registration', 'valid', 'date', 'no']):
                    last_name_txt = t
                    break

        # Extract First Name
        if 'firstname' in low or 'first name' in low:
            if i+1 < len(L):
                first_name_txt = L[i+1]

        # Extract Date of Birth
        if ('date of birth' in low) or ('birth' in low and 'date' in low):
            ahead = take_within(L, i, 4)
            for t in ahead:
                if is_date(t):
                    birth_date = format_date(t)
                    break

        # Extract Registration Number - handles split labels
        if low == 'registration' and i+1 < len(L) and L[i+1].lower() in ('no', 'no.', 'number'):
            ahead = take_within(L, i+1, 4)
            for t in ahead:
                if is_numeric_id(t):
                    registration_number = t.replace(' ', '')
                    break
        
        # Also handle fused label forms
        if ('registration no' in low) or ('registration number' in low):
            ahead = take_within(L, i, 4)
            for t in ahead:
                if is_numeric_id(t):
                    registration_number = t.replace(' ', '')
                    break

        # Extract Registration Date
        if low == 'registration' and i+1 < len(L) and L[i+1].lower() == 'date':
            ahead = take_within(L, i+1, 4)
            for t in ahead:
                if is_date(t):
                    registration_date = format_date(t)
                    break
        if 'registration date' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if is_date(t):
                    registration_date = format_date(t)
                    break

        # Extract Valid Until Date
        if 'valid until' in low or 'validity' in low:
            ahead = take_within(L, i, 3)
            for t in ahead:
                if is_date(t):
                    valid_until = format_date(t)
                    break

        # Extract Profession from bold lines
        if any(k in low for k in ['occupational','technician','engineer','teacher','nurse']):
            if len(line.split()) >= 2:
                profession = cap_words(line)
                dprint("Found profession", profession)

        # Extract SSS Number
        if sss_number is None and ('sss' in low or 'social security' in low):
            ahead = take_within(L, i, 3)
            for t in ahead:
                if is_numeric_id(t):
                    sss_number = t.replace(' ', '')
                    dprint("Found sss_number", sss_number)
                    break

        # Extract GSIS Number
        if gsis_number is None and ('gsis' in low):
            ahead = take_within(L, i, 3)
            for t in ahead:
                if is_numeric_id(t):
                    gsis_number = t.replace(' ', '')
                    dprint("Found gsis_number", gsis_number)
                    break

        i += 1

    # Compose full name from parts
    if full_name is None:
        full_name = normalize_name_from_parts(last_name_txt, first_name_txt)

    # Return structured result
    result = {
        "id_type": "PRC ID",
        "crn": crn,
        "id_number": registration_number or crn,  # Frontend expects id_number
        "registration_number": registration_number,
        "registration_date": registration_date,
        "valid_until": valid_until,
        "full_name": full_name,
        "birth_date": birth_date,
        "sss_number": sss_number,
        "gsis_number": gsis_number,
        "profession": profession
    }
    dprint("Final result", result)
    return result

def extract_ocr_lines(image_path):
    os.makedirs("output", exist_ok=True)
    dprint("Initializing PaddleOCR")
    
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=False, 
            use_doc_unwarping=False, 
            use_textline_orientation=False, 
            lang='en'
        )
        dprint("OCR initialized")
        dprint("Running OCR predict", image_path)
        results = ocr.predict(image_path)
    dprint("OCR predict done, results_count", len(results))

    # Process OCR results directly
    all_text = []
    try:
        lines = results[0] if results and isinstance(results[0], list) else results
        for item in lines:
            if isinstance(item, (list, tuple)) and len(item) >= 2:
                meta = item[1]
                if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                    all_text.append(str(meta[0]))
    except Exception as e:
        dprint("Error processing OCR results", str(e))
    
    dprint("All direct texts", all_text)
    return extract_prc_info(all_text) if all_text else {
        "id_type": "PRC ID",
        "crn": None,
        "full_name": None,
        "birth_date": None
    }

if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
    image_path = download_image(image_url)
    dprint("Image downloaded to", image_path)
    ocr_results = extract_ocr_lines(image_path)
    dprint("OCR results ready")
    
    # Restore stdout and print only the JSON response
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results}))
    sys.stdout.flush()
    
except Exception as e:
    dprint("Exception", str(e))
    # Restore stdout for error JSON
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    # Clean up
    try:
        clean_cache()
    except:
        pass