File size: 43,774 Bytes
db10255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
import sys, json, os, glob, requests
import re
import time
from contextlib import redirect_stdout, redirect_stderr

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def download_image(url, output_path='temp_police_image.jpg'):
    # Remove any existing temp file
    if os.path.exists(output_path):
        os.remove(output_path)
    
    # Add cache-busting parameters
    timestamp = int(time.time())
    if '?' in url:
        url += f'&t={timestamp}'
    else:
        url += f'?t={timestamp}'
    
    # Add headers to prevent caching
    headers = {
        'Cache-Control': 'no-cache, no-store, must-revalidate',
        'Pragma': 'no-cache',
        'Expires': '0'
    }
    
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    image_data = response.content
    
    # Save the image
    with open(output_path, 'wb') as f:
        f.write(image_data)
    
    return output_path

def format_name(name):
    """Format name: add proper spacing and commas (generic for all police clearances)

    

    Handles common OCR issues like missing spaces between name parts and missing comma spacing.

    Works with any name format, not specific to one document.

    """
    if not name:
        return None
    
    # Remove extra spaces and normalize
    name = ' '.join(name.split())
    
    # First, ensure comma spacing: "JAVA,ALBERT" -> "JAVA, ALBERT"
    name = re.sub(r',([A-Z])', r', \1', name)
    name = re.sub(r',\s*([A-Z])', r', \1', name)
    
    # Split by comma if present
    if ',' in name:
        parts = name.split(',')
        formatted_parts = []
        for part in parts:
            part = part.strip()
            # Handle consecutive capitals: "JAVAALBERTJOY" -> "JAVA ALBERT JOY"
            # Strategy: split where a capital letter is followed by another capital + lowercase
            # "ALBERTJOY" -> "ALBERT JOY"
            part = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', part)
            # Handle remaining cases: "JOYBAUTISTA" -> "JOY BAUTISTA"
            part = re.sub(r'([A-Z][a-z]+)([A-Z][a-z]+)', r'\1 \2', part)
            formatted_parts.append(part)
        name = ', '.join(formatted_parts)
    else:
        # No comma, try to add spaces between name parts
        # "JAVAALBERTJOY BAUTISTA" -> "JAVA ALBERT JOY BAUTISTA"
        # Add space before capital letters that follow lowercase
        name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)
        # Add space between consecutive capitals: "JAVAALBERT" -> "JAVA ALBERT"
        # But be careful: "BAUTISTA" should stay together
        # Split where we have multiple capitals followed by a capital+lowercase
        name = re.sub(r'([A-Z]{2,})([A-Z][a-z])', r'\1 \2', name)
        # Also handle: "ALBERTJOY" -> "ALBERT JOY"
        name = re.sub(r'([A-Z]+)([A-Z][a-z]+)', r'\1 \2', name)
    
    # Clean up multiple spaces
    name = ' '.join(name.split())
    
    return name.strip()

def format_address(address):
    """Format address: add proper spacing (generic for all police clearances)"""
    if not address:
        return None
    
    # Remove extra spaces
    address = ' '.join(address.split())
    
    # Handle #BLK/#BLOCK pattern: ensure space after # if followed by letters and numbers
    # "#BLK11" -> "#BLK 11", "#BLOCK5" -> "#BLOCK 5"
    address = re.sub(r'#([A-Z]+)(\d+)', r'#\1 \2', address)
    
    # Add space before city names and common address parts (capital followed by capital+lowercase)
    # "CAMPOTINIO" -> "CAMPO TINIO", "CABANATUANCITY" -> "CABANATUAN CITY"
    address = re.sub(r'([A-Z])([A-Z][a-z]+)', r'\1 \2', address)
    
    # Ensure comma spacing: "CITY,NUEVA" -> "CITY, NUEVA"
    address = re.sub(r',([A-Z])', r', \1', address)
    address = re.sub(r',\s*([A-Z])', r', \1', address)
    
    # Clean up multiple spaces
    address = ' '.join(address.split())
    
    return address.strip()

def format_birth_place(place):
    """Format birth place: add proper spacing (generic for all police clearances)"""
    if not place:
        return None
    
    # Remove extra spaces
    place = ' '.join(place.split())
    
    # Ensure comma spacing: "DILASAG,AURORA" -> "DILASAG, AURORA"
    place = re.sub(r',([A-Z])', r', \1', place)
    place = re.sub(r',\s*([A-Z])', r', \1', place)
    
    # Add space before province/region names if needed
    # "PLACE PROVINCE" -> already spaced, but handle "PLACEPROVINCE" -> "PLACE PROVINCE"
    place = re.sub(r'([A-Z])([A-Z][a-z]+)', r'\1 \2', place)
    
    # Clean up multiple spaces
    place = ' '.join(place.split())
    
    return place.strip()

def format_birth_date(date):
    """Format birth date: fix common OCR errors (generic for all police clearances)"""
    if not date:
        return None
    
    # Fix common OCR errors for month names (universal issues)
    date = date.replace('Juy', 'July')  # Common OCR error
    date = date.replace('Januay', 'January')
    date = date.replace('Februay', 'February')
    date = date.replace('Marc', 'March')
    date = date.replace('Apil', 'April')
    date = date.replace('Jun', 'June')  # Be careful - June is valid, but "Jun" might be incomplete
    date = date.replace('Augu', 'August')
    date = date.replace('Septemb', 'September')
    date = date.replace('Octob', 'October')
    date = date.replace('Novemb', 'November')
    date = date.replace('Decemb', 'December')
    
    # Fix year errors: "1905" when it should be "05" (day) - common OCR issue
    # Pattern: "July 1905, 1991" -> "July 05, 1991"
    # Check if we have a pattern like "Month 19XX, YYYY" where 19XX is likely the day misread
    match = re.search(r'(\w+)\s+19(\d{2}),\s*(\d{4})', date)
    if match:
        day = match.group(2)
        year = match.group(3)
        # If day is 00-31, it's likely a day, not a year
        if 0 <= int(day) <= 31:
            date = re.sub(r'(\w+)\s+19(\d{2}),\s*(\d{4})', rf'\1 {day}, \3', date)
    
    # Ensure proper date format: "July 05, 1991"
    date = re.sub(r'(\w+)\s+(\d{1,2})\s*,\s*(\d{4})', r'\1 \2, \3', date)
    
    # Clean up multiple spaces
    date = ' '.join(date.split())
    
    return date.strip()

def extract_police_details(lines):
    details = {
        'id_number': None,
        'full_name': None,
        'address': None,
        'birth_date': None,
        'birth_place': None,
        'citizenship': None,
        'gender': None,
        'status': None,
        'success': False
    }
    
    for i, line in enumerate(lines):
        if not isinstance(line, str):
            continue
            
        line_upper = line.upper().strip()
        line_stripped = line.strip()
        
        # Extract Name - handle cases where NAME and value are on separate lines
        if "NAME" in line_upper and not details['full_name']:
            if ":" in line:
                parts = line.split(':', 1)
                if len(parts) > 1:
                    name_part = parts[1].strip()
                    if name_part and len(name_part) > 2:
                        details['full_name'] = name_part
            elif i + 1 < len(lines):
                # Check next few lines for name value
                for j in range(1, min(3, len(lines) - i)):
                    next_line = lines[i+j].strip()
                    if next_line.startswith(':') and len(next_line) > 1:
                        name_part = next_line[1:].strip()
                        if name_part and len(name_part) > 2 and "ADDRESS" not in name_part.upper():
                            details['full_name'] = name_part
                            break
                    elif not next_line.startswith(('ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID')) and len(next_line) > 2:
                        if ":" not in next_line or (":" in next_line and next_line.index(':') < 3):
                            name_part = next_line.replace(':', '').strip()
                            if name_part and len(name_part) > 2:
                                details['full_name'] = name_part
                                break
        
        # Also check for name patterns that start with colon (OCR sometimes splits NAME label)
        if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5:
            name_candidate = line_stripped[1:].strip()
            # Check if it looks like a name (has commas, multiple words, etc.)
            if ',' in name_candidate or (len(name_candidate.split()) >= 2 and name_candidate.isupper()):
                # Make sure previous line wasn't ADDRESS or other label
                if i > 0:
                    prev_line = lines[i-1].strip().upper()
                    if "ADDRESS" not in prev_line and "BIRTH" not in prev_line:
                        details['full_name'] = name_candidate

        # Extract Address
        if "ADDRESS" in line_upper and not details['address']:
            if ":" in line:
                parts = line.split(':')
                if len(parts) > 1:
                    addr_part = parts[1].strip()
                    if addr_part:
                        details['address'] = addr_part
            elif i + 1 < len(lines):
                # Check next few lines for address value
                addr_parts = []
                for j in range(1, min(4, len(lines) - i)):
                    next_line = lines[i+j].strip()
                    if next_line.startswith(':') and len(next_line) > 1:
                        addr_parts.append(next_line[1:].strip())
                    elif "BIRTH" not in next_line.upper() and "CITIZEN" not in next_line.upper():
                        if ":" in next_line:
                            parts = next_line.split(':', 1)
                            if len(parts) > 1:
                                addr_parts.append(parts[1].strip())
                        elif len(next_line) > 2:
                            addr_parts.append(next_line)
                    else:
                        break
                if addr_parts:
                    details['address'] = ' '.join(addr_parts).strip()

        # Extract Birth Date - handle OCR errors and combined patterns
        if ("BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper) and not details['birth_date']:
            if ":" in line:
                parts = line.split(':', 1)
                if len(parts) > 1:
                    date_part = parts[1].strip()
                    # Fix common OCR errors
                    date_part = date_part.replace('Juy', 'July').replace('Juy', 'July')
                    # Fix year errors (1001 -> 1991, etc.)
                    date_part = re.sub(r'\b1001\b', '1991', date_part)
                    date_part = re.sub(r'\b(\d{2})\b', lambda m: '19' + m.group(1) if len(m.group(1)) == 2 and int(m.group(1)) < 50 else m.group(1), date_part)
                    if date_part:
                        details['birth_date'] = date_part
            elif i + 1 < len(lines):
                next_line = lines[i+1].strip()
                if ":" in next_line:
                    parts = next_line.split(':', 1)
                    if len(parts) > 1:
                        date_part = parts[1].strip()
                        date_part = date_part.replace('Juy', 'July')
                        date_part = re.sub(r'\b1001\b', '1991', date_part)
                        if date_part:
                            details['birth_date'] = date_part
        
        # Also look for date patterns in lines that might have been OCR'd incorrectly
        if not details['birth_date']:
            # Look for patterns like "Juy 05, 1001" or "July 03, 1991"
            date_pattern = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December|Juy|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2}[,\s]+\d{4}', line_upper)
            if date_pattern:
                date_part = date_pattern.group()
                date_part = date_part.replace('Juy', 'July')
                date_part = re.sub(r'\b1001\b', '1991', date_part)
                details['birth_date'] = date_part

        # Extract Birth Place
        if "BIRTH PLACE" in line_upper and not details['birth_place']:
            if ":" in line:
                parts = line.split(':', 1)
                if len(parts) > 1:
                    details['birth_place'] = parts[1].strip()
            elif i + 1 < len(lines):
                next_line = lines[i+1].strip()
                if next_line.startswith(':') and len(next_line) > 1:
                    details['birth_place'] = next_line[1:].strip()
                elif ":" in next_line and "CITIZEN" not in next_line.upper():
                    parts = next_line.split(':', 1)
                    if len(parts) > 1:
                        details['birth_place'] = parts[1].strip()

        # Extract Citizenship
        if "CITIZENSHIP" in line_upper and not details['citizenship']:
            if ":" in line:
                parts = line.split(':', 1)
                if len(parts) > 1:
                    details['citizenship'] = parts[1].strip()
            elif i + 1 < len(lines):
                next_line = lines[i+1].strip()
                if next_line.startswith(':') and len(next_line) > 1:
                    details['citizenship'] = next_line[1:].strip()
                elif ":" in next_line:
                    parts = next_line.split(':', 1)
                    if len(parts) > 1:
                        details['citizenship'] = parts[1].strip()

        # Extract Gender - handle cases where GENDER and value are on separate lines
        if "GENDER" in line_upper and not details['gender']:
            if ":" in line:
                parts = line.split(':', 1)
                if len(parts) > 1:
                    details['gender'] = parts[1].strip()
            elif i + 1 < len(lines):
                next_line = lines[i+1].strip()
                if next_line.startswith(':') and len(next_line) > 1:
                    gender_part = next_line[1:].strip()
                    if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
                        details['gender'] = gender_part
                elif ":" in next_line:
                    parts = next_line.split(':', 1)
                    if len(parts) > 1:
                        gender_part = parts[1].strip()
                        if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
                            details['gender'] = gender_part

        # Extract ID Number (Usually "ID No.:" or near QR code)
        if "ID NO" in line_upper or "ID NO." in line_upper:
            parts = line.split(':')
            if len(parts) > 1:
                details['id_number'] = parts[1].strip()
        
        # Fallback ID extraction looking for specific patterns if not found by label
        if not details['id_number']:
            # Look for pattern like TRARH + digits
            id_match = re.search(r'\b[A-Z]{4,5}\d{10,15}\b', line_upper)
            if id_match:
                details['id_number'] = id_match.group()

        # Extract Status (e.g., "NO RECORD ON FILE")
        if "NO RECORD ON FILE" in line_upper:
            details['status'] = "NO RECORD ON FILE"
        elif "HAS A RECORD" in line_upper or "WITH RECORD" in line_upper:
            details['status'] = "HAS RECORD"

    if details['full_name'] or details['id_number']:
        details['success'] = True
    
    # Format the extracted fields
    if details['full_name']:
        details['full_name'] = format_name(details['full_name'])
    if details['address']:
        details['address'] = format_address(details['address'])
    if details['birth_place']:
        details['birth_place'] = format_birth_place(details['birth_place'])
    if details['birth_date']:
        details['birth_date'] = format_birth_date(details['birth_date'])
        
    return details

def extract_ocr_lines(image_path):
    # Check if file exists
    if not os.path.exists(image_path):
        return {'success': False, 'error': 'File not found'}
    
    file_size = os.path.getsize(image_path)
    print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr)

    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        # Try simple configuration first (matching NBI script primary method)
        ocr = PaddleOCR(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,
            lang='en'
        )
        try:
            results = ocr.ocr(image_path)
        except Exception as e:
            print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr)
            if hasattr(ocr, 'predict'):
                results = ocr.predict(image_path)
            else:
                results = None
    
    # Debug: Print raw results structure
    print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr)
    print(f"DEBUG: Raw OCR results is None: {results is None}", file=sys.stderr)
    if results is not None:
        print(f"DEBUG: Raw OCR results length: {len(results) if isinstance(results, list) else 'N/A'}", file=sys.stderr)
        if isinstance(results, list) and len(results) > 0:
            print(f"DEBUG: First level item type: {type(results[0])}", file=sys.stderr)
            print(f"DEBUG: First level item: {str(results[0])[:200] if results[0] else 'None'}", file=sys.stderr)
            if isinstance(results[0], list) and len(results[0]) > 0:
                print(f"DEBUG: Second level first item: {str(results[0][0])[:200] if results[0][0] else 'None'}", file=sys.stderr)
    
    # Process OCR results - handle both old format (list) and new format (OCRResult object)
    all_text = []
    try:
        # Check if results contain OCRResult objects (new PaddleX format)
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            # Check if it's an OCRResult object by type name
            item_type_name = type(first_item).__name__
            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
            
            if is_ocr_result:
                print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
                # Inspect attributes
                attrs = dir(first_item)
                print(f"DEBUG: OCRResult attributes: {[a for a in attrs if not a.startswith('_')]}", file=sys.stderr)
                
                for ocr_result in results:
                    # Try various possible attribute names for text
                    text_found = False
                    
                    # First, try accessing as dictionary (OCRResult is dict-like)
                    try:
                        if hasattr(ocr_result, 'keys'):
                            ocr_dict = dict(ocr_result)
                            print(f"DEBUG: OCRResult as dict keys: {list(ocr_dict.keys())}", file=sys.stderr)
                            
                            # Look for common OCR result keys (rec_texts is the actual key in PaddleX OCRResult)
                            for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results', 'ocr_result', 'dt_boxes']:
                                if key in ocr_dict:
                                    val = ocr_dict[key]
                                    print(f"DEBUG: Found key '{key}': {type(val)}, length: {len(val) if isinstance(val, list) else 'N/A'}", file=sys.stderr)
                                    if isinstance(val, list):
                                        # rec_texts is a list of strings directly
                                        if key == 'rec_texts':
                                            for text_item in val:
                                                if isinstance(text_item, str) and text_item.strip():
                                                    all_text.append(text_item.strip())
                                                elif text_item:
                                                    all_text.append(str(text_item))
                                            if val:
                                                text_found = True
                                        else:
                                            # For other keys, try to extract text from nested structures
                                            for item in val:
                                                if isinstance(item, (list, tuple)) and len(item) >= 2:
                                                    # Format: [[coords], (text, confidence)]
                                                    text_part = item[1]
                                                    if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
                                                        all_text.append(str(text_part[0]))
                                                elif isinstance(item, str):
                                                    all_text.append(item)
                                            if val:
                                                text_found = True
                                    elif isinstance(val, str) and val:
                                        all_text.append(val)
                                        text_found = True
                                    if text_found:
                                        break
                    except Exception as e:
                        print(f"DEBUG: Error accessing OCRResult as dict: {e}", file=sys.stderr)
                    
                    # Try json() method
                    if not text_found:
                        try:
                            if hasattr(ocr_result, 'json'):
                                json_data = ocr_result.json()
                                print(f"DEBUG: OCRResult.json() type: {type(json_data)}", file=sys.stderr)
                                if isinstance(json_data, dict):
                                    print(f"DEBUG: OCRResult.json() keys: {list(json_data.keys())}", file=sys.stderr)
                                    # Look for text in JSON (rec_texts is the actual key)
                                    for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results']:
                                        if key in json_data:
                                            val = json_data[key]
                                            if isinstance(val, list):
                                                # rec_texts is a list of strings directly
                                                if key == 'rec_texts':
                                                    for text_item in val:
                                                        if isinstance(text_item, str) and text_item.strip():
                                                            all_text.append(text_item.strip())
                                                        elif text_item:
                                                            all_text.append(str(text_item))
                                                    if val:
                                                        text_found = True
                                                else:
                                                    for item in val:
                                                        if isinstance(item, (list, tuple)) and len(item) >= 2:
                                                            text_part = item[1]
                                                            if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
                                                                all_text.append(str(text_part[0]))
                                                        elif isinstance(item, str):
                                                            all_text.append(item)
                                                    if val:
                                                        text_found = True
                                            elif isinstance(val, str) and val:
                                                all_text.append(val)
                                                text_found = True
                                            if text_found:
                                                break
                        except Exception as e:
                            print(f"DEBUG: Error calling json(): {e}", file=sys.stderr)
                    
                    # Try rec_text attribute
                    if not text_found and hasattr(ocr_result, 'rec_text'):
                        rec_text = ocr_result.rec_text
                        print(f"DEBUG: Found rec_text attribute: {type(rec_text)}", file=sys.stderr)
                        if isinstance(rec_text, list):
                            all_text.extend([str(t) for t in rec_text if t])
                            text_found = True
                        elif rec_text:
                            all_text.append(str(rec_text))
                            text_found = True
                    
                    # Try text attribute
                    if not text_found and hasattr(ocr_result, 'text'):
                        text = ocr_result.text
                        print(f"DEBUG: Found text attribute: {type(text)}", file=sys.stderr)
                        if isinstance(text, list):
                            all_text.extend([str(t) for t in text if t])
                            text_found = True
                        elif text:
                            all_text.append(str(text))
                            text_found = True
                    
                    # If still no text, print full structure for debugging
                    if not text_found:
                        print(f"DEBUG: Could not find text in OCRResult, trying to inspect structure", file=sys.stderr)
                        try:
                            print(f"DEBUG: OCRResult repr: {repr(ocr_result)[:500]}", file=sys.stderr)
                            # Try to get all keys/items
                            if hasattr(ocr_result, 'keys'):
                                try:
                                    all_keys = list(ocr_result.keys())
                                    print(f"DEBUG: All OCRResult keys: {all_keys}", file=sys.stderr)
                                    for key in all_keys:
                                        try:
                                            val = ocr_result[key]
                                            print(f"DEBUG: Key '{key}' type: {type(val)}, value preview: {str(val)[:100]}", file=sys.stderr)
                                        except:
                                            pass
                                except:
                                    pass
                        except Exception as e:
                            print(f"DEBUG: Error inspecting structure: {e}", file=sys.stderr)
            else:
                # Old format - list of lists
                lines = results[0] if results and isinstance(results[0], list) else results
                print(f"DEBUG: Processing lines (old format), count: {len(lines) if isinstance(lines, list) else 'N/A'}", file=sys.stderr)
                for item in lines:
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        meta = item[1]
                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                            all_text.append(str(meta[0]))
    except Exception as e:
        print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
        import traceback
        print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
        # Try to inspect the object attributes
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            print(f"DEBUG: First item attributes: {dir(first_item)}", file=sys.stderr)
            if hasattr(first_item, '__dict__'):
                print(f"DEBUG: First item dict: {first_item.__dict__}", file=sys.stderr)
    
    print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
    
    return extract_police_details(all_text) if all_text else {'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}

def extract_ocr_lines_simple(image_path):
    # Fallback method with advanced features (matching NBI script fallback)
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=True,
            use_doc_unwarping=True,
            use_textline_orientation=True,
            lang='en'
        )
        results = ocr.ocr(image_path)
    
    # Debug: Print raw results structure for fallback method
    print(f"DEBUG (fallback): Raw OCR results type: {type(results)}", file=sys.stderr)
    print(f"DEBUG (fallback): Raw OCR results is None: {results is None}", file=sys.stderr)
    if results is not None:
        print(f"DEBUG (fallback): Raw OCR results length: {len(results) if isinstance(results, list) else 'N/A'}", file=sys.stderr)
        if isinstance(results, list) and len(results) > 0:
            print(f"DEBUG (fallback): First level item type: {type(results[0])}", file=sys.stderr)
            if isinstance(results[0], list) and len(results[0]) > 0:
                print(f"DEBUG (fallback): Second level first item: {str(results[0][0])[:200] if results[0][0] else 'None'}", file=sys.stderr)
    
    all_text = []
    try:
        # Check if results contain OCRResult objects (new PaddleX format)
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            # Check if it's an OCRResult object by type name
            item_type_name = type(first_item).__name__
            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
            
            if is_ocr_result:
                print(f"DEBUG (fallback): Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
                # Inspect attributes
                attrs = dir(first_item)
                print(f"DEBUG (fallback): OCRResult attributes: {[a for a in attrs if not a.startswith('_')]}", file=sys.stderr)
                
                for ocr_result in results:
                    # Try various possible attribute names for text
                    text_found = False
                    
                    # First, try accessing as dictionary (OCRResult is dict-like)
                    try:
                        if hasattr(ocr_result, 'keys'):
                            ocr_dict = dict(ocr_result)
                            print(f"DEBUG (fallback): OCRResult as dict keys: {list(ocr_dict.keys())}", file=sys.stderr)
                            
                            # Look for common OCR result keys (rec_texts is the actual key in PaddleX OCRResult)
                            for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results', 'ocr_result', 'dt_boxes']:
                                if key in ocr_dict:
                                    val = ocr_dict[key]
                                    print(f"DEBUG (fallback): Found key '{key}': {type(val)}, length: {len(val) if isinstance(val, list) else 'N/A'}", file=sys.stderr)
                                    if isinstance(val, list):
                                        # rec_texts is a list of strings directly
                                        if key == 'rec_texts':
                                            for text_item in val:
                                                if isinstance(text_item, str) and text_item.strip():
                                                    all_text.append(text_item.strip())
                                                elif text_item:
                                                    all_text.append(str(text_item))
                                            if val:
                                                text_found = True
                                        else:
                                            # For other keys, try to extract text from nested structures
                                            for item in val:
                                                if isinstance(item, (list, tuple)) and len(item) >= 2:
                                                    # Format: [[coords], (text, confidence)]
                                                    text_part = item[1]
                                                    if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
                                                        all_text.append(str(text_part[0]))
                                                elif isinstance(item, str):
                                                    all_text.append(item)
                                            if val:
                                                text_found = True
                                    elif isinstance(val, str) and val:
                                        all_text.append(val)
                                        text_found = True
                                    if text_found:
                                        break
                    except Exception as e:
                        print(f"DEBUG (fallback): Error accessing OCRResult as dict: {e}", file=sys.stderr)
                    
                    # Try json() method
                    if not text_found:
                        try:
                            if hasattr(ocr_result, 'json'):
                                json_data = ocr_result.json()
                                print(f"DEBUG (fallback): OCRResult.json() type: {type(json_data)}", file=sys.stderr)
                                if isinstance(json_data, dict):
                                    print(f"DEBUG (fallback): OCRResult.json() keys: {list(json_data.keys())}", file=sys.stderr)
                                    # Look for text in JSON (rec_texts is the actual key)
                                    for key in ['rec_texts', 'rec_text', 'dt_polys', 'ocr_text', 'text', 'texts', 'result', 'results']:
                                        if key in json_data:
                                            val = json_data[key]
                                            if isinstance(val, list):
                                                # rec_texts is a list of strings directly
                                                if key == 'rec_texts':
                                                    for text_item in val:
                                                        if isinstance(text_item, str) and text_item.strip():
                                                            all_text.append(text_item.strip())
                                                        elif text_item:
                                                            all_text.append(str(text_item))
                                                    if val:
                                                        text_found = True
                                                else:
                                                    for item in val:
                                                        if isinstance(item, (list, tuple)) and len(item) >= 2:
                                                            text_part = item[1]
                                                            if isinstance(text_part, (list, tuple)) and len(text_part) >= 1:
                                                                all_text.append(str(text_part[0]))
                                                        elif isinstance(item, str):
                                                            all_text.append(item)
                                                    if val:
                                                        text_found = True
                                            elif isinstance(val, str) and val:
                                                all_text.append(val)
                                                text_found = True
                                            if text_found:
                                                break
                        except Exception as e:
                            print(f"DEBUG (fallback): Error calling json(): {e}", file=sys.stderr)
                    
                    # Try rec_text attribute
                    if not text_found and hasattr(ocr_result, 'rec_text'):
                        rec_text = ocr_result.rec_text
                        print(f"DEBUG (fallback): Found rec_text attribute: {type(rec_text)}", file=sys.stderr)
                        if isinstance(rec_text, list):
                            all_text.extend([str(t) for t in rec_text if t])
                            text_found = True
                        elif rec_text:
                            all_text.append(str(rec_text))
                            text_found = True
                    
                    # Try text attribute
                    if not text_found and hasattr(ocr_result, 'text'):
                        text = ocr_result.text
                        print(f"DEBUG (fallback): Found text attribute: {type(text)}", file=sys.stderr)
                        if isinstance(text, list):
                            all_text.extend([str(t) for t in text if t])
                            text_found = True
                        elif text:
                            all_text.append(str(text))
                            text_found = True
                    
                    # If still no text, print full structure for debugging
                    if not text_found:
                        print(f"DEBUG (fallback): Could not find text in OCRResult, trying to inspect structure", file=sys.stderr)
                        try:
                            print(f"DEBUG (fallback): OCRResult repr: {repr(ocr_result)[:500]}", file=sys.stderr)
                            # Try to get all keys/items
                            if hasattr(ocr_result, 'keys'):
                                try:
                                    all_keys = list(ocr_result.keys())
                                    print(f"DEBUG (fallback): All OCRResult keys: {all_keys}", file=sys.stderr)
                                    for key in all_keys:
                                        try:
                                            val = ocr_result[key]
                                            print(f"DEBUG (fallback): Key '{key}' type: {type(val)}, value preview: {str(val)[:100]}", file=sys.stderr)
                                        except:
                                            pass
                                except:
                                    pass
                        except Exception as e:
                            print(f"DEBUG (fallback): Error inspecting structure: {e}", file=sys.stderr)
            else:
                # Old format - list of lists
                lines = results[0] if results and isinstance(results[0], list) else results
                print(f"DEBUG (fallback): Processing lines (old format), count: {len(lines) if isinstance(lines, list) else 'N/A'}", file=sys.stderr)
                for item in lines:
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        meta = item[1]
                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                            all_text.append(str(meta[0]))
    except Exception as e:
        print(f"DEBUG (fallback): Error processing OCR results: {str(e)}", file=sys.stderr)
        import traceback
        print(f"DEBUG (fallback): Traceback: {traceback.format_exc()}", file=sys.stderr)
        # Try to inspect the object attributes
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            print(f"DEBUG (fallback): First item attributes: {dir(first_item)}", file=sys.stderr)
            if hasattr(first_item, '__dict__'):
                print(f"DEBUG (fallback): First item dict: {first_item.__dict__}", file=sys.stderr)
    
    print(f"DEBUG (fallback): Extracted text lines: {all_text}", file=sys.stderr)
    
    return extract_police_details(all_text) if all_text else {'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}

# Main Execution
if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"success": False, "error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
print(f"DEBUG: Processing Police Clearance image URL: {image_url}", file=sys.stderr)

try:
    image_path = download_image(image_url, 'temp_police_image.jpg')
    print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)

    # Try the original OCR method first
    ocr_results = extract_ocr_lines(image_path)
    print(f"DEBUG: OCR results from extract_ocr_lines: {ocr_results}", file=sys.stderr)
    
    # If original method fails, try simple method with advanced features
    if not ocr_results['success']:
        print("DEBUG: Original method failed, trying simple method with advanced features", file=sys.stderr)
        ocr_results = extract_ocr_lines_simple(image_path)
        print(f"DEBUG: OCR results from extract_ocr_lines_simple: {ocr_results}", file=sys.stderr)
    
    # Clean up
    if os.path.exists(image_path):
        os.remove(image_path)
    
    response = {
        "success": ocr_results['success'],
        "data": ocr_results
    }
    
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps(response))
    sys.stdout.flush()
        
except Exception as e:
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    try:
        if os.path.exists('temp_police_image.jpg'):
            os.remove('temp_police_image.jpg')
    except:
        pass