takomattyy commited on
Commit
2b089f9
·
verified ·
1 Parent(s): 4848007

Upload 2 files

Browse files

nbi and police ocr

Files changed (2) hide show
  1. extract_nbi_ocr.py +305 -50
  2. extract_police_ocr.py +80 -25
extract_nbi_ocr.py CHANGED
@@ -48,43 +48,238 @@ def download_image(url, output_path='temp_image.jpg'):
48
 
49
  return output_path
50
 
51
- # OCR Function to extract NBI ID NO
52
  def extract_nbi_id(lines):
53
  nbi_id = None
 
 
 
54
 
55
- for i, line in enumerate(lines):
56
- if isinstance(line, str):
57
- # Look for "NBI ID NO:" pattern
58
- if "NBI ID NO:" in line.upper() or "NBIIDNO" in line.upper():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # Extract the ID after the colon
60
- parts = line.split(':')
61
- if len(parts) > 1:
62
- nbi_id = parts[1].strip()
63
- break
64
- # Also check if the next line contains the ID (in case it's on a separate line)
65
- elif i < len(lines) - 1 and ("NBI ID NO:" in line.upper() or "NBI ID NO" in line.upper()):
66
- next_line = lines[i + 1]
67
- if isinstance(next_line, str) and len(next_line.strip()) > 5:
68
- nbi_id = next_line.strip()
69
- break
70
-
71
- # If not found with "NBI ID NO:" pattern, look for the specific format
72
- if not nbi_id:
73
- for line in lines:
74
- if isinstance(line, str):
75
- # Look for pattern like HGUR87H38D-U47204A873 (alphanumeric with one hyphen)
76
- pattern = r'[A-Z0-9]{10,12}-[A-Z0-9]{10,12}'
77
- match = re.search(pattern, line)
78
- if match:
79
- nbi_id = match.group()
80
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  return {
83
  'clearance_type': 'nbi',
84
  'id_number': nbi_id,
85
- 'full_name': None,
86
- 'birth_date': None,
87
- 'success': nbi_id is not None
 
88
  }
89
 
90
  def extract_ocr_lines_simple(image_path):
@@ -97,25 +292,54 @@ def extract_ocr_lines_simple(image_path):
97
  use_textline_orientation=True, # Enable text line orientation
98
  lang='en' # Set language to English
99
  )
100
- results = ocr.predict(image_path)
 
 
 
 
 
 
 
101
 
102
  all_text = []
103
  try:
104
- lines = results[0] if results and isinstance(results[0], list) else results
105
- for item in lines:
106
- if isinstance(item, (list, tuple)) and len(item) >= 2:
107
- meta = item[1]
108
- if isinstance(meta, (list, tuple)) and len(meta) >= 1:
109
- all_text.append(str(meta[0]))
110
- except Exception:
111
- pass
112
-
113
- return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
 
115
  def extract_ocr_lines(image_path):
116
  # Check if file exists and has content
117
  if not os.path.exists(image_path):
118
- return {'id_number': None, 'success': False}
119
 
120
  # Ensure output directory exists
121
  os.makedirs("output", exist_ok=True)
@@ -131,22 +355,53 @@ def extract_ocr_lines(image_path):
131
  use_textline_orientation=False,
132
  lang='en'
133
  )
134
- results = ocr.predict(image_path)
 
 
 
 
 
 
 
135
 
136
- # Process OCR results directly
137
  all_text = []
138
  try:
139
- lines = results[0] if results and isinstance(results[0], list) else results
140
- for item in lines:
141
- if isinstance(item, (list, tuple)) and len(item) >= 2:
142
- meta = item[1]
143
- if isinstance(meta, (list, tuple)) and len(meta) >= 1:
144
- all_text.append(str(meta[0]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  except Exception as e:
146
  print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
 
 
147
 
148
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
149
- return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
150
 
151
  # Main
152
  if len(sys.argv) < 2:
 
48
 
49
  return output_path
50
 
51
+ # OCR Function to extract NBI ID NO, Name, Birth Date, and LIT
52
  def extract_nbi_id(lines):
53
  nbi_id = None
54
+ full_name = None
55
+ birth_date = None
56
+ lit = None # LIT field (Last Issued To or similar)
57
 
58
+ # Clean lines - convert to strings and strip
59
+ cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines]
60
+
61
+ # First pass: Look for NBI ID pattern in all lines (prioritize exact matches)
62
+ # This helps catch IDs that might be on lines without labels
63
+ for i, line in enumerate(cleaned_lines):
64
+ line_upper = line.upper().strip()
65
+ line_clean = line.strip()
66
+
67
+ # Look for NBI ID pattern with hyphen first (most reliable)
68
+ if not nbi_id:
69
+ hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
70
+ match = re.search(hyphen_pattern, line_clean)
71
+ if match:
72
+ candidate = match.group(1)
73
+ # Validate length and that it's not part of an address
74
+ if 17 <= len(candidate) <= 25:
75
+ # Check that line doesn't have too many words (NBI IDs are usually standalone)
76
+ line_words = line_clean.split()
77
+ if len(line_words) <= 3: # Usually 1-2 words max (the ID itself)
78
+ # Additional validation: should have mix of letters and numbers
79
+ has_letters = bool(re.search(r'[A-Z]', candidate))
80
+ has_numbers = bool(re.search(r'[0-9]', candidate))
81
+ if has_letters and has_numbers:
82
+ nbi_id = candidate
83
+ print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr)
84
+ break
85
+
86
+ # Second pass: Extract other fields and refine ID if needed
87
+ for i, line in enumerate(cleaned_lines):
88
+ line_upper = line.upper().strip()
89
+ line_clean = line.strip()
90
+
91
+ # Extract NBI ID Number (if not found in first pass)
92
+ if not nbi_id:
93
+ # Look for "NBI ID NO:" pattern (various formats)
94
+ if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or
95
+ "NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper):
96
  # Extract the ID after the colon
97
+ if ":" in line:
98
+ parts = line.split(':', 1)
99
+ if len(parts) > 1:
100
+ id_candidate = parts[1].strip()
101
+ # Clean up the ID (remove extra spaces, ensure proper format)
102
+ id_candidate = re.sub(r'\s+', '', id_candidate) # Remove spaces
103
+ if len(id_candidate) > 5: # Valid ID should be longer
104
+ nbi_id = id_candidate
105
+ print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr)
106
+ continue
107
+
108
+ # Also check if the next line contains the ID (in case it's on a separate line)
109
+ if i < len(cleaned_lines) - 1:
110
+ for j in range(1, min(3, len(cleaned_lines) - i)):
111
+ next_line = cleaned_lines[i + j].strip()
112
+ # Skip if it's clearly not an ID (too short, contains labels)
113
+ if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']):
114
+ continue
115
+ # Check if it looks like an NBI ID (alphanumeric, reasonable length)
116
+ if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')):
117
+ nbi_id = next_line.replace(' ', '')
118
+ print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr)
119
+ break
120
+ if nbi_id:
121
+ continue
122
+
123
+ # Look for NBI ID pattern: alphanumeric with one hyphen
124
+ # Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873
125
+ # First part: 8-12 chars, hyphen, second part: 8-12 chars
126
+ # Total length: 17-25 characters (including hyphen)
127
+
128
+ # Priority 1: Pattern with hyphen (most common format)
129
+ # Look for pattern like B450JRLR0B-RC248667
130
+ hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
131
+ match = re.search(hyphen_pattern, line_clean)
132
+ if match:
133
+ candidate = match.group(1)
134
+ # Validate: should be 17-25 chars total
135
+ if 17 <= len(candidate) <= 25:
136
+ # Make sure it's not matching address parts or other text
137
+ # Also check that the line doesn't have too many words (NBI IDs are usually standalone)
138
+ line_words = line_clean.split()
139
+ # Additional validation: should have mix of letters and numbers
140
+ has_letters = bool(re.search(r'[A-Z]', candidate))
141
+ has_numbers = bool(re.search(r'[0-9]', candidate))
142
+ if (has_letters and has_numbers and
143
+ not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and
144
+ len(line_words) <= 3): # NBI ID is usually on its own line or with 1-2 other words
145
+ nbi_id = candidate
146
+ print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr)
147
+ continue
148
+
149
+ # Priority 2: Pattern with space instead of hyphen
150
+ space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b'
151
+ match = re.search(space_pattern, line_clean)
152
+ if match:
153
+ part1, part2 = match.groups()
154
+ candidate = f"{part1}-{part2}"
155
+ if 17 <= len(candidate) <= 25:
156
+ has_letters = bool(re.search(r'[A-Z]', candidate))
157
+ has_numbers = bool(re.search(r'[0-9]', candidate))
158
+ if (has_letters and has_numbers and
159
+ not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])):
160
+ nbi_id = candidate
161
+ print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr)
162
+ continue
163
+
164
+ # Priority 3: Pattern without hyphen/space (all together)
165
+ # Only if we haven't found one yet and it's a reasonable length
166
+ no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b'
167
+ match = re.search(no_hyphen_pattern, line_clean)
168
+ if match:
169
+ candidate = match.group(1)
170
+ # Make sure it doesn't contain common address words and has both letters and numbers
171
+ has_letters = bool(re.search(r'[A-Z]', candidate))
172
+ has_numbers = bool(re.search(r'[0-9]', candidate))
173
+ if (has_letters and has_numbers and
174
+ not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])):
175
+ # Try to split it intelligently (usually split in the middle)
176
+ mid = len(candidate) // 2
177
+ # Try splitting at various points
178
+ for split_point in range(mid-2, mid+3):
179
+ if 8 <= split_point <= len(candidate) - 8:
180
+ part1 = candidate[:split_point]
181
+ part2 = candidate[split_point:]
182
+ if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12:
183
+ nbi_id = f"{part1}-{part2}"
184
+ print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr)
185
+ break
186
+ if nbi_id:
187
+ continue
188
+
189
+ # Extract Full Name - look for name patterns after "NAME" label
190
+ # Also handle cases where name might be on the same line or next lines
191
+ if not full_name:
192
+ # Check if line contains "NAME" label
193
+ if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper):
194
+ # First, check if name is on the same line after colon
195
+ if ":" in line:
196
+ parts = line.split(':', 1)
197
+ if len(parts) > 1:
198
+ name_part = parts[1].strip()
199
+ if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2:
200
+ full_name = name_part
201
+ print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr)
202
+ continue
203
+
204
+ # Check next few lines for name value
205
+ for j in range(1, min(5, len(cleaned_lines) - i)):
206
+ next_line = cleaned_lines[i + j].strip()
207
+ next_upper = next_line.upper()
208
+ # Skip if it's another label or ID number
209
+ if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
210
+ continue
211
+ # Check if it looks like a name (has letters, may have commas, not all numbers)
212
+ if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2:
213
+ # Additional check: make sure it's not just a single word that's too short
214
+ if len(next_line.split()) >= 1 and len(next_line) > 3:
215
+ full_name = next_line
216
+ print(f"DEBUG: Found full name: {full_name}", file=sys.stderr)
217
+ break
218
+
219
+ # Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label
220
+ if not birth_date:
221
+ if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or
222
+ ("DATE" in line_upper and "BIRTH" in line_upper)):
223
+ # First, check if date is on the same line after colon
224
+ if ":" in line:
225
+ parts = line.split(':', 1)
226
+ if len(parts) > 1:
227
+ date_part = parts[1].strip()
228
+ if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', date_part.upper()) or
229
+ re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or
230
+ re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)):
231
+ birth_date = date_part
232
+ print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr)
233
+ continue
234
+
235
+ # Check next few lines for date value
236
+ for j in range(1, min(5, len(cleaned_lines) - i)):
237
+ next_line = cleaned_lines[i + j].strip()
238
+ next_upper = next_line.upper()
239
+ # Skip if it's another label
240
+ if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
241
+ continue
242
+ # Check if it looks like a date (contains month name or date pattern)
243
+ if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', next_upper) or
244
+ re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or
245
+ re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)):
246
+ birth_date = next_line
247
+ print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
248
+ break
249
+
250
+ # Extract LIT field - look for "LIT" label or pattern
251
+ if not lit:
252
+ # Look for "LIT" label (could be "LIT:", "LIT", or part of another label)
253
+ if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper):
254
+ # Check if LIT value is on the same line after colon or space
255
+ if ":" in line:
256
+ parts = line.split(':', 1)
257
+ if len(parts) > 1:
258
+ lit_part = parts[1].strip()
259
+ if len(lit_part) > 0:
260
+ lit = lit_part
261
+ print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr)
262
+ continue
263
+ # Check next few lines for LIT value
264
+ for j in range(1, min(4, len(cleaned_lines) - i)):
265
+ next_line = cleaned_lines[i + j].strip()
266
+ next_upper = next_line.upper()
267
+ # Skip if it's another label
268
+ if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']):
269
+ continue
270
+ # Check if it looks like a valid LIT value (could be date, name, or other text)
271
+ if len(next_line) > 0:
272
+ lit = next_line
273
+ print(f"DEBUG: Found LIT: {lit}", file=sys.stderr)
274
+ break
275
 
276
  return {
277
  'clearance_type': 'nbi',
278
  'id_number': nbi_id,
279
+ 'full_name': full_name,
280
+ 'birth_date': birth_date,
281
+ 'lit': lit,
282
+ 'success': nbi_id is not None or full_name is not None
283
  }
284
 
285
  def extract_ocr_lines_simple(image_path):
 
292
  use_textline_orientation=True, # Enable text line orientation
293
  lang='en' # Set language to English
294
  )
295
+ try:
296
+ results = ocr.predict(image_path)
297
+ except Exception as e:
298
+ print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
299
+ if hasattr(ocr, 'ocr'):
300
+ results = ocr.ocr(image_path)
301
+ else:
302
+ results = None
303
 
304
  all_text = []
305
  try:
306
+ # Handle both old format (list) and new format (OCRResult object)
307
+ if results and isinstance(results, list) and len(results) > 0:
308
+ first_item = results[0]
309
+ item_type_name = type(first_item).__name__
310
+ is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
311
+
312
+ if is_ocr_result:
313
+ print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
314
+ # Access OCRResult as dictionary
315
+ try:
316
+ if hasattr(first_item, 'keys'):
317
+ ocr_dict = dict(first_item)
318
+ # Look for rec_texts key
319
+ if 'rec_texts' in ocr_dict:
320
+ rec_texts = ocr_dict['rec_texts']
321
+ if isinstance(rec_texts, list):
322
+ all_text = [str(t) for t in rec_texts if t]
323
+ print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
324
+ except Exception as e:
325
+ print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
326
+ else:
327
+ # Old format - list of lists
328
+ lines = results[0] if results and isinstance(results[0], list) else results
329
+ for item in lines:
330
+ if isinstance(item, (list, tuple)) and len(item) >= 2:
331
+ meta = item[1]
332
+ if isinstance(meta, (list, tuple)) and len(meta) >= 1:
333
+ all_text.append(str(meta[0]))
334
+ except Exception as e:
335
+ print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
336
+
337
+ return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
338
 
339
  def extract_ocr_lines(image_path):
340
  # Check if file exists and has content
341
  if not os.path.exists(image_path):
342
+ return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
343
 
344
  # Ensure output directory exists
345
  os.makedirs("output", exist_ok=True)
 
355
  use_textline_orientation=False,
356
  lang='en'
357
  )
358
+ try:
359
+ results = ocr.predict(image_path)
360
+ except Exception as e:
361
+ print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
362
+ if hasattr(ocr, 'ocr'):
363
+ results = ocr.ocr(image_path)
364
+ else:
365
+ results = None
366
 
367
+ # Process OCR results - handle both old format (list) and new format (OCRResult object)
368
  all_text = []
369
  try:
370
+ # Handle both old format (list) and new format (OCRResult object)
371
+ if results and isinstance(results, list) and len(results) > 0:
372
+ first_item = results[0]
373
+ item_type_name = type(first_item).__name__
374
+ is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
375
+
376
+ if is_ocr_result:
377
+ print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
378
+ # Access OCRResult as dictionary
379
+ try:
380
+ if hasattr(first_item, 'keys'):
381
+ ocr_dict = dict(first_item)
382
+ # Look for rec_texts key
383
+ if 'rec_texts' in ocr_dict:
384
+ rec_texts = ocr_dict['rec_texts']
385
+ if isinstance(rec_texts, list):
386
+ all_text = [str(t) for t in rec_texts if t]
387
+ print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
388
+ except Exception as e:
389
+ print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
390
+ else:
391
+ # Old format - list of lists
392
+ lines = results[0] if results and isinstance(results[0], list) else results
393
+ for item in lines:
394
+ if isinstance(item, (list, tuple)) and len(item) >= 2:
395
+ meta = item[1]
396
+ if isinstance(meta, (list, tuple)) and len(meta) >= 1:
397
+ all_text.append(str(meta[0]))
398
  except Exception as e:
399
  print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
400
+ import traceback
401
+ print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
402
 
403
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
404
+ return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
405
 
406
  # Main
407
  if len(sys.argv) < 2:
extract_police_ocr.py CHANGED
@@ -197,39 +197,66 @@ def extract_police_details(lines):
197
  line_stripped = line.strip()
198
 
199
  # Extract Name - handle cases where NAME and value are on separate lines
 
200
  if "NAME" in line_upper and not details['full_name']:
 
201
  if ":" in line:
202
  parts = line.split(':', 1)
203
  if len(parts) > 1:
204
  name_part = parts[1].strip()
205
- if name_part and len(name_part) > 2:
 
206
  details['full_name'] = name_part
207
- elif i + 1 < len(lines):
208
- # Check next few lines for name value
209
- for j in range(1, min(3, len(lines) - i)):
 
 
 
210
  next_line = lines[i+j].strip()
 
 
 
 
 
 
 
211
  if next_line.startswith(':') and len(next_line) > 1:
212
  name_part = next_line[1:].strip()
213
- if name_part and len(name_part) > 2 and "ADDRESS" not in name_part.upper():
 
 
 
214
  details['full_name'] = name_part
 
 
 
 
 
 
 
 
 
 
 
 
215
  break
216
- elif not next_line.startswith(('ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID')) and len(next_line) > 2:
217
- if ":" not in next_line or (":" in next_line and next_line.index(':') < 3):
218
- name_part = next_line.replace(':', '').strip()
219
- if name_part and len(name_part) > 2:
220
- details['full_name'] = name_part
221
- break
222
 
223
  # Also check for name patterns that start with colon (OCR sometimes splits NAME label)
 
224
  if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5:
225
  name_candidate = line_stripped[1:].strip()
226
- # Check if it looks like a name (has commas, multiple words, etc.)
227
- if ',' in name_candidate or (len(name_candidate.split()) >= 2 and name_candidate.isupper()):
 
 
 
228
  # Make sure previous line wasn't ADDRESS or other label
229
  if i > 0:
230
  prev_line = lines[i-1].strip().upper()
231
- if "ADDRESS" not in prev_line and "BIRTH" not in prev_line:
232
  details['full_name'] = name_candidate
 
233
 
234
  # Extract Address
235
  if "ADDRESS" in line_upper and not details['address']:
@@ -323,23 +350,51 @@ def extract_police_details(lines):
323
  details['citizenship'] = parts[1].strip()
324
 
325
  # Extract Gender - handle cases where GENDER and value are on separate lines
 
326
  if "GENDER" in line_upper and not details['gender']:
 
327
  if ":" in line:
328
  parts = line.split(':', 1)
329
  if len(parts) > 1:
330
- details['gender'] = parts[1].strip()
331
- elif i + 1 < len(lines):
332
- next_line = lines[i+1].strip()
333
- if next_line.startswith(':') and len(next_line) > 1:
334
- gender_part = next_line[1:].strip()
335
  if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
336
- details['gender'] = gender_part
337
- elif ":" in next_line:
338
- parts = next_line.split(':', 1)
339
- if len(parts) > 1:
340
- gender_part = parts[1].strip()
 
 
 
 
 
 
 
 
 
 
 
 
341
  if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
342
- details['gender'] = gender_part
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  # Extract ID Number (Usually "ID No.:" or near QR code)
345
  if "ID NO" in line_upper or "ID NO." in line_upper:
 
197
  line_stripped = line.strip()
198
 
199
  # Extract Name - handle cases where NAME and value are on separate lines
200
+ # Format: 'NAME' on one line, ':IRENE TIMBAL VILLAFUERTE' on next line
201
  if "NAME" in line_upper and not details['full_name']:
202
+ # First, check if name is on the same line after colon
203
  if ":" in line:
204
  parts = line.split(':', 1)
205
  if len(parts) > 1:
206
  name_part = parts[1].strip()
207
+ # Validate it's actually a name (not descriptive text)
208
+ if name_part and len(name_part) > 2 and not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT']):
209
  details['full_name'] = name_part
210
+ print(f"DEBUG: Found full name (same line): {details['full_name']}", file=sys.stderr)
211
+ continue
212
+
213
+ # Check next few lines for name value (prioritize lines starting with colon)
214
+ if i + 1 < len(lines):
215
+ for j in range(1, min(5, len(lines) - i)):
216
  next_line = lines[i+j].strip()
217
+ next_upper = next_line.upper()
218
+
219
+ # Skip if it's clearly a label or descriptive text
220
+ if any(word in next_upper for word in ['ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID', 'THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT', 'CERTIFY', 'PERSON', 'WHOSE', 'PHOTO', 'SIGNATURE']):
221
+ continue
222
+
223
+ # Priority: Line starting with colon (most reliable format)
224
  if next_line.startswith(':') and len(next_line) > 1:
225
  name_part = next_line[1:].strip()
226
+ # Validate it looks like a name (has letters, reasonable length, not descriptive text)
227
+ if (name_part and len(name_part) > 3 and
228
+ re.search(r'[A-Za-z]{2,}', name_part) and
229
+ not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION'])):
230
  details['full_name'] = name_part
231
+ print(f"DEBUG: Found full name (colon line): {details['full_name']}", file=sys.stderr)
232
+ break
233
+
234
+ # Fallback: Line that looks like a name (all caps, multiple words, reasonable length)
235
+ elif (re.match(r'^[A-Z\s,]+$', next_line) and
236
+ len(next_line.split()) >= 2 and
237
+ len(next_line) > 5 and
238
+ len(next_line) < 50): # Names are usually not too long
239
+ # Make sure it's not descriptive text
240
+ if not any(word in next_upper for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME']):
241
+ details['full_name'] = next_line
242
+ print(f"DEBUG: Found full name (all caps line): {details['full_name']}", file=sys.stderr)
243
  break
 
 
 
 
 
 
244
 
245
  # Also check for name patterns that start with colon (OCR sometimes splits NAME label)
246
+ # But only if we haven't found a name yet
247
  if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5:
248
  name_candidate = line_stripped[1:].strip()
249
+ # Check if it looks like a name (has letters, reasonable length, not descriptive text)
250
+ if (re.search(r'[A-Za-z]{2,}', name_candidate) and
251
+ len(name_candidate) > 3 and
252
+ len(name_candidate) < 50 and
253
+ not any(word in name_candidate.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'ADDRESS', 'BIRTH'])):
254
  # Make sure previous line wasn't ADDRESS or other label
255
  if i > 0:
256
  prev_line = lines[i-1].strip().upper()
257
+ if "ADDRESS" not in prev_line and "BIRTH" not in prev_line and "CITIZEN" not in prev_line:
258
  details['full_name'] = name_candidate
259
+ print(f"DEBUG: Found full name (colon pattern): {details['full_name']}", file=sys.stderr)
260
 
261
  # Extract Address
262
  if "ADDRESS" in line_upper and not details['address']:
 
350
  details['citizenship'] = parts[1].strip()
351
 
352
  # Extract Gender - handle cases where GENDER and value are on separate lines
353
+ # Format: 'GENDER' on one line, 'FEMALE' or 'MALE' on next line
354
  if "GENDER" in line_upper and not details['gender']:
355
+ # First, check if gender is on the same line after colon
356
  if ":" in line:
357
  parts = line.split(':', 1)
358
  if len(parts) > 1:
359
+ gender_part = parts[1].strip().upper()
 
 
 
 
360
  if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
361
+ details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
362
+ print(f"DEBUG: Found gender (same line): {details['gender']}", file=sys.stderr)
363
+ continue
364
+
365
+ # Check next few lines for gender value
366
+ if i + 1 < len(lines):
367
+ for j in range(1, min(4, len(lines) - i)):
368
+ next_line = lines[i+j].strip()
369
+ next_upper = next_line.upper()
370
+
371
+ # Skip if it's clearly a label
372
+ if any(label in next_upper for label in ['NAME', 'ADDRESS', 'BIRTH', 'CITIZEN', 'DATE', 'PLACE', 'PICTURE', 'SIGNATURE', 'THUMBMARK']):
373
+ continue
374
+
375
+ # Check if line starts with colon
376
+ if next_line.startswith(':') and len(next_line) > 1:
377
+ gender_part = next_line[1:].strip().upper()
378
  if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
379
+ details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
380
+ print(f"DEBUG: Found gender (colon line): {details['gender']}", file=sys.stderr)
381
+ break
382
+
383
+ # Check if the line itself is the gender value
384
+ elif next_upper in ['MALE', 'FEMALE', 'M', 'F']:
385
+ details['gender'] = next_line.capitalize() if len(next_line) > 1 else next_line
386
+ print(f"DEBUG: Found gender (direct): {details['gender']}", file=sys.stderr)
387
+ break
388
+
389
+ # Check if line contains colon with gender value
390
+ elif ":" in next_line:
391
+ parts = next_line.split(':', 1)
392
+ if len(parts) > 1:
393
+ gender_part = parts[1].strip().upper()
394
+ if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
395
+ details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
396
+ print(f"DEBUG: Found gender (colon in line): {details['gender']}", file=sys.stderr)
397
+ break
398
 
399
  # Extract ID Number (Usually "ID No.:" or near QR code)
400
  if "ID NO" in line_upper or "ID NO." in line_upper: