takomattyy commited on
Commit
6916300
·
verified ·
1 Parent(s): 7908d00

Upload 10 files

Browse files
extract_drivers_license.py CHANGED
@@ -271,7 +271,7 @@ def extract_drivers_license_info(lines):
271
  i += 1
272
 
273
  result = {
274
- 'id_type': 'drivers_license',
275
  'license_number': license_number,
276
  'id_number': license_number, # for frontend compatibility
277
  'full_name': full_name,
 
271
  i += 1
272
 
273
  result = {
274
+ 'id_type': "Driver's License",
275
  'license_number': license_number,
276
  'id_number': license_number, # for frontend compatibility
277
  'full_name': full_name,
extract_national_id.py CHANGED
@@ -192,24 +192,62 @@ def format_birth_date(date_str):
192
 
193
  def capitalize_name(name):
194
  """
195
- Properly capitalize name string.
196
 
197
  Args:
198
  name (str): Raw name string from OCR
199
 
200
  Returns:
201
- str: Properly capitalized name
202
 
203
  Why this is needed:
204
  - OCR often produces inconsistent capitalization
 
205
  - Need standardized name format for database storage
206
  - Handles multiple spaces and OCR artifacts
207
  """
208
  if not name:
209
  return name
210
 
211
- # Capitalize each word, handling possible multiple spaces or OCR errors
212
- return ' '.join([w.capitalize() for w in name.split()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
  # OCR Function
215
  def extract_id_info(lines):
@@ -239,6 +277,7 @@ def extract_id_info(lines):
239
  # Process each line to find relevant information
240
  for i in range(len(lines)):
241
  line = lines[i]
 
242
  print(f"DEBUG: Processing line {i}: '{line}'", file=sys.stderr)
243
 
244
  # Check for National ID number format: XXXX-XXXX-XXXX-XXXX
@@ -247,21 +286,31 @@ def extract_id_info(lines):
247
  id_number = line
248
  print(f"DEBUG: Found ID number: {id_number}", file=sys.stderr)
249
 
250
- # Look for bilingual "Last Name" label
251
  # Philippine IDs often have both English and Filipino labels
252
- if line == "Apelyido/Last Name" and i+1 < len(lines):
253
  last_name = lines[i+1]
254
  print(f"DEBUG: Found last name: {last_name}", file=sys.stderr)
255
 
256
- # Look for bilingual "Given Names" label
257
- if line == "Mga Pangalan/Given Names" and i+1 < len(lines):
258
  given_names = lines[i+1]
259
  print(f"DEBUG: Found given names: {given_names}", file=sys.stderr)
260
 
261
- # Look for bilingual "Date of Birth" label
262
- if line == "Petsa ng Kapanganakan/Date of Birth" and i+1 < len(lines):
263
- birth_date = lines[i+1]
264
- print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
 
 
 
 
 
 
 
 
 
 
265
 
266
  # Compose full name from separate fields
267
  # Philippine names typically follow: Given Names + Last Name
@@ -274,6 +323,7 @@ def extract_id_info(lines):
274
 
275
  # Return structured result
276
  result = {
 
277
  'id_number': id_number,
278
  'full_name': full_name,
279
  'birth_date': formatted_birth_date
@@ -310,22 +360,44 @@ def extract_ocr_lines(image_path):
310
  use_textline_orientation=False, # Disable for better performance
311
  lang='en' # English language
312
  )
313
- results = ocr.predict(image_path)
314
 
315
  # Process OCR results directly
316
  all_text = []
317
  try:
318
- lines = results[0] if results and isinstance(results[0], list) else results
319
- for item in lines:
320
- if isinstance(item, (list, tuple)) and len(item) >= 2:
321
- meta = item[1]
322
- if isinstance(meta, (list, tuple)) and len(meta) >= 1:
323
- all_text.append(str(meta[0]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  except Exception as e:
325
  print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
326
 
327
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
328
- return extract_id_info(all_text) if all_text else {'id_number': None, 'full_name': None, 'birth_date': None}
329
 
330
  # Main execution
331
  if __name__ == "__main__":
 
192
 
193
  def capitalize_name(name):
194
  """
195
+ Properly capitalize name string and split concatenated names.
196
 
197
  Args:
198
  name (str): Raw name string from OCR
199
 
200
  Returns:
201
+ str: Properly capitalized name with spaces between name parts
202
 
203
  Why this is needed:
204
  - OCR often produces inconsistent capitalization
205
+ - OCR may concatenate multiple given names without spaces
206
  - Need standardized name format for database storage
207
  - Handles multiple spaces and OCR artifacts
208
  """
209
  if not name:
210
  return name
211
 
212
+ # Split by existing spaces first
213
+ words = name.split()
214
+ processed_words = []
215
+
216
+ for word in words:
217
+ # Check if word is all caps and might be multiple names concatenated
218
+ # Example: "CARLMATTHEW" -> "Carl Matthew"
219
+ if word.isupper() and len(word) > 5:
220
+ # Try to split on capital letters that likely start new names
221
+ # Look for patterns where a lowercase would naturally follow
222
+ # Common Filipino/Western given names are 3-7 letters
223
+ parts = []
224
+ current = []
225
+
226
+ for i, char in enumerate(word):
227
+ current.append(char)
228
+ # Check if this might be the end of a name part
229
+ # (next char is capital and current part is 3+ letters)
230
+ if i < len(word) - 1 and len(current) >= 3:
231
+ # Check if the accumulated part looks like a complete name
232
+ current_str = ''.join(current)
233
+ # Common name endings or patterns
234
+ if current_str.upper() in ['CARL', 'MATTHEW', 'JOHN', 'MARK', 'LUKE', 'PAUL', 'MARIA', 'JOSE', 'JUAN', 'PEDRO', 'MIGUEL', 'ANGEL', 'LUIS', 'CARLOS', 'MARCO', 'ANDRE', 'ANDRE', 'ALBERT', 'JOY']:
235
+ parts.append(''.join(current))
236
+ current = []
237
+
238
+ # Add remaining
239
+ if current:
240
+ parts.append(''.join(current))
241
+
242
+ # If we found multiple parts, use them; otherwise use original
243
+ if len(parts) > 1:
244
+ processed_words.extend([p.capitalize() for p in parts])
245
+ else:
246
+ processed_words.append(word.capitalize())
247
+ else:
248
+ processed_words.append(word.capitalize())
249
+
250
+ return ' '.join(processed_words)
251
 
252
  # OCR Function
253
  def extract_id_info(lines):
 
277
  # Process each line to find relevant information
278
  for i in range(len(lines)):
279
  line = lines[i]
280
+ line_upper = line.upper().replace(' ', '') if isinstance(line, str) else ''
281
  print(f"DEBUG: Processing line {i}: '{line}'", file=sys.stderr)
282
 
283
  # Check for National ID number format: XXXX-XXXX-XXXX-XXXX
 
286
  id_number = line
287
  print(f"DEBUG: Found ID number: {id_number}", file=sys.stderr)
288
 
289
+ # Look for bilingual "Last Name" label (flexible matching)
290
  # Philippine IDs often have both English and Filipino labels
291
+ if ('APELYIDO' in line_upper and 'LASTNAME' in line_upper) and i+1 < len(lines):
292
  last_name = lines[i+1]
293
  print(f"DEBUG: Found last name: {last_name}", file=sys.stderr)
294
 
295
+ # Look for bilingual "Given Names" label (flexible matching)
296
+ if ('PANGALAN' in line_upper and 'GIVENNAMES' in line_upper) and i+1 < len(lines):
297
  given_names = lines[i+1]
298
  print(f"DEBUG: Found given names: {given_names}", file=sys.stderr)
299
 
300
+ # Look for bilingual "Date of Birth" label (flexible matching)
301
+ if ('KAPANGANAKAN' in line_upper or ('DATEOF' in line_upper and 'BIRTH' in line_upper)):
302
+ # Look ahead for the actual date value (skip any labels)
303
+ for j in range(i+1, min(i+4, len(lines))):
304
+ next_line = lines[j]
305
+ next_upper = next_line.upper().replace(' ', '') if isinstance(next_line, str) else ''
306
+ # Skip if it's another label
307
+ if any(keyword in next_upper for keyword in ['DIGITAL', 'NUMBER', 'ADDRESS', 'TIRAHAN', 'ID']):
308
+ continue
309
+ # Check if it looks like a date (contains month name or digits)
310
+ if any(month in next_line for month in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) or re.search(r'\d{1,2}[,.\s]+\d{4}', next_line):
311
+ birth_date = next_line
312
+ print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
313
+ break
314
 
315
  # Compose full name from separate fields
316
  # Philippine names typically follow: Given Names + Last Name
 
323
 
324
  # Return structured result
325
  result = {
326
+ 'id_type': 'National ID',
327
  'id_number': id_number,
328
  'full_name': full_name,
329
  'birth_date': formatted_birth_date
 
360
  use_textline_orientation=False, # Disable for better performance
361
  lang='en' # English language
362
  )
363
+ results = ocr.ocr(image_path)
364
 
365
  # Process OCR results directly
366
  all_text = []
367
  try:
368
+ # Handle both old format (list) and new format (OCRResult object)
369
+ if results and isinstance(results, list) and len(results) > 0:
370
+ first_item = results[0]
371
+ item_type_name = type(first_item).__name__
372
+ is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
373
+
374
+ if is_ocr_result:
375
+ print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
376
+ # Access OCRResult as dictionary
377
+ try:
378
+ if hasattr(first_item, 'keys'):
379
+ ocr_dict = dict(first_item)
380
+ # Look for rec_texts key
381
+ if 'rec_texts' in ocr_dict:
382
+ rec_texts = ocr_dict['rec_texts']
383
+ if isinstance(rec_texts, list):
384
+ all_text = [str(t) for t in rec_texts if t]
385
+ print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
386
+ except Exception as e:
387
+ print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
388
+ else:
389
+ # Old format - list of lists
390
+ lines = results[0] if results and isinstance(results[0], list) else results
391
+ for item in lines:
392
+ if isinstance(item, (list, tuple)) and len(item) >= 2:
393
+ meta = item[1]
394
+ if isinstance(meta, (list, tuple)) and len(meta) >= 1:
395
+ all_text.append(str(meta[0]))
396
  except Exception as e:
397
  print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
398
 
399
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
400
+ return extract_id_info(all_text) if all_text else {'id_type': 'National ID', 'id_number': None, 'full_name': None, 'birth_date': None}
401
 
402
  # Main execution
403
  if __name__ == "__main__":
extract_nbi_ocr.py CHANGED
@@ -80,6 +80,7 @@ def extract_nbi_id(lines):
80
  break
81
 
82
  return {
 
83
  'id_number': nbi_id,
84
  'full_name': None,
85
  'birth_date': None,
@@ -109,7 +110,7 @@ def extract_ocr_lines_simple(image_path):
109
  except Exception:
110
  pass
111
 
112
- return extract_nbi_id(all_text) if all_text else {'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
113
 
114
  def extract_ocr_lines(image_path):
115
  # Check if file exists and has content
@@ -145,7 +146,7 @@ def extract_ocr_lines(image_path):
145
  print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
146
 
147
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
148
- return extract_nbi_id(all_text) if all_text else {'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
149
 
150
  # Main
151
  if len(sys.argv) < 2:
 
80
  break
81
 
82
  return {
83
+ 'clearance_type': 'nbi',
84
  'id_number': nbi_id,
85
  'full_name': None,
86
  'birth_date': None,
 
110
  except Exception:
111
  pass
112
 
113
+ return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
114
 
115
  def extract_ocr_lines(image_path):
116
  # Check if file exists and has content
 
146
  print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
147
 
148
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
149
+ return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
150
 
151
  # Main
152
  if len(sys.argv) < 2:
extract_passport.py CHANGED
@@ -373,7 +373,7 @@ def extract_passport_info(lines):
373
 
374
  # Return structured result
375
  result = {
376
- "id_type": "passport",
377
  "passport_number": passport_number,
378
  "id_number": passport_number,
379
  "full_name": full_name,
@@ -426,7 +426,7 @@ def extract_ocr_lines(image_path):
426
 
427
  dprint("All direct texts", all_text)
428
  return extract_passport_info(all_text) if all_text else {
429
- "id_type": "passport",
430
  "passport_number": None,
431
  "id_number": None,
432
  "full_name": None,
 
373
 
374
  # Return structured result
375
  result = {
376
+ "id_type": "Passport",
377
  "passport_number": passport_number,
378
  "id_number": passport_number,
379
  "full_name": full_name,
 
426
 
427
  dprint("All direct texts", all_text)
428
  return extract_passport_info(all_text) if all_text else {
429
+ "id_type": "Passport",
430
  "passport_number": None,
431
  "id_number": None,
432
  "full_name": None,
extract_phic.py CHANGED
@@ -144,6 +144,7 @@ def format_address(address_lines):
144
 
145
  def extract_phic_details(lines):
146
  details = {
 
147
  'id_number': None,
148
  'full_name': None,
149
  'birth_date': None,
@@ -354,6 +355,7 @@ def extract_ocr_lines(image_path):
354
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
355
 
356
  return extract_phic_details(all_text) if all_text else {
 
357
  'id_number': None,
358
  'full_name': None,
359
  'birth_date': None,
 
144
 
145
  def extract_phic_details(lines):
146
  details = {
147
+ 'id_type': 'PHIC',
148
  'id_number': None,
149
  'full_name': None,
150
  'birth_date': None,
 
355
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
356
 
357
  return extract_phic_details(all_text) if all_text else {
358
+ 'id_type': 'PHIC',
359
  'id_number': None,
360
  'full_name': None,
361
  'birth_date': None,
extract_police_ocr.py CHANGED
@@ -177,6 +177,7 @@ def format_birth_date(date):
177
 
178
  def extract_police_details(lines):
179
  details = {
 
180
  'id_number': None,
181
  'full_name': None,
182
  'address': None,
@@ -575,7 +576,7 @@ def extract_ocr_lines(image_path):
575
 
576
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
577
 
578
- return extract_police_details(all_text) if all_text else {'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}
579
 
580
  def extract_ocr_lines_simple(image_path):
581
  # Fallback method with advanced features (matching NBI script fallback)
@@ -762,7 +763,7 @@ def extract_ocr_lines_simple(image_path):
762
 
763
  print(f"DEBUG (fallback): Extracted text lines: {all_text}", file=sys.stderr)
764
 
765
- return extract_police_details(all_text) if all_text else {'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}
766
 
767
  # Main Execution
768
  if len(sys.argv) < 2:
 
177
 
178
  def extract_police_details(lines):
179
  details = {
180
+ 'clearance_type': 'police',
181
  'id_number': None,
182
  'full_name': None,
183
  'address': None,
 
576
 
577
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
578
 
579
+ return extract_police_details(all_text) if all_text else {'clearance_type': 'police', 'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}
580
 
581
  def extract_ocr_lines_simple(image_path):
582
  # Fallback method with advanced features (matching NBI script fallback)
 
763
 
764
  print(f"DEBUG (fallback): Extracted text lines: {all_text}", file=sys.stderr)
765
 
766
+ return extract_police_details(all_text) if all_text else {'clearance_type': 'police', 'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}
767
 
768
  # Main Execution
769
  if len(sys.argv) < 2:
extract_postal.py CHANGED
@@ -128,6 +128,7 @@ def format_address(address_lines):
128
 
129
  def extract_postal_details(lines):
130
  details = {
 
131
  'prn': None,
132
  'full_name': None,
133
  'address': None,
@@ -367,6 +368,7 @@ def extract_ocr_lines(image_path):
367
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
368
 
369
  return extract_postal_details(all_text) if all_text else {
 
370
  'prn': None,
371
  'full_name': None,
372
  'address': None,
 
128
 
129
  def extract_postal_details(lines):
130
  details = {
131
+ 'id_type': 'Postal ID',
132
  'prn': None,
133
  'full_name': None,
134
  'address': None,
 
368
  print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
369
 
370
  return extract_postal_details(all_text) if all_text else {
371
+ 'id_type': 'Postal ID',
372
  'prn': None,
373
  'full_name': None,
374
  'address': None,
extract_prc.py CHANGED
@@ -295,7 +295,7 @@ def extract_prc_info(lines):
295
 
296
  # Return structured result
297
  result = {
298
- "id_type": "prc",
299
  "crn": crn,
300
  "id_number": registration_number or crn, # Frontend expects id_number
301
  "registration_number": registration_number,
@@ -340,7 +340,7 @@ def extract_ocr_lines(image_path):
340
 
341
  dprint("All direct texts", all_text)
342
  return extract_prc_info(all_text) if all_text else {
343
- "id_type": "prc",
344
  "crn": None,
345
  "full_name": None,
346
  "birth_date": None
 
295
 
296
  # Return structured result
297
  result = {
298
+ "id_type": "PRC ID",
299
  "crn": crn,
300
  "id_number": registration_number or crn, # Frontend expects id_number
301
  "registration_number": registration_number,
 
340
 
341
  dprint("All direct texts", all_text)
342
  return extract_prc_info(all_text) if all_text else {
343
+ "id_type": "PRC ID",
344
  "crn": None,
345
  "full_name": None,
346
  "birth_date": None
extract_sss.py CHANGED
@@ -136,7 +136,7 @@ def extract_sss_info(lines):
136
  dprint("Composed name from all parts", {"parts": name_parts, "result": full_name})
137
 
138
  result = {
139
- "id_type": "sss",
140
  "sss_number": sss_number,
141
  "id_number": sss_id_number,
142
  "full_name": full_name,
@@ -178,7 +178,7 @@ def extract_ocr_lines(image_path):
178
 
179
  dprint("All direct texts", all_text)
180
  return extract_sss_info(all_text) if all_text else {
181
- "id_type": "sss",
182
  "sss_number": None,
183
  "id_number": None,
184
  "full_name": None,
 
136
  dprint("Composed name from all parts", {"parts": name_parts, "result": full_name})
137
 
138
  result = {
139
+ "id_type": "SSS ID",
140
  "sss_number": sss_number,
141
  "id_number": sss_id_number,
142
  "full_name": full_name,
 
178
 
179
  dprint("All direct texts", all_text)
180
  return extract_sss_info(all_text) if all_text else {
181
+ "id_type": "SSS ID",
182
  "sss_number": None,
183
  "id_number": None,
184
  "full_name": None,
extract_umid.py CHANGED
@@ -214,7 +214,7 @@ def extract_umid_info(lines):
214
  dprint("Composed full_name", {"last": last_name_txt, "given": given_name_txt, "full": full_name})
215
 
216
  result = {
217
- "id_type": "umid",
218
  "crn": crn,
219
  "id_number": crn, # frontend expects this
220
  "full_name": full_name,
@@ -255,7 +255,7 @@ def extract_ocr_lines(image_path):
255
 
256
  dprint("All direct texts", all_text)
257
  return extract_umid_info(all_text) if all_text else {
258
- "id_type": "umid",
259
  "crn": None,
260
  "id_number": None,
261
  "full_name": None,
 
214
  dprint("Composed full_name", {"last": last_name_txt, "given": given_name_txt, "full": full_name})
215
 
216
  result = {
217
+ "id_type": "UMID",
218
  "crn": crn,
219
  "id_number": crn, # frontend expects this
220
  "full_name": full_name,
 
255
 
256
  dprint("All direct texts", all_text)
257
  return extract_umid_info(all_text) if all_text else {
258
+ "id_type": "UMID",
259
  "crn": None,
260
  "id_number": None,
261
  "full_name": None,