Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

takomattyy commited on 21 days ago

Commit

6916300

verified ·

1 Parent(s): 7908d00

Upload 10 files

Browse files

Files changed (10) hide show

extract_drivers_license.py +1 -1
extract_national_id.py +92 -20
extract_nbi_ocr.py +3 -2
extract_passport.py +2 -2
extract_phic.py +2 -0
extract_police_ocr.py +3 -2
extract_postal.py +2 -0
extract_prc.py +2 -2
extract_sss.py +2 -2
extract_umid.py +2 -2

extract_drivers_license.py CHANGED Viewed

@@ -271,7 +271,7 @@ def extract_drivers_license_info(lines):
         i += 1
     result = {
-        'id_type': 'drivers_license',
         'license_number': license_number,
         'id_number': license_number,  # for frontend compatibility
         'full_name': full_name,

         i += 1
     result = {
+        'id_type': "Driver's License",
         'license_number': license_number,
         'id_number': license_number,  # for frontend compatibility
         'full_name': full_name,

extract_national_id.py CHANGED Viewed

@@ -192,24 +192,62 @@ def format_birth_date(date_str):
 def capitalize_name(name):
     """
-    Properly capitalize name string.
     Args:
         name (str): Raw name string from OCR
     Returns:
-        str: Properly capitalized name
     Why this is needed:
     - OCR often produces inconsistent capitalization
     - Need standardized name format for database storage
     - Handles multiple spaces and OCR artifacts
     """
     if not name:
         return name
-    # Capitalize each word, handling possible multiple spaces or OCR errors
-    return ' '.join([w.capitalize() for w in name.split()])
 # OCR Function
 def extract_id_info(lines):
@@ -239,6 +277,7 @@ def extract_id_info(lines):
     # Process each line to find relevant information
     for i in range(len(lines)):
         line = lines[i]
         print(f"DEBUG: Processing line {i}: '{line}'", file=sys.stderr)
         # Check for National ID number format: XXXX-XXXX-XXXX-XXXX
@@ -247,21 +286,31 @@ def extract_id_info(lines):
             id_number = line
             print(f"DEBUG: Found ID number: {id_number}", file=sys.stderr)
-        # Look for bilingual "Last Name" label
         # Philippine IDs often have both English and Filipino labels
-        if line == "Apelyido/Last Name" and i+1 < len(lines):
             last_name = lines[i+1]
             print(f"DEBUG: Found last name: {last_name}", file=sys.stderr)
-        # Look for bilingual "Given Names" label
-        if line == "Mga Pangalan/Given Names" and i+1 < len(lines):
             given_names = lines[i+1]
             print(f"DEBUG: Found given names: {given_names}", file=sys.stderr)
-        # Look for bilingual "Date of Birth" label
-        if line == "Petsa ng Kapanganakan/Date of Birth" and i+1 < len(lines):
-            birth_date = lines[i+1]
-            print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
     # Compose full name from separate fields
     # Philippine names typically follow: Given Names + Last Name
@@ -274,6 +323,7 @@ def extract_id_info(lines):
     # Return structured result
     result = {
         'id_number': id_number,
         'full_name': full_name,
         'birth_date': formatted_birth_date
@@ -310,22 +360,44 @@ def extract_ocr_lines(image_path):
             use_textline_orientation=False,     # Disable for better performance
             lang='en'                           # English language
         )
-        results = ocr.predict(image_path)
     # Process OCR results directly
     all_text = []
     try:
-        lines = results[0] if results and isinstance(results[0], list) else results
-        for item in lines:
-            if isinstance(item, (list, tuple)) and len(item) >= 2:
-                meta = item[1]
-                if isinstance(meta, (list, tuple)) and len(meta) >= 1:
-                    all_text.append(str(meta[0]))
     except Exception as e:
         print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
-    return extract_id_info(all_text) if all_text else {'id_number': None, 'full_name': None, 'birth_date': None}
 # Main execution
 if __name__ == "__main__":

 def capitalize_name(name):
     """
+    Properly capitalize name string and split concatenated names.
     Args:
         name (str): Raw name string from OCR
     Returns:
+        str: Properly capitalized name with spaces between name parts
     Why this is needed:
     - OCR often produces inconsistent capitalization
+    - OCR may concatenate multiple given names without spaces
     - Need standardized name format for database storage
     - Handles multiple spaces and OCR artifacts
     """
     if not name:
         return name
+    # Split by existing spaces first
+    words = name.split()
+    processed_words = []
+    for word in words:
+        # Check if word is all caps and might be multiple names concatenated
+        # Example: "CARLMATTHEW" -> "Carl Matthew"
+        if word.isupper() and len(word) > 5:
+            # Try to split on capital letters that likely start new names
+            # Look for patterns where a lowercase would naturally follow
+            # Common Filipino/Western given names are 3-7 letters
+            parts = []
+            current = []
+            for i, char in enumerate(word):
+                current.append(char)
+                # Check if this might be the end of a name part
+                # (next char is capital and current part is 3+ letters)
+                if i < len(word) - 1 and len(current) >= 3:
+                    # Check if the accumulated part looks like a complete name
+                    current_str = ''.join(current)
+                    # Common name endings or patterns
+                    if current_str.upper() in ['CARL', 'MATTHEW', 'JOHN', 'MARK', 'LUKE', 'PAUL', 'MARIA', 'JOSE', 'JUAN', 'PEDRO', 'MIGUEL', 'ANGEL', 'LUIS', 'CARLOS', 'MARCO', 'ANDRE', 'ANDRE', 'ALBERT', 'JOY']:
+                        parts.append(''.join(current))
+                        current = []
+            # Add remaining
+            if current:
+                parts.append(''.join(current))
+            # If we found multiple parts, use them; otherwise use original
+            if len(parts) > 1:
+                processed_words.extend([p.capitalize() for p in parts])
+            else:
+                processed_words.append(word.capitalize())
+        else:
+            processed_words.append(word.capitalize())
+    return ' '.join(processed_words)
 # OCR Function
 def extract_id_info(lines):
     # Process each line to find relevant information
     for i in range(len(lines)):
         line = lines[i]
+        line_upper = line.upper().replace(' ', '') if isinstance(line, str) else ''
         print(f"DEBUG: Processing line {i}: '{line}'", file=sys.stderr)
         # Check for National ID number format: XXXX-XXXX-XXXX-XXXX
             id_number = line
             print(f"DEBUG: Found ID number: {id_number}", file=sys.stderr)
+        # Look for bilingual "Last Name" label (flexible matching)
         # Philippine IDs often have both English and Filipino labels
+        if ('APELYIDO' in line_upper and 'LASTNAME' in line_upper) and i+1 < len(lines):
             last_name = lines[i+1]
             print(f"DEBUG: Found last name: {last_name}", file=sys.stderr)
+        # Look for bilingual "Given Names" label (flexible matching)
+        if ('PANGALAN' in line_upper and 'GIVENNAMES' in line_upper) and i+1 < len(lines):
             given_names = lines[i+1]
             print(f"DEBUG: Found given names: {given_names}", file=sys.stderr)
+        # Look for bilingual "Date of Birth" label (flexible matching)
+        if ('KAPANGANAKAN' in line_upper or ('DATEOF' in line_upper and 'BIRTH' in line_upper)):
+            # Look ahead for the actual date value (skip any labels)
+            for j in range(i+1, min(i+4, len(lines))):
+                next_line = lines[j]
+                next_upper = next_line.upper().replace(' ', '') if isinstance(next_line, str) else ''
+                # Skip if it's another label
+                if any(keyword in next_upper for keyword in ['DIGITAL', 'NUMBER', 'ADDRESS', 'TIRAHAN', 'ID']):
+                    continue
+                # Check if it looks like a date (contains month name or digits)
+                if any(month in next_line for month in ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']) or re.search(r'\d{1,2}[,.\s]+\d{4}', next_line):
+                    birth_date = next_line
+                    print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
+                    break
     # Compose full name from separate fields
     # Philippine names typically follow: Given Names + Last Name
     # Return structured result
     result = {
+        'id_type': 'National ID',
         'id_number': id_number,
         'full_name': full_name,
         'birth_date': formatted_birth_date
             use_textline_orientation=False,     # Disable for better performance
             lang='en'                           # English language
         )
+        results = ocr.ocr(image_path)
     # Process OCR results directly
     all_text = []
     try:
+        # Handle both old format (list) and new format (OCRResult object)
+        if results and isinstance(results, list) and len(results) > 0:
+            first_item = results[0]
+            item_type_name = type(first_item).__name__
+            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
+            if is_ocr_result:
+                print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
+                # Access OCRResult as dictionary
+                try:
+                    if hasattr(first_item, 'keys'):
+                        ocr_dict = dict(first_item)
+                        # Look for rec_texts key
+                        if 'rec_texts' in ocr_dict:
+                            rec_texts = ocr_dict['rec_texts']
+                            if isinstance(rec_texts, list):
+                                all_text = [str(t) for t in rec_texts if t]
+                                print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
+                except Exception as e:
+                    print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
+            else:
+                # Old format - list of lists
+                lines = results[0] if results and isinstance(results[0], list) else results
+                for item in lines:
+                    if isinstance(item, (list, tuple)) and len(item) >= 2:
+                        meta = item[1]
+                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
+                            all_text.append(str(meta[0]))
     except Exception as e:
         print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
+    return extract_id_info(all_text) if all_text else {'id_type': 'National ID', 'id_number': None, 'full_name': None, 'birth_date': None}
 # Main execution
 if __name__ == "__main__":

extract_nbi_ocr.py CHANGED Viewed

@@ -80,6 +80,7 @@ def extract_nbi_id(lines):
                     break
     return {
         'id_number': nbi_id,
         'full_name': None,
         'birth_date': None,
@@ -109,7 +110,7 @@ def extract_ocr_lines_simple(image_path):
     except Exception:
         pass
-    return extract_nbi_id(all_text) if all_text else {'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
 def extract_ocr_lines(image_path):
     # Check if file exists and has content
@@ -145,7 +146,7 @@ def extract_ocr_lines(image_path):
         print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
-    return extract_nbi_id(all_text) if all_text else {'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
 # Main
 if len(sys.argv) < 2:

                     break
     return {
+        'clearance_type': 'nbi',
         'id_number': nbi_id,
         'full_name': None,
         'birth_date': None,
     except Exception:
         pass
+    return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
 def extract_ocr_lines(image_path):
     # Check if file exists and has content
         print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
+    return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
 # Main
 if len(sys.argv) < 2:

extract_passport.py CHANGED Viewed

@@ -373,7 +373,7 @@ def extract_passport_info(lines):
     # Return structured result
     result = {
-        "id_type": "passport",
         "passport_number": passport_number,
         "id_number": passport_number,
         "full_name": full_name,
@@ -426,7 +426,7 @@ def extract_ocr_lines(image_path):
     dprint("All direct texts", all_text)
     return extract_passport_info(all_text) if all_text else {
-        "id_type": "passport",
         "passport_number": None,
         "id_number": None,
         "full_name": None,

     # Return structured result
     result = {
+        "id_type": "Passport",
         "passport_number": passport_number,
         "id_number": passport_number,
         "full_name": full_name,
     dprint("All direct texts", all_text)
     return extract_passport_info(all_text) if all_text else {
+        "id_type": "Passport",
         "passport_number": None,
         "id_number": None,
         "full_name": None,

extract_phic.py CHANGED Viewed

@@ -144,6 +144,7 @@ def format_address(address_lines):
 def extract_phic_details(lines):
     details = {
         'id_number': None,
         'full_name': None,
         'birth_date': None,
@@ -354,6 +355,7 @@ def extract_ocr_lines(image_path):
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
     return extract_phic_details(all_text) if all_text else {
         'id_number': None,
         'full_name': None,
         'birth_date': None,

 def extract_phic_details(lines):
     details = {
+        'id_type': 'PHIC',
         'id_number': None,
         'full_name': None,
         'birth_date': None,
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
     return extract_phic_details(all_text) if all_text else {
+        'id_type': 'PHIC',
         'id_number': None,
         'full_name': None,
         'birth_date': None,

extract_police_ocr.py CHANGED Viewed

@@ -177,6 +177,7 @@ def format_birth_date(date):
 def extract_police_details(lines):
     details = {
         'id_number': None,
         'full_name': None,
         'address': None,
@@ -575,7 +576,7 @@ def extract_ocr_lines(image_path):
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
-    return extract_police_details(all_text) if all_text else {'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}
 def extract_ocr_lines_simple(image_path):
     # Fallback method with advanced features (matching NBI script fallback)
@@ -762,7 +763,7 @@ def extract_ocr_lines_simple(image_path):
     print(f"DEBUG (fallback): Extracted text lines: {all_text}", file=sys.stderr)
-    return extract_police_details(all_text) if all_text else {'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}
 # Main Execution
 if len(sys.argv) < 2:

 def extract_police_details(lines):
     details = {
+        'clearance_type': 'police',
         'id_number': None,
         'full_name': None,
         'address': None,
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
+    return extract_police_details(all_text) if all_text else {'clearance_type': 'police', 'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}
 def extract_ocr_lines_simple(image_path):
     # Fallback method with advanced features (matching NBI script fallback)
     print(f"DEBUG (fallback): Extracted text lines: {all_text}", file=sys.stderr)
+    return extract_police_details(all_text) if all_text else {'clearance_type': 'police', 'id_number': None, 'full_name': None, 'address': None, 'birth_date': None, 'birth_place': None, 'citizenship': None, 'gender': None, 'status': None, 'success': False}
 # Main Execution
 if len(sys.argv) < 2:

extract_postal.py CHANGED Viewed

@@ -128,6 +128,7 @@ def format_address(address_lines):
 def extract_postal_details(lines):
     details = {
         'prn': None,
         'full_name': None,
         'address': None,
@@ -367,6 +368,7 @@ def extract_ocr_lines(image_path):
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
     return extract_postal_details(all_text) if all_text else {
         'prn': None,
         'full_name': None,
         'address': None,

 def extract_postal_details(lines):
     details = {
+        'id_type': 'Postal ID',
         'prn': None,
         'full_name': None,
         'address': None,
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
     return extract_postal_details(all_text) if all_text else {
+        'id_type': 'Postal ID',
         'prn': None,
         'full_name': None,
         'address': None,

extract_prc.py CHANGED Viewed

@@ -295,7 +295,7 @@ def extract_prc_info(lines):
     # Return structured result
     result = {
-        "id_type": "prc",
         "crn": crn,
         "id_number": registration_number or crn,  # Frontend expects id_number
         "registration_number": registration_number,
@@ -340,7 +340,7 @@ def extract_ocr_lines(image_path):
     dprint("All direct texts", all_text)
     return extract_prc_info(all_text) if all_text else {
-        "id_type": "prc",
         "crn": None,
         "full_name": None,
         "birth_date": None

     # Return structured result
     result = {
+        "id_type": "PRC ID",
         "crn": crn,
         "id_number": registration_number or crn,  # Frontend expects id_number
         "registration_number": registration_number,
     dprint("All direct texts", all_text)
     return extract_prc_info(all_text) if all_text else {
+        "id_type": "PRC ID",
         "crn": None,
         "full_name": None,
         "birth_date": None

extract_sss.py CHANGED Viewed

@@ -136,7 +136,7 @@ def extract_sss_info(lines):
         dprint("Composed name from all parts", {"parts": name_parts, "result": full_name})
     result = {
-        "id_type": "sss",
         "sss_number": sss_number,
         "id_number": sss_id_number,
         "full_name": full_name,
@@ -178,7 +178,7 @@ def extract_ocr_lines(image_path):
     dprint("All direct texts", all_text)
     return extract_sss_info(all_text) if all_text else {
-        "id_type": "sss",
         "sss_number": None,
         "id_number": None,
         "full_name": None,

         dprint("Composed name from all parts", {"parts": name_parts, "result": full_name})
     result = {
+        "id_type": "SSS ID",
         "sss_number": sss_number,
         "id_number": sss_id_number,
         "full_name": full_name,
     dprint("All direct texts", all_text)
     return extract_sss_info(all_text) if all_text else {
+        "id_type": "SSS ID",
         "sss_number": None,
         "id_number": None,
         "full_name": None,

extract_umid.py CHANGED Viewed

@@ -214,7 +214,7 @@ def extract_umid_info(lines):
         dprint("Composed full_name", {"last": last_name_txt, "given": given_name_txt, "full": full_name})
     result = {
-        "id_type": "umid",
         "crn": crn,
         "id_number": crn,   # frontend expects this
         "full_name": full_name,
@@ -255,7 +255,7 @@ def extract_ocr_lines(image_path):
     dprint("All direct texts", all_text)
     return extract_umid_info(all_text) if all_text else {
-        "id_type": "umid",
         "crn": None,
         "id_number": None,
         "full_name": None,

         dprint("Composed full_name", {"last": last_name_txt, "given": given_name_txt, "full": full_name})
     result = {
+        "id_type": "UMID",
         "crn": crn,
         "id_number": crn,   # frontend expects this
         "full_name": full_name,
     dprint("All direct texts", all_text)
     return extract_umid_info(all_text) if all_text else {
+        "id_type": "UMID",
         "crn": None,
         "id_number": None,
         "full_name": None,