Spaces:

takomattyy
/

handyhome-ocr-api

Sleeping

App Files Files Community

takomattyy commited on 24 days ago

Commit

2b089f9

verified ·

1 Parent(s): 4848007

Upload 2 files

Browse files

nbi and police ocr

Files changed (2) hide show

extract_nbi_ocr.py +305 -50
extract_police_ocr.py +80 -25

extract_nbi_ocr.py CHANGED Viewed

@@ -48,43 +48,238 @@ def download_image(url, output_path='temp_image.jpg'):
     return output_path
-# OCR Function to extract NBI ID NO
 def extract_nbi_id(lines):
     nbi_id = None
-    for i, line in enumerate(lines):
-        if isinstance(line, str):
-            # Look for "NBI ID NO:" pattern
-            if "NBI ID NO:" in line.upper() or "NBIIDNO" in line.upper():
                 # Extract the ID after the colon
-                parts = line.split(':')
-                if len(parts) > 1:
-                    nbi_id = parts[1].strip()
-                    break
-            # Also check if the next line contains the ID (in case it's on a separate line)
-            elif i < len(lines) - 1 and ("NBI ID NO:" in line.upper() or "NBI ID NO" in line.upper()):
-                next_line = lines[i + 1]
-                if isinstance(next_line, str) and len(next_line.strip()) > 5:
-                    nbi_id = next_line.strip()
-                    break
-    # If not found with "NBI ID NO:" pattern, look for the specific format
-    if not nbi_id:
-        for line in lines:
-            if isinstance(line, str):
-                # Look for pattern like HGUR87H38D-U47204A873 (alphanumeric with one hyphen)
-                pattern = r'[A-Z0-9]{10,12}-[A-Z0-9]{10,12}'
-                match = re.search(pattern, line)
-                if match:
-                    nbi_id = match.group()
-                    break
     return {
         'clearance_type': 'nbi',
         'id_number': nbi_id,
-        'full_name': None,
-        'birth_date': None,
-        'success': nbi_id is not None
     }
 def extract_ocr_lines_simple(image_path):
@@ -97,25 +292,54 @@ def extract_ocr_lines_simple(image_path):
             use_textline_orientation=True,     # Enable text line orientation
             lang='en'                          # Set language to English
         )
-        results = ocr.predict(image_path)
     all_text = []
     try:
-        lines = results[0] if results and isinstance(results[0], list) else results
-        for item in lines:
-            if isinstance(item, (list, tuple)) and len(item) >= 2:
-                meta = item[1]
-                if isinstance(meta, (list, tuple)) and len(meta) >= 1:
-                    all_text.append(str(meta[0]))
-    except Exception:
-        pass
-    return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
 def extract_ocr_lines(image_path):
     # Check if file exists and has content
     if not os.path.exists(image_path):
-        return {'id_number': None, 'success': False}
     # Ensure output directory exists
     os.makedirs("output", exist_ok=True)
@@ -131,22 +355,53 @@ def extract_ocr_lines(image_path):
             use_textline_orientation=False,
             lang='en'
         )
-        results = ocr.predict(image_path)
-    # Process OCR results directly
     all_text = []
     try:
-        lines = results[0] if results and isinstance(results[0], list) else results
-        for item in lines:
-            if isinstance(item, (list, tuple)) and len(item) >= 2:
-                meta = item[1]
-                if isinstance(meta, (list, tuple)) and len(meta) >= 1:
-                    all_text.append(str(meta[0]))
     except Exception as e:
         print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
-    return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
 # Main
 if len(sys.argv) < 2:

     return output_path
+# OCR Function to extract NBI ID NO, Name, Birth Date, and LIT
 def extract_nbi_id(lines):
     nbi_id = None
+    full_name = None
+    birth_date = None
+    lit = None  # LIT field (Last Issued To or similar)
+    # Clean lines - convert to strings and strip
+    cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines]
+    # First pass: Look for NBI ID pattern in all lines (prioritize exact matches)
+    # This helps catch IDs that might be on lines without labels
+    for i, line in enumerate(cleaned_lines):
+        line_upper = line.upper().strip()
+        line_clean = line.strip()
+        # Look for NBI ID pattern with hyphen first (most reliable)
+        if not nbi_id:
+            hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
+            match = re.search(hyphen_pattern, line_clean)
+            if match:
+                candidate = match.group(1)
+                # Validate length and that it's not part of an address
+                if 17 <= len(candidate) <= 25:
+                    # Check that line doesn't have too many words (NBI IDs are usually standalone)
+                    line_words = line_clean.split()
+                    if len(line_words) <= 3:  # Usually 1-2 words max (the ID itself)
+                        # Additional validation: should have mix of letters and numbers
+                        has_letters = bool(re.search(r'[A-Z]', candidate))
+                        has_numbers = bool(re.search(r'[0-9]', candidate))
+                        if has_letters and has_numbers:
+                            nbi_id = candidate
+                            print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr)
+                            break
+    # Second pass: Extract other fields and refine ID if needed
+    for i, line in enumerate(cleaned_lines):
+        line_upper = line.upper().strip()
+        line_clean = line.strip()
+        # Extract NBI ID Number (if not found in first pass)
+        if not nbi_id:
+            # Look for "NBI ID NO:" pattern (various formats)
+            if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or
+                "NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper):
                 # Extract the ID after the colon
+                if ":" in line:
+                    parts = line.split(':', 1)
+                    if len(parts) > 1:
+                        id_candidate = parts[1].strip()
+                        # Clean up the ID (remove extra spaces, ensure proper format)
+                        id_candidate = re.sub(r'\s+', '', id_candidate)  # Remove spaces
+                        if len(id_candidate) > 5:  # Valid ID should be longer
+                            nbi_id = id_candidate
+                            print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr)
+                            continue
+                # Also check if the next line contains the ID (in case it's on a separate line)
+                if i < len(cleaned_lines) - 1:
+                    for j in range(1, min(3, len(cleaned_lines) - i)):
+                        next_line = cleaned_lines[i + j].strip()
+                        # Skip if it's clearly not an ID (too short, contains labels)
+                        if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']):
+                            continue
+                        # Check if it looks like an NBI ID (alphanumeric, reasonable length)
+                        if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')):
+                            nbi_id = next_line.replace(' ', '')
+                            print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr)
+                            break
+                    if nbi_id:
+                        continue
+            # Look for NBI ID pattern: alphanumeric with one hyphen
+            # Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873
+            # First part: 8-12 chars, hyphen, second part: 8-12 chars
+            # Total length: 17-25 characters (including hyphen)
+            # Priority 1: Pattern with hyphen (most common format)
+            # Look for pattern like B450JRLR0B-RC248667
+            hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
+            match = re.search(hyphen_pattern, line_clean)
+            if match:
+                candidate = match.group(1)
+                # Validate: should be 17-25 chars total
+                if 17 <= len(candidate) <= 25:
+                    # Make sure it's not matching address parts or other text
+                    # Also check that the line doesn't have too many words (NBI IDs are usually standalone)
+                    line_words = line_clean.split()
+                    # Additional validation: should have mix of letters and numbers
+                    has_letters = bool(re.search(r'[A-Z]', candidate))
+                    has_numbers = bool(re.search(r'[0-9]', candidate))
+                    if (has_letters and has_numbers and
+                        not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and
+                        len(line_words) <= 3):  # NBI ID is usually on its own line or with 1-2 other words
+                        nbi_id = candidate
+                        print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr)
+                        continue
+            # Priority 2: Pattern with space instead of hyphen
+            space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b'
+            match = re.search(space_pattern, line_clean)
+            if match:
+                part1, part2 = match.groups()
+                candidate = f"{part1}-{part2}"
+                if 17 <= len(candidate) <= 25:
+                    has_letters = bool(re.search(r'[A-Z]', candidate))
+                    has_numbers = bool(re.search(r'[0-9]', candidate))
+                    if (has_letters and has_numbers and
+                        not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])):
+                        nbi_id = candidate
+                        print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr)
+                        continue
+            # Priority 3: Pattern without hyphen/space (all together)
+            # Only if we haven't found one yet and it's a reasonable length
+            no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b'
+            match = re.search(no_hyphen_pattern, line_clean)
+            if match:
+                candidate = match.group(1)
+                # Make sure it doesn't contain common address words and has both letters and numbers
+                has_letters = bool(re.search(r'[A-Z]', candidate))
+                has_numbers = bool(re.search(r'[0-9]', candidate))
+                if (has_letters and has_numbers and
+                    not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])):
+                    # Try to split it intelligently (usually split in the middle)
+                    mid = len(candidate) // 2
+                    # Try splitting at various points
+                    for split_point in range(mid-2, mid+3):
+                        if 8 <= split_point <= len(candidate) - 8:
+                            part1 = candidate[:split_point]
+                            part2 = candidate[split_point:]
+                            if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12:
+                                nbi_id = f"{part1}-{part2}"
+                                print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr)
+                                break
+                    if nbi_id:
+                        continue
+        # Extract Full Name - look for name patterns after "NAME" label
+        # Also handle cases where name might be on the same line or next lines
+        if not full_name:
+            # Check if line contains "NAME" label
+            if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper):
+                # First, check if name is on the same line after colon
+                if ":" in line:
+                    parts = line.split(':', 1)
+                    if len(parts) > 1:
+                        name_part = parts[1].strip()
+                        if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2:
+                            full_name = name_part
+                            print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr)
+                            continue
+                # Check next few lines for name value
+                for j in range(1, min(5, len(cleaned_lines) - i)):
+                    next_line = cleaned_lines[i + j].strip()
+                    next_upper = next_line.upper()
+                    # Skip if it's another label or ID number
+                    if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
+                        continue
+                    # Check if it looks like a name (has letters, may have commas, not all numbers)
+                    if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2:
+                        # Additional check: make sure it's not just a single word that's too short
+                        if len(next_line.split()) >= 1 and len(next_line) > 3:
+                            full_name = next_line
+                            print(f"DEBUG: Found full name: {full_name}", file=sys.stderr)
+                            break
+        # Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label
+        if not birth_date:
+            if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or
+                ("DATE" in line_upper and "BIRTH" in line_upper)):
+                # First, check if date is on the same line after colon
+                if ":" in line:
+                    parts = line.split(':', 1)
+                    if len(parts) > 1:
+                        date_part = parts[1].strip()
+                        if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', date_part.upper()) or
+                            re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or
+                            re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)):
+                            birth_date = date_part
+                            print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr)
+                            continue
+                # Check next few lines for date value
+                for j in range(1, min(5, len(cleaned_lines) - i)):
+                    next_line = cleaned_lines[i + j].strip()
+                    next_upper = next_line.upper()
+                    # Skip if it's another label
+                    if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
+                        continue
+                    # Check if it looks like a date (contains month name or date pattern)
+                    if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', next_upper) or
+                        re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or
+                        re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)):
+                        birth_date = next_line
+                        print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
+                        break
+        # Extract LIT field - look for "LIT" label or pattern
+        if not lit:
+            # Look for "LIT" label (could be "LIT:", "LIT", or part of another label)
+            if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper):
+                # Check if LIT value is on the same line after colon or space
+                if ":" in line:
+                    parts = line.split(':', 1)
+                    if len(parts) > 1:
+                        lit_part = parts[1].strip()
+                        if len(lit_part) > 0:
+                            lit = lit_part
+                            print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr)
+                            continue
+                # Check next few lines for LIT value
+                for j in range(1, min(4, len(cleaned_lines) - i)):
+                    next_line = cleaned_lines[i + j].strip()
+                    next_upper = next_line.upper()
+                    # Skip if it's another label
+                    if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']):
+                        continue
+                    # Check if it looks like a valid LIT value (could be date, name, or other text)
+                    if len(next_line) > 0:
+                        lit = next_line
+                        print(f"DEBUG: Found LIT: {lit}", file=sys.stderr)
+                        break
     return {
         'clearance_type': 'nbi',
         'id_number': nbi_id,
+        'full_name': full_name,
+        'birth_date': birth_date,
+        'lit': lit,
+        'success': nbi_id is not None or full_name is not None
     }
 def extract_ocr_lines_simple(image_path):
             use_textline_orientation=True,     # Enable text line orientation
             lang='en'                          # Set language to English
         )
+        try:
+            results = ocr.predict(image_path)
+        except Exception as e:
+            print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
+            if hasattr(ocr, 'ocr'):
+                results = ocr.ocr(image_path)
+            else:
+                results = None
     all_text = []
     try:
+        # Handle both old format (list) and new format (OCRResult object)
+        if results and isinstance(results, list) and len(results) > 0:
+            first_item = results[0]
+            item_type_name = type(first_item).__name__
+            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
+            if is_ocr_result:
+                print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
+                # Access OCRResult as dictionary
+                try:
+                    if hasattr(first_item, 'keys'):
+                        ocr_dict = dict(first_item)
+                        # Look for rec_texts key
+                        if 'rec_texts' in ocr_dict:
+                            rec_texts = ocr_dict['rec_texts']
+                            if isinstance(rec_texts, list):
+                                all_text = [str(t) for t in rec_texts if t]
+                                print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
+                except Exception as e:
+                    print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
+            else:
+                # Old format - list of lists
+                lines = results[0] if results and isinstance(results[0], list) else results
+                for item in lines:
+                    if isinstance(item, (list, tuple)) and len(item) >= 2:
+                        meta = item[1]
+                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
+                            all_text.append(str(meta[0]))
+    except Exception as e:
+        print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
+    return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
 def extract_ocr_lines(image_path):
     # Check if file exists and has content
     if not os.path.exists(image_path):
+        return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
     # Ensure output directory exists
     os.makedirs("output", exist_ok=True)
             use_textline_orientation=False,
             lang='en'
         )
+        try:
+            results = ocr.predict(image_path)
+        except Exception as e:
+            print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
+            if hasattr(ocr, 'ocr'):
+                results = ocr.ocr(image_path)
+            else:
+                results = None
+    # Process OCR results - handle both old format (list) and new format (OCRResult object)
     all_text = []
     try:
+        # Handle both old format (list) and new format (OCRResult object)
+        if results and isinstance(results, list) and len(results) > 0:
+            first_item = results[0]
+            item_type_name = type(first_item).__name__
+            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
+            if is_ocr_result:
+                print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
+                # Access OCRResult as dictionary
+                try:
+                    if hasattr(first_item, 'keys'):
+                        ocr_dict = dict(first_item)
+                        # Look for rec_texts key
+                        if 'rec_texts' in ocr_dict:
+                            rec_texts = ocr_dict['rec_texts']
+                            if isinstance(rec_texts, list):
+                                all_text = [str(t) for t in rec_texts if t]
+                                print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
+                except Exception as e:
+                    print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
+            else:
+                # Old format - list of lists
+                lines = results[0] if results and isinstance(results[0], list) else results
+                for item in lines:
+                    if isinstance(item, (list, tuple)) and len(item) >= 2:
+                        meta = item[1]
+                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
+                            all_text.append(str(meta[0]))
     except Exception as e:
         print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
+        import traceback
+        print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
     print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
+    return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
 # Main
 if len(sys.argv) < 2:

extract_police_ocr.py CHANGED Viewed

@@ -197,39 +197,66 @@ def extract_police_details(lines):
         line_stripped = line.strip()
         # Extract Name - handle cases where NAME and value are on separate lines
         if "NAME" in line_upper and not details['full_name']:
             if ":" in line:
                 parts = line.split(':', 1)
                 if len(parts) > 1:
                     name_part = parts[1].strip()
-                    if name_part and len(name_part) > 2:
                         details['full_name'] = name_part
-            elif i + 1 < len(lines):
-                # Check next few lines for name value
-                for j in range(1, min(3, len(lines) - i)):
                     next_line = lines[i+j].strip()
                     if next_line.startswith(':') and len(next_line) > 1:
                         name_part = next_line[1:].strip()
-                        if name_part and len(name_part) > 2 and "ADDRESS" not in name_part.upper():
                             details['full_name'] = name_part
                             break
-                    elif not next_line.startswith(('ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID')) and len(next_line) > 2:
-                        if ":" not in next_line or (":" in next_line and next_line.index(':') < 3):
-                            name_part = next_line.replace(':', '').strip()
-                            if name_part and len(name_part) > 2:
-                                details['full_name'] = name_part
-                                break
         # Also check for name patterns that start with colon (OCR sometimes splits NAME label)
         if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5:
             name_candidate = line_stripped[1:].strip()
-            # Check if it looks like a name (has commas, multiple words, etc.)
-            if ',' in name_candidate or (len(name_candidate.split()) >= 2 and name_candidate.isupper()):
                 # Make sure previous line wasn't ADDRESS or other label
                 if i > 0:
                     prev_line = lines[i-1].strip().upper()
-                    if "ADDRESS" not in prev_line and "BIRTH" not in prev_line:
                         details['full_name'] = name_candidate
         # Extract Address
         if "ADDRESS" in line_upper and not details['address']:
@@ -323,23 +350,51 @@ def extract_police_details(lines):
                         details['citizenship'] = parts[1].strip()
         # Extract Gender - handle cases where GENDER and value are on separate lines
         if "GENDER" in line_upper and not details['gender']:
             if ":" in line:
                 parts = line.split(':', 1)
                 if len(parts) > 1:
-                    details['gender'] = parts[1].strip()
-            elif i + 1 < len(lines):
-                next_line = lines[i+1].strip()
-                if next_line.startswith(':') and len(next_line) > 1:
-                    gender_part = next_line[1:].strip()
                     if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
-                        details['gender'] = gender_part
-                elif ":" in next_line:
-                    parts = next_line.split(':', 1)
-                    if len(parts) > 1:
-                        gender_part = parts[1].strip()
                         if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
-                            details['gender'] = gender_part
         # Extract ID Number (Usually "ID No.:" or near QR code)
         if "ID NO" in line_upper or "ID NO." in line_upper:

         line_stripped = line.strip()
         # Extract Name - handle cases where NAME and value are on separate lines
+        # Format: 'NAME' on one line, ':IRENE TIMBAL VILLAFUERTE' on next line
         if "NAME" in line_upper and not details['full_name']:
+            # First, check if name is on the same line after colon
             if ":" in line:
                 parts = line.split(':', 1)
                 if len(parts) > 1:
                     name_part = parts[1].strip()
+                    # Validate it's actually a name (not descriptive text)
+                    if name_part and len(name_part) > 2 and not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT']):
                         details['full_name'] = name_part
+                        print(f"DEBUG: Found full name (same line): {details['full_name']}", file=sys.stderr)
+                        continue
+            # Check next few lines for name value (prioritize lines starting with colon)
+            if i + 1 < len(lines):
+                for j in range(1, min(5, len(lines) - i)):
                     next_line = lines[i+j].strip()
+                    next_upper = next_line.upper()
+                    # Skip if it's clearly a label or descriptive text
+                    if any(word in next_upper for word in ['ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID', 'THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT', 'CERTIFY', 'PERSON', 'WHOSE', 'PHOTO', 'SIGNATURE']):
+                        continue
+                    # Priority: Line starting with colon (most reliable format)
                     if next_line.startswith(':') and len(next_line) > 1:
                         name_part = next_line[1:].strip()
+                        # Validate it looks like a name (has letters, reasonable length, not descriptive text)
+                        if (name_part and len(name_part) > 3 and
+                            re.search(r'[A-Za-z]{2,}', name_part) and
+                            not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION'])):
                             details['full_name'] = name_part
+                            print(f"DEBUG: Found full name (colon line): {details['full_name']}", file=sys.stderr)
+                            break
+                    # Fallback: Line that looks like a name (all caps, multiple words, reasonable length)
+                    elif (re.match(r'^[A-Z\s,]+$', next_line) and
+                          len(next_line.split()) >= 2 and
+                          len(next_line) > 5 and
+                          len(next_line) < 50):  # Names are usually not too long
+                        # Make sure it's not descriptive text
+                        if not any(word in next_upper for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME']):
+                            details['full_name'] = next_line
+                            print(f"DEBUG: Found full name (all caps line): {details['full_name']}", file=sys.stderr)
                             break
         # Also check for name patterns that start with colon (OCR sometimes splits NAME label)
+        # But only if we haven't found a name yet
         if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5:
             name_candidate = line_stripped[1:].strip()
+            # Check if it looks like a name (has letters, reasonable length, not descriptive text)
+            if (re.search(r'[A-Za-z]{2,}', name_candidate) and
+                len(name_candidate) > 3 and
+                len(name_candidate) < 50 and
+                not any(word in name_candidate.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'ADDRESS', 'BIRTH'])):
                 # Make sure previous line wasn't ADDRESS or other label
                 if i > 0:
                     prev_line = lines[i-1].strip().upper()
+                    if "ADDRESS" not in prev_line and "BIRTH" not in prev_line and "CITIZEN" not in prev_line:
                         details['full_name'] = name_candidate
+                        print(f"DEBUG: Found full name (colon pattern): {details['full_name']}", file=sys.stderr)
         # Extract Address
         if "ADDRESS" in line_upper and not details['address']:
                         details['citizenship'] = parts[1].strip()
         # Extract Gender - handle cases where GENDER and value are on separate lines
+        # Format: 'GENDER' on one line, 'FEMALE' or 'MALE' on next line
         if "GENDER" in line_upper and not details['gender']:
+            # First, check if gender is on the same line after colon
             if ":" in line:
                 parts = line.split(':', 1)
                 if len(parts) > 1:
+                    gender_part = parts[1].strip().upper()
                     if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
+                        details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
+                        print(f"DEBUG: Found gender (same line): {details['gender']}", file=sys.stderr)
+                        continue
+            # Check next few lines for gender value
+            if i + 1 < len(lines):
+                for j in range(1, min(4, len(lines) - i)):
+                    next_line = lines[i+j].strip()
+                    next_upper = next_line.upper()
+                    # Skip if it's clearly a label
+                    if any(label in next_upper for label in ['NAME', 'ADDRESS', 'BIRTH', 'CITIZEN', 'DATE', 'PLACE', 'PICTURE', 'SIGNATURE', 'THUMBMARK']):
+                        continue
+                    # Check if line starts with colon
+                    if next_line.startswith(':') and len(next_line) > 1:
+                        gender_part = next_line[1:].strip().upper()
                         if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
+                            details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
+                            print(f"DEBUG: Found gender (colon line): {details['gender']}", file=sys.stderr)
+                            break
+                    # Check if the line itself is the gender value
+                    elif next_upper in ['MALE', 'FEMALE', 'M', 'F']:
+                        details['gender'] = next_line.capitalize() if len(next_line) > 1 else next_line
+                        print(f"DEBUG: Found gender (direct): {details['gender']}", file=sys.stderr)
+                        break
+                    # Check if line contains colon with gender value
+                    elif ":" in next_line:
+                        parts = next_line.split(':', 1)
+                        if len(parts) > 1:
+                            gender_part = parts[1].strip().upper()
+                            if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
+                                details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
+                                print(f"DEBUG: Found gender (colon in line): {details['gender']}", file=sys.stderr)
+                                break
         # Extract ID Number (Usually "ID No.:" or near QR code)
         if "ID NO" in line_upper or "ID NO." in line_upper: