Spaces:
Sleeping
Sleeping
Upload 2 files
Browse filesnbi and police ocr
- extract_nbi_ocr.py +305 -50
- extract_police_ocr.py +80 -25
extract_nbi_ocr.py
CHANGED
|
@@ -48,43 +48,238 @@ def download_image(url, output_path='temp_image.jpg'):
|
|
| 48 |
|
| 49 |
return output_path
|
| 50 |
|
| 51 |
-
# OCR Function to extract NBI ID NO
|
| 52 |
def extract_nbi_id(lines):
|
| 53 |
nbi_id = None
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
# Extract the ID after the colon
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
return {
|
| 83 |
'clearance_type': 'nbi',
|
| 84 |
'id_number': nbi_id,
|
| 85 |
-
'full_name':
|
| 86 |
-
'birth_date':
|
| 87 |
-
'
|
|
|
|
| 88 |
}
|
| 89 |
|
| 90 |
def extract_ocr_lines_simple(image_path):
|
|
@@ -97,25 +292,54 @@ def extract_ocr_lines_simple(image_path):
|
|
| 97 |
use_textline_orientation=True, # Enable text line orientation
|
| 98 |
lang='en' # Set language to English
|
| 99 |
)
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
all_text = []
|
| 103 |
try:
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
def extract_ocr_lines(image_path):
|
| 116 |
# Check if file exists and has content
|
| 117 |
if not os.path.exists(image_path):
|
| 118 |
-
return {'id_number': None, 'success': False}
|
| 119 |
|
| 120 |
# Ensure output directory exists
|
| 121 |
os.makedirs("output", exist_ok=True)
|
|
@@ -131,22 +355,53 @@ def extract_ocr_lines(image_path):
|
|
| 131 |
use_textline_orientation=False,
|
| 132 |
lang='en'
|
| 133 |
)
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
-
# Process OCR results
|
| 137 |
all_text = []
|
| 138 |
try:
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
except Exception as e:
|
| 146 |
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
|
|
|
|
|
|
|
| 147 |
|
| 148 |
print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
|
| 149 |
-
return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
|
| 150 |
|
| 151 |
# Main
|
| 152 |
if len(sys.argv) < 2:
|
|
|
|
| 48 |
|
| 49 |
return output_path
|
| 50 |
|
| 51 |
+
# OCR Function to extract NBI ID NO, Name, Birth Date, and LIT
|
| 52 |
def extract_nbi_id(lines):
|
| 53 |
nbi_id = None
|
| 54 |
+
full_name = None
|
| 55 |
+
birth_date = None
|
| 56 |
+
lit = None # LIT field (Last Issued To or similar)
|
| 57 |
|
| 58 |
+
# Clean lines - convert to strings and strip
|
| 59 |
+
cleaned_lines = [str(line).strip() if isinstance(line, str) else str(line).strip() for line in lines]
|
| 60 |
+
|
| 61 |
+
# First pass: Look for NBI ID pattern in all lines (prioritize exact matches)
|
| 62 |
+
# This helps catch IDs that might be on lines without labels
|
| 63 |
+
for i, line in enumerate(cleaned_lines):
|
| 64 |
+
line_upper = line.upper().strip()
|
| 65 |
+
line_clean = line.strip()
|
| 66 |
+
|
| 67 |
+
# Look for NBI ID pattern with hyphen first (most reliable)
|
| 68 |
+
if not nbi_id:
|
| 69 |
+
hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
|
| 70 |
+
match = re.search(hyphen_pattern, line_clean)
|
| 71 |
+
if match:
|
| 72 |
+
candidate = match.group(1)
|
| 73 |
+
# Validate length and that it's not part of an address
|
| 74 |
+
if 17 <= len(candidate) <= 25:
|
| 75 |
+
# Check that line doesn't have too many words (NBI IDs are usually standalone)
|
| 76 |
+
line_words = line_clean.split()
|
| 77 |
+
if len(line_words) <= 3: # Usually 1-2 words max (the ID itself)
|
| 78 |
+
# Additional validation: should have mix of letters and numbers
|
| 79 |
+
has_letters = bool(re.search(r'[A-Z]', candidate))
|
| 80 |
+
has_numbers = bool(re.search(r'[0-9]', candidate))
|
| 81 |
+
if has_letters and has_numbers:
|
| 82 |
+
nbi_id = candidate
|
| 83 |
+
print(f"DEBUG: Found NBI ID (first pass, hyphen): {nbi_id}", file=sys.stderr)
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
# Second pass: Extract other fields and refine ID if needed
|
| 87 |
+
for i, line in enumerate(cleaned_lines):
|
| 88 |
+
line_upper = line.upper().strip()
|
| 89 |
+
line_clean = line.strip()
|
| 90 |
+
|
| 91 |
+
# Extract NBI ID Number (if not found in first pass)
|
| 92 |
+
if not nbi_id:
|
| 93 |
+
# Look for "NBI ID NO:" pattern (various formats)
|
| 94 |
+
if ("NBI ID NO:" in line_upper or "NBIIDNO" in line_upper or "NBI ID NO" in line_upper or
|
| 95 |
+
"NBI ID NUMBER" in line_upper or "NBIID NUMBER" in line_upper):
|
| 96 |
# Extract the ID after the colon
|
| 97 |
+
if ":" in line:
|
| 98 |
+
parts = line.split(':', 1)
|
| 99 |
+
if len(parts) > 1:
|
| 100 |
+
id_candidate = parts[1].strip()
|
| 101 |
+
# Clean up the ID (remove extra spaces, ensure proper format)
|
| 102 |
+
id_candidate = re.sub(r'\s+', '', id_candidate) # Remove spaces
|
| 103 |
+
if len(id_candidate) > 5: # Valid ID should be longer
|
| 104 |
+
nbi_id = id_candidate
|
| 105 |
+
print(f"DEBUG: Found NBI ID (same line): {nbi_id}", file=sys.stderr)
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
# Also check if the next line contains the ID (in case it's on a separate line)
|
| 109 |
+
if i < len(cleaned_lines) - 1:
|
| 110 |
+
for j in range(1, min(3, len(cleaned_lines) - i)):
|
| 111 |
+
next_line = cleaned_lines[i + j].strip()
|
| 112 |
+
# Skip if it's clearly not an ID (too short, contains labels)
|
| 113 |
+
if len(next_line) < 5 or any(label in next_line.upper() for label in ['NAME', 'DATE', 'BIRTH', 'CLEARANCE']):
|
| 114 |
+
continue
|
| 115 |
+
# Check if it looks like an NBI ID (alphanumeric, reasonable length)
|
| 116 |
+
if re.match(r'^[A-Z0-9-]{15,25}$', next_line.replace(' ', '')):
|
| 117 |
+
nbi_id = next_line.replace(' ', '')
|
| 118 |
+
print(f"DEBUG: Found NBI ID (next line): {nbi_id}", file=sys.stderr)
|
| 119 |
+
break
|
| 120 |
+
if nbi_id:
|
| 121 |
+
continue
|
| 122 |
+
|
| 123 |
+
# Look for NBI ID pattern: alphanumeric with one hyphen
|
| 124 |
+
# Format examples: B450JRLR0B-RC248667, HGUR87H38D-U47204A873
|
| 125 |
+
# First part: 8-12 chars, hyphen, second part: 8-12 chars
|
| 126 |
+
# Total length: 17-25 characters (including hyphen)
|
| 127 |
+
|
| 128 |
+
# Priority 1: Pattern with hyphen (most common format)
|
| 129 |
+
# Look for pattern like B450JRLR0B-RC248667
|
| 130 |
+
hyphen_pattern = r'\b([A-Z0-9]{8,12}-[A-Z0-9]{8,12})\b'
|
| 131 |
+
match = re.search(hyphen_pattern, line_clean)
|
| 132 |
+
if match:
|
| 133 |
+
candidate = match.group(1)
|
| 134 |
+
# Validate: should be 17-25 chars total
|
| 135 |
+
if 17 <= len(candidate) <= 25:
|
| 136 |
+
# Make sure it's not matching address parts or other text
|
| 137 |
+
# Also check that the line doesn't have too many words (NBI IDs are usually standalone)
|
| 138 |
+
line_words = line_clean.split()
|
| 139 |
+
# Additional validation: should have mix of letters and numbers
|
| 140 |
+
has_letters = bool(re.search(r'[A-Z]', candidate))
|
| 141 |
+
has_numbers = bool(re.search(r'[0-9]', candidate))
|
| 142 |
+
if (has_letters and has_numbers and
|
| 143 |
+
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL']) and
|
| 144 |
+
len(line_words) <= 3): # NBI ID is usually on its own line or with 1-2 other words
|
| 145 |
+
nbi_id = candidate
|
| 146 |
+
print(f"DEBUG: Found NBI ID (hyphen pattern): {nbi_id}", file=sys.stderr)
|
| 147 |
+
continue
|
| 148 |
+
|
| 149 |
+
# Priority 2: Pattern with space instead of hyphen
|
| 150 |
+
space_pattern = r'\b([A-Z0-9]{8,12})\s+([A-Z0-9]{8,12})\b'
|
| 151 |
+
match = re.search(space_pattern, line_clean)
|
| 152 |
+
if match:
|
| 153 |
+
part1, part2 = match.groups()
|
| 154 |
+
candidate = f"{part1}-{part2}"
|
| 155 |
+
if 17 <= len(candidate) <= 25:
|
| 156 |
+
has_letters = bool(re.search(r'[A-Z]', candidate))
|
| 157 |
+
has_numbers = bool(re.search(r'[0-9]', candidate))
|
| 158 |
+
if (has_letters and has_numbers and
|
| 159 |
+
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'DOMINGO', 'CAINTA', 'RIZAL'])):
|
| 160 |
+
nbi_id = candidate
|
| 161 |
+
print(f"DEBUG: Found NBI ID (space pattern): {nbi_id}", file=sys.stderr)
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
# Priority 3: Pattern without hyphen/space (all together)
|
| 165 |
+
# Only if we haven't found one yet and it's a reasonable length
|
| 166 |
+
no_hyphen_pattern = r'\b([A-Z0-9]{17,25})\b'
|
| 167 |
+
match = re.search(no_hyphen_pattern, line_clean)
|
| 168 |
+
if match:
|
| 169 |
+
candidate = match.group(1)
|
| 170 |
+
# Make sure it doesn't contain common address words and has both letters and numbers
|
| 171 |
+
has_letters = bool(re.search(r'[A-Z]', candidate))
|
| 172 |
+
has_numbers = bool(re.search(r'[0-9]', candidate))
|
| 173 |
+
if (has_letters and has_numbers and
|
| 174 |
+
not any(word in candidate.upper() for word in ['STREET', 'ST', 'AVENUE', 'AVE', 'BRGY', 'BARANGAY', 'CITY', 'PHASE', 'ADDRESS', 'DOMINGO', 'CAINTA', 'RIZAL', 'ATRSTORUARPHASEABRGY'])):
|
| 175 |
+
# Try to split it intelligently (usually split in the middle)
|
| 176 |
+
mid = len(candidate) // 2
|
| 177 |
+
# Try splitting at various points
|
| 178 |
+
for split_point in range(mid-2, mid+3):
|
| 179 |
+
if 8 <= split_point <= len(candidate) - 8:
|
| 180 |
+
part1 = candidate[:split_point]
|
| 181 |
+
part2 = candidate[split_point:]
|
| 182 |
+
if 8 <= len(part1) <= 12 and 8 <= len(part2) <= 12:
|
| 183 |
+
nbi_id = f"{part1}-{part2}"
|
| 184 |
+
print(f"DEBUG: Found NBI ID (no hyphen, split): {nbi_id}", file=sys.stderr)
|
| 185 |
+
break
|
| 186 |
+
if nbi_id:
|
| 187 |
+
continue
|
| 188 |
+
|
| 189 |
+
# Extract Full Name - look for name patterns after "NAME" label
|
| 190 |
+
# Also handle cases where name might be on the same line or next lines
|
| 191 |
+
if not full_name:
|
| 192 |
+
# Check if line contains "NAME" label
|
| 193 |
+
if "NAME" in line_upper and ("NBI" not in line_upper or "ID" not in line_upper):
|
| 194 |
+
# First, check if name is on the same line after colon
|
| 195 |
+
if ":" in line:
|
| 196 |
+
parts = line.split(':', 1)
|
| 197 |
+
if len(parts) > 1:
|
| 198 |
+
name_part = parts[1].strip()
|
| 199 |
+
if re.search(r'[A-Za-z]{2,}', name_part) and len(name_part) > 2:
|
| 200 |
+
full_name = name_part
|
| 201 |
+
print(f"DEBUG: Found full name (same line): {full_name}", file=sys.stderr)
|
| 202 |
+
continue
|
| 203 |
+
|
| 204 |
+
# Check next few lines for name value
|
| 205 |
+
for j in range(1, min(5, len(cleaned_lines) - i)):
|
| 206 |
+
next_line = cleaned_lines[i + j].strip()
|
| 207 |
+
next_upper = next_line.upper()
|
| 208 |
+
# Skip if it's another label or ID number
|
| 209 |
+
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
|
| 210 |
+
continue
|
| 211 |
+
# Check if it looks like a name (has letters, may have commas, not all numbers)
|
| 212 |
+
if re.search(r'[A-Za-z]{2,}', next_line) and not re.match(r'^\d+$', next_line) and len(next_line) > 2:
|
| 213 |
+
# Additional check: make sure it's not just a single word that's too short
|
| 214 |
+
if len(next_line.split()) >= 1 and len(next_line) > 3:
|
| 215 |
+
full_name = next_line
|
| 216 |
+
print(f"DEBUG: Found full name: {full_name}", file=sys.stderr)
|
| 217 |
+
break
|
| 218 |
+
|
| 219 |
+
# Extract Birth Date - look for date patterns after "DATE OF BIRTH" or "BIRTH DATE" label
|
| 220 |
+
if not birth_date:
|
| 221 |
+
if ("DATE OF BIRTH" in line_upper or "BIRTH DATE" in line_upper or "BIRTHDATE" in line_upper or
|
| 222 |
+
("DATE" in line_upper and "BIRTH" in line_upper)):
|
| 223 |
+
# First, check if date is on the same line after colon
|
| 224 |
+
if ":" in line:
|
| 225 |
+
parts = line.split(':', 1)
|
| 226 |
+
if len(parts) > 1:
|
| 227 |
+
date_part = parts[1].strip()
|
| 228 |
+
if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', date_part.upper()) or
|
| 229 |
+
re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', date_part) or
|
| 230 |
+
re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', date_part)):
|
| 231 |
+
birth_date = date_part
|
| 232 |
+
print(f"DEBUG: Found birth date (same line): {birth_date}", file=sys.stderr)
|
| 233 |
+
continue
|
| 234 |
+
|
| 235 |
+
# Check next few lines for date value
|
| 236 |
+
for j in range(1, min(5, len(cleaned_lines) - i)):
|
| 237 |
+
next_line = cleaned_lines[i + j].strip()
|
| 238 |
+
next_upper = next_line.upper()
|
| 239 |
+
# Skip if it's another label
|
| 240 |
+
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL']):
|
| 241 |
+
continue
|
| 242 |
+
# Check if it looks like a date (contains month name or date pattern)
|
| 243 |
+
if (re.search(r'(JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER|JAN|FEB|MAR|APR|JUN|JUL|AUG|SEP|OCT|NOV|DEC)', next_upper) or
|
| 244 |
+
re.search(r'\d{1,2}[/-]\d{1,2}[/-]\d{4}', next_line) or
|
| 245 |
+
re.search(r'\d{1,2}\s+[A-Z]{3}\s+\d{4}', next_line)):
|
| 246 |
+
birth_date = next_line
|
| 247 |
+
print(f"DEBUG: Found birth date: {birth_date}", file=sys.stderr)
|
| 248 |
+
break
|
| 249 |
+
|
| 250 |
+
# Extract LIT field - look for "LIT" label or pattern
|
| 251 |
+
if not lit:
|
| 252 |
+
# Look for "LIT" label (could be "LIT:", "LIT", or part of another label)
|
| 253 |
+
if "LIT" in line_upper and ("ID" not in line_upper or "NBI" not in line_upper):
|
| 254 |
+
# Check if LIT value is on the same line after colon or space
|
| 255 |
+
if ":" in line:
|
| 256 |
+
parts = line.split(':', 1)
|
| 257 |
+
if len(parts) > 1:
|
| 258 |
+
lit_part = parts[1].strip()
|
| 259 |
+
if len(lit_part) > 0:
|
| 260 |
+
lit = lit_part
|
| 261 |
+
print(f"DEBUG: Found LIT (same line): {lit}", file=sys.stderr)
|
| 262 |
+
continue
|
| 263 |
+
# Check next few lines for LIT value
|
| 264 |
+
for j in range(1, min(4, len(cleaned_lines) - i)):
|
| 265 |
+
next_line = cleaned_lines[i + j].strip()
|
| 266 |
+
next_upper = next_line.upper()
|
| 267 |
+
# Skip if it's another label
|
| 268 |
+
if any(label in next_upper for label in ['NBI', 'ID', 'NO', 'NAME', 'DATE', 'BIRTH', 'CLEARANCE', 'REPUBLIC', 'PHILIPPINES', 'NATIONAL', 'VALID', 'UNTIL']):
|
| 269 |
+
continue
|
| 270 |
+
# Check if it looks like a valid LIT value (could be date, name, or other text)
|
| 271 |
+
if len(next_line) > 0:
|
| 272 |
+
lit = next_line
|
| 273 |
+
print(f"DEBUG: Found LIT: {lit}", file=sys.stderr)
|
| 274 |
+
break
|
| 275 |
|
| 276 |
return {
|
| 277 |
'clearance_type': 'nbi',
|
| 278 |
'id_number': nbi_id,
|
| 279 |
+
'full_name': full_name,
|
| 280 |
+
'birth_date': birth_date,
|
| 281 |
+
'lit': lit,
|
| 282 |
+
'success': nbi_id is not None or full_name is not None
|
| 283 |
}
|
| 284 |
|
| 285 |
def extract_ocr_lines_simple(image_path):
|
|
|
|
| 292 |
use_textline_orientation=True, # Enable text line orientation
|
| 293 |
lang='en' # Set language to English
|
| 294 |
)
|
| 295 |
+
try:
|
| 296 |
+
results = ocr.predict(image_path)
|
| 297 |
+
except Exception as e:
|
| 298 |
+
print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
|
| 299 |
+
if hasattr(ocr, 'ocr'):
|
| 300 |
+
results = ocr.ocr(image_path)
|
| 301 |
+
else:
|
| 302 |
+
results = None
|
| 303 |
|
| 304 |
all_text = []
|
| 305 |
try:
|
| 306 |
+
# Handle both old format (list) and new format (OCRResult object)
|
| 307 |
+
if results and isinstance(results, list) and len(results) > 0:
|
| 308 |
+
first_item = results[0]
|
| 309 |
+
item_type_name = type(first_item).__name__
|
| 310 |
+
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
|
| 311 |
+
|
| 312 |
+
if is_ocr_result:
|
| 313 |
+
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
|
| 314 |
+
# Access OCRResult as dictionary
|
| 315 |
+
try:
|
| 316 |
+
if hasattr(first_item, 'keys'):
|
| 317 |
+
ocr_dict = dict(first_item)
|
| 318 |
+
# Look for rec_texts key
|
| 319 |
+
if 'rec_texts' in ocr_dict:
|
| 320 |
+
rec_texts = ocr_dict['rec_texts']
|
| 321 |
+
if isinstance(rec_texts, list):
|
| 322 |
+
all_text = [str(t) for t in rec_texts if t]
|
| 323 |
+
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
|
| 324 |
+
except Exception as e:
|
| 325 |
+
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
|
| 326 |
+
else:
|
| 327 |
+
# Old format - list of lists
|
| 328 |
+
lines = results[0] if results and isinstance(results[0], list) else results
|
| 329 |
+
for item in lines:
|
| 330 |
+
if isinstance(item, (list, tuple)) and len(item) >= 2:
|
| 331 |
+
meta = item[1]
|
| 332 |
+
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
|
| 333 |
+
all_text.append(str(meta[0]))
|
| 334 |
+
except Exception as e:
|
| 335 |
+
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
|
| 336 |
+
|
| 337 |
+
return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
|
| 338 |
|
| 339 |
def extract_ocr_lines(image_path):
|
| 340 |
# Check if file exists and has content
|
| 341 |
if not os.path.exists(image_path):
|
| 342 |
+
return {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'success': False}
|
| 343 |
|
| 344 |
# Ensure output directory exists
|
| 345 |
os.makedirs("output", exist_ok=True)
|
|
|
|
| 355 |
use_textline_orientation=False,
|
| 356 |
lang='en'
|
| 357 |
)
|
| 358 |
+
try:
|
| 359 |
+
results = ocr.predict(image_path)
|
| 360 |
+
except Exception as e:
|
| 361 |
+
print(f"DEBUG: predict() failed: {e}, trying ocr()", file=sys.stderr)
|
| 362 |
+
if hasattr(ocr, 'ocr'):
|
| 363 |
+
results = ocr.ocr(image_path)
|
| 364 |
+
else:
|
| 365 |
+
results = None
|
| 366 |
|
| 367 |
+
# Process OCR results - handle both old format (list) and new format (OCRResult object)
|
| 368 |
all_text = []
|
| 369 |
try:
|
| 370 |
+
# Handle both old format (list) and new format (OCRResult object)
|
| 371 |
+
if results and isinstance(results, list) and len(results) > 0:
|
| 372 |
+
first_item = results[0]
|
| 373 |
+
item_type_name = type(first_item).__name__
|
| 374 |
+
is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
|
| 375 |
+
|
| 376 |
+
if is_ocr_result:
|
| 377 |
+
print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
|
| 378 |
+
# Access OCRResult as dictionary
|
| 379 |
+
try:
|
| 380 |
+
if hasattr(first_item, 'keys'):
|
| 381 |
+
ocr_dict = dict(first_item)
|
| 382 |
+
# Look for rec_texts key
|
| 383 |
+
if 'rec_texts' in ocr_dict:
|
| 384 |
+
rec_texts = ocr_dict['rec_texts']
|
| 385 |
+
if isinstance(rec_texts, list):
|
| 386 |
+
all_text = [str(t) for t in rec_texts if t]
|
| 387 |
+
print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
|
| 388 |
+
except Exception as e:
|
| 389 |
+
print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
|
| 390 |
+
else:
|
| 391 |
+
# Old format - list of lists
|
| 392 |
+
lines = results[0] if results and isinstance(results[0], list) else results
|
| 393 |
+
for item in lines:
|
| 394 |
+
if isinstance(item, (list, tuple)) and len(item) >= 2:
|
| 395 |
+
meta = item[1]
|
| 396 |
+
if isinstance(meta, (list, tuple)) and len(meta) >= 1:
|
| 397 |
+
all_text.append(str(meta[0]))
|
| 398 |
except Exception as e:
|
| 399 |
print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
|
| 400 |
+
import traceback
|
| 401 |
+
print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
|
| 402 |
|
| 403 |
print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
|
| 404 |
+
return extract_nbi_id(all_text) if all_text else {'clearance_type': 'nbi', 'id_number': None, 'full_name': None, 'birth_date': None, 'lit': None, 'success': False}
|
| 405 |
|
| 406 |
# Main
|
| 407 |
if len(sys.argv) < 2:
|
extract_police_ocr.py
CHANGED
|
@@ -197,39 +197,66 @@ def extract_police_details(lines):
|
|
| 197 |
line_stripped = line.strip()
|
| 198 |
|
| 199 |
# Extract Name - handle cases where NAME and value are on separate lines
|
|
|
|
| 200 |
if "NAME" in line_upper and not details['full_name']:
|
|
|
|
| 201 |
if ":" in line:
|
| 202 |
parts = line.split(':', 1)
|
| 203 |
if len(parts) > 1:
|
| 204 |
name_part = parts[1].strip()
|
| 205 |
-
|
|
|
|
| 206 |
details['full_name'] = name_part
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
| 210 |
next_line = lines[i+j].strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
if next_line.startswith(':') and len(next_line) > 1:
|
| 212 |
name_part = next_line[1:].strip()
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
| 214 |
details['full_name'] = name_part
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
break
|
| 216 |
-
elif not next_line.startswith(('ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID')) and len(next_line) > 2:
|
| 217 |
-
if ":" not in next_line or (":" in next_line and next_line.index(':') < 3):
|
| 218 |
-
name_part = next_line.replace(':', '').strip()
|
| 219 |
-
if name_part and len(name_part) > 2:
|
| 220 |
-
details['full_name'] = name_part
|
| 221 |
-
break
|
| 222 |
|
| 223 |
# Also check for name patterns that start with colon (OCR sometimes splits NAME label)
|
|
|
|
| 224 |
if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5:
|
| 225 |
name_candidate = line_stripped[1:].strip()
|
| 226 |
-
# Check if it looks like a name (has
|
| 227 |
-
if ','
|
|
|
|
|
|
|
|
|
|
| 228 |
# Make sure previous line wasn't ADDRESS or other label
|
| 229 |
if i > 0:
|
| 230 |
prev_line = lines[i-1].strip().upper()
|
| 231 |
-
if "ADDRESS" not in prev_line and "BIRTH" not in prev_line:
|
| 232 |
details['full_name'] = name_candidate
|
|
|
|
| 233 |
|
| 234 |
# Extract Address
|
| 235 |
if "ADDRESS" in line_upper and not details['address']:
|
|
@@ -323,23 +350,51 @@ def extract_police_details(lines):
|
|
| 323 |
details['citizenship'] = parts[1].strip()
|
| 324 |
|
| 325 |
# Extract Gender - handle cases where GENDER and value are on separate lines
|
|
|
|
| 326 |
if "GENDER" in line_upper and not details['gender']:
|
|
|
|
| 327 |
if ":" in line:
|
| 328 |
parts = line.split(':', 1)
|
| 329 |
if len(parts) > 1:
|
| 330 |
-
|
| 331 |
-
elif i + 1 < len(lines):
|
| 332 |
-
next_line = lines[i+1].strip()
|
| 333 |
-
if next_line.startswith(':') and len(next_line) > 1:
|
| 334 |
-
gender_part = next_line[1:].strip()
|
| 335 |
if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
|
| 336 |
-
details['gender'] = gender_part
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
|
| 342 |
-
details['gender'] = gender_part
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
# Extract ID Number (Usually "ID No.:" or near QR code)
|
| 345 |
if "ID NO" in line_upper or "ID NO." in line_upper:
|
|
|
|
| 197 |
line_stripped = line.strip()
|
| 198 |
|
| 199 |
# Extract Name - handle cases where NAME and value are on separate lines
|
| 200 |
+
# Format: 'NAME' on one line, ':IRENE TIMBAL VILLAFUERTE' on next line
|
| 201 |
if "NAME" in line_upper and not details['full_name']:
|
| 202 |
+
# First, check if name is on the same line after colon
|
| 203 |
if ":" in line:
|
| 204 |
parts = line.split(':', 1)
|
| 205 |
if len(parts) > 1:
|
| 206 |
name_part = parts[1].strip()
|
| 207 |
+
# Validate it's actually a name (not descriptive text)
|
| 208 |
+
if name_part and len(name_part) > 2 and not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT']):
|
| 209 |
details['full_name'] = name_part
|
| 210 |
+
print(f"DEBUG: Found full name (same line): {details['full_name']}", file=sys.stderr)
|
| 211 |
+
continue
|
| 212 |
+
|
| 213 |
+
# Check next few lines for name value (prioritize lines starting with colon)
|
| 214 |
+
if i + 1 < len(lines):
|
| 215 |
+
for j in range(1, min(5, len(lines) - i)):
|
| 216 |
next_line = lines[i+j].strip()
|
| 217 |
+
next_upper = next_line.upper()
|
| 218 |
+
|
| 219 |
+
# Skip if it's clearly a label or descriptive text
|
| 220 |
+
if any(word in next_upper for word in ['ADDRESS', 'BIRTH', 'CITIZEN', 'GENDER', 'ID', 'THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'DATABASES', 'RESULT', 'CERTIFY', 'PERSON', 'WHOSE', 'PHOTO', 'SIGNATURE']):
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
# Priority: Line starting with colon (most reliable format)
|
| 224 |
if next_line.startswith(':') and len(next_line) > 1:
|
| 225 |
name_part = next_line[1:].strip()
|
| 226 |
+
# Validate it looks like a name (has letters, reasonable length, not descriptive text)
|
| 227 |
+
if (name_part and len(name_part) > 3 and
|
| 228 |
+
re.search(r'[A-Za-z]{2,}', name_part) and
|
| 229 |
+
not any(word in name_part.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION'])):
|
| 230 |
details['full_name'] = name_part
|
| 231 |
+
print(f"DEBUG: Found full name (colon line): {details['full_name']}", file=sys.stderr)
|
| 232 |
+
break
|
| 233 |
+
|
| 234 |
+
# Fallback: Line that looks like a name (all caps, multiple words, reasonable length)
|
| 235 |
+
elif (re.match(r'^[A-Z\s,]+$', next_line) and
|
| 236 |
+
len(next_line.split()) >= 2 and
|
| 237 |
+
len(next_line) > 5 and
|
| 238 |
+
len(next_line) < 50): # Names are usually not too long
|
| 239 |
+
# Make sure it's not descriptive text
|
| 240 |
+
if not any(word in next_upper for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME']):
|
| 241 |
+
details['full_name'] = next_line
|
| 242 |
+
print(f"DEBUG: Found full name (all caps line): {details['full_name']}", file=sys.stderr)
|
| 243 |
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
# Also check for name patterns that start with colon (OCR sometimes splits NAME label)
|
| 246 |
+
# But only if we haven't found a name yet
|
| 247 |
if not details['full_name'] and line_stripped.startswith(':') and len(line_stripped) > 5:
|
| 248 |
name_candidate = line_stripped[1:].strip()
|
| 249 |
+
# Check if it looks like a name (has letters, reasonable length, not descriptive text)
|
| 250 |
+
if (re.search(r'[A-Za-z]{2,}', name_candidate) and
|
| 251 |
+
len(name_candidate) > 3 and
|
| 252 |
+
len(name_candidate) < 50 and
|
| 253 |
+
not any(word in name_candidate.upper() for word in ['THUMBMARK', 'APPEARING', 'HEREIN', 'HASUNDERGONE', 'RECORD', 'VERIFICATION', 'THROUGH', 'CRIME', 'ADDRESS', 'BIRTH'])):
|
| 254 |
# Make sure previous line wasn't ADDRESS or other label
|
| 255 |
if i > 0:
|
| 256 |
prev_line = lines[i-1].strip().upper()
|
| 257 |
+
if "ADDRESS" not in prev_line and "BIRTH" not in prev_line and "CITIZEN" not in prev_line:
|
| 258 |
details['full_name'] = name_candidate
|
| 259 |
+
print(f"DEBUG: Found full name (colon pattern): {details['full_name']}", file=sys.stderr)
|
| 260 |
|
| 261 |
# Extract Address
|
| 262 |
if "ADDRESS" in line_upper and not details['address']:
|
|
|
|
| 350 |
details['citizenship'] = parts[1].strip()
|
| 351 |
|
| 352 |
# Extract Gender - handle cases where GENDER and value are on separate lines
|
| 353 |
+
# Format: 'GENDER' on one line, 'FEMALE' or 'MALE' on next line
|
| 354 |
if "GENDER" in line_upper and not details['gender']:
|
| 355 |
+
# First, check if gender is on the same line after colon
|
| 356 |
if ":" in line:
|
| 357 |
parts = line.split(':', 1)
|
| 358 |
if len(parts) > 1:
|
| 359 |
+
gender_part = parts[1].strip().upper()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
|
| 361 |
+
details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
|
| 362 |
+
print(f"DEBUG: Found gender (same line): {details['gender']}", file=sys.stderr)
|
| 363 |
+
continue
|
| 364 |
+
|
| 365 |
+
# Check next few lines for gender value
|
| 366 |
+
if i + 1 < len(lines):
|
| 367 |
+
for j in range(1, min(4, len(lines) - i)):
|
| 368 |
+
next_line = lines[i+j].strip()
|
| 369 |
+
next_upper = next_line.upper()
|
| 370 |
+
|
| 371 |
+
# Skip if it's clearly a label
|
| 372 |
+
if any(label in next_upper for label in ['NAME', 'ADDRESS', 'BIRTH', 'CITIZEN', 'DATE', 'PLACE', 'PICTURE', 'SIGNATURE', 'THUMBMARK']):
|
| 373 |
+
continue
|
| 374 |
+
|
| 375 |
+
# Check if line starts with colon
|
| 376 |
+
if next_line.startswith(':') and len(next_line) > 1:
|
| 377 |
+
gender_part = next_line[1:].strip().upper()
|
| 378 |
if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
|
| 379 |
+
details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
|
| 380 |
+
print(f"DEBUG: Found gender (colon line): {details['gender']}", file=sys.stderr)
|
| 381 |
+
break
|
| 382 |
+
|
| 383 |
+
# Check if the line itself is the gender value
|
| 384 |
+
elif next_upper in ['MALE', 'FEMALE', 'M', 'F']:
|
| 385 |
+
details['gender'] = next_line.capitalize() if len(next_line) > 1 else next_line
|
| 386 |
+
print(f"DEBUG: Found gender (direct): {details['gender']}", file=sys.stderr)
|
| 387 |
+
break
|
| 388 |
+
|
| 389 |
+
# Check if line contains colon with gender value
|
| 390 |
+
elif ":" in next_line:
|
| 391 |
+
parts = next_line.split(':', 1)
|
| 392 |
+
if len(parts) > 1:
|
| 393 |
+
gender_part = parts[1].strip().upper()
|
| 394 |
+
if gender_part in ['MALE', 'FEMALE', 'M', 'F']:
|
| 395 |
+
details['gender'] = gender_part.capitalize() if len(gender_part) > 1 else gender_part
|
| 396 |
+
print(f"DEBUG: Found gender (colon in line): {details['gender']}", file=sys.stderr)
|
| 397 |
+
break
|
| 398 |
|
| 399 |
# Extract ID Number (Usually "ID No.:" or near QR code)
|
| 400 |
if "ID NO" in line_upper or "ID NO." in line_upper:
|