import sys, json, os, glob, requests
import re
import time
from contextlib import redirect_stdout, redirect_stderr
from datetime import datetime

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def download_image(url, output_path='temp_phic_image.jpg'):
    # Remove any existing temp file
    if os.path.exists(output_path):
        os.remove(output_path)
    
    # Add cache-busting parameters
    timestamp = int(time.time())
    if '?' in url:
        url += f'&t={timestamp}'
    else:
        url += f'?t={timestamp}'
    
    # Add headers to prevent caching
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Cache-Control': 'no-cache, no-store, must-revalidate',
        'Pragma': 'no-cache',
        'Expires': '0'
    }
    
    response = requests.get(url, headers=headers, timeout=30)
    response.raise_for_status()
    image_data = response.content
    
    # Save the image
    with open(output_path, 'wb') as f:
        f.write(image_data)
    
    return output_path

def format_date(date_str):
    """Format date from various formats to YYYY-MM-DD"""
    if not date_str:
        return None
    
    date_str = date_str.strip()
    
    # Fix common OCR errors first
    date_str = date_str.replace('VULY', 'JULY').replace('VUL', 'JUL')
    date_str = date_str.replace('Juy', 'July')
    date_str = date_str.replace('Januay', 'January').replace('Februay', 'February')
    date_str = date_str.replace('Marc', 'March').replace('Apil', 'April')
    date_str = date_str.replace('Augu', 'August').replace('Septemb', 'September')
    date_str = date_str.replace('Octob', 'October').replace('Novem', 'November')
    date_str = date_str.replace('Decemb', 'December')
    
    # Fix period instead of comma: "10.2003" -> "10, 2003"
    date_str = re.sub(r'(\d{1,2})\.(\d{4})', r'\1, \2', date_str)
    
    # Handle format like "JULY 10, 2003" or "July 10, 2003"
    match = re.match(r'([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})', date_str)
    if match:
        month_str, day, year = match.groups()
        try:
            # Parse month name (handle full month names and abbreviations)
            # Try abbreviation first (first 3 chars)
            try:
                month = datetime.strptime(month_str[:3], '%b').month
            except:
                # Try full month name
                month = datetime.strptime(month_str, '%B').month
            return f"{year}-{month:02d}-{int(day):02d}"
        except Exception as e:
            print(f"DEBUG: Date parsing error: {e}", file=sys.stderr)
            pass
    
    # Try other common formats
    for fmt in ["%B %d, %Y", "%b %d, %Y", "%Y-%m-%d", "%m/%d/%Y", "%d/%m/%Y"]:
        try:
            dt = datetime.strptime(date_str, fmt)
            return dt.strftime("%Y-%m-%d")
        except Exception:
            continue
    
    return date_str

def format_name(name):
    """Format name: reorder to given names first, then last name, and capitalize properly"""
    if not name:
        return None
    
    # Ensure comma spacing: "FERNANDEZ,CARL" -> "FERNANDEZ, CARL"
    name = re.sub(r',([A-Z])', r', \1', name)
    name = re.sub(r',\s*([A-Z])', r', \1', name)
    
    # Fix missing spaces in name parts: "MATTHEWICOY" -> "MATTHEW ICOY"
    # Add space before capital letters that follow lowercase
    name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)
    # Add space between consecutive capitals followed by lowercase (name parts)
    name = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1 \2', name)
    
    # Remove extra spaces and normalize
    name = ' '.join(name.split())
    
    # Reorder name: "LASTNAME, FIRST MIDDLE" -> "FIRST MIDDLE LASTNAME"
    # Or "LASTNAME FIRST MIDDLE" -> "FIRST MIDDLE LASTNAME"
    if ',' in name:
        # Split by comma: "AGUILAR,JEDD EISHEN BAYATAN" -> ["AGUILAR", "JEDD EISHEN BAYATAN"]
        parts = [p.strip() for p in name.split(',')]
        if len(parts) >= 2:
            last_name = parts[0]
            given_names = ' '.join(parts[1:])
            # Reorder: given names first, then last name
            name = f"{given_names} {last_name}"
        else:
            name = parts[0]
    else:
        # No comma, try to identify last name (usually the first word if it's all caps)
        words = name.split()
        if len(words) >= 2 and words[0].isupper() and len(words[0]) > 3:
            # First word might be last name, rest are given names
            # But this is less reliable, so we'll keep original order if no comma
            pass
    
    # Capitalize each word properly
    name = ' '.join([word.capitalize() for word in name.split()])
    
    return name.strip()

def format_address(address_lines):
    """Format address from multiple lines and fix OCR errors"""
    if not address_lines:
        return None
    
    # Join address lines and clean up
    address = ' '.join([line.strip() for line in address_lines if line.strip()])
    
    # Fix OCR errors in address
    address = address.replace('DYLABONFACIO', 'BONIFACIO')
    address = address.replace('AVENJE', 'AVENUE')
    address = address.replace('SANIODOMINGO', 'SANTO DOMINGO')
    address = address.replace('SANIO', 'SANTO')
    address = address.replace('AZAL', 'RIZAL')
    address = address.replace('AZAL-', 'RIZAL ')
    
    # Fix missing spaces: "071A" -> "071 A"
    address = re.sub(r'(\d+)([A-Z])', r'\1 \2', address)
    
    # Fix missing spaces before city/province: "CAINTA AZAL" -> "CAINTA RIZAL"
    address = re.sub(r'([A-Z])([A-Z][a-z]+)', r'\1 \2', address)
    
    # Remove extra spaces
    address = ' '.join(address.split())
    
    return address.strip()

def extract_phic_details(lines):
    details = {
        'id_type': 'PHIC',
        'id_number': None,
        'full_name': None,
        'birth_date': None,
        'sex': None,
        'address': None,
        'membership_category': None,
        'success': False
    }
    
    # Clean lines - convert to strings and strip
    cleaned_lines = [str(line).strip() for line in lines if str(line).strip()]
    
    for i, line in enumerate(cleaned_lines):
        line_upper = line.upper().strip()
        line_stripped = line.strip()
        
        # Extract ID Number - Format: XX-XXXXXXXXX-X (e.g., "03-026765383-2")
        # Remove dashes from ID number
        if not details['id_number']:
            # Look for pattern: digits-digits-digits with hyphens
            id_match = re.search(r'(\d{2}-\d{9}-\d)', line_stripped)
            if id_match:
                # Remove dashes from ID number
                details['id_number'] = id_match.group(1).replace('-', '')
            # Also check for pattern without hyphens that might be OCR'd incorrectly
            elif re.match(r'^\d{12}$', line_stripped):
                # Use as is (no dashes)
                details['id_number'] = line_stripped
        
        # Extract Full Name - usually appears as "LASTNAME, FIRST MIDDLE" or on separate lines
        if not details['full_name']:
            # Look for name pattern with comma (LASTNAME, FIRST MIDDLE)
            if ',' in line_stripped and re.match(r'^[A-Z\s,]+$', line_stripped):
                # Make sure it's not a label
                if not any(label in line_upper for label in ['REPUBLIC', 'PHILIPPINE', 'HEALTH', 'INSURANCE', 'CORPORATION', 'PHILHEALTH', 'DATE', 'BIRTH', 'ADDRESS', 'MEMBERSHIP', 'CATEGORY']):
                    # Store for later processing (will reorder in format_name)
                    details['full_name'] = line_stripped
            # Also check for name parts on separate lines (e.g., "AGUILAR,JEDD" and "EISHEN BAYATAN")
            elif re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped.split()) >= 1:
                # Make sure it's not a label
                if not any(label in line_upper for label in ['REPUBLIC', 'PHILIPPINE', 'HEALTH', 'INSURANCE', 'CORPORATION', 'PHILHEALTH', 'DATE', 'BIRTH', 'ADDRESS', 'MEMBERSHIP', 'CATEGORY', 'INFORMAL', 'ECONOMY']):
                    # Check if previous line might be part of name
                    if i > 0:
                        prev_line = cleaned_lines[i-1].strip()
                        # If previous line has comma (LASTNAME, FIRST) and current line is more names
                        if ',' in prev_line and re.match(r'^[A-Z\s,]+$', prev_line):
                            # Combine: "AGUILAR,JEDD" + "EISHEN BAYATAN" -> "JEDD EISHEN BAYATAN AGUILAR"
                            combined = f"{prev_line} {line_stripped}"
                            details['full_name'] = combined
                        elif re.match(r'^[A-Z\s,]+$', line_stripped):
                            # Try to combine with previous line if it looks like a name
                            if re.match(r'^[A-Z\s,]+$', prev_line) and ',' in prev_line:
                                combined = f"{prev_line} {line_stripped}"
                                details['full_name'] = combined
        
        # Extract Date of Birth and Sex - Format: "JULY 10, 2003 - MALE" or "VULY10.2003-MALE" (with OCR errors)
        # Also handle split dates like "FEBRUARY17," and "2003-" on separate lines
        if not details['birth_date'] or not details['sex']:
            # Look for date pattern followed by sex (handle OCR errors like "VULY10.2003-MALE")
            # Pattern: month name (may have OCR errors) + day (may have period instead of comma) + year + sex
            date_sex_match = re.search(r'([A-Za-z]+)\s*(\d{1,2})[.,]\s*(\d{4})\s*[-]?\s*(MALE|FEMALE)', line_upper)
            if date_sex_match:
                month_str = date_sex_match.group(1)
                day = date_sex_match.group(2)
                year = date_sex_match.group(3)
                sex_str = date_sex_match.group(4)
                
                # Fix OCR errors in month: VULY -> JULY
                month_str = month_str.replace('VULY', 'JULY').replace('VUL', 'JUL')
                month_str = month_str.replace('JANUA', 'JANUARY').replace('FEBRUA', 'FEBRUARY')
                month_str = month_str.replace('MARC', 'MARCH').replace('APIL', 'APRIL')
                month_str = month_str.replace('AUGU', 'AUGUST').replace('SEPTEM', 'SEPTEMBER')
                month_str = month_str.replace('OCTO', 'OCTOBER').replace('NOVEM', 'NOVEMBER')
                month_str = month_str.replace('DECEM', 'DECEMBER')
                
                if not details['birth_date']:
                    details['birth_date'] = f"{month_str} {day}, {year}"
                if not details['sex']:
                    details['sex'] = sex_str.capitalize()
            # Check for split date pattern: "FEBRUARY17," on one line and "2003-" on next line
            elif not details['birth_date']:
                # Look for month + day pattern (may end with comma): "FEBRUARY17,"
                month_day_match = re.search(r'([A-Za-z]+)(\d{1,2})[,]?$', line_upper)
                if month_day_match:
                    month_str = month_day_match.group(1)
                    day = month_day_match.group(2)
                    # Fix OCR errors
                    month_str = month_str.replace('VULY', 'JULY').replace('VUL', 'JUL')
                    month_str = month_str.replace('JANUA', 'JANUARY').replace('FEBRUA', 'FEBRUARY')
                    month_str = month_str.replace('MARC', 'MARCH').replace('APIL', 'APRIL')
                    month_str = month_str.replace('AUGU', 'AUGUST').replace('SEPTEM', 'SEPTEMBER')
                    month_str = month_str.replace('OCTO', 'OCTOBER').replace('NOVEM', 'NOVEMBER')
                    month_str = month_str.replace('DECEM', 'DECEMBER')
                    
                    # Look ahead for year (may have trailing dash): "2003-"
                    if i + 1 < len(cleaned_lines):
                        next_line = cleaned_lines[i+1].strip()
                        year_match = re.search(r'(\d{4})', next_line)
                        if year_match:
                            year = year_match.group(1)
                            details['birth_date'] = f"{month_str} {day}, {year}"
                # Also check for date pattern with period instead of comma on same line
                else:
                    date_match = re.search(r'([A-Za-z]+)\s*(\d{1,2})[.,]\s*(\d{4})', line_upper)
                    if date_match:
                        month_str = date_match.group(1)
                        day = date_match.group(2)
                        year = date_match.group(3)
                        # Fix OCR errors
                        month_str = month_str.replace('VULY', 'JULY').replace('VUL', 'JUL')
                        month_str = month_str.replace('JANUA', 'JANUARY').replace('FEBRUA', 'FEBRUARY')
                        month_str = month_str.replace('MARC', 'MARCH').replace('APIL', 'APRIL')
                        month_str = month_str.replace('AUGU', 'AUGUST').replace('SEPTEM', 'SEPTEMBER')
                        month_str = month_str.replace('OCTO', 'OCTOBER').replace('NOVEM', 'NOVEMBER')
                        month_str = month_str.replace('DECEM', 'DECEMBER')
                        # Make sure it's not part of address or other field
                        if "AVENUE" not in line_upper and "STREET" not in line_upper and "RIZAL" not in line_upper:
                            details['birth_date'] = f"{month_str} {day}, {year}"
            # Check for sex alone if date was found separately
            if not details['sex']:
                if "MALE" in line_upper and "FEMALE" not in line_upper:
                    details['sex'] = "Male"
                elif "FEMALE" in line_upper:
                    details['sex'] = "Female"
        
        # Extract Address - usually a long line with street, city, province, zip
        if not details['address']:
            # Look for address indicators (but exclude ID numbers and names)
            # Check if line contains address keywords but not ID pattern or name pattern
            if (any(indicator in line_upper for indicator in ['AVENUE', 'AVENJE', 'STREET', 'ROAD', 'BLVD', 'BOULEVARD', 'CAINTA', 'RIZAL', 'AZAL', 'SANTO', 'SANIO', 'DOMINGO', 'BONIFACIO', 'DYLABONFACIO']) or 
                (re.match(r'^\d+', line_stripped) and len(line_stripped) > 5)):
                # Make sure it's not an ID number or name
                is_id = bool(re.search(r'\d{2}-\d{9}-\d', line_stripped))
                is_name = bool(',' in line_stripped and re.match(r'^[A-Z\s,]+$', line_stripped) and len(line_stripped.split()) <= 5)
                
                if not is_id and not is_name:
                    address_lines = []
                    # Collect address lines forward
                    for j in range(0, min(3, len(cleaned_lines) - i)):
                        addr_line = cleaned_lines[i+j].strip()
                        addr_upper = addr_line.upper()
                        # Stop if we hit membership category, ID number, name, or other labels
                        if (any(label in addr_upper for label in ['MEMBERSHIP', 'CATEGORY', 'INFORMAL', 'ECONOMY', 'DATE', 'BIRTH', 'MALE', 'FEMALE']) or
                            re.search(r'\d{2}-\d{9}-\d', addr_line) or
                            (',' in addr_line and re.match(r'^[A-Z\s,]+$', addr_line) and len(addr_line.split()) <= 5)):
                            break
                        # Add if it looks like address content
                        if addr_line and len(addr_line) > 2:
                            address_lines.append(addr_line)
                    if address_lines:
                        details['address'] = format_address(address_lines)
        
        # Extract Membership Category - usually "INFORMAL ECONOMY" or similar
        if not details['membership_category']:
            if "MEMBERSHIP" in line_upper and "CATEGORY" in line_upper:
                # Category is likely on next line
                if i + 1 < len(cleaned_lines):
                    next_line = cleaned_lines[i+1].strip()
                    if next_line:
                        details['membership_category'] = next_line
            elif "INFORMAL ECONOMY" in line_upper or ("INFORMAL" in line_upper and "ECONOMY" in line_upper):
                details['membership_category'] = "INFORMAL ECONOMY"
            elif any(cat in line_upper for cat in ['INFORMAL', 'EMPLOYED', 'SELF-EMPLOYED', 'VOLUNTARY', 'SPONSORED', 'DEPENDENT']):
                # Check if it's a membership category (not part of address or name)
                if not any(label in line_upper for label in ['AVENUE', 'STREET', 'RIZAL', 'CAINTA', 'FERNANDEZ']):
                    details['membership_category'] = line_stripped
    
    # Format extracted fields
    if details['full_name']:
        details['full_name'] = format_name(details['full_name'])
    if details['birth_date']:
        details['birth_date'] = format_date(details['birth_date'])
    
    if details['id_number'] or details['full_name']:
        details['success'] = True
        
    return details

def extract_ocr_lines(image_path):
    # Check if file exists
    if not os.path.exists(image_path):
        return {'success': False, 'error': 'File not found'}
    
    file_size = os.path.getsize(image_path)
    print(f"DEBUG: Image file size: {file_size} bytes", file=sys.stderr)

    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        # Try simple configuration first
        ocr = PaddleOCR(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,
            lang='en'
        )
        try:
            results = ocr.ocr(image_path)
        except Exception as e:
            print(f"DEBUG: ocr() failed: {e}, trying predict()", file=sys.stderr)
            if hasattr(ocr, 'predict'):
                results = ocr.predict(image_path)
            else:
                results = None
    
    # Debug: Print raw results structure
    print(f"DEBUG: Raw OCR results type: {type(results)}", file=sys.stderr)
    
    all_text = []
    try:
        # Handle both old format (list) and new format (OCRResult object)
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            item_type_name = type(first_item).__name__
            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
            
            if is_ocr_result:
                print(f"DEBUG: Detected OCRResult object format (type: {item_type_name})", file=sys.stderr)
                # Access OCRResult as dictionary
                try:
                    if hasattr(first_item, 'keys'):
                        ocr_dict = dict(first_item)
                        # Look for rec_texts key
                        if 'rec_texts' in ocr_dict:
                            rec_texts = ocr_dict['rec_texts']
                            if isinstance(rec_texts, list):
                                all_text = [str(t) for t in rec_texts if t]
                                print(f"DEBUG: Extracted {len(all_text)} text lines from rec_texts", file=sys.stderr)
                except Exception as e:
                    print(f"DEBUG: Error accessing OCRResult: {e}", file=sys.stderr)
            else:
                # Old format - list of lists
                lines = results[0] if results and isinstance(results[0], list) else results
                for item in lines:
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        meta = item[1]
                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                            all_text.append(str(meta[0]))
    except Exception as e:
        print(f"DEBUG: Error processing OCR results: {str(e)}", file=sys.stderr)
        import traceback
        print(f"DEBUG: Traceback: {traceback.format_exc()}", file=sys.stderr)
    
    print(f"DEBUG: Extracted text lines: {all_text}", file=sys.stderr)
    
    return extract_phic_details(all_text) if all_text else {
        'id_type': 'PHIC',
        'id_number': None,
        'full_name': None,
        'birth_date': None,
        'sex': None,
        'address': None,
        'membership_category': None,
        'success': False
    }

# Main Execution
if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"success": False, "error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
print(f"DEBUG: Processing PhilHealth ID image URL: {image_url}", file=sys.stderr)

try:
    image_path = download_image(image_url, 'temp_phic_image.jpg')
    print(f"DEBUG: Image downloaded to: {image_path}", file=sys.stderr)

    ocr_results = extract_ocr_lines(image_path)
    print(f"DEBUG: OCR results: {ocr_results}", file=sys.stderr)
    
    # Clean up
    if os.path.exists(image_path):
        os.remove(image_path)
    
    response = {
        "success": ocr_results['success'],
        "data": ocr_results
    }
    
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps(response))
    sys.stdout.flush()
        
except Exception as e:
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": False, "error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    try:
        if os.path.exists('temp_phic_image.jpg'):
            os.remove('temp_phic_image.jpg')
    except:
        pass