import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def dprint(msg, obj=None):
    try:
        print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
    except Exception:
        pass

def clean_cache():
    cache_files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
    for f in cache_files:
        if os.path.exists(f):
            os.remove(f)
            dprint("Removed cache file", f)
    if os.path.exists("output"):
        import shutil
        shutil.rmtree("output")
        dprint("Removed output directory")

def download_image(url, output_path='temp_image.jpg'):
    dprint("Starting download", url)
    clean_cache()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    r = requests.get(url, headers=headers, timeout=30)
    dprint("HTTP status", r.status_code)
    r.raise_for_status()
    img = Image.open(BytesIO(r.content))
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255,255,255))
        bg.paste(img, mask=img.split()[-1])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    img.save(output_path, 'JPEG', quality=95)
    dprint("Saved image", output_path)
    return output_path

def format_date(s):
    if not s: return None
    raw = s
    s = s.strip().replace(' ', '').replace('\\','/').replace('.', '/')
    if re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', s):
        return s.replace('/', '-')
    m = re.match(r'([A-Za-z]+)(\d{1,2}),?(\d{4})', s)
    if m:
        mo, d, y = m.groups()
        try:
            mnum = datetime.strptime(mo[:3], '%b').month
            return f"{y}-{int(mnum):02d}-{int(d):02d}"
        except Exception:
            pass
    for fmt in ("%B%d,%Y", "%b%d,%Y", "%B %d, %Y", "%b %d, %Y"):
        try:
            return datetime.strptime(raw, fmt).strftime("%Y-%m-%d")
        except Exception:
            pass
    return raw

def cap_words(name):
    return None if not name else ' '.join(w.capitalize() for w in name.split())

def normalize_full_name(s):
    if not s:
        return None
    raw = ' '.join(s.split())

    # Format: "LAST, FIRST [SECOND ...] [MIDDLE...]"
    if ',' in raw:
        parts = [p.strip() for p in raw.split(',')]
        last = parts[0] if parts else ''
        given_block = ','.join(parts[1:]).strip() if len(parts) > 1 else ''
        tokens = [t for t in given_block.split(' ') if t]
        # keep up to two given names, drop the rest (assumed middle)
        given_kept = tokens[:2] if tokens else []
        return cap_words(' '.join(given_kept + [last]).strip())

    # Fallback: "FIRST [SECOND ...] LAST"
    tokens = [t for t in raw.split(' ') if t]
    if len(tokens) >= 2:
        last = tokens[-1]
        given_kept = tokens[:2]  # keep first two tokens as given names
        return cap_words(' '.join(given_kept + [last]).strip())
    return cap_words(raw)

def take_within(lines, start_idx, max_ahead=5):
    out = []
    for j in range(1, max_ahead+1):
        if start_idx + j < len(lines):
            t = str(lines[start_idx + j]).strip()
            if t:
                out.append(t)
    return out

def find_match(ahead, predicate):
    for t in ahead:
        if predicate(t):
            return t
    return None

def is_date_text(t):
    t1 = t.replace(' ', '').replace('\\','/').replace('.','/')
    return bool(re.match(r'^\d{4}[-/]\d{2}[-/]\d{2}$', t1)) or bool(re.match(r'^[A-Za-z]+\s*\d{1,2},?\s*\d{4}$', t))

def is_weight(t):
    return bool(re.match(r'^\d{2,3}(\.\d+)?$', t))

def is_height(t):
    return bool(re.match(r'^\d(\.\d{1,2})$', t)) or bool(re.match(r'^\d\.\d+$', t))

def is_sex(t):
    return t.upper() in ('M','F','MALE','FEMALE')

def join_name_lines(line1, line2):
    # e.g., "FURAGGANAN,REIN" and "ANDRE DUCUSIN" -> "FURAGGANAN,REIN ANDRE DUCUSIN"
    if not line1: return None
    s = line1
    if line2: s = f"{line1} {line2}"
    return s

def extract_drivers_license_info(lines):
    dprint("Lines to extract", lines)

    license_number = None
    full_name = None
    birth_date = None
    nationality = None
    sex = None
    weight = None
    height = None
    address = None
    expiration_date = None
    agency_code = None
    blood_type = None
    eyes_color = None
    dl_codes = None
    conditions = None

    i = 0
    L = [str(x or '').strip() for x in lines]
    while i < len(L):
        line = L[i]
        low = line.lower()
        dprint("Line", {"i": i, "text": line})

        if not license_number and re.match(r'^[A-Z]\d{2}-\d{2}-\d{6}$', line):
            license_number = line
            dprint("Found license_number", license_number)

        if ('last name' in low and 'first name' in low) or 'last name.first name.middle' in low:
            name_l1 = L[i+1] if i+1 < len(L) else ''
            name_l2 = L[i+2] if i+2 < len(L) else ''
            # Skip a stray "Name" line
            if name_l1.lower() == 'name':
                name_l1, name_l2 = name_l2, (L[i+3] if i+3 < len(L) else '')
            combined = join_name_lines(name_l1, name_l2)
            full_name = normalize_full_name(combined)
            dprint("Found full_name", full_name)
            i += 3
            continue

        if re.search(r'nation.?l.?ity', low) or 'phl' == line:
            ahead = take_within(L, i, 5)
            cand = find_match(ahead, lambda t: t.upper() in ('PHL','FILIPINO','PHILIPPINES'))
            if cand:
                nationality = cand
                dprint("Found nationality", nationality)

        if 'sex' in low:
            ahead = take_within(L, i, 5)
            cand = find_match(ahead, is_sex)
            if cand:
                sex = 'M' if cand.upper().startswith('M') else 'F'
                dprint("Found sex", sex)

        if 'date of birth' in low:
            ahead = take_within(L, i, 5)
            cand = find_match(ahead, is_date_text)
            if cand:
                birth_date = format_date(cand)
                dprint("Found birth_date", birth_date)

        if birth_date is None and is_date_text(line):
            birth_date = format_date(line)
            dprint("Found standalone birth_date", birth_date)

        if 'weight' in low:
            ahead = take_within(L, i, 5)
            cand = find_match(ahead, is_weight)
            if cand:
                weight = cand
                dprint("Found weight", weight)

        if 'height' in low:
            ahead = take_within(L, i, 5)
            cand = find_match(ahead, is_height)
            if cand:
                height = cand
                dprint("Found height", height)

        if 'address' in low:
            parts = []
            for k in range(1, 4):
                if i+k < len(L):
                    t = L[i+k]
                    if t and not any(lbl in t.lower() for lbl in ['license', 'expiration', 'agency code', 'blood', 'eyes', 'dl codes', 'conditions']):
                        parts.append(t)
            if parts:
                address = ', '.join(parts)
                dprint("Found address", address)

        if 'expiration date' in low:
            ahead = take_within(L, i, 6)
            cand = find_match(ahead, is_date_text)
            if cand:
                expiration_date = format_date(cand)
                dprint("Found expiration_date", expiration_date)

        if 'agency code' in low:
            ahead = take_within(L, i, 5)
            cand = find_match(ahead, lambda t: re.match(r'^[A-Z]\d{2}$', t) is not None)
            if cand:
                agency_code = cand
                dprint("Found agency_code", agency_code)

        if 'blood' in low and 'type' in low:
            ahead = take_within(L, i, 3)
            cand = find_match(ahead, lambda t: re.match(r'^[ABO][+-]?$', t.upper()) is not None)
            if cand:
                blood_type = cand.upper()
                dprint("Found blood_type", blood_type)

        if 'eyes' in low and 'color' in low:
            ahead = take_within(L, i, 5)
            cand = find_match(ahead, lambda t: t.isalpha())
            if cand:
                eyes_color = cap_words(cand)
                dprint("Found eyes_color", eyes_color)

        if 'dl codes' in low:
            ahead = take_within(L, i, 3)
            cand = find_match(ahead, lambda t: True)  # take first non-empty
            if cand:
                dl_codes = cand
                dprint("Found dl_codes", dl_codes)

        if 'conditions' in low:
            ahead = take_within(L, i, 3)
            cand = find_match(ahead, lambda t: True)
            if cand:
                conditions = cand
                dprint("Found conditions", conditions)

        i += 1

    result = {
        'id_type': "Driver's License",
        'license_number': license_number,
        'id_number': license_number,  # for frontend compatibility
        'full_name': full_name,
        'birth_date': birth_date,
        'nationality': nationality,
        'sex': sex,
        'weight': weight,
        'height': height,
        'address': address,
        'blood_type': blood_type,
        'eyes_color': eyes_color,
        'dl_codes': dl_codes,
        'conditions': conditions,
        'agency_code': agency_code,
        'expiration_date': expiration_date
    }
    dprint("Final result", result)
    return result

def extract_ocr_lines(image_path):
    os.makedirs("output", exist_ok=True)
    dprint("Initializing PaddleOCR")
    
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=False, 
            use_doc_unwarping=False, 
            use_textline_orientation=False, 
            lang='en'
        )
        dprint("OCR initialized")
        dprint("Running OCR predict", image_path)
        try:
            results = ocr.predict(image_path)
        except Exception as e:
            dprint("predict() failed, trying ocr()", str(e))
            if hasattr(ocr, 'ocr'):
                results = ocr.ocr(image_path)
            else:
                results = None
    dprint("OCR predict done, results_count", len(results))
    
    # Process OCR results - handle both old format (list) and new format (OCRResult object)
    all_text = []
    try:
        # Handle both old format (list) and new format (OCRResult object)
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            item_type_name = type(first_item).__name__
            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
            
            if is_ocr_result:
                dprint("Detected OCRResult object format", f"type: {item_type_name}")
                # Access OCRResult as dictionary
                try:
                    if hasattr(first_item, 'keys'):
                        ocr_dict = dict(first_item)
                        # Look for rec_texts key
                        if 'rec_texts' in ocr_dict:
                            rec_texts = ocr_dict['rec_texts']
                            if isinstance(rec_texts, list):
                                all_text = [str(t) for t in rec_texts if t]
                                dprint("Extracted text lines from rec_texts", len(all_text))
                except Exception as e:
                    dprint("Error accessing OCRResult", str(e))
            else:
                # Old format - list of lists
                lines = results[0] if results and isinstance(results[0], list) else results
                for item in lines:
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        meta = item[1]
                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                            all_text.append(str(meta[0]))
    except Exception as e:
        dprint("Error processing OCR results", str(e))
        import traceback
        dprint("Traceback", traceback.format_exc())
    
    dprint("All direct texts", all_text)
    return extract_drivers_license_info(all_text) if all_text else {'id_type': "Driver's License",'license_number':None,'id_number':None,'full_name':None,'birth_date':None}

if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
    image_path = download_image(image_url)
    dprint("Image downloaded to", image_path)
    ocr_results = extract_ocr_lines(image_path)
    dprint("OCR results ready")
    
    # Restore stdout and print only the JSON response
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results}))
    sys.stdout.flush()
    
except Exception as e:
    dprint("Exception", str(e))
    # Restore stdout for error JSON
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    # Clean up
    try:
        clean_cache()
    except:
        pass