import sys, json, os, glob, re, requests
from PIL import Image
from io import BytesIO
from datetime import datetime
from contextlib import redirect_stdout, redirect_stderr

# Immediately redirect all output to stderr except for our final JSON
original_stdout = sys.stdout
sys.stdout = sys.stderr

# Suppress all PaddleOCR output
os.environ['PADDLEOCR_LOG_LEVEL'] = 'ERROR'
os.environ['QT_QPA_PLATFORM'] = 'offscreen'
os.environ['DISPLAY'] = ':99'

# Import PaddleOCR after setting environment variables
from paddleocr import PaddleOCR

def dprint(msg, obj=None):
    try:
        print(f"DEBUG: {msg}" + (f": {obj}" if obj is not None else ""), file=sys.stderr)
    except Exception:
        pass

def clean_cache():
    files = ['temp_image.jpg', 'temp_image_ocr_res_img.jpg', 'temp_image_preprocessed_img.jpg', 'temp_image_res.json']
    for f in files:
        if os.path.exists(f):
            os.remove(f)
            dprint("Removed cache file", f)
    if os.path.exists("output"):
        import shutil
        shutil.rmtree("output")
        dprint("Removed output directory")

def download_image(url, output_path='temp_image.jpg'):
    dprint("Starting download", url)
    clean_cache()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    r = requests.get(url, headers=headers, timeout=30)
    dprint("HTTP status", r.status_code)
    r.raise_for_status()
    img = Image.open(BytesIO(r.content))
    if img.mode == 'RGBA':
        bg = Image.new('RGB', img.size, (255, 255, 255))
        bg.paste(img, mask=img.split()[-1])
        img = bg
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    img.save(output_path, 'JPEG', quality=95)
    dprint("Saved image", output_path)
    return output_path

def cap_words(s):
    return None if not s else ' '.join(w.capitalize() for w in s.split())

def normalize_full_name(s):
    if not s:
        return None
    raw = ' '.join(s.split())
    
    if ',' in raw:
        parts = [p.strip() for p in raw.split(',')]
        last = parts[0] if parts else ''
        given_block = ','.join(parts[1:]).strip() if len(parts) > 1 else ''
        tokens = [t for t in given_block.split(' ') if t]
        first = tokens[0] if tokens else ''
        second = tokens[1] if len(tokens) > 1 else ''
        if second:
            return f"{first} {second} {last}".strip()
        else:
            return f"{first} {last}".strip()
    else:
        tokens = [t for t in raw.split(' ') if t]
        if len(tokens) >= 3:
            return f"{tokens[0]} {tokens[1]} {tokens[-1]}".strip()
        elif len(tokens) == 2:
            return f"{tokens[0]} {tokens[1]}".strip()
        else:
            return raw

def extract_sss_number(text):
    sss_pattern = r'\b(\d{2}-\d{7}-\d{1})\b'
    match = re.search(sss_pattern, text)
    if match:
        return match.group(1)
    return None

def extract_sss_info(lines):
    dprint("Lines to extract", lines)

    full_name = None
    sss_number = None
    sss_id_number = None
    name_parts = []

    L = [str(x or '').strip() for x in lines]
    i = 0
    while i < len(L):
        line = L[i]
        low = line.lower()
        dprint("Line", {"i": i, "text": line})

        # Look for SSS number pattern (XX-XXXXXXX-X)
        if not sss_number:
            sss_num = extract_sss_number(line)
            if sss_num:
                sss_number = sss_num
                sss_id_number = sss_num
                dprint("Found SSS number", sss_number)

        # Collect potential name parts (single words that look like names)
        if re.search(r'^[A-Z]{2,}$', line) and not re.search(r'[0-9]', line):
            skip_words = ['REPUBLIC', 'PHILIPPINES', 'SOCIAL', 'SECURITY', 'SYSTEM', 'SSS', 'PRESIDENT', 'PROUD', 'FILIPINO', 'CORAZON']
            if not any(word in line.upper() for word in skip_words):
                name_parts.append(line)
                dprint("Added name part", line)

        # Look for multi-word names but don't set full_name yet
        if len(line.split()) >= 2 and re.search(r'[A-Z]{2,}', line) and not re.search(r'[0-9]{3,}', line):
            skip_words = ['REPUBLIC', 'PHILIPPINES', 'SOCIAL', 'SECURITY', 'SYSTEM', 'SSS', 'PRESIDENT', 'PROUD', 'FILIPINO']
            if not any(word in line.upper() for word in skip_words):
                # Add multi-word names to name_parts instead of setting full_name directly
                name_parts.append(line)
                dprint("Added multi-word name part", line)

        i += 1

    # Now compose the full name from all collected parts
    if name_parts:
        # Combine all name parts
        combined_name = ' '.join(name_parts)
        full_name = normalize_full_name(combined_name)
        dprint("Composed name from all parts", {"parts": name_parts, "result": full_name})

    result = {
        "id_type": "SSS ID",
        "sss_number": sss_number,
        "id_number": sss_id_number,
        "full_name": full_name,
        "birth_date": None,
        "address": None,
        "sex": None,
        "nationality": "Filipino"
    }
    dprint("Final result", result)
    return result

def extract_ocr_lines(image_path):
    os.makedirs("output", exist_ok=True)
    dprint("Initializing PaddleOCR")
    
    with redirect_stdout(sys.stderr), redirect_stderr(sys.stderr):
        ocr = PaddleOCR(
            use_doc_orientation_classify=False, 
            use_doc_unwarping=False, 
            use_textline_orientation=False, 
            lang='en'
        )
        dprint("OCR initialized")
        dprint("Running OCR predict", image_path)
        try:
            results = ocr.predict(image_path)
        except Exception as e:
            dprint("predict() failed, trying ocr()", str(e))
            if hasattr(ocr, 'ocr'):
                results = ocr.ocr(image_path)
            else:
                results = None
    
    try:
        count = len(results[0]) if results and isinstance(results, list) and len(results) > 0 and isinstance(results[0], list) else (len(results) if results else 0)
    except Exception:
        count = 0
    dprint("OCR done, results_count", count)

    # Process OCR results - handle both old format (list) and new format (OCRResult object)
    all_text = []
    try:
        # Handle both old format (list) and new format (OCRResult object)
        if results and isinstance(results, list) and len(results) > 0:
            first_item = results[0]
            item_type_name = type(first_item).__name__
            is_ocr_result = 'OCRResult' in item_type_name or 'ocr_result' in str(type(first_item)).lower()
            
            if is_ocr_result:
                dprint("Detected OCRResult object format", f"type: {item_type_name}")
                # Access OCRResult as dictionary
                try:
                    if hasattr(first_item, 'keys'):
                        ocr_dict = dict(first_item)
                        # Look for rec_texts key
                        if 'rec_texts' in ocr_dict:
                            rec_texts = ocr_dict['rec_texts']
                            if isinstance(rec_texts, list):
                                all_text = [str(t) for t in rec_texts if t]
                                dprint("Extracted text lines from rec_texts", len(all_text))
                except Exception as e:
                    dprint("Error accessing OCRResult", str(e))
            else:
                # Old format - list of lists
                lines = results[0] if results and isinstance(results[0], list) else results
                for item in lines:
                    if isinstance(item, (list, tuple)) and len(item) >= 2:
                        meta = item[1]
                        if isinstance(meta, (list, tuple)) and len(meta) >= 1:
                            all_text.append(str(meta[0]))
    except Exception as e:
        dprint("Error processing OCR results", str(e))
        import traceback
        dprint("Traceback", traceback.format_exc())
    
    dprint("All direct texts", all_text)
    return extract_sss_info(all_text) if all_text else {
        "id_type": "SSS ID",
        "sss_number": None,
        "id_number": None,
        "full_name": None,
        "birth_date": None
    }

if len(sys.argv) < 2:
    sys.stdout = original_stdout
    print(json.dumps({"error": "No image URL provided"}))
    sys.exit(1)

image_url = sys.argv[1]
dprint("Processing image URL", image_url)
try:
    image_path = download_image(image_url)
    dprint("Image downloaded to", image_path)
    ocr_results = extract_ocr_lines(image_path)
    dprint("OCR results ready")
    
    # Restore stdout and print only the JSON response
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"success": True, "ocr_results": ocr_results}))
    sys.stdout.flush()
    
except Exception as e:
    dprint("Exception", str(e))
    # Restore stdout for error JSON
    sys.stdout = original_stdout
    sys.stdout.write(json.dumps({"error": str(e)}))
    sys.stdout.flush()
    sys.exit(1)
finally:
    # Clean up
    try:
        clean_cache()
    except:
        pass