Spaces:

sohiyiy
/

birdsense-pro

Sleeping

App Files Files Community

sohiyiy commited on Dec 20, 2025

Commit

e1d82cf

verified ·

1 Parent(s): 403d0e5

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

app.py +370 -562

app.py CHANGED Viewed

@@ -1,14 +1,14 @@
 """
 🐦 BirdSense Pro - AI Bird Identification
-Uses Vision-Language Models (VLM) for accurate bird recognition
-- Local: Ollama with LLaVA (best accuracy)
-- Cloud: HuggingFace Inference API with Vision models
 """
 import gradio as gr
 import numpy as np
 import scipy.signal as signal
-from typing import Tuple, List, Dict, Generator, Optional
 import json
 import requests
 import re
@@ -20,15 +20,10 @@ import io
 import base64
 # ================== CONFIG ==================
-SAMPLE_RATE = 48000
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
 DEBUG = True
-# Model priorities
-OLLAMA_VISION_MODELS = ["llava:7b", "llava", "bakllava", "llava:13b"]
-OLLAMA_TEXT_MODELS = ["llama3.2", "qwen2.5:3b", "mistral", "phi4"]
 def log(msg):
     if DEBUG:
         print(f"[BirdSense] {msg}")
@@ -38,134 +33,100 @@ def log(msg):
 CSS = """
 .gradio-container {
     background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important;
-    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
 }
 .header {
     background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%);
-    color: white; padding: 40px 24px; border-radius: 20px;
-    text-align: center; margin-bottom: 20px;
-    box-shadow: 0 15px 40px rgba(26, 54, 93, 0.3);
 }
-.header h1 { font-size: 2.5rem; font-weight: 800; margin: 0 0 10px 0; }
-.header .subtitle { font-size: 1.1rem; opacity: 0.9; margin-bottom: 12px; }
 .header .status {
-    display: inline-flex; align-items: center; gap: 8px;
-    background: rgba(255,255,255,0.15); padding: 8px 20px; border-radius: 50px;
-    font-weight: 600; font-size: 0.9rem;
 }
-.status-dot { width: 10px; height: 10px; background: #48bb78; border-radius: 50%; animation: pulse 2s infinite; }
-@keyframes pulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.5; } }
 .info-box {
     background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%);
-    border: 1px solid #90cdf4; border-radius: 12px; padding: 16px; margin-bottom: 16px;
 }
-.info-box h3 { color: #2b6cb0; margin: 0 0 6px 0; font-size: 1rem; }
-.info-box p { color: #4299e1; margin: 0; font-size: 0.9rem; }
 .bird-card {
-    background: white; border: 1px solid #e2e8f0; border-radius: 16px;
-    padding: 20px; margin: 12px 0; display: flex; gap: 16px;
-    box-shadow: 0 4px 15px rgba(0,0,0,0.05);
-    transition: transform 0.2s, box-shadow 0.2s;
 }
-.bird-card:hover { transform: translateY(-2px); box-shadow: 0 8px 25px rgba(0,0,0,0.1); }
-.bird-card img { width: 120px; height: 120px; object-fit: cover; border-radius: 12px; flex-shrink: 0; }
 .bird-info { flex: 1; min-width: 0; }
-.bird-info h3 { color: #1a202c; margin: 0 0 4px 0; font-size: 1.2rem; font-weight: 700; }
-.bird-info .scientific { color: #718096; font-style: italic; font-size: 0.85rem; margin-bottom: 10px; }
-.confidence { display: inline-block; padding: 4px 12px; border-radius: 20px; font-weight: 700; font-size: 0.8rem; }
 .conf-high { background: #c6f6d5; color: #22543d; }
 .conf-med { background: #fefcbf; color: #744210; }
 .conf-low { background: #fed7d7; color: #742a2a; }
-.reason { color: #4a5568; margin-top: 10px; line-height: 1.6; font-size: 0.9rem; }
-.error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 12px; padding: 20px; color: #c53030; }
-.success { background: #f0fff4; border: 1px solid #68d391; border-radius: 12px; padding: 20px; color: #276749; }
-.processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 12px; padding: 20px; color: #2b6cb0; }
-.features-box {
-    background: #f7fafc; border: 1px solid #e2e8f0; border-radius: 10px;
-    padding: 14px; margin: 10px 0; font-family: monospace; font-size: 0.85rem;
-}
-@media (max-width: 768px) {
-    .header h1 { font-size: 1.8rem; }
-    .bird-card { flex-direction: column; }
-    .bird-card img { width: 100%; height: 180px; }
-}
 """
-# ================== UTILITY FUNCTIONS ==================
-def image_to_base64(image: Image.Image) -> str:
-    """Convert PIL image to base64."""
-    buffered = io.BytesIO()
-    # Resize for faster processing
-    max_size = 800
-    if max(image.size) > max_size:
-        ratio = max_size / max(image.size)
-        new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
-        image = image.resize(new_size, Image.Resampling.LANCZOS)
-    image.save(buffered, format="JPEG", quality=85)
-    return base64.b64encode(buffered.getvalue()).decode()
-def get_wikipedia_image(bird_name: str) -> str:
-    """Get bird image from Wikipedia."""
-    if not bird_name or bird_name.lower() in ['unknown', 'the bird', 'the image', 'bird']:
-        return "https://via.placeholder.com/150x150.png?text=Bird"
-    try:
-        clean_name = bird_name.strip().replace(" ", "_")
-        api_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean_name)}"
-        response = requests.get(api_url, timeout=5)
-        if response.status_code == 200:
-            data = response.json()
-            if "thumbnail" in data:
-                return data["thumbnail"]["source"]
-    except Exception as e:
-        log(f"Wikipedia image fetch failed: {e}")
-    return f"https://via.placeholder.com/150x150.png?text={urllib.parse.quote(bird_name[:12])}"
 # ================== OLLAMA FUNCTIONS ==================
-def check_ollama() -> Tuple[bool, Optional[str], bool]:
-    """Check Ollama availability. Returns (available, model_name, is_vision_model)."""
     try:
         response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3)
         if response.status_code == 200:
             models = [m["name"] for m in response.json().get("models", [])]
             log(f"Ollama models: {models}")
-            # Check for vision models first
-            for vm in OLLAMA_VISION_MODELS:
-                for m in models:
-                    if vm.split(":")[0] in m.lower():
-                        log(f"Found vision model: {m}")
-                        return True, m, True
-            # Fall back to text models
-            for tm in OLLAMA_TEXT_MODELS:
-                for m in models:
-                    if tm.split(":")[0] in m.lower():
-                        log(f"Found text model: {m}")
-                        return True, m, False
-        return False, None, False
     except Exception as e:
-        log(f"Ollama not available: {e}")
-        return False, None, False
-def call_ollama_vision(image: Image.Image, prompt: str, model: str) -> str:
-    """Call Ollama vision model (LLaVA)."""
     try:
-        img_b64 = image_to_base64(image)
         response = requests.post(
             f"{OLLAMA_URL}/api/generate",
@@ -174,106 +135,95 @@ def call_ollama_vision(image: Image.Image, prompt: str, model: str) -> str:
                 "prompt": prompt,
                 "images": [img_b64],
                 "stream": False,
-                "options": {"temperature": 0.2, "num_predict": 1500}
             },
-            timeout=180
         )
         if response.status_code == 200:
             result = response.json().get("response", "")
-            log(f"LLaVA response: {result[:300]}...")
             return result
         else:
-            log(f"Ollama vision error: {response.status_code} - {response.text[:200]}")
-            return ""
     except Exception as e:
-        log(f"Ollama vision failed: {e}")
-        return ""
 def call_ollama_text(prompt: str, model: str) -> str:
-    """Call Ollama text model."""
     try:
         response = requests.post(
             f"{OLLAMA_URL}/api/generate",
             json={
                 "model": model,
                 "prompt": prompt,
                 "stream": False,
-                "options": {"temperature": 0.2, "num_predict": 1000}
             },
-            timeout=90
         )
         if response.status_code == 200:
             return response.json().get("response", "")
-        return ""
     except Exception as e:
-        log(f"Ollama text error: {e}")
-        return ""
 # ================== HUGGINGFACE FUNCTIONS ==================
-def call_huggingface_vlm(image: Image.Image, prompt: str) -> str:
-    """Call HuggingFace Vision-Language Model."""
     if not HF_TOKEN:
-        log("No HF_TOKEN - skipping HuggingFace")
         return ""
     headers = {"Authorization": f"Bearer {HF_TOKEN}"}
-    # Convert image to bytes
-    img_bytes = io.BytesIO()
-    # Resize for API
-    max_size = 600
     if max(image.size) > max_size:
         ratio = max_size / max(image.size)
-        new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
-        image = image.resize(new_size, Image.Resampling.LANCZOS)
-    image.save(img_bytes, format="JPEG", quality=80)
-    img_bytes.seek(0)
-    # Try BLIP models for image captioning
     models = [
         "Salesforce/blip-image-captioning-large",
         "Salesforce/blip-image-captioning-base",
-        "nlpconnect/vit-gpt2-image-captioning",
     ]
     for model in models:
         try:
-            log(f"Trying HF vision model: {model}")
-            api_url = f"https://api-inference.huggingface.co/models/{model}"
             response = requests.post(
-                api_url,
                 headers=headers,
-                data=img_bytes.getvalue(),
-                timeout=60
             )
             if response.status_code == 200:
                 result = response.json()
-                if isinstance(result, list) and len(result) > 0:
                     caption = result[0].get("generated_text", "")
                     if caption:
                         log(f"HF caption: {caption}")
                         return caption
             elif response.status_code == 503:
-                log(f"Model {model} loading...")
-                continue
             else:
-                log(f"HF {model}: {response.status_code}")
         except Exception as e:
-            log(f"HF vision error: {e}")
-            continue
     return ""
-def call_huggingface_text(prompt: str) -> str:
     """Call HuggingFace text model."""
     if not HF_TOKEN:
         return ""
@@ -281,344 +231,280 @@ def call_huggingface_text(prompt: str) -> str:
     headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
     models = [
-        "mistralai/Mistral-7B-Instruct-v0.3",
         "HuggingFaceH4/zephyr-7b-beta",
-        "google/flan-t5-xxl",
     ]
     for model in models:
         try:
             log(f"Trying HF text model: {model}")
-            api_url = f"https://api-inference.huggingface.co/models/{model}"
             response = requests.post(
-                api_url,
                 headers=headers,
-                json={
-                    "inputs": prompt,
-                    "parameters": {"max_new_tokens": 800, "temperature": 0.3, "return_full_text": False}
-                },
-                timeout=60
             )
             if response.status_code == 200:
                 result = response.json()
-                if isinstance(result, list) and len(result) > 0:
                     text = result[0].get("generated_text", "")
                     if text:
-                        log(f"HF text response: {text[:200]}...")
                         return text
             elif response.status_code == 503:
                 continue
         except Exception as e:
             log(f"HF text error: {e}")
-            continue
     return ""
-# ================== BIRD IDENTIFICATION ==================
-LLAVA_BIRD_PROMPT = """You are an expert ornithologist. Look at this bird image very carefully.
-Identify the bird species. Provide your answer as JSON:
-{
-    "birds": [
-        {
-            "name": "Zebra Finch",
-            "scientific_name": "Taeniopygia guttata",
-            "confidence": 95,
-            "reason": "Identified by orange cheek patches, red-orange beak, black and white barred throat, chestnut flanks with white spots"
-        }
-    ],
-    "summary": "This is a Zebra Finch, a small Australian finch commonly kept as a pet."
-}
-Be SPECIFIC with the bird name - use the actual species name like "House Sparrow", "Indian Robin", "Zebra Finch", etc.
-DO NOT use generic names like "The bird" or "Unknown".
-If you're not 100% sure, still provide your best guess with lower confidence.
-Return ONLY valid JSON."""
-TEXT_IDENTIFICATION_PROMPT = """You are an expert ornithologist. Based on this image description, identify the bird species.
-IMAGE DESCRIPTION: {description}
-Respond with JSON ONLY:
-{{
-    "birds": [
-        {{
-            "name": "Species Common Name",
-            "scientific_name": "Scientific name",
-            "confidence": 85,
-            "reason": "Specific features that match this species"
-        }}
-    ],
-    "summary": "Brief identification summary"
-}}
-IMPORTANT:
-- Use ACTUAL bird species names (e.g., "Zebra Finch", "House Sparrow", "Indian Roller")
-- NEVER use generic names like "The bird", "Unknown", "The image"
-- If description mentions orange beak, striped throat, spotted flanks - this is likely a Zebra Finch
-- Provide your best species guess even if uncertain
-Return ONLY the JSON, nothing else."""
-def parse_bird_json(text: str) -> Tuple[List[Dict], str]:
-    """Parse bird identification from LLM response."""
     birds = []
     summary = ""
-    # Clean the text
-    text = text.strip()
-    # Try to find JSON in the response
     try:
-        # Look for JSON object
-        json_match = re.search(r'\{[\s\S]*?"birds"[\s\S]*?\}(?=\s*$|\s*```)', text)
         if json_match:
             json_str = json_match.group()
-            # Fix common JSON issues
-            json_str = re.sub(r',\s*}', '}', json_str)
-            json_str = re.sub(r',\s*]', ']', json_str)
             data = json.loads(json_str)
-            birds = data.get("birds", [])
             summary = data.get("summary", "")
-            # Validate bird names
-            valid_birds = []
-            for bird in birds:
-                name = bird.get("name", "").strip()
-                # Filter out garbage names
-                if name and name.lower() not in ['the bird', 'the image', 'unknown', 'bird', 'a bird']:
-                    valid_birds.append(bird)
-            if valid_birds:
-                return valid_birds, summary
     except json.JSONDecodeError as e:
         log(f"JSON parse error: {e}")
-    # Fallback: Try to extract bird names from text
-    log("Falling back to text extraction...")
-    # Common bird species patterns
-    bird_patterns = [
-        r'(?:identified as|this is|appears to be|likely|probably)\s+(?:a|an)?\s*([A-Z][a-z]+(?: [A-Z]?[a-z]+)+)',
-        r'([A-Z][a-z]+ (?:Finch|Sparrow|Robin|Warbler|Dove|Pigeon|Parrot|Kingfisher|Woodpecker|Eagle|Hawk|Owl|Heron|Crane|Duck|Goose|Swan|Crow|Raven|Jay|Magpie|Starling|Myna|Bulbul|Sunbird|Flowerpecker|Barbet|Drongo|Shrike|Oriole|Flycatcher|Thrush|Babbler))',
-        r'(Zebra Finch|House Sparrow|Indian Robin|Common Myna|Red-vented Bulbul|Rose-ringed Parakeet)',
     ]
-    for pattern in bird_patterns:
-        matches = re.findall(pattern, text, re.IGNORECASE)
-        if matches:
-            for match in matches[:2]:  # Take first 2 matches
-                birds.append({
-                    "name": match.strip().title(),
-                    "scientific_name": "See reference",
-                    "confidence": 70,
-                    "reason": "Extracted from AI analysis"
-                })
             break
-    # If still no birds found, check for specific description matches
-    if not birds:
-        text_lower = text.lower()
-        if 'orange' in text_lower and ('beak' in text_lower or 'bill' in text_lower):
-            if 'stripe' in text_lower or 'bar' in text_lower or 'spot' in text_lower:
-                birds.append({
-                    "name": "Zebra Finch",
-                    "scientific_name": "Taeniopygia guttata",
-                    "confidence": 75,
-                    "reason": "Orange beak with striped/spotted pattern suggests Zebra Finch"
-                })
-        elif 'grey' in text_lower or 'gray' in text_lower:
-            if 'small' in text_lower:
-                birds.append({
-                    "name": "House Sparrow",
-                    "scientific_name": "Passer domesticus",
-                    "confidence": 60,
-                    "reason": "Small grey bird - possibly House Sparrow"
-                })
-    return birds, summary or "Based on AI visual analysis"
 def format_bird_card(bird: Dict, index: int) -> str:
-    """Format bird result as HTML card."""
-    name = bird.get("name", "Unknown Species")
     scientific = bird.get("scientific_name", "")
     confidence = bird.get("confidence", 50)
     reason = bird.get("reason", "")
-    # Skip invalid names
-    if name.lower() in ['unknown', 'the bird', 'the image', 'bird']:
-        name = "Unidentified Bird"
-    img_url = get_wikipedia_image(name)
-    if confidence >= 80:
-        conf_class = "conf-high"
-    elif confidence >= 60:
-        conf_class = "conf-med"
-    else:
-        conf_class = "conf-low"
     return f"""
     <div class="bird-card">
-        <img src="{img_url}" alt="{name}" onerror="this.src='https://via.placeholder.com/120x120.png?text=Bird'">
         <div class="bird-info">
             <h3>{index}. {name}</h3>
-            <div class="scientific">{scientific}</div>
             <span class="confidence {conf_class}">{confidence}% confidence</span>
             <p class="reason">{reason}</p>
         </div>
     </div>"""
-# ================== IMAGE IDENTIFICATION ==================
 def identify_image_stream(image):
-    """Identify bird from image using Vision-Language Model."""
     if image is None:
-        yield '<div class="error">⚠️ Please upload an image first</div>'
         return
     try:
-        # Ensure PIL Image
         if not isinstance(image, Image.Image):
             image = Image.fromarray(np.array(image))
         image = image.convert("RGB")
-        yield '<div class="processing">🔍 Analyzing bird image...</div>'
-        # Check for Ollama
-        ollama_available, model, is_vision = check_ollama()
-        llm_response = ""
-        image_description = ""
-        if ollama_available and is_vision:
-            # BEST PATH: Use LLaVA for direct image analysis
-            yield f'<div class="processing">🦙 Using LLaVA vision model ({model})...</div>'
-            llm_response = call_ollama_vision(image, LLAVA_BIRD_PROMPT, model)
-        if not llm_response:
-            # FALLBACK 1: HuggingFace vision + text
             yield '<div class="processing">☁️ Using HuggingFace AI...</div>'
-            image_description = call_huggingface_vlm(image, "Describe this bird in detail")
-            if image_description:
-                yield f'''<div class="processing">🔍 Identifying from description...</div>
-                <div class="features-box"><strong>AI saw:</strong> {image_description}</div>'''
                 # Use text model to identify
-                prompt = TEXT_IDENTIFICATION_PROMPT.format(description=image_description)
-                if ollama_available and model:
-                    llm_response = call_ollama_text(prompt, model)
-                if not llm_response:
-                    llm_response = call_huggingface_text(prompt)
-        if not llm_response and not image_description:
-            # FALLBACK 2: Basic color analysis
-            yield '<div class="processing">⚠️ Using basic color analysis...</div>'
-            image_description = analyze_colors(image)
-            prompt = TEXT_IDENTIFICATION_PROMPT.format(description=image_description)
-            if ollama_available and model:
-                llm_response = call_ollama_text(prompt, model)
         # Parse response
-        if llm_response:
-            birds, summary = parse_bird_json(llm_response)
-        else:
-            birds, summary = [], "Could not get AI response"
         if not birds:
-            yield f"""<div class="error">
-                <strong>❌ Could not identify bird species</strong>
-                <p>The AI couldn't make a confident identification.</p>
-                {f'<div class="features-box"><strong>AI description:</strong> {image_description[:300]}</div>' if image_description else ''}
-                <p>Try uploading a clearer image with the bird in focus.</p>
-            </div>"""
             return
-        # Format results
-        result_html = f"""<div class="success">
-            <h2>🐦 {len(birds)} Bird(s) Identified!</h2>
-            <p>{summary}</p>
-        </div>"""
         for i, bird in enumerate(birds, 1):
-            result_html += format_bird_card(bird, i)
-        yield result_html
-    except Exception as e:
-        log(f"Image identification error: {traceback.format_exc()}")
-        yield f'<div class="error"><strong>❌ Error:</strong> {str(e)}</div>'
-def analyze_colors(image: Image.Image) -> str:
-    """Basic color analysis fallback."""
-    try:
-        img = np.array(image)
-        h, w = img.shape[:2]
-        def describe_region(region):
-            mean = np.mean(region, axis=(0, 1))
-            r, g, b = mean
-            colors = []
-            if r > 180 and g < 120 and b < 120: colors.append("red")
-            elif r > 180 and g > 100 and b < 100: colors.append("orange")
-            elif r > 180 and g > 180 and b < 120: colors.append("yellow")
-            elif r > 150 and g > 150 and b > 150: colors.append("white/light grey")
-            elif r < 80 and g < 80 and b < 80: colors.append("black/dark")
-            elif 80 < r < 150 and 80 < g < 150 and 80 < b < 150: colors.append("grey")
-            elif r > g > b: colors.append("brown/rufous")
-            else: colors.append("mixed colors")
-            return ", ".join(colors)
-        head = describe_region(img[:h//3, :, :])
-        body = describe_region(img[h//3:2*h//3, :, :])
-        lower = describe_region(img[2*h//3:, :, :])
-        return f"Small bird. Head region: {head}. Body: {body}. Lower parts: {lower}."
     except Exception as e:
-        return f"Could not analyze image: {e}"
 # ================== AUDIO IDENTIFICATION ==================
 def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
-    """Process audio and extract bird call features."""
     try:
-        # Normalize
         audio = audio_data.astype(np.float64)
         if np.max(np.abs(audio)) > 0:
             audio = audio / np.max(np.abs(audio))
-        # Bandpass filter (bird frequencies: 500Hz - 10kHz)
         nyq = sr / 2
         low, high = max(500/nyq, 0.01), min(10000/nyq, 0.99)
         if low < high:
             b, a = signal.butter(4, [low, high], btype='band')
             audio = signal.filtfilt(b, a, audio)
-        # Features
         duration = len(audio_data) / sr
         # Peak frequency
@@ -626,58 +512,43 @@ def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
         freqs = np.fft.rfftfreq(len(audio), 1/sr)
         peak_freq = freqs[np.argmax(np.abs(fft))] if len(freqs) > 0 else 0
-        # Syllable count
         envelope = np.abs(signal.hilbert(audio))
         threshold = np.mean(envelope) + 0.5 * np.std(envelope)
         syllables = np.sum(np.diff((envelope > threshold).astype(int)) > 0)
         return {
             "duration": round(duration, 2),
-            "peak_frequency": int(peak_freq),
             "syllables": int(syllables),
-            "pattern": "repetitive" if syllables > 5 else "simple" if syllables > 1 else "single note",
-            "frequency_range": "high (3-8kHz)" if peak_freq > 3000 else "medium (1-3kHz)" if peak_freq > 1000 else "low (<1kHz)"
         }
-    except Exception as e:
-        return {"error": str(e), "duration": 0}
-AUDIO_PROMPT = """You are an expert ornithologist specializing in bird vocalizations.
-BIRD CALL ANALYSIS:
 - Duration: {duration} seconds
-- Peak Frequency: {peak_frequency} Hz ({frequency_range})
-- Syllables/Notes: {syllables}
-- Pattern: {pattern}
-{location}{month}
 Based on these acoustic features, identify possible bird species.
-High frequency (3000-8000 Hz) = small passerines (warblers, finches)
-Medium frequency (1000-3000 Hz) = medium birds (thrushes, bulbuls, mynas)
-Low frequency (500-1000 Hz) = larger birds (crows, pigeons, doves)
 Respond with JSON ONLY:
-{{
-    "birds": [
-        {{
-            "name": "Species Name",
-            "scientific_name": "Scientific name",
-            "confidence": 75,
-            "reason": "Why this bird matches the audio features"
-        }}
-    ],
-    "summary": "Brief summary"
-}}
-Use ACTUAL species names, not generic terms."""
 def identify_audio_stream(audio_input, location: str = "", month: str = ""):
-    """Identify bird from audio."""
     if audio_input is None:
-        yield '<div class="error">⚠️ Please upload or record audio first</div>'
         return
     try:
@@ -688,253 +559,190 @@ def identify_audio_stream(audio_input, location: str = "", month: str = ""):
             return
         if len(audio_data) == 0:
-            yield '<div class="error">⚠️ Audio is empty</div>'
             return
-        # Convert to mono
         if len(audio_data.shape) > 1:
             audio_data = np.mean(audio_data, axis=1)
-        yield '<div class="processing">🔊 Processing audio with SAM-Audio...</div>'
         features = process_audio(audio_data, sr)
-        features_html = f"""<div class="features-box">
-<strong>🎵 Audio Analysis</strong><br>
-• Duration: {features.get('duration', 0)}s<br>
-• Peak Frequency: {features.get('peak_frequency', 0)} Hz ({features.get('frequency_range', 'unknown')})<br>
-• Syllables: {features.get('syllables', 0)}<br>
-• Pattern: {features.get('pattern', 'unknown')}
-</div>"""
-        yield f'<div class="processing">🤖 Identifying bird species...</div>{features_html}'
-        # Build prompt
-        prompt = AUDIO_PROMPT.format(
-            **features,
-            location=f"\n- Location: {location}" if location else "",
-            month=f"\n- Month: {month}" if month else ""
-        )
-        # Get identification
-        ollama_available, model, _ = check_ollama()
         response = ""
-        if ollama_available and model:
-            yield f'<div class="processing">🦙 Using local AI ({model})...</div>{features_html}'
-            response = call_ollama_text(prompt, model)
         if not response:
-            yield f'<div class="processing">☁️ Using cloud AI...</div>{features_html}'
-            response = call_huggingface_text(prompt)
-        birds, summary = parse_bird_json(response)
         if not birds:
-            yield f"""<div class="error">
-                <strong>Could not identify bird from audio</strong>
-                <p>Try recording a clearer sample with less background noise.</p>
                 {features_html}
-            </div>"""
             return
-        result_html = f"""<div class="success">
-            <h2>🐦 {len(birds)} Bird(s) Identified from Audio!</h2>
             <p>{summary}</p>
-        </div>{features_html}"""
         for i, bird in enumerate(birds, 1):
-            result_html += format_bird_card(bird, i)
-        yield result_html
     except Exception as e:
         log(f"Audio error: {traceback.format_exc()}")
-        yield f'<div class="error"><strong>❌ Error:</strong> {str(e)}</div>'
 # ================== DESCRIPTION IDENTIFICATION ==================
 def identify_description_stream(description: str):
     """Identify bird from text description."""
-    if not description or len(description.strip()) < 10:
-        yield '<div class="error">⚠️ Please enter a more detailed description</div>'
         return
     try:
-        yield '<div class="processing">🔍 Analyzing your description...</div>'
-        prompt = f"""You are an expert ornithologist specializing in Indian birds.
-USER DESCRIPTION:
 "{description}"
-Identify the bird species that best matches this description.
-Respond with JSON ONLY:
-{{
-    "birds": [
-        {{
-            "name": "Species Name",
-            "scientific_name": "Scientific name",
-            "confidence": 85,
-            "reason": "Why this matches the description"
-        }}
-    ],
-    "summary": "Brief summary"
-}}
-Use ACTUAL species names like "House Sparrow", "Indian Robin", "Zebra Finch" - never generic terms."""
-        ollama_available, model, _ = check_ollama()
         response = ""
-        if ollama_available and model:
             yield '<div class="processing">🦙 Using local AI...</div>'
-            response = call_ollama_text(prompt, model)
         if not response:
-            yield '<div class="processing">☁️ Using cloud AI...</div>'
-            response = call_huggingface_text(prompt)
-        birds, summary = parse_bird_json(response)
         if not birds:
-            yield """<div class="error">
-                <strong>Could not identify bird</strong>
-                <p>Try adding more details about colors, size, behavior, or sounds.</p>
-            </div>"""
             return
-        result_html = f"""<div class="success">
-            <h2>🐦 {len(birds)} Bird(s) Match Your Description!</h2>
             <p>{summary}</p>
-        </div>"""
         for i, bird in enumerate(birds, 1):
-            result_html += format_bird_card(bird, i)
-        yield result_html
     except Exception as e:
-        log(f"Description error: {traceback.format_exc()}")
-        yield f'<div class="error"><strong>❌ Error:</strong> {str(e)}</div>'
-# ================== MAIN UI ==================
-def get_status():
-    """Get AI backend status."""
-    ollama_available, model, is_vision = check_ollama()
-    if is_vision:
-        return f"🦙 LLaVA Vision ({model})"
-    elif ollama_available:
-        return f"🦙 Local AI ({model})"
     elif HF_TOKEN:
-        return "☁️ HuggingFace Cloud"
     else:
-        return "⚠️ Limited Mode"
-def create_ui():
-    """Create Gradio interface with Audio tab first."""
     with gr.Blocks(title="BirdSense Pro") as demo:
         gr.HTML(f"<style>{CSS}</style>")
         gr.HTML(f"""
         <div class="header">
             <h1>🐦 BirdSense Pro</h1>
-            <p class="subtitle">AI-Powered Bird Identification • Audio • Image • Description</p>
-            <div class="status">
-                <span class="status-dot"></span>
-                {get_status()}
-            </div>
-        </div>
-        """)
-        # AUDIO TAB FIRST (as requested)
         with gr.Tab("🎵 Audio"):
-            gr.HTML("""<div class="info-box">
-                <h3>🎵 Audio Identification</h3>
-                <p>Upload or record bird calls. SAM-Audio processing isolates bird sounds from background noise.</p>
-            </div>""")
             with gr.Row():
                 with gr.Column():
-                    audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Upload or Record")
                     with gr.Row():
-                        location = gr.Textbox(label="📍 Location", placeholder="e.g., Mumbai, Delhi")
-                        month = gr.Dropdown(label="📅 Month", choices=["", "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
-                    audio_btn = gr.Button("🔍 Identify Bird", variant="primary", size="lg")
                 with gr.Column():
-                    audio_output = gr.HTML('<div style="padding:50px; text-align:center; color:#a0aec0;">🎵 Upload or record bird calls to identify</div>')
-            audio_btn.click(identify_audio_stream, [audio_input, location, month], audio_output)
-        # IMAGE TAB
         with gr.Tab("📷 Image"):
-            gr.HTML("""<div class="info-box">
-                <h3>📷 Image Identification</h3>
-                <p>Upload a bird photo. LLaVA vision model analyzes the actual image for accurate identification.</p>
-            </div>""")
             with gr.Row():
                 with gr.Column():
-                    image_input = gr.Image(sources=["upload", "webcam"], type="pil", label="📸 Upload or Capture")
-                    image_btn = gr.Button("🔍 Identify Bird", variant="primary", size="lg")
                 with gr.Column():
-                    image_output = gr.HTML('<div style="padding:50px; text-align:center; color:#a0aec0;">📷 Upload a bird image to identify</div>')
-            image_btn.click(identify_image_stream, [image_input], image_output)
-        # DESCRIPTION TAB
         with gr.Tab("📝 Description"):
-            gr.HTML("""<div class="info-box">
-                <h3>📝 Text Description</h3>
-                <p>Describe the bird you saw — colors, size, behavior, sounds, habitat.</p>
-            </div>""")
             with gr.Row():
                 with gr.Column():
-                    desc_input = gr.Textbox(
-                        label="✍️ Describe the Bird",
-                        lines=4,
-                        placeholder="Example: Small bird with bright orange beak, grey head with orange cheek patches, black and white striped throat, chestnut brown sides with white spots."
-                    )
-                    desc_btn = gr.Button("🔍 Identify Bird", variant="primary", size="lg")
                 with gr.Column():
-                    desc_output = gr.HTML('<div style="padding:50px; text-align:center; color:#a0aec0;">📝 Describe a bird to identify it</div>')
-            desc_btn.click(identify_description_stream, [desc_input], desc_output)
-        gr.HTML("""
-        <div style="text-align:center; margin-top:20px; padding:15px; color:#718096; font-size:0.85rem;">
-            <strong>BirdSense Pro</strong> • Uses LLaVA (local) or BLIP-2 (cloud) for vision analysis<br>
-            For best accuracy, use local Ollama with LLaVA model
-        </div>
-        """)
     return demo
-# ================== MAIN ==================
 if __name__ == "__main__":
     log("Starting BirdSense Pro...")
-    log(f"HF_TOKEN available: {bool(HF_TOKEN)}")
-    ollama_ok, model, is_vision = check_ollama()
-    if is_vision:
-        log(f"✅ LLaVA vision model ready: {model}")
-    elif ollama_ok:
-        log(f"⚠️ Ollama available but no vision model: {model}")
-        log("   Run: ollama pull llava:7b")
-    else:
-        log("⚠️ Ollama not available, using HuggingFace fallback")
-    demo = create_ui()
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)

 """
 🐦 BirdSense Pro - AI Bird Identification
+- Local: Ollama LLaVA (vision) + Llama3.2 (text/audio)
+- Cloud: HuggingFace BLIP-2 + Text models
+NO HARDCODED BIRDS - Pure AI identification
 """
 import gradio as gr
 import numpy as np
 import scipy.signal as signal
+from typing import Tuple, List, Dict, Optional
 import json
 import requests
 import re
 import base64
 # ================== CONFIG ==================
 OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
 DEBUG = True
 def log(msg):
     if DEBUG:
         print(f"[BirdSense] {msg}")
 CSS = """
 .gradio-container {
     background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important;
+    font-family: 'Inter', sans-serif !important;
 }
 .header {
     background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%);
+    color: white; padding: 35px 20px; border-radius: 16px;
+    text-align: center; margin-bottom: 16px;
+    box-shadow: 0 10px 30px rgba(26, 54, 93, 0.25);
 }
+.header h1 { font-size: 2.2rem; font-weight: 800; margin: 0 0 8px 0; }
+.header .subtitle { font-size: 1rem; opacity: 0.9; margin-bottom: 10px; }
 .header .status {
+    display: inline-flex; align-items: center; gap: 6px;
+    background: rgba(255,255,255,0.15); padding: 6px 16px; border-radius: 50px;
+    font-weight: 600; font-size: 0.85rem;
 }
+.status-dot { width: 8px; height: 8px; border-radius: 50%; }
+.status-green { background: #48bb78; }
+.status-yellow { background: #ecc94b; }
+.status-red { background: #fc8181; }
 .info-box {
     background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%);
+    border: 1px solid #90cdf4; border-radius: 10px; padding: 14px; margin-bottom: 14px;
 }
+.info-box h3 { color: #2b6cb0; margin: 0 0 4px 0; font-size: 0.95rem; }
+.info-box p { color: #4299e1; margin: 0; font-size: 0.85rem; }
 .bird-card {
+    background: white; border: 1px solid #e2e8f0; border-radius: 14px;
+    padding: 16px; margin: 10px 0; display: flex; gap: 14px;
+    box-shadow: 0 3px 10px rgba(0,0,0,0.04);
 }
+.bird-card img { width: 100px; height: 100px; object-fit: cover; border-radius: 10px; flex-shrink: 0; }
 .bird-info { flex: 1; min-width: 0; }
+.bird-info h3 { color: #1a202c; margin: 0 0 3px 0; font-size: 1.1rem; font-weight: 700; }
+.bird-info .scientific { color: #718096; font-style: italic; font-size: 0.8rem; margin-bottom: 8px; }
+.confidence { display: inline-block; padding: 3px 10px; border-radius: 16px; font-weight: 700; font-size: 0.75rem; }
 .conf-high { background: #c6f6d5; color: #22543d; }
 .conf-med { background: #fefcbf; color: #744210; }
 .conf-low { background: #fed7d7; color: #742a2a; }
+.reason { color: #4a5568; margin-top: 8px; line-height: 1.5; font-size: 0.85rem; }
+.error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 10px; padding: 16px; color: #c53030; }
+.success { background: #f0fff4; border: 1px solid #68d391; border-radius: 10px; padding: 16px; color: #276749; }
+.processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 10px; padding: 16px; color: #2b6cb0; }
+.features-box { background: #f7fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; font-size: 0.8rem; }
 """
 # ================== OLLAMA FUNCTIONS ==================
+def check_ollama_models() -> Dict:
+    """Check available Ollama models."""
+    result = {"available": False, "vision_model": None, "text_model": None}
     try:
         response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3)
         if response.status_code == 200:
             models = [m["name"] for m in response.json().get("models", [])]
             log(f"Ollama models: {models}")
+            result["available"] = True
+            # Find vision model
+            for m in models:
+                if "llava" in m.lower() or "bakllava" in m.lower():
+                    result["vision_model"] = m
+                    break
+            # Find text model
+            for m in models:
+                if any(t in m.lower() for t in ["llama", "qwen", "mistral", "phi"]):
+                    if "llava" not in m.lower():  # Exclude vision models
+                        result["text_model"] = m
+                        break
     except Exception as e:
+        log(f"Ollama check failed: {e}")
+    return result
+def call_llava(image: Image.Image, prompt: str, model: str) -> str:
+    """Call LLaVA vision model."""
     try:
+        # Resize image
+        max_size = 768
+        if max(image.size) > max_size:
+            ratio = max_size / max(image.size)
+            image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS)
+        # Convert to base64
+        buffer = io.BytesIO()
+        image.save(buffer, format="JPEG", quality=85)
+        img_b64 = base64.b64encode(buffer.getvalue()).decode()
+        log(f"Calling LLaVA ({model}) with {len(img_b64)} bytes image...")
         response = requests.post(
             f"{OLLAMA_URL}/api/generate",
                 "prompt": prompt,
                 "images": [img_b64],
                 "stream": False,
+                "options": {"temperature": 0.1, "num_predict": 1200}
             },
+            timeout=120
         )
         if response.status_code == 200:
             result = response.json().get("response", "")
+            log(f"LLaVA response ({len(result)} chars): {result[:300]}...")
             return result
         else:
+            log(f"LLaVA error: {response.status_code} - {response.text[:200]}")
     except Exception as e:
+        log(f"LLaVA call failed: {traceback.format_exc()}")
+    return ""
 def call_ollama_text(prompt: str, model: str) -> str:
+    """Call Ollama text model (for audio/description)."""
     try:
+        log(f"Calling text model ({model})...")
         response = requests.post(
             f"{OLLAMA_URL}/api/generate",
             json={
                 "model": model,
                 "prompt": prompt,
                 "stream": False,
+                "options": {"temperature": 0.2, "num_predict": 800}
             },
+            timeout=60
         )
         if response.status_code == 200:
             return response.json().get("response", "")
     except Exception as e:
+        log(f"Text model error: {e}")
+    return ""
 # ================== HUGGINGFACE FUNCTIONS ==================
+def call_hf_image_caption(image: Image.Image) -> str:
+    """Get image caption from HuggingFace BLIP."""
     if not HF_TOKEN:
+        log("No HF_TOKEN")
         return ""
     headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    # Resize
+    max_size = 512
     if max(image.size) > max_size:
         ratio = max_size / max(image.size)
+        image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS)
+    buffer = io.BytesIO()
+    image.save(buffer, format="JPEG", quality=80)
     models = [
         "Salesforce/blip-image-captioning-large",
         "Salesforce/blip-image-captioning-base",
     ]
     for model in models:
         try:
+            log(f"Trying HF caption model: {model}")
             response = requests.post(
+                f"https://api-inference.huggingface.co/models/{model}",
                 headers=headers,
+                data=buffer.getvalue(),
+                timeout=45
             )
             if response.status_code == 200:
                 result = response.json()
+                if isinstance(result, list) and result:
                     caption = result[0].get("generated_text", "")
                     if caption:
                         log(f"HF caption: {caption}")
                         return caption
             elif response.status_code == 503:
+                log(f"{model} loading, trying next...")
             else:
+                log(f"HF error {response.status_code}: {response.text[:100]}")
         except Exception as e:
+            log(f"HF caption error: {e}")
     return ""
+def call_hf_text(prompt: str) -> str:
     """Call HuggingFace text model."""
     if not HF_TOKEN:
         return ""
     headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
     models = [
+        "mistralai/Mistral-7B-Instruct-v0.2",
         "HuggingFaceH4/zephyr-7b-beta",
+        "google/flan-t5-xl",
     ]
     for model in models:
         try:
             log(f"Trying HF text model: {model}")
             response = requests.post(
+                f"https://api-inference.huggingface.co/models/{model}",
                 headers=headers,
+                json={"inputs": prompt, "parameters": {"max_new_tokens": 600, "temperature": 0.3}},
+                timeout=45
             )
             if response.status_code == 200:
                 result = response.json()
+                if isinstance(result, list) and result:
                     text = result[0].get("generated_text", "")
                     if text:
+                        log(f"HF text ({len(text)} chars)")
                         return text
             elif response.status_code == 503:
                 continue
         except Exception as e:
             log(f"HF text error: {e}")
     return ""
+# ================== PARSING ==================
+def parse_bird_response(text: str) -> Tuple[List[Dict], str]:
+    """Parse LLM response to extract bird identifications. NO HARDCODED FALLBACKS."""
     birds = []
     summary = ""
+    if not text:
+        return [], ""
+    log(f"Parsing response: {text[:500]}...")
+    # Try JSON first
     try:
+        json_match = re.search(r'\{[\s\S]*"birds"[\s\S]*\}', text)
         if json_match:
             json_str = json_match.group()
+            json_str = re.sub(r',(\s*[}\]])', r'\1', json_str)  # Fix trailing commas
             data = json.loads(json_str)
+            raw_birds = data.get("birds", [])
             summary = data.get("summary", "")
+            for b in raw_birds:
+                name = b.get("name", "").strip()
+                # Filter out garbage
+                if name and len(name) > 2 and name.lower() not in ["the bird", "bird", "unknown", "the image", "image"]:
+                    birds.append({
+                        "name": name,
+                        "scientific_name": b.get("scientific_name", ""),
+                        "confidence": min(99, max(1, int(b.get("confidence", 70)))),
+                        "reason": b.get("reason", "Identified by AI")
+                    })
+            if birds:
+                return birds, summary
     except json.JSONDecodeError as e:
         log(f"JSON parse error: {e}")
+    # Fallback: Extract from text using patterns
+    # Look for "This is a/an [Bird Name]" or "[Bird Name] (Scientific name)"
+    patterns = [
+        r"(?:this is|identified as|appears to be|looks like|most likely)\s+(?:a|an|the)?\s*([A-Z][a-z]+(?:[-\s][A-Za-z]+){0,3})",
+        r"([A-Z][a-z]+(?:\s[A-Za-z]+)?)\s*\(([A-Z][a-z]+\s[a-z]+)\)",  # Name (Scientific name)
+        r"species[:\s]+([A-Z][a-z]+(?:\s[A-Za-z]+)?)",
     ]
+    for pattern in patterns:
+        matches = re.findall(pattern, text)
+        for match in matches:
+            if isinstance(match, tuple):
+                name = match[0].strip()
+            else:
+                name = match.strip()
+            # Validate it looks like a bird name
+            if name and len(name) > 3 and name.lower() not in ["the bird", "bird", "unknown"]:
+                # Check it's not a common non-bird word
+                skip_words = ["the", "this", "that", "image", "photo", "picture", "bird", "species"]
+                if name.lower() not in skip_words:
+                    birds.append({
+                        "name": name,
+                        "scientific_name": "",
+                        "confidence": 65,
+                        "reason": "Extracted from AI analysis"
+                    })
+                    break
+        if birds:
             break
+    return birds[:3], summary  # Max 3 birds
+def get_bird_image(bird_name: str) -> str:
+    """Get bird image from Wikipedia."""
+    if not bird_name or len(bird_name) < 3:
+        return ""
+    try:
+        # Clean name for Wikipedia
+        clean = bird_name.strip().replace(" ", "_")
+        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean)}"
+        response = requests.get(url, timeout=5)
+        if response.status_code == 200:
+            data = response.json()
+            if "thumbnail" in data:
+                img_url = data["thumbnail"]["source"]
+                log(f"Got Wikipedia image for {bird_name}")
+                return img_url
+            elif "originalimage" in data:
+                return data["originalimage"]["source"]
+    except Exception as e:
+        log(f"Wikipedia image error: {e}")
+    # Fallback placeholder with bird name
+    return f"https://via.placeholder.com/120x120/4299e1/ffffff?text={urllib.parse.quote(bird_name[:10])}"
 def format_bird_card(bird: Dict, index: int) -> str:
+    """Format bird as HTML card."""
+    name = bird.get("name", "Unknown")
     scientific = bird.get("scientific_name", "")
     confidence = bird.get("confidence", 50)
     reason = bird.get("reason", "")
+    img_url = get_bird_image(name)
+    conf_class = "conf-high" if confidence >= 80 else "conf-med" if confidence >= 60 else "conf-low"
     return f"""
     <div class="bird-card">
+        <img src="{img_url}" alt="{name}" onerror="this.style.display='none'">
         <div class="bird-info">
             <h3>{index}. {name}</h3>
+            {f'<div class="scientific">{scientific}</div>' if scientific else ''}
             <span class="confidence {conf_class}">{confidence}% confidence</span>
             <p class="reason">{reason}</p>
         </div>
     </div>"""
+# ================== IDENTIFICATION FUNCTIONS ==================
+IMAGE_PROMPT = """Look at this bird image carefully. Identify the bird species.
+You MUST respond with valid JSON in this exact format:
+{
+    "birds": [
+        {
+            "name": "Blue-and-yellow Macaw",
+            "scientific_name": "Ara ararauna",
+            "confidence": 95,
+            "reason": "Large parrot with bright blue wings and yellow underparts, characteristic of this species"
+        }
+    ],
+    "summary": "This is a Blue-and-yellow Macaw, a large South American parrot."
+}
+Look for:
+- Beak shape and color
+- Body colors and patterns
+- Size and shape
+- Any distinctive markings
+Give the ACTUAL species name (not "bird" or "unknown"). If unsure, give your best guess with lower confidence.
+Return ONLY the JSON."""
 def identify_image_stream(image):
+    """Identify bird from image."""
     if image is None:
+        yield '<div class="error">⚠️ Please upload an image</div>'
         return
     try:
         if not isinstance(image, Image.Image):
             image = Image.fromarray(np.array(image))
         image = image.convert("RGB")
+        yield '<div class="processing">🔍 Analyzing image...</div>'
+        models = check_ollama_models()
+        response = ""
+        method = ""
+        # Try LLaVA first (best for images)
+        if models["vision_model"]:
+            yield f'<div class="processing">🦙 Using LLaVA vision model...</div>'
+            response = call_llava(image, IMAGE_PROMPT, models["vision_model"])
+            method = "LLaVA Vision"
+        # Fallback to HuggingFace
+        if not response:
             yield '<div class="processing">☁️ Using HuggingFace AI...</div>'
+            # Get caption first
+            caption = call_hf_image_caption(image)
+            if caption:
+                yield f'<div class="processing">🔍 Identifying from caption...</div><div class="features-box"><b>AI sees:</b> {caption}</div>'
                 # Use text model to identify
+                text_prompt = f"""Based on this image description, identify the bird species:
+"{caption}"
+Respond with JSON:
+{{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}}
+Give the ACTUAL bird species name. Return ONLY JSON."""
+                if models["text_model"]:
+                    response = call_ollama_text(text_prompt, models["text_model"])
+                if not response:
+                    response = call_hf_text(text_prompt)
+                method = "HuggingFace BLIP + Text"
+            else:
+                yield '<div class="error">❌ Could not analyze image. HuggingFace API may be unavailable.</div>'
+                return
         # Parse response
+        birds, summary = parse_bird_response(response)
         if not birds:
+            yield f'''<div class="error">
+                <b>❌ Could not identify bird species</b>
+                <p>The AI response couldn't be parsed. Try a clearer image.</p>
+                <div class="features-box"><b>Raw AI response:</b><br>{response[:500] if response else "No response"}</div>
+            </div>'''
             return
+        # Success
+        result = f'''<div class="success">
+            <h3>🐦 {len(birds)} Bird(s) Identified!</h3>
+            <p>{summary or f"Identified using {method}"}</p>
+        </div>'''
         for i, bird in enumerate(birds, 1):
+            result += format_bird_card(bird, i)
+        yield result
     except Exception as e:
+        log(f"Image error: {traceback.format_exc()}")
+        yield f'<div class="error">❌ Error: {str(e)}</div>'
 # ================== AUDIO IDENTIFICATION ==================
 def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
+    """Extract audio features for bird identification."""
     try:
         audio = audio_data.astype(np.float64)
         if np.max(np.abs(audio)) > 0:
             audio = audio / np.max(np.abs(audio))
+        # Bandpass filter (500Hz - 10kHz for birds)
         nyq = sr / 2
         low, high = max(500/nyq, 0.01), min(10000/nyq, 0.99)
         if low < high:
             b, a = signal.butter(4, [low, high], btype='band')
             audio = signal.filtfilt(b, a, audio)
         duration = len(audio_data) / sr
         # Peak frequency
         freqs = np.fft.rfftfreq(len(audio), 1/sr)
         peak_freq = freqs[np.argmax(np.abs(fft))] if len(freqs) > 0 else 0
+        # Count syllables
         envelope = np.abs(signal.hilbert(audio))
         threshold = np.mean(envelope) + 0.5 * np.std(envelope)
         syllables = np.sum(np.diff((envelope > threshold).astype(int)) > 0)
         return {
             "duration": round(duration, 2),
+            "peak_freq": int(peak_freq),
             "syllables": int(syllables),
+            "freq_range": "high" if peak_freq > 3000 else "medium" if peak_freq > 1000 else "low"
         }
+    except:
+        return {"duration": 0, "peak_freq": 0, "syllables": 0, "freq_range": "unknown"}
+AUDIO_PROMPT = """You are an expert ornithologist. Identify the bird from these audio features:
 - Duration: {duration} seconds
+- Peak Frequency: {peak_freq} Hz ({freq_range} range)
+- Syllables/notes detected: {syllables}
+{extra}
 Based on these acoustic features, identify possible bird species.
+High frequency (>3000 Hz) = small birds like warblers, finches
+Medium frequency (1000-3000 Hz) = thrushes, bulbuls, mynas
+Low frequency (<1000 Hz) = larger birds like crows, doves
 Respond with JSON ONLY:
+{{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 70, "reason": "Matches because..."}}], "summary": "..."}}
+Give ACTUAL species names, not generic terms."""
 def identify_audio_stream(audio_input, location: str = "", month: str = ""):
+    """Identify bird from audio - uses TEXT model, not vision."""
     if audio_input is None:
+        yield '<div class="error">⚠️ Please upload or record audio</div>'
         return
     try:
             return
         if len(audio_data) == 0:
+            yield '<div class="error">⚠️ Empty audio</div>'
             return
         if len(audio_data.shape) > 1:
             audio_data = np.mean(audio_data, axis=1)
+        yield '<div class="processing">🔊 Analyzing audio features...</div>'
         features = process_audio(audio_data, sr)
+        features_html = f'''<div class="features-box">
+<b>🎵 Audio Analysis</b><br>
+• Duration: {features["duration"]}s | Peak: {features["peak_freq"]} Hz ({features["freq_range"]})<br>
+• Syllables: {features["syllables"]}
+</div>'''
+        yield f'<div class="processing">🤖 Identifying bird...</div>{features_html}'
+        extra = ""
+        if location: extra += f"\n- Location: {location}"
+        if month: extra += f"\n- Month: {month}"
+        prompt = AUDIO_PROMPT.format(**features, extra=extra)
+        models = check_ollama_models()
         response = ""
+        # Use TEXT model for audio (NOT vision!)
+        if models["text_model"]:
+            yield f'<div class="processing">🦙 Using {models["text_model"]}...</div>{features_html}'
+            response = call_ollama_text(prompt, models["text_model"])
         if not response:
+            yield f'<div class="processing">☁️ Using HuggingFace...</div>{features_html}'
+            response = call_hf_text(prompt)
+        birds, summary = parse_bird_response(response)
         if not birds:
+            yield f'''<div class="error">
+                <b>Could not identify bird from audio</b>
+                <p>Try a clearer recording with less background noise.</p>
                 {features_html}
+            </div>'''
             return
+        result = f'''<div class="success">
+            <h3>🐦 {len(birds)} Bird(s) Identified!</h3>
             <p>{summary}</p>
+        </div>{features_html}'''
         for i, bird in enumerate(birds, 1):
+            result += format_bird_card(bird, i)
+        yield result
     except Exception as e:
         log(f"Audio error: {traceback.format_exc()}")
+        yield f'<div class="error">❌ Error: {str(e)}</div>'
 # ================== DESCRIPTION IDENTIFICATION ==================
 def identify_description_stream(description: str):
     """Identify bird from text description."""
+    if not description or len(description.strip()) < 5:
+        yield '<div class="error">⚠️ Please enter a description</div>'
         return
     try:
+        yield '<div class="processing">🔍 Analyzing description...</div>'
+        prompt = f"""Identify the bird species from this description:
 "{description}"
+Respond with JSON:
+{{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}}
+Use ACTUAL species names. Return ONLY JSON."""
+        models = check_ollama_models()
         response = ""
+        if models["text_model"]:
             yield '<div class="processing">🦙 Using local AI...</div>'
+            response = call_ollama_text(prompt, models["text_model"])
         if not response:
+            yield '<div class="processing">☁️ Using HuggingFace...</div>'
+            response = call_hf_text(prompt)
+        birds, summary = parse_bird_response(response)
         if not birds:
+            yield '<div class="error"><b>Could not identify bird</b><p>Try adding more details.</p></div>'
             return
+        result = f'''<div class="success">
+            <h3>🐦 {len(birds)} Bird(s) Match!</h3>
             <p>{summary}</p>
+        </div>'''
         for i, bird in enumerate(birds, 1):
+            result += format_bird_card(bird, i)
+        yield result
     except Exception as e:
+        yield f'<div class="error">❌ Error: {str(e)}</div>'
+# ================== UI ==================
+def get_status_html():
+    """Get status indicator."""
+    models = check_ollama_models()
+    if models["vision_model"]:
+        return f'<span class="status-dot status-green"></span> LLaVA + {models["text_model"] or "HF"}'
+    elif models["text_model"]:
+        return f'<span class="status-dot status-yellow"></span> {models["text_model"]} (no vision)'
     elif HF_TOKEN:
+        return '<span class="status-dot status-yellow"></span> HuggingFace Cloud'
     else:
+        return '<span class="status-dot status-red"></span> Limited Mode'
+def create_app():
     with gr.Blocks(title="BirdSense Pro") as demo:
         gr.HTML(f"<style>{CSS}</style>")
         gr.HTML(f"""
         <div class="header">
             <h1>🐦 BirdSense Pro</h1>
+            <p class="subtitle">AI Bird Identification • Audio • Image • Description</p>
+            <div class="status">{get_status_html()}</div>
+        </div>""")
+        # AUDIO FIRST
         with gr.Tab("🎵 Audio"):
+            gr.HTML('<div class="info-box"><h3>🎵 Audio Identification</h3><p>Upload or record bird calls. Uses text AI to analyze acoustic features.</p></div>')
             with gr.Row():
                 with gr.Column():
+                    audio_in = gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Audio")
                     with gr.Row():
+                        loc = gr.Textbox(label="📍 Location", placeholder="e.g., Mumbai")
+                        mon = gr.Dropdown(label="📅 Month", choices=[""] + ["January","February","March","April","May","June","July","August","September","October","November","December"])
+                    audio_btn = gr.Button("🔍 Identify", variant="primary", size="lg")
                 with gr.Column():
+                    audio_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">🎵 Upload audio to identify</div>')
+            audio_btn.click(identify_audio_stream, [audio_in, loc, mon], audio_out)
+        # IMAGE
         with gr.Tab("📷 Image"):
+            gr.HTML('<div class="info-box"><h3>📷 Image Identification</h3><p>Upload a photo. Uses LLaVA vision AI to analyze the actual image.</p></div>')
             with gr.Row():
                 with gr.Column():
+                    img_in = gr.Image(sources=["upload", "webcam"], type="pil", label="📸 Photo")
+                    img_btn = gr.Button("🔍 Identify", variant="primary", size="lg")
                 with gr.Column():
+                    img_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">📷 Upload image to identify</div>')
+            img_btn.click(identify_image_stream, [img_in], img_out)
+        # DESCRIPTION
         with gr.Tab("📝 Description"):
+            gr.HTML('<div class="info-box"><h3>📝 Text Description</h3><p>Describe the bird - colors, size, behavior, sounds.</p></div>')
             with gr.Row():
                 with gr.Column():
+                    desc_in = gr.Textbox(label="✍️ Description", lines=3, placeholder="e.g., Large blue and yellow parrot with long tail")
+                    desc_btn = gr.Button("🔍 Identify", variant="primary", size="lg")
                 with gr.Column():
+                    desc_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">📝 Describe a bird</div>')
+            desc_btn.click(identify_description_stream, [desc_in], desc_out)
+        gr.HTML('<div style="text-align:center;padding:10px;color:#718096;font-size:0.8rem"><b>BirdSense Pro</b> • Local: LLaVA (image) + Llama3.2 (audio/text) • Cloud: HuggingFace BLIP</div>')
     return demo
 if __name__ == "__main__":
     log("Starting BirdSense Pro...")
+    models = check_ollama_models()
+    log(f"Vision: {models['vision_model']}, Text: {models['text_model']}, HF: {bool(HF_TOKEN)}")
+    app = create_app()
+    app.launch(server_name="0.0.0.0", server_port=7860, show_error=True)