sohiyiy commited on
Commit
e1d82cf
Β·
verified Β·
1 Parent(s): 403d0e5

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +370 -562
app.py CHANGED
@@ -1,14 +1,14 @@
1
  """
2
  🐦 BirdSense Pro - AI Bird Identification
3
- Uses Vision-Language Models (VLM) for accurate bird recognition
4
- - Local: Ollama with LLaVA (best accuracy)
5
- - Cloud: HuggingFace Inference API with Vision models
6
  """
7
 
8
  import gradio as gr
9
  import numpy as np
10
  import scipy.signal as signal
11
- from typing import Tuple, List, Dict, Generator, Optional
12
  import json
13
  import requests
14
  import re
@@ -20,15 +20,10 @@ import io
20
  import base64
21
 
22
  # ================== CONFIG ==================
23
- SAMPLE_RATE = 48000
24
  OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
25
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
26
  DEBUG = True
27
 
28
- # Model priorities
29
- OLLAMA_VISION_MODELS = ["llava:7b", "llava", "bakllava", "llava:13b"]
30
- OLLAMA_TEXT_MODELS = ["llama3.2", "qwen2.5:3b", "mistral", "phi4"]
31
-
32
  def log(msg):
33
  if DEBUG:
34
  print(f"[BirdSense] {msg}")
@@ -38,134 +33,100 @@ def log(msg):
38
  CSS = """
39
  .gradio-container {
40
  background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important;
41
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
42
  }
43
  .header {
44
  background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%);
45
- color: white; padding: 40px 24px; border-radius: 20px;
46
- text-align: center; margin-bottom: 20px;
47
- box-shadow: 0 15px 40px rgba(26, 54, 93, 0.3);
48
  }
49
- .header h1 { font-size: 2.5rem; font-weight: 800; margin: 0 0 10px 0; }
50
- .header .subtitle { font-size: 1.1rem; opacity: 0.9; margin-bottom: 12px; }
51
  .header .status {
52
- display: inline-flex; align-items: center; gap: 8px;
53
- background: rgba(255,255,255,0.15); padding: 8px 20px; border-radius: 50px;
54
- font-weight: 600; font-size: 0.9rem;
55
  }
56
- .status-dot { width: 10px; height: 10px; background: #48bb78; border-radius: 50%; animation: pulse 2s infinite; }
57
- @keyframes pulse { 0%, 100% { opacity: 1; } 50% { opacity: 0.5; } }
 
 
58
 
59
  .info-box {
60
  background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%);
61
- border: 1px solid #90cdf4; border-radius: 12px; padding: 16px; margin-bottom: 16px;
62
  }
63
- .info-box h3 { color: #2b6cb0; margin: 0 0 6px 0; font-size: 1rem; }
64
- .info-box p { color: #4299e1; margin: 0; font-size: 0.9rem; }
65
 
66
  .bird-card {
67
- background: white; border: 1px solid #e2e8f0; border-radius: 16px;
68
- padding: 20px; margin: 12px 0; display: flex; gap: 16px;
69
- box-shadow: 0 4px 15px rgba(0,0,0,0.05);
70
- transition: transform 0.2s, box-shadow 0.2s;
71
  }
72
- .bird-card:hover { transform: translateY(-2px); box-shadow: 0 8px 25px rgba(0,0,0,0.1); }
73
- .bird-card img { width: 120px; height: 120px; object-fit: cover; border-radius: 12px; flex-shrink: 0; }
74
  .bird-info { flex: 1; min-width: 0; }
75
- .bird-info h3 { color: #1a202c; margin: 0 0 4px 0; font-size: 1.2rem; font-weight: 700; }
76
- .bird-info .scientific { color: #718096; font-style: italic; font-size: 0.85rem; margin-bottom: 10px; }
77
- .confidence { display: inline-block; padding: 4px 12px; border-radius: 20px; font-weight: 700; font-size: 0.8rem; }
78
  .conf-high { background: #c6f6d5; color: #22543d; }
79
  .conf-med { background: #fefcbf; color: #744210; }
80
  .conf-low { background: #fed7d7; color: #742a2a; }
81
- .reason { color: #4a5568; margin-top: 10px; line-height: 1.6; font-size: 0.9rem; }
82
-
83
- .error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 12px; padding: 20px; color: #c53030; }
84
- .success { background: #f0fff4; border: 1px solid #68d391; border-radius: 12px; padding: 20px; color: #276749; }
85
- .processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 12px; padding: 20px; color: #2b6cb0; }
86
 
87
- .features-box {
88
- background: #f7fafc; border: 1px solid #e2e8f0; border-radius: 10px;
89
- padding: 14px; margin: 10px 0; font-family: monospace; font-size: 0.85rem;
90
- }
91
-
92
- @media (max-width: 768px) {
93
- .header h1 { font-size: 1.8rem; }
94
- .bird-card { flex-direction: column; }
95
- .bird-card img { width: 100%; height: 180px; }
96
- }
97
  """
98
 
99
 
100
- # ================== UTILITY FUNCTIONS ==================
101
-
102
- def image_to_base64(image: Image.Image) -> str:
103
- """Convert PIL image to base64."""
104
- buffered = io.BytesIO()
105
- # Resize for faster processing
106
- max_size = 800
107
- if max(image.size) > max_size:
108
- ratio = max_size / max(image.size)
109
- new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
110
- image = image.resize(new_size, Image.Resampling.LANCZOS)
111
- image.save(buffered, format="JPEG", quality=85)
112
- return base64.b64encode(buffered.getvalue()).decode()
113
-
114
-
115
- def get_wikipedia_image(bird_name: str) -> str:
116
- """Get bird image from Wikipedia."""
117
- if not bird_name or bird_name.lower() in ['unknown', 'the bird', 'the image', 'bird']:
118
- return "https://via.placeholder.com/150x150.png?text=Bird"
119
-
120
- try:
121
- clean_name = bird_name.strip().replace(" ", "_")
122
- api_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean_name)}"
123
-
124
- response = requests.get(api_url, timeout=5)
125
- if response.status_code == 200:
126
- data = response.json()
127
- if "thumbnail" in data:
128
- return data["thumbnail"]["source"]
129
- except Exception as e:
130
- log(f"Wikipedia image fetch failed: {e}")
131
-
132
- return f"https://via.placeholder.com/150x150.png?text={urllib.parse.quote(bird_name[:12])}"
133
-
134
-
135
  # ================== OLLAMA FUNCTIONS ==================
136
 
137
- def check_ollama() -> Tuple[bool, Optional[str], bool]:
138
- """Check Ollama availability. Returns (available, model_name, is_vision_model)."""
 
139
  try:
140
  response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3)
141
  if response.status_code == 200:
142
  models = [m["name"] for m in response.json().get("models", [])]
143
  log(f"Ollama models: {models}")
 
144
 
145
- # Check for vision models first
146
- for vm in OLLAMA_VISION_MODELS:
147
- for m in models:
148
- if vm.split(":")[0] in m.lower():
149
- log(f"Found vision model: {m}")
150
- return True, m, True
151
 
152
- # Fall back to text models
153
- for tm in OLLAMA_TEXT_MODELS:
154
- for m in models:
155
- if tm.split(":")[0] in m.lower():
156
- log(f"Found text model: {m}")
157
- return True, m, False
158
-
159
- return False, None, False
160
  except Exception as e:
161
- log(f"Ollama not available: {e}")
162
- return False, None, False
 
163
 
164
 
165
- def call_ollama_vision(image: Image.Image, prompt: str, model: str) -> str:
166
- """Call Ollama vision model (LLaVA)."""
167
  try:
168
- img_b64 = image_to_base64(image)
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  response = requests.post(
171
  f"{OLLAMA_URL}/api/generate",
@@ -174,106 +135,95 @@ def call_ollama_vision(image: Image.Image, prompt: str, model: str) -> str:
174
  "prompt": prompt,
175
  "images": [img_b64],
176
  "stream": False,
177
- "options": {"temperature": 0.2, "num_predict": 1500}
178
  },
179
- timeout=180
180
  )
181
 
182
  if response.status_code == 200:
183
  result = response.json().get("response", "")
184
- log(f"LLaVA response: {result[:300]}...")
185
  return result
186
  else:
187
- log(f"Ollama vision error: {response.status_code} - {response.text[:200]}")
188
- return ""
189
  except Exception as e:
190
- log(f"Ollama vision failed: {e}")
191
- return ""
192
 
193
 
194
  def call_ollama_text(prompt: str, model: str) -> str:
195
- """Call Ollama text model."""
196
  try:
 
197
  response = requests.post(
198
  f"{OLLAMA_URL}/api/generate",
199
  json={
200
  "model": model,
201
  "prompt": prompt,
202
  "stream": False,
203
- "options": {"temperature": 0.2, "num_predict": 1000}
204
  },
205
- timeout=90
206
  )
207
-
208
  if response.status_code == 200:
209
  return response.json().get("response", "")
210
- return ""
211
  except Exception as e:
212
- log(f"Ollama text error: {e}")
213
- return ""
214
 
215
 
216
  # ================== HUGGINGFACE FUNCTIONS ==================
217
 
218
- def call_huggingface_vlm(image: Image.Image, prompt: str) -> str:
219
- """Call HuggingFace Vision-Language Model."""
220
  if not HF_TOKEN:
221
- log("No HF_TOKEN - skipping HuggingFace")
222
  return ""
223
 
224
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
225
 
226
- # Convert image to bytes
227
- img_bytes = io.BytesIO()
228
- # Resize for API
229
- max_size = 600
230
  if max(image.size) > max_size:
231
  ratio = max_size / max(image.size)
232
- new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
233
- image = image.resize(new_size, Image.Resampling.LANCZOS)
234
- image.save(img_bytes, format="JPEG", quality=80)
235
- img_bytes.seek(0)
236
 
237
- # Try BLIP models for image captioning
238
  models = [
239
  "Salesforce/blip-image-captioning-large",
240
  "Salesforce/blip-image-captioning-base",
241
- "nlpconnect/vit-gpt2-image-captioning",
242
  ]
243
 
244
  for model in models:
245
  try:
246
- log(f"Trying HF vision model: {model}")
247
- api_url = f"https://api-inference.huggingface.co/models/{model}"
248
-
249
  response = requests.post(
250
- api_url,
251
  headers=headers,
252
- data=img_bytes.getvalue(),
253
- timeout=60
254
  )
255
 
256
  if response.status_code == 200:
257
  result = response.json()
258
- if isinstance(result, list) and len(result) > 0:
259
  caption = result[0].get("generated_text", "")
260
  if caption:
261
  log(f"HF caption: {caption}")
262
  return caption
263
  elif response.status_code == 503:
264
- log(f"Model {model} loading...")
265
- continue
266
  else:
267
- log(f"HF {model}: {response.status_code}")
268
-
269
  except Exception as e:
270
- log(f"HF vision error: {e}")
271
- continue
272
 
273
  return ""
274
 
275
 
276
- def call_huggingface_text(prompt: str) -> str:
277
  """Call HuggingFace text model."""
278
  if not HF_TOKEN:
279
  return ""
@@ -281,344 +231,280 @@ def call_huggingface_text(prompt: str) -> str:
281
  headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
282
 
283
  models = [
284
- "mistralai/Mistral-7B-Instruct-v0.3",
285
  "HuggingFaceH4/zephyr-7b-beta",
286
- "google/flan-t5-xxl",
287
  ]
288
 
289
  for model in models:
290
  try:
291
  log(f"Trying HF text model: {model}")
292
- api_url = f"https://api-inference.huggingface.co/models/{model}"
293
-
294
  response = requests.post(
295
- api_url,
296
  headers=headers,
297
- json={
298
- "inputs": prompt,
299
- "parameters": {"max_new_tokens": 800, "temperature": 0.3, "return_full_text": False}
300
- },
301
- timeout=60
302
  )
303
 
304
  if response.status_code == 200:
305
  result = response.json()
306
- if isinstance(result, list) and len(result) > 0:
307
  text = result[0].get("generated_text", "")
308
  if text:
309
- log(f"HF text response: {text[:200]}...")
310
  return text
311
  elif response.status_code == 503:
312
  continue
313
-
314
  except Exception as e:
315
  log(f"HF text error: {e}")
316
- continue
317
 
318
  return ""
319
 
320
 
321
- # ================== BIRD IDENTIFICATION ==================
322
 
323
- LLAVA_BIRD_PROMPT = """You are an expert ornithologist. Look at this bird image very carefully.
324
-
325
- Identify the bird species. Provide your answer as JSON:
326
-
327
- {
328
- "birds": [
329
- {
330
- "name": "Zebra Finch",
331
- "scientific_name": "Taeniopygia guttata",
332
- "confidence": 95,
333
- "reason": "Identified by orange cheek patches, red-orange beak, black and white barred throat, chestnut flanks with white spots"
334
- }
335
- ],
336
- "summary": "This is a Zebra Finch, a small Australian finch commonly kept as a pet."
337
- }
338
-
339
- Be SPECIFIC with the bird name - use the actual species name like "House Sparrow", "Indian Robin", "Zebra Finch", etc.
340
- DO NOT use generic names like "The bird" or "Unknown".
341
- If you're not 100% sure, still provide your best guess with lower confidence.
342
-
343
- Return ONLY valid JSON."""
344
-
345
-
346
- TEXT_IDENTIFICATION_PROMPT = """You are an expert ornithologist. Based on this image description, identify the bird species.
347
-
348
- IMAGE DESCRIPTION: {description}
349
-
350
- Respond with JSON ONLY:
351
- {{
352
- "birds": [
353
- {{
354
- "name": "Species Common Name",
355
- "scientific_name": "Scientific name",
356
- "confidence": 85,
357
- "reason": "Specific features that match this species"
358
- }}
359
- ],
360
- "summary": "Brief identification summary"
361
- }}
362
-
363
- IMPORTANT:
364
- - Use ACTUAL bird species names (e.g., "Zebra Finch", "House Sparrow", "Indian Roller")
365
- - NEVER use generic names like "The bird", "Unknown", "The image"
366
- - If description mentions orange beak, striped throat, spotted flanks - this is likely a Zebra Finch
367
- - Provide your best species guess even if uncertain
368
-
369
- Return ONLY the JSON, nothing else."""
370
-
371
-
372
- def parse_bird_json(text: str) -> Tuple[List[Dict], str]:
373
- """Parse bird identification from LLM response."""
374
  birds = []
375
  summary = ""
376
 
377
- # Clean the text
378
- text = text.strip()
 
 
379
 
380
- # Try to find JSON in the response
381
  try:
382
- # Look for JSON object
383
- json_match = re.search(r'\{[\s\S]*?"birds"[\s\S]*?\}(?=\s*$|\s*```)', text)
384
  if json_match:
385
  json_str = json_match.group()
386
- # Fix common JSON issues
387
- json_str = re.sub(r',\s*}', '}', json_str)
388
- json_str = re.sub(r',\s*]', ']', json_str)
389
-
390
  data = json.loads(json_str)
391
- birds = data.get("birds", [])
 
392
  summary = data.get("summary", "")
393
 
394
- # Validate bird names
395
- valid_birds = []
396
- for bird in birds:
397
- name = bird.get("name", "").strip()
398
- # Filter out garbage names
399
- if name and name.lower() not in ['the bird', 'the image', 'unknown', 'bird', 'a bird']:
400
- valid_birds.append(bird)
 
 
 
401
 
402
- if valid_birds:
403
- return valid_birds, summary
404
-
405
  except json.JSONDecodeError as e:
406
  log(f"JSON parse error: {e}")
407
 
408
- # Fallback: Try to extract bird names from text
409
- log("Falling back to text extraction...")
410
-
411
- # Common bird species patterns
412
- bird_patterns = [
413
- r'(?:identified as|this is|appears to be|likely|probably)\s+(?:a|an)?\s*([A-Z][a-z]+(?: [A-Z]?[a-z]+)+)',
414
- r'([A-Z][a-z]+ (?:Finch|Sparrow|Robin|Warbler|Dove|Pigeon|Parrot|Kingfisher|Woodpecker|Eagle|Hawk|Owl|Heron|Crane|Duck|Goose|Swan|Crow|Raven|Jay|Magpie|Starling|Myna|Bulbul|Sunbird|Flowerpecker|Barbet|Drongo|Shrike|Oriole|Flycatcher|Thrush|Babbler))',
415
- r'(Zebra Finch|House Sparrow|Indian Robin|Common Myna|Red-vented Bulbul|Rose-ringed Parakeet)',
416
  ]
417
 
418
- for pattern in bird_patterns:
419
- matches = re.findall(pattern, text, re.IGNORECASE)
420
- if matches:
421
- for match in matches[:2]: # Take first 2 matches
422
- birds.append({
423
- "name": match.strip().title(),
424
- "scientific_name": "See reference",
425
- "confidence": 70,
426
- "reason": "Extracted from AI analysis"
427
- })
 
 
 
 
 
 
 
 
 
 
 
428
  break
429
 
430
- # If still no birds found, check for specific description matches
431
- if not birds:
432
- text_lower = text.lower()
433
- if 'orange' in text_lower and ('beak' in text_lower or 'bill' in text_lower):
434
- if 'stripe' in text_lower or 'bar' in text_lower or 'spot' in text_lower:
435
- birds.append({
436
- "name": "Zebra Finch",
437
- "scientific_name": "Taeniopygia guttata",
438
- "confidence": 75,
439
- "reason": "Orange beak with striped/spotted pattern suggests Zebra Finch"
440
- })
441
- elif 'grey' in text_lower or 'gray' in text_lower:
442
- if 'small' in text_lower:
443
- birds.append({
444
- "name": "House Sparrow",
445
- "scientific_name": "Passer domesticus",
446
- "confidence": 60,
447
- "reason": "Small grey bird - possibly House Sparrow"
448
- })
 
 
 
 
 
449
 
450
- return birds, summary or "Based on AI visual analysis"
 
451
 
452
 
453
  def format_bird_card(bird: Dict, index: int) -> str:
454
- """Format bird result as HTML card."""
455
- name = bird.get("name", "Unknown Species")
456
  scientific = bird.get("scientific_name", "")
457
  confidence = bird.get("confidence", 50)
458
  reason = bird.get("reason", "")
459
 
460
- # Skip invalid names
461
- if name.lower() in ['unknown', 'the bird', 'the image', 'bird']:
462
- name = "Unidentified Bird"
463
 
464
- img_url = get_wikipedia_image(name)
465
-
466
- if confidence >= 80:
467
- conf_class = "conf-high"
468
- elif confidence >= 60:
469
- conf_class = "conf-med"
470
- else:
471
- conf_class = "conf-low"
472
 
473
  return f"""
474
  <div class="bird-card">
475
- <img src="{img_url}" alt="{name}" onerror="this.src='https://via.placeholder.com/120x120.png?text=Bird'">
476
  <div class="bird-info">
477
  <h3>{index}. {name}</h3>
478
- <div class="scientific">{scientific}</div>
479
  <span class="confidence {conf_class}">{confidence}% confidence</span>
480
  <p class="reason">{reason}</p>
481
  </div>
482
  </div>"""
483
 
484
 
485
- # ================== IMAGE IDENTIFICATION ==================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
  def identify_image_stream(image):
488
- """Identify bird from image using Vision-Language Model."""
489
  if image is None:
490
- yield '<div class="error">⚠️ Please upload an image first</div>'
491
  return
492
 
493
  try:
494
- # Ensure PIL Image
495
  if not isinstance(image, Image.Image):
496
  image = Image.fromarray(np.array(image))
497
  image = image.convert("RGB")
498
 
499
- yield '<div class="processing">πŸ” Analyzing bird image...</div>'
500
 
501
- # Check for Ollama
502
- ollama_available, model, is_vision = check_ollama()
 
503
 
504
- llm_response = ""
505
- image_description = ""
 
 
 
506
 
507
- if ollama_available and is_vision:
508
- # BEST PATH: Use LLaVA for direct image analysis
509
- yield f'<div class="processing">πŸ¦™ Using LLaVA vision model ({model})...</div>'
510
-
511
- llm_response = call_ollama_vision(image, LLAVA_BIRD_PROMPT, model)
512
-
513
- if not llm_response:
514
- # FALLBACK 1: HuggingFace vision + text
515
  yield '<div class="processing">☁️ Using HuggingFace AI...</div>'
516
 
517
- image_description = call_huggingface_vlm(image, "Describe this bird in detail")
 
518
 
519
- if image_description:
520
- yield f'''<div class="processing">πŸ” Identifying from description...</div>
521
- <div class="features-box"><strong>AI saw:</strong> {image_description}</div>'''
522
 
523
  # Use text model to identify
524
- prompt = TEXT_IDENTIFICATION_PROMPT.format(description=image_description)
525
-
526
- if ollama_available and model:
527
- llm_response = call_ollama_text(prompt, model)
528
-
529
- if not llm_response:
530
- llm_response = call_huggingface_text(prompt)
531
-
532
- if not llm_response and not image_description:
533
- # FALLBACK 2: Basic color analysis
534
- yield '<div class="processing">⚠️ Using basic color analysis...</div>'
535
- image_description = analyze_colors(image)
536
-
537
- prompt = TEXT_IDENTIFICATION_PROMPT.format(description=image_description)
538
- if ollama_available and model:
539
- llm_response = call_ollama_text(prompt, model)
 
540
 
541
  # Parse response
542
- if llm_response:
543
- birds, summary = parse_bird_json(llm_response)
544
- else:
545
- birds, summary = [], "Could not get AI response"
546
 
547
  if not birds:
548
- yield f"""<div class="error">
549
- <strong>❌ Could not identify bird species</strong>
550
- <p>The AI couldn't make a confident identification.</p>
551
- {f'<div class="features-box"><strong>AI description:</strong> {image_description[:300]}</div>' if image_description else ''}
552
- <p>Try uploading a clearer image with the bird in focus.</p>
553
- </div>"""
554
  return
555
 
556
- # Format results
557
- result_html = f"""<div class="success">
558
- <h2>🐦 {len(birds)} Bird(s) Identified!</h2>
559
- <p>{summary}</p>
560
- </div>"""
561
 
562
  for i, bird in enumerate(birds, 1):
563
- result_html += format_bird_card(bird, i)
564
-
565
- yield result_html
566
-
567
- except Exception as e:
568
- log(f"Image identification error: {traceback.format_exc()}")
569
- yield f'<div class="error"><strong>❌ Error:</strong> {str(e)}</div>'
570
-
571
-
572
- def analyze_colors(image: Image.Image) -> str:
573
- """Basic color analysis fallback."""
574
- try:
575
- img = np.array(image)
576
- h, w = img.shape[:2]
577
 
578
- def describe_region(region):
579
- mean = np.mean(region, axis=(0, 1))
580
- r, g, b = mean
581
-
582
- colors = []
583
- if r > 180 and g < 120 and b < 120: colors.append("red")
584
- elif r > 180 and g > 100 and b < 100: colors.append("orange")
585
- elif r > 180 and g > 180 and b < 120: colors.append("yellow")
586
- elif r > 150 and g > 150 and b > 150: colors.append("white/light grey")
587
- elif r < 80 and g < 80 and b < 80: colors.append("black/dark")
588
- elif 80 < r < 150 and 80 < g < 150 and 80 < b < 150: colors.append("grey")
589
- elif r > g > b: colors.append("brown/rufous")
590
- else: colors.append("mixed colors")
591
-
592
- return ", ".join(colors)
593
-
594
- head = describe_region(img[:h//3, :, :])
595
- body = describe_region(img[h//3:2*h//3, :, :])
596
- lower = describe_region(img[2*h//3:, :, :])
597
-
598
- return f"Small bird. Head region: {head}. Body: {body}. Lower parts: {lower}."
599
 
600
  except Exception as e:
601
- return f"Could not analyze image: {e}"
 
602
 
603
 
604
  # ================== AUDIO IDENTIFICATION ==================
605
 
606
  def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
607
- """Process audio and extract bird call features."""
608
  try:
609
- # Normalize
610
  audio = audio_data.astype(np.float64)
611
  if np.max(np.abs(audio)) > 0:
612
  audio = audio / np.max(np.abs(audio))
613
 
614
- # Bandpass filter (bird frequencies: 500Hz - 10kHz)
615
  nyq = sr / 2
616
  low, high = max(500/nyq, 0.01), min(10000/nyq, 0.99)
617
  if low < high:
618
  b, a = signal.butter(4, [low, high], btype='band')
619
  audio = signal.filtfilt(b, a, audio)
620
 
621
- # Features
622
  duration = len(audio_data) / sr
623
 
624
  # Peak frequency
@@ -626,58 +512,43 @@ def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
626
  freqs = np.fft.rfftfreq(len(audio), 1/sr)
627
  peak_freq = freqs[np.argmax(np.abs(fft))] if len(freqs) > 0 else 0
628
 
629
- # Syllable count
630
  envelope = np.abs(signal.hilbert(audio))
631
  threshold = np.mean(envelope) + 0.5 * np.std(envelope)
632
  syllables = np.sum(np.diff((envelope > threshold).astype(int)) > 0)
633
 
634
  return {
635
  "duration": round(duration, 2),
636
- "peak_frequency": int(peak_freq),
637
  "syllables": int(syllables),
638
- "pattern": "repetitive" if syllables > 5 else "simple" if syllables > 1 else "single note",
639
- "frequency_range": "high (3-8kHz)" if peak_freq > 3000 else "medium (1-3kHz)" if peak_freq > 1000 else "low (<1kHz)"
640
  }
641
-
642
- except Exception as e:
643
- return {"error": str(e), "duration": 0}
644
 
645
 
646
- AUDIO_PROMPT = """You are an expert ornithologist specializing in bird vocalizations.
647
 
648
- BIRD CALL ANALYSIS:
649
  - Duration: {duration} seconds
650
- - Peak Frequency: {peak_frequency} Hz ({frequency_range})
651
- - Syllables/Notes: {syllables}
652
- - Pattern: {pattern}
653
- {location}{month}
654
 
655
  Based on these acoustic features, identify possible bird species.
656
-
657
- High frequency (3000-8000 Hz) = small passerines (warblers, finches)
658
- Medium frequency (1000-3000 Hz) = medium birds (thrushes, bulbuls, mynas)
659
- Low frequency (500-1000 Hz) = larger birds (crows, pigeons, doves)
660
 
661
  Respond with JSON ONLY:
662
- {{
663
- "birds": [
664
- {{
665
- "name": "Species Name",
666
- "scientific_name": "Scientific name",
667
- "confidence": 75,
668
- "reason": "Why this bird matches the audio features"
669
- }}
670
- ],
671
- "summary": "Brief summary"
672
- }}
673
 
674
- Use ACTUAL species names, not generic terms."""
675
 
676
 
677
  def identify_audio_stream(audio_input, location: str = "", month: str = ""):
678
- """Identify bird from audio."""
679
  if audio_input is None:
680
- yield '<div class="error">⚠️ Please upload or record audio first</div>'
681
  return
682
 
683
  try:
@@ -688,253 +559,190 @@ def identify_audio_stream(audio_input, location: str = "", month: str = ""):
688
  return
689
 
690
  if len(audio_data) == 0:
691
- yield '<div class="error">⚠️ Audio is empty</div>'
692
  return
693
 
694
- # Convert to mono
695
  if len(audio_data.shape) > 1:
696
  audio_data = np.mean(audio_data, axis=1)
697
 
698
- yield '<div class="processing">πŸ”Š Processing audio with SAM-Audio...</div>'
699
 
700
  features = process_audio(audio_data, sr)
701
 
702
- features_html = f"""<div class="features-box">
703
- <strong>🎡 Audio Analysis</strong><br>
704
- β€’ Duration: {features.get('duration', 0)}s<br>
705
- β€’ Peak Frequency: {features.get('peak_frequency', 0)} Hz ({features.get('frequency_range', 'unknown')})<br>
706
- β€’ Syllables: {features.get('syllables', 0)}<br>
707
- β€’ Pattern: {features.get('pattern', 'unknown')}
708
- </div>"""
709
-
710
- yield f'<div class="processing">πŸ€– Identifying bird species...</div>{features_html}'
711
-
712
- # Build prompt
713
- prompt = AUDIO_PROMPT.format(
714
- **features,
715
- location=f"\n- Location: {location}" if location else "",
716
- month=f"\n- Month: {month}" if month else ""
717
- )
718
 
719
- # Get identification
720
- ollama_available, model, _ = check_ollama()
 
 
 
 
 
721
  response = ""
722
 
723
- if ollama_available and model:
724
- yield f'<div class="processing">πŸ¦™ Using local AI ({model})...</div>{features_html}'
725
- response = call_ollama_text(prompt, model)
 
726
 
727
  if not response:
728
- yield f'<div class="processing">☁️ Using cloud AI...</div>{features_html}'
729
- response = call_huggingface_text(prompt)
730
 
731
- birds, summary = parse_bird_json(response)
732
 
733
  if not birds:
734
- yield f"""<div class="error">
735
- <strong>Could not identify bird from audio</strong>
736
- <p>Try recording a clearer sample with less background noise.</p>
737
  {features_html}
738
- </div>"""
739
  return
740
 
741
- result_html = f"""<div class="success">
742
- <h2>🐦 {len(birds)} Bird(s) Identified from Audio!</h2>
743
  <p>{summary}</p>
744
- </div>{features_html}"""
745
 
746
  for i, bird in enumerate(birds, 1):
747
- result_html += format_bird_card(bird, i)
748
 
749
- yield result_html
750
 
751
  except Exception as e:
752
  log(f"Audio error: {traceback.format_exc()}")
753
- yield f'<div class="error"><strong>❌ Error:</strong> {str(e)}</div>'
754
 
755
 
756
  # ================== DESCRIPTION IDENTIFICATION ==================
757
 
758
  def identify_description_stream(description: str):
759
  """Identify bird from text description."""
760
- if not description or len(description.strip()) < 10:
761
- yield '<div class="error">⚠️ Please enter a more detailed description</div>'
762
  return
763
 
764
  try:
765
- yield '<div class="processing">πŸ” Analyzing your description...</div>'
766
 
767
- prompt = f"""You are an expert ornithologist specializing in Indian birds.
768
 
769
- USER DESCRIPTION:
770
  "{description}"
771
 
772
- Identify the bird species that best matches this description.
773
-
774
- Respond with JSON ONLY:
775
- {{
776
- "birds": [
777
- {{
778
- "name": "Species Name",
779
- "scientific_name": "Scientific name",
780
- "confidence": 85,
781
- "reason": "Why this matches the description"
782
- }}
783
- ],
784
- "summary": "Brief summary"
785
- }}
786
 
787
- Use ACTUAL species names like "House Sparrow", "Indian Robin", "Zebra Finch" - never generic terms."""
788
 
789
- ollama_available, model, _ = check_ollama()
790
  response = ""
791
 
792
- if ollama_available and model:
793
  yield '<div class="processing">πŸ¦™ Using local AI...</div>'
794
- response = call_ollama_text(prompt, model)
795
 
796
  if not response:
797
- yield '<div class="processing">☁️ Using cloud AI...</div>'
798
- response = call_huggingface_text(prompt)
799
 
800
- birds, summary = parse_bird_json(response)
801
 
802
  if not birds:
803
- yield """<div class="error">
804
- <strong>Could not identify bird</strong>
805
- <p>Try adding more details about colors, size, behavior, or sounds.</p>
806
- </div>"""
807
  return
808
 
809
- result_html = f"""<div class="success">
810
- <h2>🐦 {len(birds)} Bird(s) Match Your Description!</h2>
811
  <p>{summary}</p>
812
- </div>"""
813
 
814
  for i, bird in enumerate(birds, 1):
815
- result_html += format_bird_card(bird, i)
816
 
817
- yield result_html
818
 
819
  except Exception as e:
820
- log(f"Description error: {traceback.format_exc()}")
821
- yield f'<div class="error"><strong>❌ Error:</strong> {str(e)}</div>'
822
 
823
 
824
- # ================== MAIN UI ==================
825
 
826
- def get_status():
827
- """Get AI backend status."""
828
- ollama_available, model, is_vision = check_ollama()
829
- if is_vision:
830
- return f"πŸ¦™ LLaVA Vision ({model})"
831
- elif ollama_available:
832
- return f"πŸ¦™ Local AI ({model})"
 
833
  elif HF_TOKEN:
834
- return "☁️ HuggingFace Cloud"
835
  else:
836
- return "⚠️ Limited Mode"
837
 
838
 
839
- def create_ui():
840
- """Create Gradio interface with Audio tab first."""
841
-
842
  with gr.Blocks(title="BirdSense Pro") as demo:
843
  gr.HTML(f"<style>{CSS}</style>")
844
 
845
  gr.HTML(f"""
846
  <div class="header">
847
  <h1>🐦 BirdSense Pro</h1>
848
- <p class="subtitle">AI-Powered Bird Identification β€’ Audio β€’ Image β€’ Description</p>
849
- <div class="status">
850
- <span class="status-dot"></span>
851
- {get_status()}
852
- </div>
853
- </div>
854
- """)
855
 
856
- # AUDIO TAB FIRST (as requested)
857
  with gr.Tab("🎡 Audio"):
858
- gr.HTML("""<div class="info-box">
859
- <h3>🎡 Audio Identification</h3>
860
- <p>Upload or record bird calls. SAM-Audio processing isolates bird sounds from background noise.</p>
861
- </div>""")
862
-
863
  with gr.Row():
864
  with gr.Column():
865
- audio_input = gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎀 Upload or Record")
866
  with gr.Row():
867
- location = gr.Textbox(label="πŸ“ Location", placeholder="e.g., Mumbai, Delhi")
868
- month = gr.Dropdown(label="πŸ“… Month", choices=["", "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"])
869
- audio_btn = gr.Button("πŸ” Identify Bird", variant="primary", size="lg")
870
-
871
  with gr.Column():
872
- audio_output = gr.HTML('<div style="padding:50px; text-align:center; color:#a0aec0;">🎡 Upload or record bird calls to identify</div>')
873
-
874
- audio_btn.click(identify_audio_stream, [audio_input, location, month], audio_output)
875
 
876
- # IMAGE TAB
877
  with gr.Tab("πŸ“· Image"):
878
- gr.HTML("""<div class="info-box">
879
- <h3>πŸ“· Image Identification</h3>
880
- <p>Upload a bird photo. LLaVA vision model analyzes the actual image for accurate identification.</p>
881
- </div>""")
882
-
883
  with gr.Row():
884
  with gr.Column():
885
- image_input = gr.Image(sources=["upload", "webcam"], type="pil", label="πŸ“Έ Upload or Capture")
886
- image_btn = gr.Button("πŸ” Identify Bird", variant="primary", size="lg")
887
-
888
  with gr.Column():
889
- image_output = gr.HTML('<div style="padding:50px; text-align:center; color:#a0aec0;">πŸ“· Upload a bird image to identify</div>')
890
-
891
- image_btn.click(identify_image_stream, [image_input], image_output)
892
 
893
- # DESCRIPTION TAB
894
  with gr.Tab("πŸ“ Description"):
895
- gr.HTML("""<div class="info-box">
896
- <h3>πŸ“ Text Description</h3>
897
- <p>Describe the bird you saw β€” colors, size, behavior, sounds, habitat.</p>
898
- </div>""")
899
-
900
  with gr.Row():
901
  with gr.Column():
902
- desc_input = gr.Textbox(
903
- label="✍️ Describe the Bird",
904
- lines=4,
905
- placeholder="Example: Small bird with bright orange beak, grey head with orange cheek patches, black and white striped throat, chestnut brown sides with white spots."
906
- )
907
- desc_btn = gr.Button("πŸ” Identify Bird", variant="primary", size="lg")
908
-
909
  with gr.Column():
910
- desc_output = gr.HTML('<div style="padding:50px; text-align:center; color:#a0aec0;">πŸ“ Describe a bird to identify it</div>')
911
-
912
- desc_btn.click(identify_description_stream, [desc_input], desc_output)
913
 
914
- gr.HTML("""
915
- <div style="text-align:center; margin-top:20px; padding:15px; color:#718096; font-size:0.85rem;">
916
- <strong>BirdSense Pro</strong> β€’ Uses LLaVA (local) or BLIP-2 (cloud) for vision analysis<br>
917
- For best accuracy, use local Ollama with LLaVA model
918
- </div>
919
- """)
920
 
921
  return demo
922
 
923
 
924
- # ================== MAIN ==================
925
-
926
  if __name__ == "__main__":
927
  log("Starting BirdSense Pro...")
928
- log(f"HF_TOKEN available: {bool(HF_TOKEN)}")
929
-
930
- ollama_ok, model, is_vision = check_ollama()
931
- if is_vision:
932
- log(f"βœ… LLaVA vision model ready: {model}")
933
- elif ollama_ok:
934
- log(f"⚠️ Ollama available but no vision model: {model}")
935
- log(" Run: ollama pull llava:7b")
936
- else:
937
- log("⚠️ Ollama not available, using HuggingFace fallback")
938
 
939
- demo = create_ui()
940
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)
 
1
  """
2
  🐦 BirdSense Pro - AI Bird Identification
3
+ - Local: Ollama LLaVA (vision) + Llama3.2 (text/audio)
4
+ - Cloud: HuggingFace BLIP-2 + Text models
5
+ NO HARDCODED BIRDS - Pure AI identification
6
  """
7
 
8
  import gradio as gr
9
  import numpy as np
10
  import scipy.signal as signal
11
+ from typing import Tuple, List, Dict, Optional
12
  import json
13
  import requests
14
  import re
 
20
  import base64
21
 
22
  # ================== CONFIG ==================
 
23
  OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
24
  HF_TOKEN = os.environ.get("HF_TOKEN", "")
25
  DEBUG = True
26
 
 
 
 
 
27
  def log(msg):
28
  if DEBUG:
29
  print(f"[BirdSense] {msg}")
 
33
  CSS = """
34
  .gradio-container {
35
  background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important;
36
+ font-family: 'Inter', sans-serif !important;
37
  }
38
  .header {
39
  background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%);
40
+ color: white; padding: 35px 20px; border-radius: 16px;
41
+ text-align: center; margin-bottom: 16px;
42
+ box-shadow: 0 10px 30px rgba(26, 54, 93, 0.25);
43
  }
44
+ .header h1 { font-size: 2.2rem; font-weight: 800; margin: 0 0 8px 0; }
45
+ .header .subtitle { font-size: 1rem; opacity: 0.9; margin-bottom: 10px; }
46
  .header .status {
47
+ display: inline-flex; align-items: center; gap: 6px;
48
+ background: rgba(255,255,255,0.15); padding: 6px 16px; border-radius: 50px;
49
+ font-weight: 600; font-size: 0.85rem;
50
  }
51
+ .status-dot { width: 8px; height: 8px; border-radius: 50%; }
52
+ .status-green { background: #48bb78; }
53
+ .status-yellow { background: #ecc94b; }
54
+ .status-red { background: #fc8181; }
55
 
56
  .info-box {
57
  background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%);
58
+ border: 1px solid #90cdf4; border-radius: 10px; padding: 14px; margin-bottom: 14px;
59
  }
60
+ .info-box h3 { color: #2b6cb0; margin: 0 0 4px 0; font-size: 0.95rem; }
61
+ .info-box p { color: #4299e1; margin: 0; font-size: 0.85rem; }
62
 
63
  .bird-card {
64
+ background: white; border: 1px solid #e2e8f0; border-radius: 14px;
65
+ padding: 16px; margin: 10px 0; display: flex; gap: 14px;
66
+ box-shadow: 0 3px 10px rgba(0,0,0,0.04);
 
67
  }
68
+ .bird-card img { width: 100px; height: 100px; object-fit: cover; border-radius: 10px; flex-shrink: 0; }
 
69
  .bird-info { flex: 1; min-width: 0; }
70
+ .bird-info h3 { color: #1a202c; margin: 0 0 3px 0; font-size: 1.1rem; font-weight: 700; }
71
+ .bird-info .scientific { color: #718096; font-style: italic; font-size: 0.8rem; margin-bottom: 8px; }
72
+ .confidence { display: inline-block; padding: 3px 10px; border-radius: 16px; font-weight: 700; font-size: 0.75rem; }
73
  .conf-high { background: #c6f6d5; color: #22543d; }
74
  .conf-med { background: #fefcbf; color: #744210; }
75
  .conf-low { background: #fed7d7; color: #742a2a; }
76
+ .reason { color: #4a5568; margin-top: 8px; line-height: 1.5; font-size: 0.85rem; }
 
 
 
 
77
 
78
+ .error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 10px; padding: 16px; color: #c53030; }
79
+ .success { background: #f0fff4; border: 1px solid #68d391; border-radius: 10px; padding: 16px; color: #276749; }
80
+ .processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 10px; padding: 16px; color: #2b6cb0; }
81
+ .features-box { background: #f7fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; font-size: 0.8rem; }
 
 
 
 
 
 
82
  """
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  # ================== OLLAMA FUNCTIONS ==================
86
 
87
+ def check_ollama_models() -> Dict:
88
+ """Check available Ollama models."""
89
+ result = {"available": False, "vision_model": None, "text_model": None}
90
  try:
91
  response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3)
92
  if response.status_code == 200:
93
  models = [m["name"] for m in response.json().get("models", [])]
94
  log(f"Ollama models: {models}")
95
+ result["available"] = True
96
 
97
+ # Find vision model
98
+ for m in models:
99
+ if "llava" in m.lower() or "bakllava" in m.lower():
100
+ result["vision_model"] = m
101
+ break
 
102
 
103
+ # Find text model
104
+ for m in models:
105
+ if any(t in m.lower() for t in ["llama", "qwen", "mistral", "phi"]):
106
+ if "llava" not in m.lower(): # Exclude vision models
107
+ result["text_model"] = m
108
+ break
 
 
109
  except Exception as e:
110
+ log(f"Ollama check failed: {e}")
111
+
112
+ return result
113
 
114
 
115
+ def call_llava(image: Image.Image, prompt: str, model: str) -> str:
116
+ """Call LLaVA vision model."""
117
  try:
118
+ # Resize image
119
+ max_size = 768
120
+ if max(image.size) > max_size:
121
+ ratio = max_size / max(image.size)
122
+ image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS)
123
+
124
+ # Convert to base64
125
+ buffer = io.BytesIO()
126
+ image.save(buffer, format="JPEG", quality=85)
127
+ img_b64 = base64.b64encode(buffer.getvalue()).decode()
128
+
129
+ log(f"Calling LLaVA ({model}) with {len(img_b64)} bytes image...")
130
 
131
  response = requests.post(
132
  f"{OLLAMA_URL}/api/generate",
 
135
  "prompt": prompt,
136
  "images": [img_b64],
137
  "stream": False,
138
+ "options": {"temperature": 0.1, "num_predict": 1200}
139
  },
140
+ timeout=120
141
  )
142
 
143
  if response.status_code == 200:
144
  result = response.json().get("response", "")
145
+ log(f"LLaVA response ({len(result)} chars): {result[:300]}...")
146
  return result
147
  else:
148
+ log(f"LLaVA error: {response.status_code} - {response.text[:200]}")
 
149
  except Exception as e:
150
+ log(f"LLaVA call failed: {traceback.format_exc()}")
151
+ return ""
152
 
153
 
154
  def call_ollama_text(prompt: str, model: str) -> str:
155
+ """Call Ollama text model (for audio/description)."""
156
  try:
157
+ log(f"Calling text model ({model})...")
158
  response = requests.post(
159
  f"{OLLAMA_URL}/api/generate",
160
  json={
161
  "model": model,
162
  "prompt": prompt,
163
  "stream": False,
164
+ "options": {"temperature": 0.2, "num_predict": 800}
165
  },
166
+ timeout=60
167
  )
 
168
  if response.status_code == 200:
169
  return response.json().get("response", "")
 
170
  except Exception as e:
171
+ log(f"Text model error: {e}")
172
+ return ""
173
 
174
 
175
  # ================== HUGGINGFACE FUNCTIONS ==================
176
 
177
+ def call_hf_image_caption(image: Image.Image) -> str:
178
+ """Get image caption from HuggingFace BLIP."""
179
  if not HF_TOKEN:
180
+ log("No HF_TOKEN")
181
  return ""
182
 
183
  headers = {"Authorization": f"Bearer {HF_TOKEN}"}
184
 
185
+ # Resize
186
+ max_size = 512
 
 
187
  if max(image.size) > max_size:
188
  ratio = max_size / max(image.size)
189
+ image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS)
190
+
191
+ buffer = io.BytesIO()
192
+ image.save(buffer, format="JPEG", quality=80)
193
 
 
194
  models = [
195
  "Salesforce/blip-image-captioning-large",
196
  "Salesforce/blip-image-captioning-base",
 
197
  ]
198
 
199
  for model in models:
200
  try:
201
+ log(f"Trying HF caption model: {model}")
 
 
202
  response = requests.post(
203
+ f"https://api-inference.huggingface.co/models/{model}",
204
  headers=headers,
205
+ data=buffer.getvalue(),
206
+ timeout=45
207
  )
208
 
209
  if response.status_code == 200:
210
  result = response.json()
211
+ if isinstance(result, list) and result:
212
  caption = result[0].get("generated_text", "")
213
  if caption:
214
  log(f"HF caption: {caption}")
215
  return caption
216
  elif response.status_code == 503:
217
+ log(f"{model} loading, trying next...")
 
218
  else:
219
+ log(f"HF error {response.status_code}: {response.text[:100]}")
 
220
  except Exception as e:
221
+ log(f"HF caption error: {e}")
 
222
 
223
  return ""
224
 
225
 
226
+ def call_hf_text(prompt: str) -> str:
227
  """Call HuggingFace text model."""
228
  if not HF_TOKEN:
229
  return ""
 
231
  headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
232
 
233
  models = [
234
+ "mistralai/Mistral-7B-Instruct-v0.2",
235
  "HuggingFaceH4/zephyr-7b-beta",
236
+ "google/flan-t5-xl",
237
  ]
238
 
239
  for model in models:
240
  try:
241
  log(f"Trying HF text model: {model}")
 
 
242
  response = requests.post(
243
+ f"https://api-inference.huggingface.co/models/{model}",
244
  headers=headers,
245
+ json={"inputs": prompt, "parameters": {"max_new_tokens": 600, "temperature": 0.3}},
246
+ timeout=45
 
 
 
247
  )
248
 
249
  if response.status_code == 200:
250
  result = response.json()
251
+ if isinstance(result, list) and result:
252
  text = result[0].get("generated_text", "")
253
  if text:
254
+ log(f"HF text ({len(text)} chars)")
255
  return text
256
  elif response.status_code == 503:
257
  continue
 
258
  except Exception as e:
259
  log(f"HF text error: {e}")
 
260
 
261
  return ""
262
 
263
 
264
+ # ================== PARSING ==================
265
 
266
+ def parse_bird_response(text: str) -> Tuple[List[Dict], str]:
267
+ """Parse LLM response to extract bird identifications. NO HARDCODED FALLBACKS."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  birds = []
269
  summary = ""
270
 
271
+ if not text:
272
+ return [], ""
273
+
274
+ log(f"Parsing response: {text[:500]}...")
275
 
276
+ # Try JSON first
277
  try:
278
+ json_match = re.search(r'\{[\s\S]*"birds"[\s\S]*\}', text)
 
279
  if json_match:
280
  json_str = json_match.group()
281
+ json_str = re.sub(r',(\s*[}\]])', r'\1', json_str) # Fix trailing commas
 
 
 
282
  data = json.loads(json_str)
283
+
284
+ raw_birds = data.get("birds", [])
285
  summary = data.get("summary", "")
286
 
287
+ for b in raw_birds:
288
+ name = b.get("name", "").strip()
289
+ # Filter out garbage
290
+ if name and len(name) > 2 and name.lower() not in ["the bird", "bird", "unknown", "the image", "image"]:
291
+ birds.append({
292
+ "name": name,
293
+ "scientific_name": b.get("scientific_name", ""),
294
+ "confidence": min(99, max(1, int(b.get("confidence", 70)))),
295
+ "reason": b.get("reason", "Identified by AI")
296
+ })
297
 
298
+ if birds:
299
+ return birds, summary
 
300
  except json.JSONDecodeError as e:
301
  log(f"JSON parse error: {e}")
302
 
303
+ # Fallback: Extract from text using patterns
304
+ # Look for "This is a/an [Bird Name]" or "[Bird Name] (Scientific name)"
305
+ patterns = [
306
+ r"(?:this is|identified as|appears to be|looks like|most likely)\s+(?:a|an|the)?\s*([A-Z][a-z]+(?:[-\s][A-Za-z]+){0,3})",
307
+ r"([A-Z][a-z]+(?:\s[A-Za-z]+)?)\s*\(([A-Z][a-z]+\s[a-z]+)\)", # Name (Scientific name)
308
+ r"species[:\s]+([A-Z][a-z]+(?:\s[A-Za-z]+)?)",
 
 
309
  ]
310
 
311
+ for pattern in patterns:
312
+ matches = re.findall(pattern, text)
313
+ for match in matches:
314
+ if isinstance(match, tuple):
315
+ name = match[0].strip()
316
+ else:
317
+ name = match.strip()
318
+
319
+ # Validate it looks like a bird name
320
+ if name and len(name) > 3 and name.lower() not in ["the bird", "bird", "unknown"]:
321
+ # Check it's not a common non-bird word
322
+ skip_words = ["the", "this", "that", "image", "photo", "picture", "bird", "species"]
323
+ if name.lower() not in skip_words:
324
+ birds.append({
325
+ "name": name,
326
+ "scientific_name": "",
327
+ "confidence": 65,
328
+ "reason": "Extracted from AI analysis"
329
+ })
330
+ break
331
+ if birds:
332
  break
333
 
334
+ return birds[:3], summary # Max 3 birds
335
+
336
+
337
+ def get_bird_image(bird_name: str) -> str:
338
+ """Get bird image from Wikipedia."""
339
+ if not bird_name or len(bird_name) < 3:
340
+ return ""
341
+
342
+ try:
343
+ # Clean name for Wikipedia
344
+ clean = bird_name.strip().replace(" ", "_")
345
+ url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean)}"
346
+
347
+ response = requests.get(url, timeout=5)
348
+ if response.status_code == 200:
349
+ data = response.json()
350
+ if "thumbnail" in data:
351
+ img_url = data["thumbnail"]["source"]
352
+ log(f"Got Wikipedia image for {bird_name}")
353
+ return img_url
354
+ elif "originalimage" in data:
355
+ return data["originalimage"]["source"]
356
+ except Exception as e:
357
+ log(f"Wikipedia image error: {e}")
358
 
359
+ # Fallback placeholder with bird name
360
+ return f"https://via.placeholder.com/120x120/4299e1/ffffff?text={urllib.parse.quote(bird_name[:10])}"
361
 
362
 
363
  def format_bird_card(bird: Dict, index: int) -> str:
364
+ """Format bird as HTML card."""
365
+ name = bird.get("name", "Unknown")
366
  scientific = bird.get("scientific_name", "")
367
  confidence = bird.get("confidence", 50)
368
  reason = bird.get("reason", "")
369
 
370
+ img_url = get_bird_image(name)
 
 
371
 
372
+ conf_class = "conf-high" if confidence >= 80 else "conf-med" if confidence >= 60 else "conf-low"
 
 
 
 
 
 
 
373
 
374
  return f"""
375
  <div class="bird-card">
376
+ <img src="{img_url}" alt="{name}" onerror="this.style.display='none'">
377
  <div class="bird-info">
378
  <h3>{index}. {name}</h3>
379
+ {f'<div class="scientific">{scientific}</div>' if scientific else ''}
380
  <span class="confidence {conf_class}">{confidence}% confidence</span>
381
  <p class="reason">{reason}</p>
382
  </div>
383
  </div>"""
384
 
385
 
386
+ # ================== IDENTIFICATION FUNCTIONS ==================
387
+
388
+ IMAGE_PROMPT = """Look at this bird image carefully. Identify the bird species.
389
+
390
+ You MUST respond with valid JSON in this exact format:
391
+ {
392
+ "birds": [
393
+ {
394
+ "name": "Blue-and-yellow Macaw",
395
+ "scientific_name": "Ara ararauna",
396
+ "confidence": 95,
397
+ "reason": "Large parrot with bright blue wings and yellow underparts, characteristic of this species"
398
+ }
399
+ ],
400
+ "summary": "This is a Blue-and-yellow Macaw, a large South American parrot."
401
+ }
402
+
403
+ Look for:
404
+ - Beak shape and color
405
+ - Body colors and patterns
406
+ - Size and shape
407
+ - Any distinctive markings
408
+
409
+ Give the ACTUAL species name (not "bird" or "unknown"). If unsure, give your best guess with lower confidence.
410
+ Return ONLY the JSON."""
411
+
412
 
413
  def identify_image_stream(image):
414
+ """Identify bird from image."""
415
  if image is None:
416
+ yield '<div class="error">⚠️ Please upload an image</div>'
417
  return
418
 
419
  try:
 
420
  if not isinstance(image, Image.Image):
421
  image = Image.fromarray(np.array(image))
422
  image = image.convert("RGB")
423
 
424
+ yield '<div class="processing">πŸ” Analyzing image...</div>'
425
 
426
+ models = check_ollama_models()
427
+ response = ""
428
+ method = ""
429
 
430
+ # Try LLaVA first (best for images)
431
+ if models["vision_model"]:
432
+ yield f'<div class="processing">πŸ¦™ Using LLaVA vision model...</div>'
433
+ response = call_llava(image, IMAGE_PROMPT, models["vision_model"])
434
+ method = "LLaVA Vision"
435
 
436
+ # Fallback to HuggingFace
437
+ if not response:
 
 
 
 
 
 
438
  yield '<div class="processing">☁️ Using HuggingFace AI...</div>'
439
 
440
+ # Get caption first
441
+ caption = call_hf_image_caption(image)
442
 
443
+ if caption:
444
+ yield f'<div class="processing">πŸ” Identifying from caption...</div><div class="features-box"><b>AI sees:</b> {caption}</div>'
 
445
 
446
  # Use text model to identify
447
+ text_prompt = f"""Based on this image description, identify the bird species:
448
+
449
+ "{caption}"
450
+
451
+ Respond with JSON:
452
+ {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}}
453
+
454
+ Give the ACTUAL bird species name. Return ONLY JSON."""
455
+
456
+ if models["text_model"]:
457
+ response = call_ollama_text(text_prompt, models["text_model"])
458
+ if not response:
459
+ response = call_hf_text(text_prompt)
460
+ method = "HuggingFace BLIP + Text"
461
+ else:
462
+ yield '<div class="error">❌ Could not analyze image. HuggingFace API may be unavailable.</div>'
463
+ return
464
 
465
  # Parse response
466
+ birds, summary = parse_bird_response(response)
 
 
 
467
 
468
  if not birds:
469
+ yield f'''<div class="error">
470
+ <b>❌ Could not identify bird species</b>
471
+ <p>The AI response couldn't be parsed. Try a clearer image.</p>
472
+ <div class="features-box"><b>Raw AI response:</b><br>{response[:500] if response else "No response"}</div>
473
+ </div>'''
 
474
  return
475
 
476
+ # Success
477
+ result = f'''<div class="success">
478
+ <h3>🐦 {len(birds)} Bird(s) Identified!</h3>
479
+ <p>{summary or f"Identified using {method}"}</p>
480
+ </div>'''
481
 
482
  for i, bird in enumerate(birds, 1):
483
+ result += format_bird_card(bird, i)
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
485
+ yield result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
 
487
  except Exception as e:
488
+ log(f"Image error: {traceback.format_exc()}")
489
+ yield f'<div class="error">❌ Error: {str(e)}</div>'
490
 
491
 
492
  # ================== AUDIO IDENTIFICATION ==================
493
 
494
  def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
495
+ """Extract audio features for bird identification."""
496
  try:
 
497
  audio = audio_data.astype(np.float64)
498
  if np.max(np.abs(audio)) > 0:
499
  audio = audio / np.max(np.abs(audio))
500
 
501
+ # Bandpass filter (500Hz - 10kHz for birds)
502
  nyq = sr / 2
503
  low, high = max(500/nyq, 0.01), min(10000/nyq, 0.99)
504
  if low < high:
505
  b, a = signal.butter(4, [low, high], btype='band')
506
  audio = signal.filtfilt(b, a, audio)
507
 
 
508
  duration = len(audio_data) / sr
509
 
510
  # Peak frequency
 
512
  freqs = np.fft.rfftfreq(len(audio), 1/sr)
513
  peak_freq = freqs[np.argmax(np.abs(fft))] if len(freqs) > 0 else 0
514
 
515
+ # Count syllables
516
  envelope = np.abs(signal.hilbert(audio))
517
  threshold = np.mean(envelope) + 0.5 * np.std(envelope)
518
  syllables = np.sum(np.diff((envelope > threshold).astype(int)) > 0)
519
 
520
  return {
521
  "duration": round(duration, 2),
522
+ "peak_freq": int(peak_freq),
523
  "syllables": int(syllables),
524
+ "freq_range": "high" if peak_freq > 3000 else "medium" if peak_freq > 1000 else "low"
 
525
  }
526
+ except:
527
+ return {"duration": 0, "peak_freq": 0, "syllables": 0, "freq_range": "unknown"}
 
528
 
529
 
530
+ AUDIO_PROMPT = """You are an expert ornithologist. Identify the bird from these audio features:
531
 
 
532
  - Duration: {duration} seconds
533
+ - Peak Frequency: {peak_freq} Hz ({freq_range} range)
534
+ - Syllables/notes detected: {syllables}
535
+ {extra}
 
536
 
537
  Based on these acoustic features, identify possible bird species.
538
+ High frequency (>3000 Hz) = small birds like warblers, finches
539
+ Medium frequency (1000-3000 Hz) = thrushes, bulbuls, mynas
540
+ Low frequency (<1000 Hz) = larger birds like crows, doves
 
541
 
542
  Respond with JSON ONLY:
543
+ {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 70, "reason": "Matches because..."}}], "summary": "..."}}
 
 
 
 
 
 
 
 
 
 
544
 
545
+ Give ACTUAL species names, not generic terms."""
546
 
547
 
548
  def identify_audio_stream(audio_input, location: str = "", month: str = ""):
549
+ """Identify bird from audio - uses TEXT model, not vision."""
550
  if audio_input is None:
551
+ yield '<div class="error">⚠️ Please upload or record audio</div>'
552
  return
553
 
554
  try:
 
559
  return
560
 
561
  if len(audio_data) == 0:
562
+ yield '<div class="error">⚠️ Empty audio</div>'
563
  return
564
 
 
565
  if len(audio_data.shape) > 1:
566
  audio_data = np.mean(audio_data, axis=1)
567
 
568
+ yield '<div class="processing">πŸ”Š Analyzing audio features...</div>'
569
 
570
  features = process_audio(audio_data, sr)
571
 
572
+ features_html = f'''<div class="features-box">
573
+ <b>🎡 Audio Analysis</b><br>
574
+ β€’ Duration: {features["duration"]}s | Peak: {features["peak_freq"]} Hz ({features["freq_range"]})<br>
575
+ β€’ Syllables: {features["syllables"]}
576
+ </div>'''
577
+
578
+ yield f'<div class="processing">πŸ€– Identifying bird...</div>{features_html}'
 
 
 
 
 
 
 
 
 
579
 
580
+ extra = ""
581
+ if location: extra += f"\n- Location: {location}"
582
+ if month: extra += f"\n- Month: {month}"
583
+
584
+ prompt = AUDIO_PROMPT.format(**features, extra=extra)
585
+
586
+ models = check_ollama_models()
587
  response = ""
588
 
589
+ # Use TEXT model for audio (NOT vision!)
590
+ if models["text_model"]:
591
+ yield f'<div class="processing">πŸ¦™ Using {models["text_model"]}...</div>{features_html}'
592
+ response = call_ollama_text(prompt, models["text_model"])
593
 
594
  if not response:
595
+ yield f'<div class="processing">☁️ Using HuggingFace...</div>{features_html}'
596
+ response = call_hf_text(prompt)
597
 
598
+ birds, summary = parse_bird_response(response)
599
 
600
  if not birds:
601
+ yield f'''<div class="error">
602
+ <b>Could not identify bird from audio</b>
603
+ <p>Try a clearer recording with less background noise.</p>
604
  {features_html}
605
+ </div>'''
606
  return
607
 
608
+ result = f'''<div class="success">
609
+ <h3>🐦 {len(birds)} Bird(s) Identified!</h3>
610
  <p>{summary}</p>
611
+ </div>{features_html}'''
612
 
613
  for i, bird in enumerate(birds, 1):
614
+ result += format_bird_card(bird, i)
615
 
616
+ yield result
617
 
618
  except Exception as e:
619
  log(f"Audio error: {traceback.format_exc()}")
620
+ yield f'<div class="error">❌ Error: {str(e)}</div>'
621
 
622
 
623
  # ================== DESCRIPTION IDENTIFICATION ==================
624
 
625
  def identify_description_stream(description: str):
626
  """Identify bird from text description."""
627
+ if not description or len(description.strip()) < 5:
628
+ yield '<div class="error">⚠️ Please enter a description</div>'
629
  return
630
 
631
  try:
632
+ yield '<div class="processing">πŸ” Analyzing description...</div>'
633
 
634
+ prompt = f"""Identify the bird species from this description:
635
 
 
636
  "{description}"
637
 
638
+ Respond with JSON:
639
+ {{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}}
 
 
 
 
 
 
 
 
 
 
 
 
640
 
641
+ Use ACTUAL species names. Return ONLY JSON."""
642
 
643
+ models = check_ollama_models()
644
  response = ""
645
 
646
+ if models["text_model"]:
647
  yield '<div class="processing">πŸ¦™ Using local AI...</div>'
648
+ response = call_ollama_text(prompt, models["text_model"])
649
 
650
  if not response:
651
+ yield '<div class="processing">☁️ Using HuggingFace...</div>'
652
+ response = call_hf_text(prompt)
653
 
654
+ birds, summary = parse_bird_response(response)
655
 
656
  if not birds:
657
+ yield '<div class="error"><b>Could not identify bird</b><p>Try adding more details.</p></div>'
 
 
 
658
  return
659
 
660
+ result = f'''<div class="success">
661
+ <h3>🐦 {len(birds)} Bird(s) Match!</h3>
662
  <p>{summary}</p>
663
+ </div>'''
664
 
665
  for i, bird in enumerate(birds, 1):
666
+ result += format_bird_card(bird, i)
667
 
668
+ yield result
669
 
670
  except Exception as e:
671
+ yield f'<div class="error">❌ Error: {str(e)}</div>'
 
672
 
673
 
674
+ # ================== UI ==================
675
 
676
+ def get_status_html():
677
+ """Get status indicator."""
678
+ models = check_ollama_models()
679
+
680
+ if models["vision_model"]:
681
+ return f'<span class="status-dot status-green"></span> LLaVA + {models["text_model"] or "HF"}'
682
+ elif models["text_model"]:
683
+ return f'<span class="status-dot status-yellow"></span> {models["text_model"]} (no vision)'
684
  elif HF_TOKEN:
685
+ return '<span class="status-dot status-yellow"></span> HuggingFace Cloud'
686
  else:
687
+ return '<span class="status-dot status-red"></span> Limited Mode'
688
 
689
 
690
+ def create_app():
 
 
691
  with gr.Blocks(title="BirdSense Pro") as demo:
692
  gr.HTML(f"<style>{CSS}</style>")
693
 
694
  gr.HTML(f"""
695
  <div class="header">
696
  <h1>🐦 BirdSense Pro</h1>
697
+ <p class="subtitle">AI Bird Identification β€’ Audio β€’ Image β€’ Description</p>
698
+ <div class="status">{get_status_html()}</div>
699
+ </div>""")
 
 
 
 
700
 
701
+ # AUDIO FIRST
702
  with gr.Tab("🎡 Audio"):
703
+ gr.HTML('<div class="info-box"><h3>🎡 Audio Identification</h3><p>Upload or record bird calls. Uses text AI to analyze acoustic features.</p></div>')
 
 
 
 
704
  with gr.Row():
705
  with gr.Column():
706
+ audio_in = gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎀 Audio")
707
  with gr.Row():
708
+ loc = gr.Textbox(label="πŸ“ Location", placeholder="e.g., Mumbai")
709
+ mon = gr.Dropdown(label="πŸ“… Month", choices=[""] + ["January","February","March","April","May","June","July","August","September","October","November","December"])
710
+ audio_btn = gr.Button("πŸ” Identify", variant="primary", size="lg")
 
711
  with gr.Column():
712
+ audio_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">🎡 Upload audio to identify</div>')
713
+ audio_btn.click(identify_audio_stream, [audio_in, loc, mon], audio_out)
 
714
 
715
+ # IMAGE
716
  with gr.Tab("πŸ“· Image"):
717
+ gr.HTML('<div class="info-box"><h3>πŸ“· Image Identification</h3><p>Upload a photo. Uses LLaVA vision AI to analyze the actual image.</p></div>')
 
 
 
 
718
  with gr.Row():
719
  with gr.Column():
720
+ img_in = gr.Image(sources=["upload", "webcam"], type="pil", label="πŸ“Έ Photo")
721
+ img_btn = gr.Button("πŸ” Identify", variant="primary", size="lg")
 
722
  with gr.Column():
723
+ img_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">πŸ“· Upload image to identify</div>')
724
+ img_btn.click(identify_image_stream, [img_in], img_out)
 
725
 
726
+ # DESCRIPTION
727
  with gr.Tab("πŸ“ Description"):
728
+ gr.HTML('<div class="info-box"><h3>πŸ“ Text Description</h3><p>Describe the bird - colors, size, behavior, sounds.</p></div>')
 
 
 
 
729
  with gr.Row():
730
  with gr.Column():
731
+ desc_in = gr.Textbox(label="✍️ Description", lines=3, placeholder="e.g., Large blue and yellow parrot with long tail")
732
+ desc_btn = gr.Button("πŸ” Identify", variant="primary", size="lg")
 
 
 
 
 
733
  with gr.Column():
734
+ desc_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">πŸ“ Describe a bird</div>')
735
+ desc_btn.click(identify_description_stream, [desc_in], desc_out)
 
736
 
737
+ gr.HTML('<div style="text-align:center;padding:10px;color:#718096;font-size:0.8rem"><b>BirdSense Pro</b> β€’ Local: LLaVA (image) + Llama3.2 (audio/text) β€’ Cloud: HuggingFace BLIP</div>')
 
 
 
 
 
738
 
739
  return demo
740
 
741
 
 
 
742
  if __name__ == "__main__":
743
  log("Starting BirdSense Pro...")
744
+ models = check_ollama_models()
745
+ log(f"Vision: {models['vision_model']}, Text: {models['text_model']}, HF: {bool(HF_TOKEN)}")
 
 
 
 
 
 
 
 
746
 
747
+ app = create_app()
748
+ app.launch(server_name="0.0.0.0", server_port=7860, show_error=True)