Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files
app.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
| 1 |
"""
|
| 2 |
π¦ BirdSense Pro - AI Bird Identification
|
| 3 |
-
|
| 4 |
-
-
|
| 5 |
-
|
| 6 |
"""
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import numpy as np
|
| 10 |
import scipy.signal as signal
|
| 11 |
-
from typing import Tuple, List, Dict,
|
| 12 |
import json
|
| 13 |
import requests
|
| 14 |
import re
|
|
@@ -20,15 +20,10 @@ import io
|
|
| 20 |
import base64
|
| 21 |
|
| 22 |
# ================== CONFIG ==================
|
| 23 |
-
SAMPLE_RATE = 48000
|
| 24 |
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
| 25 |
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 26 |
DEBUG = True
|
| 27 |
|
| 28 |
-
# Model priorities
|
| 29 |
-
OLLAMA_VISION_MODELS = ["llava:7b", "llava", "bakllava", "llava:13b"]
|
| 30 |
-
OLLAMA_TEXT_MODELS = ["llama3.2", "qwen2.5:3b", "mistral", "phi4"]
|
| 31 |
-
|
| 32 |
def log(msg):
|
| 33 |
if DEBUG:
|
| 34 |
print(f"[BirdSense] {msg}")
|
|
@@ -38,134 +33,100 @@ def log(msg):
|
|
| 38 |
CSS = """
|
| 39 |
.gradio-container {
|
| 40 |
background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important;
|
| 41 |
-
font-family: 'Inter',
|
| 42 |
}
|
| 43 |
.header {
|
| 44 |
background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%);
|
| 45 |
-
color: white; padding:
|
| 46 |
-
text-align: center; margin-bottom:
|
| 47 |
-
box-shadow: 0
|
| 48 |
}
|
| 49 |
-
.header h1 { font-size: 2.
|
| 50 |
-
.header .subtitle { font-size:
|
| 51 |
.header .status {
|
| 52 |
-
display: inline-flex; align-items: center; gap:
|
| 53 |
-
background: rgba(255,255,255,0.15); padding:
|
| 54 |
-
font-weight: 600; font-size: 0.
|
| 55 |
}
|
| 56 |
-
.status-dot { width:
|
| 57 |
-
|
|
|
|
|
|
|
| 58 |
|
| 59 |
.info-box {
|
| 60 |
background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%);
|
| 61 |
-
border: 1px solid #90cdf4; border-radius:
|
| 62 |
}
|
| 63 |
-
.info-box h3 { color: #2b6cb0; margin: 0 0
|
| 64 |
-
.info-box p { color: #4299e1; margin: 0; font-size: 0.
|
| 65 |
|
| 66 |
.bird-card {
|
| 67 |
-
background: white; border: 1px solid #e2e8f0; border-radius:
|
| 68 |
-
padding:
|
| 69 |
-
box-shadow: 0
|
| 70 |
-
transition: transform 0.2s, box-shadow 0.2s;
|
| 71 |
}
|
| 72 |
-
.bird-card
|
| 73 |
-
.bird-card img { width: 120px; height: 120px; object-fit: cover; border-radius: 12px; flex-shrink: 0; }
|
| 74 |
.bird-info { flex: 1; min-width: 0; }
|
| 75 |
-
.bird-info h3 { color: #1a202c; margin: 0 0
|
| 76 |
-
.bird-info .scientific { color: #718096; font-style: italic; font-size: 0.
|
| 77 |
-
.confidence { display: inline-block; padding:
|
| 78 |
.conf-high { background: #c6f6d5; color: #22543d; }
|
| 79 |
.conf-med { background: #fefcbf; color: #744210; }
|
| 80 |
.conf-low { background: #fed7d7; color: #742a2a; }
|
| 81 |
-
.reason { color: #4a5568; margin-top:
|
| 82 |
-
|
| 83 |
-
.error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 12px; padding: 20px; color: #c53030; }
|
| 84 |
-
.success { background: #f0fff4; border: 1px solid #68d391; border-radius: 12px; padding: 20px; color: #276749; }
|
| 85 |
-
.processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 12px; padding: 20px; color: #2b6cb0; }
|
| 86 |
|
| 87 |
-
.
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
}
|
| 91 |
-
|
| 92 |
-
@media (max-width: 768px) {
|
| 93 |
-
.header h1 { font-size: 1.8rem; }
|
| 94 |
-
.bird-card { flex-direction: column; }
|
| 95 |
-
.bird-card img { width: 100%; height: 180px; }
|
| 96 |
-
}
|
| 97 |
"""
|
| 98 |
|
| 99 |
|
| 100 |
-
# ================== UTILITY FUNCTIONS ==================
|
| 101 |
-
|
| 102 |
-
def image_to_base64(image: Image.Image) -> str:
|
| 103 |
-
"""Convert PIL image to base64."""
|
| 104 |
-
buffered = io.BytesIO()
|
| 105 |
-
# Resize for faster processing
|
| 106 |
-
max_size = 800
|
| 107 |
-
if max(image.size) > max_size:
|
| 108 |
-
ratio = max_size / max(image.size)
|
| 109 |
-
new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
|
| 110 |
-
image = image.resize(new_size, Image.Resampling.LANCZOS)
|
| 111 |
-
image.save(buffered, format="JPEG", quality=85)
|
| 112 |
-
return base64.b64encode(buffered.getvalue()).decode()
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
def get_wikipedia_image(bird_name: str) -> str:
|
| 116 |
-
"""Get bird image from Wikipedia."""
|
| 117 |
-
if not bird_name or bird_name.lower() in ['unknown', 'the bird', 'the image', 'bird']:
|
| 118 |
-
return "https://via.placeholder.com/150x150.png?text=Bird"
|
| 119 |
-
|
| 120 |
-
try:
|
| 121 |
-
clean_name = bird_name.strip().replace(" ", "_")
|
| 122 |
-
api_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean_name)}"
|
| 123 |
-
|
| 124 |
-
response = requests.get(api_url, timeout=5)
|
| 125 |
-
if response.status_code == 200:
|
| 126 |
-
data = response.json()
|
| 127 |
-
if "thumbnail" in data:
|
| 128 |
-
return data["thumbnail"]["source"]
|
| 129 |
-
except Exception as e:
|
| 130 |
-
log(f"Wikipedia image fetch failed: {e}")
|
| 131 |
-
|
| 132 |
-
return f"https://via.placeholder.com/150x150.png?text={urllib.parse.quote(bird_name[:12])}"
|
| 133 |
-
|
| 134 |
-
|
| 135 |
# ================== OLLAMA FUNCTIONS ==================
|
| 136 |
|
| 137 |
-
def
|
| 138 |
-
"""Check
|
|
|
|
| 139 |
try:
|
| 140 |
response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3)
|
| 141 |
if response.status_code == 200:
|
| 142 |
models = [m["name"] for m in response.json().get("models", [])]
|
| 143 |
log(f"Ollama models: {models}")
|
|
|
|
| 144 |
|
| 145 |
-
#
|
| 146 |
-
for
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
return True, m, True
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
for
|
| 154 |
-
|
| 155 |
-
if
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
return False, None, False
|
| 160 |
except Exception as e:
|
| 161 |
-
log(f"Ollama
|
| 162 |
-
|
|
|
|
| 163 |
|
| 164 |
|
| 165 |
-
def
|
| 166 |
-
"""Call
|
| 167 |
try:
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
response = requests.post(
|
| 171 |
f"{OLLAMA_URL}/api/generate",
|
|
@@ -174,106 +135,95 @@ def call_ollama_vision(image: Image.Image, prompt: str, model: str) -> str:
|
|
| 174 |
"prompt": prompt,
|
| 175 |
"images": [img_b64],
|
| 176 |
"stream": False,
|
| 177 |
-
"options": {"temperature": 0.
|
| 178 |
},
|
| 179 |
-
timeout=
|
| 180 |
)
|
| 181 |
|
| 182 |
if response.status_code == 200:
|
| 183 |
result = response.json().get("response", "")
|
| 184 |
-
log(f"LLaVA response: {result[:300]}...")
|
| 185 |
return result
|
| 186 |
else:
|
| 187 |
-
log(f"
|
| 188 |
-
return ""
|
| 189 |
except Exception as e:
|
| 190 |
-
log(f"
|
| 191 |
-
|
| 192 |
|
| 193 |
|
| 194 |
def call_ollama_text(prompt: str, model: str) -> str:
|
| 195 |
-
"""Call Ollama text model."""
|
| 196 |
try:
|
|
|
|
| 197 |
response = requests.post(
|
| 198 |
f"{OLLAMA_URL}/api/generate",
|
| 199 |
json={
|
| 200 |
"model": model,
|
| 201 |
"prompt": prompt,
|
| 202 |
"stream": False,
|
| 203 |
-
"options": {"temperature": 0.2, "num_predict":
|
| 204 |
},
|
| 205 |
-
timeout=
|
| 206 |
)
|
| 207 |
-
|
| 208 |
if response.status_code == 200:
|
| 209 |
return response.json().get("response", "")
|
| 210 |
-
return ""
|
| 211 |
except Exception as e:
|
| 212 |
-
log(f"
|
| 213 |
-
|
| 214 |
|
| 215 |
|
| 216 |
# ================== HUGGINGFACE FUNCTIONS ==================
|
| 217 |
|
| 218 |
-
def
|
| 219 |
-
"""
|
| 220 |
if not HF_TOKEN:
|
| 221 |
-
log("No HF_TOKEN
|
| 222 |
return ""
|
| 223 |
|
| 224 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 225 |
|
| 226 |
-
#
|
| 227 |
-
|
| 228 |
-
# Resize for API
|
| 229 |
-
max_size = 600
|
| 230 |
if max(image.size) > max_size:
|
| 231 |
ratio = max_size / max(image.size)
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
|
| 237 |
-
# Try BLIP models for image captioning
|
| 238 |
models = [
|
| 239 |
"Salesforce/blip-image-captioning-large",
|
| 240 |
"Salesforce/blip-image-captioning-base",
|
| 241 |
-
"nlpconnect/vit-gpt2-image-captioning",
|
| 242 |
]
|
| 243 |
|
| 244 |
for model in models:
|
| 245 |
try:
|
| 246 |
-
log(f"Trying HF
|
| 247 |
-
api_url = f"https://api-inference.huggingface.co/models/{model}"
|
| 248 |
-
|
| 249 |
response = requests.post(
|
| 250 |
-
|
| 251 |
headers=headers,
|
| 252 |
-
data=
|
| 253 |
-
timeout=
|
| 254 |
)
|
| 255 |
|
| 256 |
if response.status_code == 200:
|
| 257 |
result = response.json()
|
| 258 |
-
if isinstance(result, list) and
|
| 259 |
caption = result[0].get("generated_text", "")
|
| 260 |
if caption:
|
| 261 |
log(f"HF caption: {caption}")
|
| 262 |
return caption
|
| 263 |
elif response.status_code == 503:
|
| 264 |
-
log(f"
|
| 265 |
-
continue
|
| 266 |
else:
|
| 267 |
-
log(f"HF {
|
| 268 |
-
|
| 269 |
except Exception as e:
|
| 270 |
-
log(f"HF
|
| 271 |
-
continue
|
| 272 |
|
| 273 |
return ""
|
| 274 |
|
| 275 |
|
| 276 |
-
def
|
| 277 |
"""Call HuggingFace text model."""
|
| 278 |
if not HF_TOKEN:
|
| 279 |
return ""
|
|
@@ -281,344 +231,280 @@ def call_huggingface_text(prompt: str) -> str:
|
|
| 281 |
headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
|
| 282 |
|
| 283 |
models = [
|
| 284 |
-
"mistralai/Mistral-7B-Instruct-v0.
|
| 285 |
"HuggingFaceH4/zephyr-7b-beta",
|
| 286 |
-
"google/flan-t5-
|
| 287 |
]
|
| 288 |
|
| 289 |
for model in models:
|
| 290 |
try:
|
| 291 |
log(f"Trying HF text model: {model}")
|
| 292 |
-
api_url = f"https://api-inference.huggingface.co/models/{model}"
|
| 293 |
-
|
| 294 |
response = requests.post(
|
| 295 |
-
|
| 296 |
headers=headers,
|
| 297 |
-
json={
|
| 298 |
-
|
| 299 |
-
"parameters": {"max_new_tokens": 800, "temperature": 0.3, "return_full_text": False}
|
| 300 |
-
},
|
| 301 |
-
timeout=60
|
| 302 |
)
|
| 303 |
|
| 304 |
if response.status_code == 200:
|
| 305 |
result = response.json()
|
| 306 |
-
if isinstance(result, list) and
|
| 307 |
text = result[0].get("generated_text", "")
|
| 308 |
if text:
|
| 309 |
-
log(f"HF text
|
| 310 |
return text
|
| 311 |
elif response.status_code == 503:
|
| 312 |
continue
|
| 313 |
-
|
| 314 |
except Exception as e:
|
| 315 |
log(f"HF text error: {e}")
|
| 316 |
-
continue
|
| 317 |
|
| 318 |
return ""
|
| 319 |
|
| 320 |
|
| 321 |
-
# ==================
|
| 322 |
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
Identify the bird species. Provide your answer as JSON:
|
| 326 |
-
|
| 327 |
-
{
|
| 328 |
-
"birds": [
|
| 329 |
-
{
|
| 330 |
-
"name": "Zebra Finch",
|
| 331 |
-
"scientific_name": "Taeniopygia guttata",
|
| 332 |
-
"confidence": 95,
|
| 333 |
-
"reason": "Identified by orange cheek patches, red-orange beak, black and white barred throat, chestnut flanks with white spots"
|
| 334 |
-
}
|
| 335 |
-
],
|
| 336 |
-
"summary": "This is a Zebra Finch, a small Australian finch commonly kept as a pet."
|
| 337 |
-
}
|
| 338 |
-
|
| 339 |
-
Be SPECIFIC with the bird name - use the actual species name like "House Sparrow", "Indian Robin", "Zebra Finch", etc.
|
| 340 |
-
DO NOT use generic names like "The bird" or "Unknown".
|
| 341 |
-
If you're not 100% sure, still provide your best guess with lower confidence.
|
| 342 |
-
|
| 343 |
-
Return ONLY valid JSON."""
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
TEXT_IDENTIFICATION_PROMPT = """You are an expert ornithologist. Based on this image description, identify the bird species.
|
| 347 |
-
|
| 348 |
-
IMAGE DESCRIPTION: {description}
|
| 349 |
-
|
| 350 |
-
Respond with JSON ONLY:
|
| 351 |
-
{{
|
| 352 |
-
"birds": [
|
| 353 |
-
{{
|
| 354 |
-
"name": "Species Common Name",
|
| 355 |
-
"scientific_name": "Scientific name",
|
| 356 |
-
"confidence": 85,
|
| 357 |
-
"reason": "Specific features that match this species"
|
| 358 |
-
}}
|
| 359 |
-
],
|
| 360 |
-
"summary": "Brief identification summary"
|
| 361 |
-
}}
|
| 362 |
-
|
| 363 |
-
IMPORTANT:
|
| 364 |
-
- Use ACTUAL bird species names (e.g., "Zebra Finch", "House Sparrow", "Indian Roller")
|
| 365 |
-
- NEVER use generic names like "The bird", "Unknown", "The image"
|
| 366 |
-
- If description mentions orange beak, striped throat, spotted flanks - this is likely a Zebra Finch
|
| 367 |
-
- Provide your best species guess even if uncertain
|
| 368 |
-
|
| 369 |
-
Return ONLY the JSON, nothing else."""
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
def parse_bird_json(text: str) -> Tuple[List[Dict], str]:
|
| 373 |
-
"""Parse bird identification from LLM response."""
|
| 374 |
birds = []
|
| 375 |
summary = ""
|
| 376 |
|
| 377 |
-
|
| 378 |
-
|
|
|
|
|
|
|
| 379 |
|
| 380 |
-
# Try
|
| 381 |
try:
|
| 382 |
-
|
| 383 |
-
json_match = re.search(r'\{[\s\S]*?"birds"[\s\S]*?\}(?=\s*$|\s*```)', text)
|
| 384 |
if json_match:
|
| 385 |
json_str = json_match.group()
|
| 386 |
-
# Fix
|
| 387 |
-
json_str = re.sub(r',\s*}', '}', json_str)
|
| 388 |
-
json_str = re.sub(r',\s*]', ']', json_str)
|
| 389 |
-
|
| 390 |
data = json.loads(json_str)
|
| 391 |
-
|
|
|
|
| 392 |
summary = data.get("summary", "")
|
| 393 |
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
name
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
|
|
|
|
|
|
|
|
|
| 401 |
|
| 402 |
-
if
|
| 403 |
-
return
|
| 404 |
-
|
| 405 |
except json.JSONDecodeError as e:
|
| 406 |
log(f"JSON parse error: {e}")
|
| 407 |
|
| 408 |
-
# Fallback:
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
r
|
| 414 |
-
r'([A-Z][a-z]+ (?:Finch|Sparrow|Robin|Warbler|Dove|Pigeon|Parrot|Kingfisher|Woodpecker|Eagle|Hawk|Owl|Heron|Crane|Duck|Goose|Swan|Crow|Raven|Jay|Magpie|Starling|Myna|Bulbul|Sunbird|Flowerpecker|Barbet|Drongo|Shrike|Oriole|Flycatcher|Thrush|Babbler))',
|
| 415 |
-
r'(Zebra Finch|House Sparrow|Indian Robin|Common Myna|Red-vented Bulbul|Rose-ringed Parakeet)',
|
| 416 |
]
|
| 417 |
|
| 418 |
-
for pattern in
|
| 419 |
-
matches = re.findall(pattern, text
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
break
|
| 429 |
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
|
| 450 |
-
|
|
|
|
| 451 |
|
| 452 |
|
| 453 |
def format_bird_card(bird: Dict, index: int) -> str:
|
| 454 |
-
"""Format bird
|
| 455 |
-
name = bird.get("name", "Unknown
|
| 456 |
scientific = bird.get("scientific_name", "")
|
| 457 |
confidence = bird.get("confidence", 50)
|
| 458 |
reason = bird.get("reason", "")
|
| 459 |
|
| 460 |
-
|
| 461 |
-
if name.lower() in ['unknown', 'the bird', 'the image', 'bird']:
|
| 462 |
-
name = "Unidentified Bird"
|
| 463 |
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
if confidence >= 80:
|
| 467 |
-
conf_class = "conf-high"
|
| 468 |
-
elif confidence >= 60:
|
| 469 |
-
conf_class = "conf-med"
|
| 470 |
-
else:
|
| 471 |
-
conf_class = "conf-low"
|
| 472 |
|
| 473 |
return f"""
|
| 474 |
<div class="bird-card">
|
| 475 |
-
<img src="{img_url}" alt="{name}" onerror="this.
|
| 476 |
<div class="bird-info">
|
| 477 |
<h3>{index}. {name}</h3>
|
| 478 |
-
<div class="scientific">{scientific}</div>
|
| 479 |
<span class="confidence {conf_class}">{confidence}% confidence</span>
|
| 480 |
<p class="reason">{reason}</p>
|
| 481 |
</div>
|
| 482 |
</div>"""
|
| 483 |
|
| 484 |
|
| 485 |
-
# ==================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
|
| 487 |
def identify_image_stream(image):
|
| 488 |
-
"""Identify bird from image
|
| 489 |
if image is None:
|
| 490 |
-
yield '<div class="error">β οΈ Please upload an image
|
| 491 |
return
|
| 492 |
|
| 493 |
try:
|
| 494 |
-
# Ensure PIL Image
|
| 495 |
if not isinstance(image, Image.Image):
|
| 496 |
image = Image.fromarray(np.array(image))
|
| 497 |
image = image.convert("RGB")
|
| 498 |
|
| 499 |
-
yield '<div class="processing">π Analyzing
|
| 500 |
|
| 501 |
-
|
| 502 |
-
|
|
|
|
| 503 |
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
| 506 |
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
yield f'<div class="processing">π¦ Using LLaVA vision model ({model})...</div>'
|
| 510 |
-
|
| 511 |
-
llm_response = call_ollama_vision(image, LLAVA_BIRD_PROMPT, model)
|
| 512 |
-
|
| 513 |
-
if not llm_response:
|
| 514 |
-
# FALLBACK 1: HuggingFace vision + text
|
| 515 |
yield '<div class="processing">βοΈ Using HuggingFace AI...</div>'
|
| 516 |
|
| 517 |
-
|
|
|
|
| 518 |
|
| 519 |
-
if
|
| 520 |
-
yield f'
|
| 521 |
-
<div class="features-box"><strong>AI saw:</strong> {image_description}</div>'''
|
| 522 |
|
| 523 |
# Use text model to identify
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
|
|
|
| 540 |
|
| 541 |
# Parse response
|
| 542 |
-
|
| 543 |
-
birds, summary = parse_bird_json(llm_response)
|
| 544 |
-
else:
|
| 545 |
-
birds, summary = [], "Could not get AI response"
|
| 546 |
|
| 547 |
if not birds:
|
| 548 |
-
yield f
|
| 549 |
-
<
|
| 550 |
-
<p>The AI couldn't
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
</div>"""
|
| 554 |
return
|
| 555 |
|
| 556 |
-
#
|
| 557 |
-
|
| 558 |
-
<
|
| 559 |
-
<p>{summary}</p>
|
| 560 |
-
</div>
|
| 561 |
|
| 562 |
for i, bird in enumerate(birds, 1):
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
yield result_html
|
| 566 |
-
|
| 567 |
-
except Exception as e:
|
| 568 |
-
log(f"Image identification error: {traceback.format_exc()}")
|
| 569 |
-
yield f'<div class="error"><strong>β Error:</strong> {str(e)}</div>'
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
def analyze_colors(image: Image.Image) -> str:
|
| 573 |
-
"""Basic color analysis fallback."""
|
| 574 |
-
try:
|
| 575 |
-
img = np.array(image)
|
| 576 |
-
h, w = img.shape[:2]
|
| 577 |
|
| 578 |
-
|
| 579 |
-
mean = np.mean(region, axis=(0, 1))
|
| 580 |
-
r, g, b = mean
|
| 581 |
-
|
| 582 |
-
colors = []
|
| 583 |
-
if r > 180 and g < 120 and b < 120: colors.append("red")
|
| 584 |
-
elif r > 180 and g > 100 and b < 100: colors.append("orange")
|
| 585 |
-
elif r > 180 and g > 180 and b < 120: colors.append("yellow")
|
| 586 |
-
elif r > 150 and g > 150 and b > 150: colors.append("white/light grey")
|
| 587 |
-
elif r < 80 and g < 80 and b < 80: colors.append("black/dark")
|
| 588 |
-
elif 80 < r < 150 and 80 < g < 150 and 80 < b < 150: colors.append("grey")
|
| 589 |
-
elif r > g > b: colors.append("brown/rufous")
|
| 590 |
-
else: colors.append("mixed colors")
|
| 591 |
-
|
| 592 |
-
return ", ".join(colors)
|
| 593 |
-
|
| 594 |
-
head = describe_region(img[:h//3, :, :])
|
| 595 |
-
body = describe_region(img[h//3:2*h//3, :, :])
|
| 596 |
-
lower = describe_region(img[2*h//3:, :, :])
|
| 597 |
-
|
| 598 |
-
return f"Small bird. Head region: {head}. Body: {body}. Lower parts: {lower}."
|
| 599 |
|
| 600 |
except Exception as e:
|
| 601 |
-
|
|
|
|
| 602 |
|
| 603 |
|
| 604 |
# ================== AUDIO IDENTIFICATION ==================
|
| 605 |
|
| 606 |
def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
|
| 607 |
-
"""
|
| 608 |
try:
|
| 609 |
-
# Normalize
|
| 610 |
audio = audio_data.astype(np.float64)
|
| 611 |
if np.max(np.abs(audio)) > 0:
|
| 612 |
audio = audio / np.max(np.abs(audio))
|
| 613 |
|
| 614 |
-
# Bandpass filter (
|
| 615 |
nyq = sr / 2
|
| 616 |
low, high = max(500/nyq, 0.01), min(10000/nyq, 0.99)
|
| 617 |
if low < high:
|
| 618 |
b, a = signal.butter(4, [low, high], btype='band')
|
| 619 |
audio = signal.filtfilt(b, a, audio)
|
| 620 |
|
| 621 |
-
# Features
|
| 622 |
duration = len(audio_data) / sr
|
| 623 |
|
| 624 |
# Peak frequency
|
|
@@ -626,58 +512,43 @@ def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
|
|
| 626 |
freqs = np.fft.rfftfreq(len(audio), 1/sr)
|
| 627 |
peak_freq = freqs[np.argmax(np.abs(fft))] if len(freqs) > 0 else 0
|
| 628 |
|
| 629 |
-
#
|
| 630 |
envelope = np.abs(signal.hilbert(audio))
|
| 631 |
threshold = np.mean(envelope) + 0.5 * np.std(envelope)
|
| 632 |
syllables = np.sum(np.diff((envelope > threshold).astype(int)) > 0)
|
| 633 |
|
| 634 |
return {
|
| 635 |
"duration": round(duration, 2),
|
| 636 |
-
"
|
| 637 |
"syllables": int(syllables),
|
| 638 |
-
"
|
| 639 |
-
"frequency_range": "high (3-8kHz)" if peak_freq > 3000 else "medium (1-3kHz)" if peak_freq > 1000 else "low (<1kHz)"
|
| 640 |
}
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
return {"error": str(e), "duration": 0}
|
| 644 |
|
| 645 |
|
| 646 |
-
AUDIO_PROMPT = """You are an expert ornithologist
|
| 647 |
|
| 648 |
-
BIRD CALL ANALYSIS:
|
| 649 |
- Duration: {duration} seconds
|
| 650 |
-
- Peak Frequency: {
|
| 651 |
-
- Syllables/
|
| 652 |
-
|
| 653 |
-
{location}{month}
|
| 654 |
|
| 655 |
Based on these acoustic features, identify possible bird species.
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
Low frequency (500-1000 Hz) = larger birds (crows, pigeons, doves)
|
| 660 |
|
| 661 |
Respond with JSON ONLY:
|
| 662 |
-
{{
|
| 663 |
-
"birds": [
|
| 664 |
-
{{
|
| 665 |
-
"name": "Species Name",
|
| 666 |
-
"scientific_name": "Scientific name",
|
| 667 |
-
"confidence": 75,
|
| 668 |
-
"reason": "Why this bird matches the audio features"
|
| 669 |
-
}}
|
| 670 |
-
],
|
| 671 |
-
"summary": "Brief summary"
|
| 672 |
-
}}
|
| 673 |
|
| 674 |
-
|
| 675 |
|
| 676 |
|
| 677 |
def identify_audio_stream(audio_input, location: str = "", month: str = ""):
|
| 678 |
-
"""Identify bird from audio."""
|
| 679 |
if audio_input is None:
|
| 680 |
-
yield '<div class="error">β οΈ Please upload or record audio
|
| 681 |
return
|
| 682 |
|
| 683 |
try:
|
|
@@ -688,253 +559,190 @@ def identify_audio_stream(audio_input, location: str = "", month: str = ""):
|
|
| 688 |
return
|
| 689 |
|
| 690 |
if len(audio_data) == 0:
|
| 691 |
-
yield '<div class="error">β οΈ
|
| 692 |
return
|
| 693 |
|
| 694 |
-
# Convert to mono
|
| 695 |
if len(audio_data.shape) > 1:
|
| 696 |
audio_data = np.mean(audio_data, axis=1)
|
| 697 |
|
| 698 |
-
yield '<div class="processing">π
|
| 699 |
|
| 700 |
features = process_audio(audio_data, sr)
|
| 701 |
|
| 702 |
-
features_html = f
|
| 703 |
-
<
|
| 704 |
-
β’ Duration: {features
|
| 705 |
-
β’
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
yield f'<div class="processing">π€ Identifying bird species...</div>{features_html}'
|
| 711 |
-
|
| 712 |
-
# Build prompt
|
| 713 |
-
prompt = AUDIO_PROMPT.format(
|
| 714 |
-
**features,
|
| 715 |
-
location=f"\n- Location: {location}" if location else "",
|
| 716 |
-
month=f"\n- Month: {month}" if month else ""
|
| 717 |
-
)
|
| 718 |
|
| 719 |
-
|
| 720 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
response = ""
|
| 722 |
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
|
|
|
| 726 |
|
| 727 |
if not response:
|
| 728 |
-
yield f'<div class="processing">βοΈ Using
|
| 729 |
-
response =
|
| 730 |
|
| 731 |
-
birds, summary =
|
| 732 |
|
| 733 |
if not birds:
|
| 734 |
-
yield f
|
| 735 |
-
<
|
| 736 |
-
<p>Try
|
| 737 |
{features_html}
|
| 738 |
-
</div>
|
| 739 |
return
|
| 740 |
|
| 741 |
-
|
| 742 |
-
<
|
| 743 |
<p>{summary}</p>
|
| 744 |
-
</div>{features_html}
|
| 745 |
|
| 746 |
for i, bird in enumerate(birds, 1):
|
| 747 |
-
|
| 748 |
|
| 749 |
-
yield
|
| 750 |
|
| 751 |
except Exception as e:
|
| 752 |
log(f"Audio error: {traceback.format_exc()}")
|
| 753 |
-
yield f'<div class="error"
|
| 754 |
|
| 755 |
|
| 756 |
# ================== DESCRIPTION IDENTIFICATION ==================
|
| 757 |
|
| 758 |
def identify_description_stream(description: str):
|
| 759 |
"""Identify bird from text description."""
|
| 760 |
-
if not description or len(description.strip()) <
|
| 761 |
-
yield '<div class="error">β οΈ Please enter a
|
| 762 |
return
|
| 763 |
|
| 764 |
try:
|
| 765 |
-
yield '<div class="processing">π Analyzing
|
| 766 |
|
| 767 |
-
prompt = f"""
|
| 768 |
|
| 769 |
-
USER DESCRIPTION:
|
| 770 |
"{description}"
|
| 771 |
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
Respond with JSON ONLY:
|
| 775 |
-
{{
|
| 776 |
-
"birds": [
|
| 777 |
-
{{
|
| 778 |
-
"name": "Species Name",
|
| 779 |
-
"scientific_name": "Scientific name",
|
| 780 |
-
"confidence": 85,
|
| 781 |
-
"reason": "Why this matches the description"
|
| 782 |
-
}}
|
| 783 |
-
],
|
| 784 |
-
"summary": "Brief summary"
|
| 785 |
-
}}
|
| 786 |
|
| 787 |
-
Use ACTUAL species names
|
| 788 |
|
| 789 |
-
|
| 790 |
response = ""
|
| 791 |
|
| 792 |
-
if
|
| 793 |
yield '<div class="processing">π¦ Using local AI...</div>'
|
| 794 |
-
response = call_ollama_text(prompt,
|
| 795 |
|
| 796 |
if not response:
|
| 797 |
-
yield '<div class="processing">βοΈ Using
|
| 798 |
-
response =
|
| 799 |
|
| 800 |
-
birds, summary =
|
| 801 |
|
| 802 |
if not birds:
|
| 803 |
-
yield
|
| 804 |
-
<strong>Could not identify bird</strong>
|
| 805 |
-
<p>Try adding more details about colors, size, behavior, or sounds.</p>
|
| 806 |
-
</div>"""
|
| 807 |
return
|
| 808 |
|
| 809 |
-
|
| 810 |
-
<
|
| 811 |
<p>{summary}</p>
|
| 812 |
-
</div>
|
| 813 |
|
| 814 |
for i, bird in enumerate(birds, 1):
|
| 815 |
-
|
| 816 |
|
| 817 |
-
yield
|
| 818 |
|
| 819 |
except Exception as e:
|
| 820 |
-
|
| 821 |
-
yield f'<div class="error"><strong>β Error:</strong> {str(e)}</div>'
|
| 822 |
|
| 823 |
|
| 824 |
-
# ==================
|
| 825 |
|
| 826 |
-
def
|
| 827 |
-
"""Get
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
|
|
|
| 833 |
elif HF_TOKEN:
|
| 834 |
-
return "
|
| 835 |
else:
|
| 836 |
-
return "
|
| 837 |
|
| 838 |
|
| 839 |
-
def
|
| 840 |
-
"""Create Gradio interface with Audio tab first."""
|
| 841 |
-
|
| 842 |
with gr.Blocks(title="BirdSense Pro") as demo:
|
| 843 |
gr.HTML(f"<style>{CSS}</style>")
|
| 844 |
|
| 845 |
gr.HTML(f"""
|
| 846 |
<div class="header">
|
| 847 |
<h1>π¦ BirdSense Pro</h1>
|
| 848 |
-
<p class="subtitle">AI
|
| 849 |
-
<div class="status">
|
| 850 |
-
|
| 851 |
-
{get_status()}
|
| 852 |
-
</div>
|
| 853 |
-
</div>
|
| 854 |
-
""")
|
| 855 |
|
| 856 |
-
# AUDIO
|
| 857 |
with gr.Tab("π΅ Audio"):
|
| 858 |
-
gr.HTML(
|
| 859 |
-
<h3>π΅ Audio Identification</h3>
|
| 860 |
-
<p>Upload or record bird calls. SAM-Audio processing isolates bird sounds from background noise.</p>
|
| 861 |
-
</div>""")
|
| 862 |
-
|
| 863 |
with gr.Row():
|
| 864 |
with gr.Column():
|
| 865 |
-
|
| 866 |
with gr.Row():
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
audio_btn = gr.Button("π Identify
|
| 870 |
-
|
| 871 |
with gr.Column():
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
audio_btn.click(identify_audio_stream, [audio_input, location, month], audio_output)
|
| 875 |
|
| 876 |
-
# IMAGE
|
| 877 |
with gr.Tab("π· Image"):
|
| 878 |
-
gr.HTML(
|
| 879 |
-
<h3>π· Image Identification</h3>
|
| 880 |
-
<p>Upload a bird photo. LLaVA vision model analyzes the actual image for accurate identification.</p>
|
| 881 |
-
</div>""")
|
| 882 |
-
|
| 883 |
with gr.Row():
|
| 884 |
with gr.Column():
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
with gr.Column():
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
image_btn.click(identify_image_stream, [image_input], image_output)
|
| 892 |
|
| 893 |
-
# DESCRIPTION
|
| 894 |
with gr.Tab("π Description"):
|
| 895 |
-
gr.HTML(
|
| 896 |
-
<h3>π Text Description</h3>
|
| 897 |
-
<p>Describe the bird you saw β colors, size, behavior, sounds, habitat.</p>
|
| 898 |
-
</div>""")
|
| 899 |
-
|
| 900 |
with gr.Row():
|
| 901 |
with gr.Column():
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
lines=4,
|
| 905 |
-
placeholder="Example: Small bird with bright orange beak, grey head with orange cheek patches, black and white striped throat, chestnut brown sides with white spots."
|
| 906 |
-
)
|
| 907 |
-
desc_btn = gr.Button("π Identify Bird", variant="primary", size="lg")
|
| 908 |
-
|
| 909 |
with gr.Column():
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
desc_btn.click(identify_description_stream, [desc_input], desc_output)
|
| 913 |
|
| 914 |
-
gr.HTML(""
|
| 915 |
-
<div style="text-align:center; margin-top:20px; padding:15px; color:#718096; font-size:0.85rem;">
|
| 916 |
-
<strong>BirdSense Pro</strong> β’ Uses LLaVA (local) or BLIP-2 (cloud) for vision analysis<br>
|
| 917 |
-
For best accuracy, use local Ollama with LLaVA model
|
| 918 |
-
</div>
|
| 919 |
-
""")
|
| 920 |
|
| 921 |
return demo
|
| 922 |
|
| 923 |
|
| 924 |
-
# ================== MAIN ==================
|
| 925 |
-
|
| 926 |
if __name__ == "__main__":
|
| 927 |
log("Starting BirdSense Pro...")
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
ollama_ok, model, is_vision = check_ollama()
|
| 931 |
-
if is_vision:
|
| 932 |
-
log(f"β
LLaVA vision model ready: {model}")
|
| 933 |
-
elif ollama_ok:
|
| 934 |
-
log(f"β οΈ Ollama available but no vision model: {model}")
|
| 935 |
-
log(" Run: ollama pull llava:7b")
|
| 936 |
-
else:
|
| 937 |
-
log("β οΈ Ollama not available, using HuggingFace fallback")
|
| 938 |
|
| 939 |
-
|
| 940 |
-
|
|
|
|
| 1 |
"""
|
| 2 |
π¦ BirdSense Pro - AI Bird Identification
|
| 3 |
+
- Local: Ollama LLaVA (vision) + Llama3.2 (text/audio)
|
| 4 |
+
- Cloud: HuggingFace BLIP-2 + Text models
|
| 5 |
+
NO HARDCODED BIRDS - Pure AI identification
|
| 6 |
"""
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
import numpy as np
|
| 10 |
import scipy.signal as signal
|
| 11 |
+
from typing import Tuple, List, Dict, Optional
|
| 12 |
import json
|
| 13 |
import requests
|
| 14 |
import re
|
|
|
|
| 20 |
import base64
|
| 21 |
|
| 22 |
# ================== CONFIG ==================
|
|
|
|
| 23 |
OLLAMA_URL = os.environ.get("OLLAMA_URL", "http://localhost:11434")
|
| 24 |
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 25 |
DEBUG = True
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def log(msg):
|
| 28 |
if DEBUG:
|
| 29 |
print(f"[BirdSense] {msg}")
|
|
|
|
| 33 |
CSS = """
|
| 34 |
.gradio-container {
|
| 35 |
background: linear-gradient(135deg, #f0f4f8 0%, #d9e2ec 100%) !important;
|
| 36 |
+
font-family: 'Inter', sans-serif !important;
|
| 37 |
}
|
| 38 |
.header {
|
| 39 |
background: linear-gradient(135deg, #1a365d 0%, #2c5282 50%, #3182ce 100%);
|
| 40 |
+
color: white; padding: 35px 20px; border-radius: 16px;
|
| 41 |
+
text-align: center; margin-bottom: 16px;
|
| 42 |
+
box-shadow: 0 10px 30px rgba(26, 54, 93, 0.25);
|
| 43 |
}
|
| 44 |
+
.header h1 { font-size: 2.2rem; font-weight: 800; margin: 0 0 8px 0; }
|
| 45 |
+
.header .subtitle { font-size: 1rem; opacity: 0.9; margin-bottom: 10px; }
|
| 46 |
.header .status {
|
| 47 |
+
display: inline-flex; align-items: center; gap: 6px;
|
| 48 |
+
background: rgba(255,255,255,0.15); padding: 6px 16px; border-radius: 50px;
|
| 49 |
+
font-weight: 600; font-size: 0.85rem;
|
| 50 |
}
|
| 51 |
+
.status-dot { width: 8px; height: 8px; border-radius: 50%; }
|
| 52 |
+
.status-green { background: #48bb78; }
|
| 53 |
+
.status-yellow { background: #ecc94b; }
|
| 54 |
+
.status-red { background: #fc8181; }
|
| 55 |
|
| 56 |
.info-box {
|
| 57 |
background: linear-gradient(135deg, #ebf4ff 0%, #c3dafe 100%);
|
| 58 |
+
border: 1px solid #90cdf4; border-radius: 10px; padding: 14px; margin-bottom: 14px;
|
| 59 |
}
|
| 60 |
+
.info-box h3 { color: #2b6cb0; margin: 0 0 4px 0; font-size: 0.95rem; }
|
| 61 |
+
.info-box p { color: #4299e1; margin: 0; font-size: 0.85rem; }
|
| 62 |
|
| 63 |
.bird-card {
|
| 64 |
+
background: white; border: 1px solid #e2e8f0; border-radius: 14px;
|
| 65 |
+
padding: 16px; margin: 10px 0; display: flex; gap: 14px;
|
| 66 |
+
box-shadow: 0 3px 10px rgba(0,0,0,0.04);
|
|
|
|
| 67 |
}
|
| 68 |
+
.bird-card img { width: 100px; height: 100px; object-fit: cover; border-radius: 10px; flex-shrink: 0; }
|
|
|
|
| 69 |
.bird-info { flex: 1; min-width: 0; }
|
| 70 |
+
.bird-info h3 { color: #1a202c; margin: 0 0 3px 0; font-size: 1.1rem; font-weight: 700; }
|
| 71 |
+
.bird-info .scientific { color: #718096; font-style: italic; font-size: 0.8rem; margin-bottom: 8px; }
|
| 72 |
+
.confidence { display: inline-block; padding: 3px 10px; border-radius: 16px; font-weight: 700; font-size: 0.75rem; }
|
| 73 |
.conf-high { background: #c6f6d5; color: #22543d; }
|
| 74 |
.conf-med { background: #fefcbf; color: #744210; }
|
| 75 |
.conf-low { background: #fed7d7; color: #742a2a; }
|
| 76 |
+
.reason { color: #4a5568; margin-top: 8px; line-height: 1.5; font-size: 0.85rem; }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
.error { background: #fff5f5; border: 1px solid #fc8181; border-radius: 10px; padding: 16px; color: #c53030; }
|
| 79 |
+
.success { background: #f0fff4; border: 1px solid #68d391; border-radius: 10px; padding: 16px; color: #276749; }
|
| 80 |
+
.processing { background: #ebf8ff; border: 1px solid #63b3ed; border-radius: 10px; padding: 16px; color: #2b6cb0; }
|
| 81 |
+
.features-box { background: #f7fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; margin: 8px 0; font-size: 0.8rem; }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
"""
|
| 83 |
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
# ================== OLLAMA FUNCTIONS ==================
|
| 86 |
|
| 87 |
+
def check_ollama_models() -> Dict:
|
| 88 |
+
"""Check available Ollama models."""
|
| 89 |
+
result = {"available": False, "vision_model": None, "text_model": None}
|
| 90 |
try:
|
| 91 |
response = requests.get(f"{OLLAMA_URL}/api/tags", timeout=3)
|
| 92 |
if response.status_code == 200:
|
| 93 |
models = [m["name"] for m in response.json().get("models", [])]
|
| 94 |
log(f"Ollama models: {models}")
|
| 95 |
+
result["available"] = True
|
| 96 |
|
| 97 |
+
# Find vision model
|
| 98 |
+
for m in models:
|
| 99 |
+
if "llava" in m.lower() or "bakllava" in m.lower():
|
| 100 |
+
result["vision_model"] = m
|
| 101 |
+
break
|
|
|
|
| 102 |
|
| 103 |
+
# Find text model
|
| 104 |
+
for m in models:
|
| 105 |
+
if any(t in m.lower() for t in ["llama", "qwen", "mistral", "phi"]):
|
| 106 |
+
if "llava" not in m.lower(): # Exclude vision models
|
| 107 |
+
result["text_model"] = m
|
| 108 |
+
break
|
|
|
|
|
|
|
| 109 |
except Exception as e:
|
| 110 |
+
log(f"Ollama check failed: {e}")
|
| 111 |
+
|
| 112 |
+
return result
|
| 113 |
|
| 114 |
|
| 115 |
+
def call_llava(image: Image.Image, prompt: str, model: str) -> str:
|
| 116 |
+
"""Call LLaVA vision model."""
|
| 117 |
try:
|
| 118 |
+
# Resize image
|
| 119 |
+
max_size = 768
|
| 120 |
+
if max(image.size) > max_size:
|
| 121 |
+
ratio = max_size / max(image.size)
|
| 122 |
+
image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS)
|
| 123 |
+
|
| 124 |
+
# Convert to base64
|
| 125 |
+
buffer = io.BytesIO()
|
| 126 |
+
image.save(buffer, format="JPEG", quality=85)
|
| 127 |
+
img_b64 = base64.b64encode(buffer.getvalue()).decode()
|
| 128 |
+
|
| 129 |
+
log(f"Calling LLaVA ({model}) with {len(img_b64)} bytes image...")
|
| 130 |
|
| 131 |
response = requests.post(
|
| 132 |
f"{OLLAMA_URL}/api/generate",
|
|
|
|
| 135 |
"prompt": prompt,
|
| 136 |
"images": [img_b64],
|
| 137 |
"stream": False,
|
| 138 |
+
"options": {"temperature": 0.1, "num_predict": 1200}
|
| 139 |
},
|
| 140 |
+
timeout=120
|
| 141 |
)
|
| 142 |
|
| 143 |
if response.status_code == 200:
|
| 144 |
result = response.json().get("response", "")
|
| 145 |
+
log(f"LLaVA response ({len(result)} chars): {result[:300]}...")
|
| 146 |
return result
|
| 147 |
else:
|
| 148 |
+
log(f"LLaVA error: {response.status_code} - {response.text[:200]}")
|
|
|
|
| 149 |
except Exception as e:
|
| 150 |
+
log(f"LLaVA call failed: {traceback.format_exc()}")
|
| 151 |
+
return ""
|
| 152 |
|
| 153 |
|
| 154 |
def call_ollama_text(prompt: str, model: str) -> str:
|
| 155 |
+
"""Call Ollama text model (for audio/description)."""
|
| 156 |
try:
|
| 157 |
+
log(f"Calling text model ({model})...")
|
| 158 |
response = requests.post(
|
| 159 |
f"{OLLAMA_URL}/api/generate",
|
| 160 |
json={
|
| 161 |
"model": model,
|
| 162 |
"prompt": prompt,
|
| 163 |
"stream": False,
|
| 164 |
+
"options": {"temperature": 0.2, "num_predict": 800}
|
| 165 |
},
|
| 166 |
+
timeout=60
|
| 167 |
)
|
|
|
|
| 168 |
if response.status_code == 200:
|
| 169 |
return response.json().get("response", "")
|
|
|
|
| 170 |
except Exception as e:
|
| 171 |
+
log(f"Text model error: {e}")
|
| 172 |
+
return ""
|
| 173 |
|
| 174 |
|
| 175 |
# ================== HUGGINGFACE FUNCTIONS ==================
|
| 176 |
|
| 177 |
+
def call_hf_image_caption(image: Image.Image) -> str:
|
| 178 |
+
"""Get image caption from HuggingFace BLIP."""
|
| 179 |
if not HF_TOKEN:
|
| 180 |
+
log("No HF_TOKEN")
|
| 181 |
return ""
|
| 182 |
|
| 183 |
headers = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 184 |
|
| 185 |
+
# Resize
|
| 186 |
+
max_size = 512
|
|
|
|
|
|
|
| 187 |
if max(image.size) > max_size:
|
| 188 |
ratio = max_size / max(image.size)
|
| 189 |
+
image = image.resize((int(image.size[0]*ratio), int(image.size[1]*ratio)), Image.Resampling.LANCZOS)
|
| 190 |
+
|
| 191 |
+
buffer = io.BytesIO()
|
| 192 |
+
image.save(buffer, format="JPEG", quality=80)
|
| 193 |
|
|
|
|
| 194 |
models = [
|
| 195 |
"Salesforce/blip-image-captioning-large",
|
| 196 |
"Salesforce/blip-image-captioning-base",
|
|
|
|
| 197 |
]
|
| 198 |
|
| 199 |
for model in models:
|
| 200 |
try:
|
| 201 |
+
log(f"Trying HF caption model: {model}")
|
|
|
|
|
|
|
| 202 |
response = requests.post(
|
| 203 |
+
f"https://api-inference.huggingface.co/models/{model}",
|
| 204 |
headers=headers,
|
| 205 |
+
data=buffer.getvalue(),
|
| 206 |
+
timeout=45
|
| 207 |
)
|
| 208 |
|
| 209 |
if response.status_code == 200:
|
| 210 |
result = response.json()
|
| 211 |
+
if isinstance(result, list) and result:
|
| 212 |
caption = result[0].get("generated_text", "")
|
| 213 |
if caption:
|
| 214 |
log(f"HF caption: {caption}")
|
| 215 |
return caption
|
| 216 |
elif response.status_code == 503:
|
| 217 |
+
log(f"{model} loading, trying next...")
|
|
|
|
| 218 |
else:
|
| 219 |
+
log(f"HF error {response.status_code}: {response.text[:100]}")
|
|
|
|
| 220 |
except Exception as e:
|
| 221 |
+
log(f"HF caption error: {e}")
|
|
|
|
| 222 |
|
| 223 |
return ""
|
| 224 |
|
| 225 |
|
| 226 |
+
def call_hf_text(prompt: str) -> str:
|
| 227 |
"""Call HuggingFace text model."""
|
| 228 |
if not HF_TOKEN:
|
| 229 |
return ""
|
|
|
|
| 231 |
headers = {"Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json"}
|
| 232 |
|
| 233 |
models = [
|
| 234 |
+
"mistralai/Mistral-7B-Instruct-v0.2",
|
| 235 |
"HuggingFaceH4/zephyr-7b-beta",
|
| 236 |
+
"google/flan-t5-xl",
|
| 237 |
]
|
| 238 |
|
| 239 |
for model in models:
|
| 240 |
try:
|
| 241 |
log(f"Trying HF text model: {model}")
|
|
|
|
|
|
|
| 242 |
response = requests.post(
|
| 243 |
+
f"https://api-inference.huggingface.co/models/{model}",
|
| 244 |
headers=headers,
|
| 245 |
+
json={"inputs": prompt, "parameters": {"max_new_tokens": 600, "temperature": 0.3}},
|
| 246 |
+
timeout=45
|
|
|
|
|
|
|
|
|
|
| 247 |
)
|
| 248 |
|
| 249 |
if response.status_code == 200:
|
| 250 |
result = response.json()
|
| 251 |
+
if isinstance(result, list) and result:
|
| 252 |
text = result[0].get("generated_text", "")
|
| 253 |
if text:
|
| 254 |
+
log(f"HF text ({len(text)} chars)")
|
| 255 |
return text
|
| 256 |
elif response.status_code == 503:
|
| 257 |
continue
|
|
|
|
| 258 |
except Exception as e:
|
| 259 |
log(f"HF text error: {e}")
|
|
|
|
| 260 |
|
| 261 |
return ""
|
| 262 |
|
| 263 |
|
| 264 |
+
# ================== PARSING ==================
|
| 265 |
|
| 266 |
+
def parse_bird_response(text: str) -> Tuple[List[Dict], str]:
|
| 267 |
+
"""Parse LLM response to extract bird identifications. NO HARDCODED FALLBACKS."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
birds = []
|
| 269 |
summary = ""
|
| 270 |
|
| 271 |
+
if not text:
|
| 272 |
+
return [], ""
|
| 273 |
+
|
| 274 |
+
log(f"Parsing response: {text[:500]}...")
|
| 275 |
|
| 276 |
+
# Try JSON first
|
| 277 |
try:
|
| 278 |
+
json_match = re.search(r'\{[\s\S]*"birds"[\s\S]*\}', text)
|
|
|
|
| 279 |
if json_match:
|
| 280 |
json_str = json_match.group()
|
| 281 |
+
json_str = re.sub(r',(\s*[}\]])', r'\1', json_str) # Fix trailing commas
|
|
|
|
|
|
|
|
|
|
| 282 |
data = json.loads(json_str)
|
| 283 |
+
|
| 284 |
+
raw_birds = data.get("birds", [])
|
| 285 |
summary = data.get("summary", "")
|
| 286 |
|
| 287 |
+
for b in raw_birds:
|
| 288 |
+
name = b.get("name", "").strip()
|
| 289 |
+
# Filter out garbage
|
| 290 |
+
if name and len(name) > 2 and name.lower() not in ["the bird", "bird", "unknown", "the image", "image"]:
|
| 291 |
+
birds.append({
|
| 292 |
+
"name": name,
|
| 293 |
+
"scientific_name": b.get("scientific_name", ""),
|
| 294 |
+
"confidence": min(99, max(1, int(b.get("confidence", 70)))),
|
| 295 |
+
"reason": b.get("reason", "Identified by AI")
|
| 296 |
+
})
|
| 297 |
|
| 298 |
+
if birds:
|
| 299 |
+
return birds, summary
|
|
|
|
| 300 |
except json.JSONDecodeError as e:
|
| 301 |
log(f"JSON parse error: {e}")
|
| 302 |
|
| 303 |
+
# Fallback: Extract from text using patterns
|
| 304 |
+
# Look for "This is a/an [Bird Name]" or "[Bird Name] (Scientific name)"
|
| 305 |
+
patterns = [
|
| 306 |
+
r"(?:this is|identified as|appears to be|looks like|most likely)\s+(?:a|an|the)?\s*([A-Z][a-z]+(?:[-\s][A-Za-z]+){0,3})",
|
| 307 |
+
r"([A-Z][a-z]+(?:\s[A-Za-z]+)?)\s*\(([A-Z][a-z]+\s[a-z]+)\)", # Name (Scientific name)
|
| 308 |
+
r"species[:\s]+([A-Z][a-z]+(?:\s[A-Za-z]+)?)",
|
|
|
|
|
|
|
| 309 |
]
|
| 310 |
|
| 311 |
+
for pattern in patterns:
|
| 312 |
+
matches = re.findall(pattern, text)
|
| 313 |
+
for match in matches:
|
| 314 |
+
if isinstance(match, tuple):
|
| 315 |
+
name = match[0].strip()
|
| 316 |
+
else:
|
| 317 |
+
name = match.strip()
|
| 318 |
+
|
| 319 |
+
# Validate it looks like a bird name
|
| 320 |
+
if name and len(name) > 3 and name.lower() not in ["the bird", "bird", "unknown"]:
|
| 321 |
+
# Check it's not a common non-bird word
|
| 322 |
+
skip_words = ["the", "this", "that", "image", "photo", "picture", "bird", "species"]
|
| 323 |
+
if name.lower() not in skip_words:
|
| 324 |
+
birds.append({
|
| 325 |
+
"name": name,
|
| 326 |
+
"scientific_name": "",
|
| 327 |
+
"confidence": 65,
|
| 328 |
+
"reason": "Extracted from AI analysis"
|
| 329 |
+
})
|
| 330 |
+
break
|
| 331 |
+
if birds:
|
| 332 |
break
|
| 333 |
|
| 334 |
+
return birds[:3], summary # Max 3 birds
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
def get_bird_image(bird_name: str) -> str:
|
| 338 |
+
"""Get bird image from Wikipedia."""
|
| 339 |
+
if not bird_name or len(bird_name) < 3:
|
| 340 |
+
return ""
|
| 341 |
+
|
| 342 |
+
try:
|
| 343 |
+
# Clean name for Wikipedia
|
| 344 |
+
clean = bird_name.strip().replace(" ", "_")
|
| 345 |
+
url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{urllib.parse.quote(clean)}"
|
| 346 |
+
|
| 347 |
+
response = requests.get(url, timeout=5)
|
| 348 |
+
if response.status_code == 200:
|
| 349 |
+
data = response.json()
|
| 350 |
+
if "thumbnail" in data:
|
| 351 |
+
img_url = data["thumbnail"]["source"]
|
| 352 |
+
log(f"Got Wikipedia image for {bird_name}")
|
| 353 |
+
return img_url
|
| 354 |
+
elif "originalimage" in data:
|
| 355 |
+
return data["originalimage"]["source"]
|
| 356 |
+
except Exception as e:
|
| 357 |
+
log(f"Wikipedia image error: {e}")
|
| 358 |
|
| 359 |
+
# Fallback placeholder with bird name
|
| 360 |
+
return f"https://via.placeholder.com/120x120/4299e1/ffffff?text={urllib.parse.quote(bird_name[:10])}"
|
| 361 |
|
| 362 |
|
| 363 |
def format_bird_card(bird: Dict, index: int) -> str:
|
| 364 |
+
"""Format bird as HTML card."""
|
| 365 |
+
name = bird.get("name", "Unknown")
|
| 366 |
scientific = bird.get("scientific_name", "")
|
| 367 |
confidence = bird.get("confidence", 50)
|
| 368 |
reason = bird.get("reason", "")
|
| 369 |
|
| 370 |
+
img_url = get_bird_image(name)
|
|
|
|
|
|
|
| 371 |
|
| 372 |
+
conf_class = "conf-high" if confidence >= 80 else "conf-med" if confidence >= 60 else "conf-low"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
return f"""
|
| 375 |
<div class="bird-card">
|
| 376 |
+
<img src="{img_url}" alt="{name}" onerror="this.style.display='none'">
|
| 377 |
<div class="bird-info">
|
| 378 |
<h3>{index}. {name}</h3>
|
| 379 |
+
{f'<div class="scientific">{scientific}</div>' if scientific else ''}
|
| 380 |
<span class="confidence {conf_class}">{confidence}% confidence</span>
|
| 381 |
<p class="reason">{reason}</p>
|
| 382 |
</div>
|
| 383 |
</div>"""
|
| 384 |
|
| 385 |
|
| 386 |
+
# ================== IDENTIFICATION FUNCTIONS ==================
|
| 387 |
+
|
| 388 |
+
IMAGE_PROMPT = """Look at this bird image carefully. Identify the bird species.
|
| 389 |
+
|
| 390 |
+
You MUST respond with valid JSON in this exact format:
|
| 391 |
+
{
|
| 392 |
+
"birds": [
|
| 393 |
+
{
|
| 394 |
+
"name": "Blue-and-yellow Macaw",
|
| 395 |
+
"scientific_name": "Ara ararauna",
|
| 396 |
+
"confidence": 95,
|
| 397 |
+
"reason": "Large parrot with bright blue wings and yellow underparts, characteristic of this species"
|
| 398 |
+
}
|
| 399 |
+
],
|
| 400 |
+
"summary": "This is a Blue-and-yellow Macaw, a large South American parrot."
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
Look for:
|
| 404 |
+
- Beak shape and color
|
| 405 |
+
- Body colors and patterns
|
| 406 |
+
- Size and shape
|
| 407 |
+
- Any distinctive markings
|
| 408 |
+
|
| 409 |
+
Give the ACTUAL species name (not "bird" or "unknown"). If unsure, give your best guess with lower confidence.
|
| 410 |
+
Return ONLY the JSON."""
|
| 411 |
+
|
| 412 |
|
| 413 |
def identify_image_stream(image):
|
| 414 |
+
"""Identify bird from image."""
|
| 415 |
if image is None:
|
| 416 |
+
yield '<div class="error">β οΈ Please upload an image</div>'
|
| 417 |
return
|
| 418 |
|
| 419 |
try:
|
|
|
|
| 420 |
if not isinstance(image, Image.Image):
|
| 421 |
image = Image.fromarray(np.array(image))
|
| 422 |
image = image.convert("RGB")
|
| 423 |
|
| 424 |
+
yield '<div class="processing">π Analyzing image...</div>'
|
| 425 |
|
| 426 |
+
models = check_ollama_models()
|
| 427 |
+
response = ""
|
| 428 |
+
method = ""
|
| 429 |
|
| 430 |
+
# Try LLaVA first (best for images)
|
| 431 |
+
if models["vision_model"]:
|
| 432 |
+
yield f'<div class="processing">π¦ Using LLaVA vision model...</div>'
|
| 433 |
+
response = call_llava(image, IMAGE_PROMPT, models["vision_model"])
|
| 434 |
+
method = "LLaVA Vision"
|
| 435 |
|
| 436 |
+
# Fallback to HuggingFace
|
| 437 |
+
if not response:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
yield '<div class="processing">βοΈ Using HuggingFace AI...</div>'
|
| 439 |
|
| 440 |
+
# Get caption first
|
| 441 |
+
caption = call_hf_image_caption(image)
|
| 442 |
|
| 443 |
+
if caption:
|
| 444 |
+
yield f'<div class="processing">π Identifying from caption...</div><div class="features-box"><b>AI sees:</b> {caption}</div>'
|
|
|
|
| 445 |
|
| 446 |
# Use text model to identify
|
| 447 |
+
text_prompt = f"""Based on this image description, identify the bird species:
|
| 448 |
+
|
| 449 |
+
"{caption}"
|
| 450 |
+
|
| 451 |
+
Respond with JSON:
|
| 452 |
+
{{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}}
|
| 453 |
+
|
| 454 |
+
Give the ACTUAL bird species name. Return ONLY JSON."""
|
| 455 |
+
|
| 456 |
+
if models["text_model"]:
|
| 457 |
+
response = call_ollama_text(text_prompt, models["text_model"])
|
| 458 |
+
if not response:
|
| 459 |
+
response = call_hf_text(text_prompt)
|
| 460 |
+
method = "HuggingFace BLIP + Text"
|
| 461 |
+
else:
|
| 462 |
+
yield '<div class="error">β Could not analyze image. HuggingFace API may be unavailable.</div>'
|
| 463 |
+
return
|
| 464 |
|
| 465 |
# Parse response
|
| 466 |
+
birds, summary = parse_bird_response(response)
|
|
|
|
|
|
|
|
|
|
| 467 |
|
| 468 |
if not birds:
|
| 469 |
+
yield f'''<div class="error">
|
| 470 |
+
<b>β Could not identify bird species</b>
|
| 471 |
+
<p>The AI response couldn't be parsed. Try a clearer image.</p>
|
| 472 |
+
<div class="features-box"><b>Raw AI response:</b><br>{response[:500] if response else "No response"}</div>
|
| 473 |
+
</div>'''
|
|
|
|
| 474 |
return
|
| 475 |
|
| 476 |
+
# Success
|
| 477 |
+
result = f'''<div class="success">
|
| 478 |
+
<h3>π¦ {len(birds)} Bird(s) Identified!</h3>
|
| 479 |
+
<p>{summary or f"Identified using {method}"}</p>
|
| 480 |
+
</div>'''
|
| 481 |
|
| 482 |
for i, bird in enumerate(birds, 1):
|
| 483 |
+
result += format_bird_card(bird, i)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
+
yield result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
|
| 487 |
except Exception as e:
|
| 488 |
+
log(f"Image error: {traceback.format_exc()}")
|
| 489 |
+
yield f'<div class="error">β Error: {str(e)}</div>'
|
| 490 |
|
| 491 |
|
| 492 |
# ================== AUDIO IDENTIFICATION ==================
|
| 493 |
|
| 494 |
def process_audio(audio_data: np.ndarray, sr: int) -> Dict:
|
| 495 |
+
"""Extract audio features for bird identification."""
|
| 496 |
try:
|
|
|
|
| 497 |
audio = audio_data.astype(np.float64)
|
| 498 |
if np.max(np.abs(audio)) > 0:
|
| 499 |
audio = audio / np.max(np.abs(audio))
|
| 500 |
|
| 501 |
+
# Bandpass filter (500Hz - 10kHz for birds)
|
| 502 |
nyq = sr / 2
|
| 503 |
low, high = max(500/nyq, 0.01), min(10000/nyq, 0.99)
|
| 504 |
if low < high:
|
| 505 |
b, a = signal.butter(4, [low, high], btype='band')
|
| 506 |
audio = signal.filtfilt(b, a, audio)
|
| 507 |
|
|
|
|
| 508 |
duration = len(audio_data) / sr
|
| 509 |
|
| 510 |
# Peak frequency
|
|
|
|
| 512 |
freqs = np.fft.rfftfreq(len(audio), 1/sr)
|
| 513 |
peak_freq = freqs[np.argmax(np.abs(fft))] if len(freqs) > 0 else 0
|
| 514 |
|
| 515 |
+
# Count syllables
|
| 516 |
envelope = np.abs(signal.hilbert(audio))
|
| 517 |
threshold = np.mean(envelope) + 0.5 * np.std(envelope)
|
| 518 |
syllables = np.sum(np.diff((envelope > threshold).astype(int)) > 0)
|
| 519 |
|
| 520 |
return {
|
| 521 |
"duration": round(duration, 2),
|
| 522 |
+
"peak_freq": int(peak_freq),
|
| 523 |
"syllables": int(syllables),
|
| 524 |
+
"freq_range": "high" if peak_freq > 3000 else "medium" if peak_freq > 1000 else "low"
|
|
|
|
| 525 |
}
|
| 526 |
+
except:
|
| 527 |
+
return {"duration": 0, "peak_freq": 0, "syllables": 0, "freq_range": "unknown"}
|
|
|
|
| 528 |
|
| 529 |
|
| 530 |
+
AUDIO_PROMPT = """You are an expert ornithologist. Identify the bird from these audio features:
|
| 531 |
|
|
|
|
| 532 |
- Duration: {duration} seconds
|
| 533 |
+
- Peak Frequency: {peak_freq} Hz ({freq_range} range)
|
| 534 |
+
- Syllables/notes detected: {syllables}
|
| 535 |
+
{extra}
|
|
|
|
| 536 |
|
| 537 |
Based on these acoustic features, identify possible bird species.
|
| 538 |
+
High frequency (>3000 Hz) = small birds like warblers, finches
|
| 539 |
+
Medium frequency (1000-3000 Hz) = thrushes, bulbuls, mynas
|
| 540 |
+
Low frequency (<1000 Hz) = larger birds like crows, doves
|
|
|
|
| 541 |
|
| 542 |
Respond with JSON ONLY:
|
| 543 |
+
{{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 70, "reason": "Matches because..."}}], "summary": "..."}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
+
Give ACTUAL species names, not generic terms."""
|
| 546 |
|
| 547 |
|
| 548 |
def identify_audio_stream(audio_input, location: str = "", month: str = ""):
|
| 549 |
+
"""Identify bird from audio - uses TEXT model, not vision."""
|
| 550 |
if audio_input is None:
|
| 551 |
+
yield '<div class="error">β οΈ Please upload or record audio</div>'
|
| 552 |
return
|
| 553 |
|
| 554 |
try:
|
|
|
|
| 559 |
return
|
| 560 |
|
| 561 |
if len(audio_data) == 0:
|
| 562 |
+
yield '<div class="error">β οΈ Empty audio</div>'
|
| 563 |
return
|
| 564 |
|
|
|
|
| 565 |
if len(audio_data.shape) > 1:
|
| 566 |
audio_data = np.mean(audio_data, axis=1)
|
| 567 |
|
| 568 |
+
yield '<div class="processing">π Analyzing audio features...</div>'
|
| 569 |
|
| 570 |
features = process_audio(audio_data, sr)
|
| 571 |
|
| 572 |
+
features_html = f'''<div class="features-box">
|
| 573 |
+
<b>π΅ Audio Analysis</b><br>
|
| 574 |
+
β’ Duration: {features["duration"]}s | Peak: {features["peak_freq"]} Hz ({features["freq_range"]})<br>
|
| 575 |
+
β’ Syllables: {features["syllables"]}
|
| 576 |
+
</div>'''
|
| 577 |
+
|
| 578 |
+
yield f'<div class="processing">π€ Identifying bird...</div>{features_html}'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
|
| 580 |
+
extra = ""
|
| 581 |
+
if location: extra += f"\n- Location: {location}"
|
| 582 |
+
if month: extra += f"\n- Month: {month}"
|
| 583 |
+
|
| 584 |
+
prompt = AUDIO_PROMPT.format(**features, extra=extra)
|
| 585 |
+
|
| 586 |
+
models = check_ollama_models()
|
| 587 |
response = ""
|
| 588 |
|
| 589 |
+
# Use TEXT model for audio (NOT vision!)
|
| 590 |
+
if models["text_model"]:
|
| 591 |
+
yield f'<div class="processing">π¦ Using {models["text_model"]}...</div>{features_html}'
|
| 592 |
+
response = call_ollama_text(prompt, models["text_model"])
|
| 593 |
|
| 594 |
if not response:
|
| 595 |
+
yield f'<div class="processing">βοΈ Using HuggingFace...</div>{features_html}'
|
| 596 |
+
response = call_hf_text(prompt)
|
| 597 |
|
| 598 |
+
birds, summary = parse_bird_response(response)
|
| 599 |
|
| 600 |
if not birds:
|
| 601 |
+
yield f'''<div class="error">
|
| 602 |
+
<b>Could not identify bird from audio</b>
|
| 603 |
+
<p>Try a clearer recording with less background noise.</p>
|
| 604 |
{features_html}
|
| 605 |
+
</div>'''
|
| 606 |
return
|
| 607 |
|
| 608 |
+
result = f'''<div class="success">
|
| 609 |
+
<h3>π¦ {len(birds)} Bird(s) Identified!</h3>
|
| 610 |
<p>{summary}</p>
|
| 611 |
+
</div>{features_html}'''
|
| 612 |
|
| 613 |
for i, bird in enumerate(birds, 1):
|
| 614 |
+
result += format_bird_card(bird, i)
|
| 615 |
|
| 616 |
+
yield result
|
| 617 |
|
| 618 |
except Exception as e:
|
| 619 |
log(f"Audio error: {traceback.format_exc()}")
|
| 620 |
+
yield f'<div class="error">β Error: {str(e)}</div>'
|
| 621 |
|
| 622 |
|
| 623 |
# ================== DESCRIPTION IDENTIFICATION ==================
|
| 624 |
|
| 625 |
def identify_description_stream(description: str):
|
| 626 |
"""Identify bird from text description."""
|
| 627 |
+
if not description or len(description.strip()) < 5:
|
| 628 |
+
yield '<div class="error">β οΈ Please enter a description</div>'
|
| 629 |
return
|
| 630 |
|
| 631 |
try:
|
| 632 |
+
yield '<div class="processing">π Analyzing description...</div>'
|
| 633 |
|
| 634 |
+
prompt = f"""Identify the bird species from this description:
|
| 635 |
|
|
|
|
| 636 |
"{description}"
|
| 637 |
|
| 638 |
+
Respond with JSON:
|
| 639 |
+
{{"birds": [{{"name": "Species Name", "scientific_name": "...", "confidence": 80, "reason": "..."}}], "summary": "..."}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
|
| 641 |
+
Use ACTUAL species names. Return ONLY JSON."""
|
| 642 |
|
| 643 |
+
models = check_ollama_models()
|
| 644 |
response = ""
|
| 645 |
|
| 646 |
+
if models["text_model"]:
|
| 647 |
yield '<div class="processing">π¦ Using local AI...</div>'
|
| 648 |
+
response = call_ollama_text(prompt, models["text_model"])
|
| 649 |
|
| 650 |
if not response:
|
| 651 |
+
yield '<div class="processing">βοΈ Using HuggingFace...</div>'
|
| 652 |
+
response = call_hf_text(prompt)
|
| 653 |
|
| 654 |
+
birds, summary = parse_bird_response(response)
|
| 655 |
|
| 656 |
if not birds:
|
| 657 |
+
yield '<div class="error"><b>Could not identify bird</b><p>Try adding more details.</p></div>'
|
|
|
|
|
|
|
|
|
|
| 658 |
return
|
| 659 |
|
| 660 |
+
result = f'''<div class="success">
|
| 661 |
+
<h3>π¦ {len(birds)} Bird(s) Match!</h3>
|
| 662 |
<p>{summary}</p>
|
| 663 |
+
</div>'''
|
| 664 |
|
| 665 |
for i, bird in enumerate(birds, 1):
|
| 666 |
+
result += format_bird_card(bird, i)
|
| 667 |
|
| 668 |
+
yield result
|
| 669 |
|
| 670 |
except Exception as e:
|
| 671 |
+
yield f'<div class="error">β Error: {str(e)}</div>'
|
|
|
|
| 672 |
|
| 673 |
|
| 674 |
+
# ================== UI ==================
|
| 675 |
|
| 676 |
+
def get_status_html():
|
| 677 |
+
"""Get status indicator."""
|
| 678 |
+
models = check_ollama_models()
|
| 679 |
+
|
| 680 |
+
if models["vision_model"]:
|
| 681 |
+
return f'<span class="status-dot status-green"></span> LLaVA + {models["text_model"] or "HF"}'
|
| 682 |
+
elif models["text_model"]:
|
| 683 |
+
return f'<span class="status-dot status-yellow"></span> {models["text_model"]} (no vision)'
|
| 684 |
elif HF_TOKEN:
|
| 685 |
+
return '<span class="status-dot status-yellow"></span> HuggingFace Cloud'
|
| 686 |
else:
|
| 687 |
+
return '<span class="status-dot status-red"></span> Limited Mode'
|
| 688 |
|
| 689 |
|
| 690 |
+
def create_app():
|
|
|
|
|
|
|
| 691 |
with gr.Blocks(title="BirdSense Pro") as demo:
|
| 692 |
gr.HTML(f"<style>{CSS}</style>")
|
| 693 |
|
| 694 |
gr.HTML(f"""
|
| 695 |
<div class="header">
|
| 696 |
<h1>π¦ BirdSense Pro</h1>
|
| 697 |
+
<p class="subtitle">AI Bird Identification β’ Audio β’ Image β’ Description</p>
|
| 698 |
+
<div class="status">{get_status_html()}</div>
|
| 699 |
+
</div>""")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
|
| 701 |
+
# AUDIO FIRST
|
| 702 |
with gr.Tab("π΅ Audio"):
|
| 703 |
+
gr.HTML('<div class="info-box"><h3>π΅ Audio Identification</h3><p>Upload or record bird calls. Uses text AI to analyze acoustic features.</p></div>')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 704 |
with gr.Row():
|
| 705 |
with gr.Column():
|
| 706 |
+
audio_in = gr.Audio(sources=["upload", "microphone"], type="numpy", label="π€ Audio")
|
| 707 |
with gr.Row():
|
| 708 |
+
loc = gr.Textbox(label="π Location", placeholder="e.g., Mumbai")
|
| 709 |
+
mon = gr.Dropdown(label="π
Month", choices=[""] + ["January","February","March","April","May","June","July","August","September","October","November","December"])
|
| 710 |
+
audio_btn = gr.Button("π Identify", variant="primary", size="lg")
|
|
|
|
| 711 |
with gr.Column():
|
| 712 |
+
audio_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">π΅ Upload audio to identify</div>')
|
| 713 |
+
audio_btn.click(identify_audio_stream, [audio_in, loc, mon], audio_out)
|
|
|
|
| 714 |
|
| 715 |
+
# IMAGE
|
| 716 |
with gr.Tab("π· Image"):
|
| 717 |
+
gr.HTML('<div class="info-box"><h3>π· Image Identification</h3><p>Upload a photo. Uses LLaVA vision AI to analyze the actual image.</p></div>')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 718 |
with gr.Row():
|
| 719 |
with gr.Column():
|
| 720 |
+
img_in = gr.Image(sources=["upload", "webcam"], type="pil", label="πΈ Photo")
|
| 721 |
+
img_btn = gr.Button("π Identify", variant="primary", size="lg")
|
|
|
|
| 722 |
with gr.Column():
|
| 723 |
+
img_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">π· Upload image to identify</div>')
|
| 724 |
+
img_btn.click(identify_image_stream, [img_in], img_out)
|
|
|
|
| 725 |
|
| 726 |
+
# DESCRIPTION
|
| 727 |
with gr.Tab("π Description"):
|
| 728 |
+
gr.HTML('<div class="info-box"><h3>π Text Description</h3><p>Describe the bird - colors, size, behavior, sounds.</p></div>')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 729 |
with gr.Row():
|
| 730 |
with gr.Column():
|
| 731 |
+
desc_in = gr.Textbox(label="βοΈ Description", lines=3, placeholder="e.g., Large blue and yellow parrot with long tail")
|
| 732 |
+
desc_btn = gr.Button("π Identify", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 733 |
with gr.Column():
|
| 734 |
+
desc_out = gr.HTML('<div style="padding:40px;text-align:center;color:#a0aec0">π Describe a bird</div>')
|
| 735 |
+
desc_btn.click(identify_description_stream, [desc_in], desc_out)
|
|
|
|
| 736 |
|
| 737 |
+
gr.HTML('<div style="text-align:center;padding:10px;color:#718096;font-size:0.8rem"><b>BirdSense Pro</b> β’ Local: LLaVA (image) + Llama3.2 (audio/text) β’ Cloud: HuggingFace BLIP</div>')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
|
| 739 |
return demo
|
| 740 |
|
| 741 |
|
|
|
|
|
|
|
| 742 |
if __name__ == "__main__":
|
| 743 |
log("Starting BirdSense Pro...")
|
| 744 |
+
models = check_ollama_models()
|
| 745 |
+
log(f"Vision: {models['vision_model']}, Text: {models['text_model']}, HF: {bool(HF_TOKEN)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 746 |
|
| 747 |
+
app = create_app()
|
| 748 |
+
app.launch(server_name="0.0.0.0", server_port=7860, show_error=True)
|