Spaces:

ASRMODEL
/

ASRCONTRI

Paused

App Files Files Community

kasimali commited on Aug 14

Commit

ee405f2

verified ·

1 Parent(s): 875e456

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -182

app.py CHANGED Viewed

@@ -1,190 +1,128 @@
 import os
 import re
-import gradio as gr
 import fasttext
 import torch
 from transformers import AutoTokenizer
 from huggingface_hub import hf_hub_download
-# ---------------------------------------------------------------------------------
-# Configuration (thresholds and tokenizer)
-# ---------------------------------------------------------------------------------
-ROMAN_SPLIT_THRESHOLD = float(os.getenv("INDICLID_INPUT_ROMAN_THRESHOLD", "0.5"))  # >50% A–Z => romanized
-FTR_CONF_THRESHOLD = float(os.getenv("INDICLID_ROMAN_CONF_THRESHOLD", "0.6"))      # FTR prob threshold for BERT fallback
-BERT_TOKENIZER_ID = os.getenv("INDICLID_BERT_TOKENIZER", "ai4bharat/IndicBERTv2-MLM-only")
-# Persist Hugging Face cache if Space has persistent storage enabled
-os.environ["HF_HOME"] = os.getenv("HF_HOME", "/data/.huggingface")
-# Local filenames (no models/ folder)
-FTN_LOCAL = "indiclid_ftn.bin"
-FTR_LOCAL = "indiclid_ftr.bin"
-BERT_LOCAL = "indiclid_bert.pt"
-# Repos and filenames confirmed from upstream
-# - FTN fastText (native): model_baseline_roman.bin (as used in official paths in repo)
-# - FTR fastText (roman): model_baseline_roman.bin
-# - BERT fallback: basline_nn_simple.pt
-FTN_REPO = "ai4bharat/IndicLID-FTN"        # file exists in repo; official code references this filename
-FTN_FILENAME = "model_baseline_roman.bin"  # per upstream repo path usage[9][13]
-FTR_REPO = "ai4bharat/IndicLID-FTR"
-FTR_FILENAME = "model_baseline_roman.bin"  # per HF commit/files[12]
-BERT_REPO = "ai4bharat/IndicLID-BERT"
-BERT_FILENAME = "basline_nn_simple.pt"     # per HF file listing[7][10]
-# ---------------------------------------------------------------------------------
-# Utilities
-# ---------------------------------------------------------------------------------
-def ensure_artifact(local_path: str, repo_id: str, filename: str):
-    if os.path.exists(local_path):
-        return local_path
-    downloaded = hf_hub_download(repo_id=repo_id, filename=filename)
-    if downloaded != local_path:
-        try:
-            os.rename(downloaded, local_path)
-        except Exception:
-            import shutil
-            shutil.copyfile(downloaded, local_path)
-    return local_path
-# ---------------------------------------------------------------------------------
-# Download and load models
-# ---------------------------------------------------------------------------------
-FTN_PATH = ensure_artifact(FTN_LOCAL, FTN_REPO, FTN_FILENAME)  # native-script fastText[9][13]
-FTR_PATH = ensure_artifact(FTR_LOCAL, FTR_REPO, FTR_FILENAME)  # roman fastText[12]
-BERT_PATH = ensure_artifact(BERT_LOCAL, BERT_REPO, BERT_FILENAME)  # BERT fallback[7][10]
-DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
-ftn_model = fasttext.load_model(FTN_PATH)
-ftr_model = fasttext.load_model(FTR_PATH)
-# Note: basline_nn_simple.pt is a ready-to-call module in the official pipeline that returns logits
-bert_model = torch.load(BERT_PATH, map_location=DEVICE)
-if hasattr(bert_model, "to"):
-    bert_model = bert_model.to(DEVICE)
-bert_model.eval()
-tokenizer = AutoTokenizer.from_pretrained(BERT_TOKENIZER_ID)
-# If the checkpoint exposes label_map_reverse, prefer it; else use a safe placeholder.
-LABEL_MAP_REVERSE = getattr(bert_model, "label_map_reverse", None)
-if LABEL_MAP_REVERSE is None:
-    # Replace with official mapping from the IndicLID inference file for exact codes.
-    LABEL_MAP_REVERSE = {i: f"label_{i}" for i in range(60)}
-# ---------------------------------------------------------------------------------
-# Inference helpers
-# ---------------------------------------------------------------------------------
-def roman_char_ratio(text: str) -> float:
-    if not text:
-        return 0.0
-    roman = len(re.findall(r"[A-Za-z]", text))
-    return roman / max(len(text), 1)
-def predict_ftn(texts):
-    labels, scores = ftn_model.predict(texts)
-    out = []
-    for t, ls, sc in zip(texts, labels, scores):
-        out.append({
-            "text": t,
-            "label": ls[0].replace("__label__", ""),
-            "score": float(sc[0]),
-            "model": "IndicLID-FTN"
-        })
-    return out
-def ftr_predict_or_route(texts):
-    labels, scores = ftr_model.predict(texts)
-    kept, route = [], []
-    for idx, (t, ls, sc) in enumerate(zip(texts, labels, scores)):
-        conf = float(sc[0])
-        lbl = ls[0].replace("__label__", "")
-        if conf > FTR_CONF_THRESHOLD:
-            kept.append({"index": idx, "text": t, "label": lbl, "score": conf, "model": "IndicLID-FTR"})
-        else:
-            route.append((idx, t))
-    return kept, route
-@torch.no_grad()
-def bert_predict(indexed_inputs):
-    if not indexed_inputs:
-        return []
-    idxs = [i for i, _ in indexed_inputs]
-    texts = [t for _, t in indexed_inputs]
-    enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
-    for k in enc:
-        enc[k] = enc[k].to(DEVICE)
-    outputs = bert_model(
-        enc["input_ids"],
-        token_type_ids=enc.get("token_type_ids"),
-        attention_mask=enc.get("attention_mask"),
-    )
-    logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
-    probs = torch.softmax(logits, dim=1)
-    preds = torch.argmax(probs, dim=1)
-    scores = probs.gather(1, preds.unsqueeze(1)).squeeze(1)
-    results = []
-    for i, t, p, s in zip(idxs, texts, preds, scores):
-        label_idx = int(p.item())
-        label = LABEL_MAP_REVERSE.get(label_idx, str(label_idx))
-        results.append({"index": i, "text": t, "label": label, "score": float(s.item()), "model": "IndicLID-BERT"})
-    results.sort(key=lambda x: x["index"])
-    return results
-def ensemble_predict(texts):
-    roman_inputs, native_inputs = [], []
-    for i, t in enumerate(texts):
-        if roman_char_ratio(t) > ROMAN_SPLIT_THRESHOLD:
-            roman_inputs.append((i, t))
-        else:
-            native_inputs.append((i, t))
-    outputs = {}
-    if native_inputs:
-        nat_texts = [t for _, t in native_inputs]
-        nat_out = predict_ftn(nat_texts)
-        for (i, _), r in zip(native_inputs, nat_out):
-            outputs[i] = r
-    if roman_inputs:
-        rom_texts = [t for _, t in roman_inputs]
-        ftr_kept, bert_inputs = ftr_predict_or_route(rom_texts)
-        for kept in ftr_kept:
-            i_orig = roman_inputs[kept["index"]][0]
-            outputs[i_orig] = {
-                "text": kept["text"], "label": kept["label"], "score": kept["score"], "model": kept["model"]
-            }
-        if bert_inputs:
-            bert_out = bert_predict(bert_inputs)
-            for r in bert_out:
-                i_orig = roman_inputs[r["index"]][0]
-                outputs[i_orig] = {
-                    "text": r["text"], "label": r["label"], "score": r["score"], "model": r["model"]
-                }
-    return [outputs[i] for i in sorted(outputs.keys())]
-# ---------------------------------------------------------------------------------
-# Gradio UI
-# ---------------------------------------------------------------------------------
-def detect(texts_str: str):
-    if not texts_str or not texts_str.strip():
-        return []
-    lines = [t.strip() for t in texts_str.split("\n") if t.strip()]
-    return ensemble_predict(lines)
-with gr.Blocks(title="IndicLID Ensemble (AI4Bharat) — Gradio Space") as demo:
-    gr.Markdown(
-        "## IndicLID Ensemble (AI4Bharat)\n"
-        "Two-stage LID for 22 Indian languages (47 classes), with native fastText (FTN), roman fastText (FTR), "
-        "and IndicBERT fallback for low-confidence romanized inputs."
-    )
-    inp = gr.Textbox(lines=8, label="Enter text(s) — one per line")
-    out = gr.JSON(label="Predictions")
-    gr.Button("Detect").click(fn=detect, inputs=inp, outputs=out)
 if __name__ == "__main__":
-    demo.launch()

 import os
 import re
+import pandas as pd
 import fasttext
 import torch
+from torch.utils.data import Dataset, DataLoader
 from transformers import AutoTokenizer
 from huggingface_hub import hf_hub_download
+# ------------------------------
+# Download models automatically
+# ------------------------------
+print("Downloading IndicLID models from Hugging Face...")
+FTN_PATH = hf_hub_download("ai4bharat/IndicLID-FTN", filename="model_baseline_roman.bin")
+FTR_PATH = hf_hub_download("ai4bharat/IndicLID-FTR", filename="model_baseline_roman.bin")
+BERT_PATH = hf_hub_download("ai4bharat/IndicLID-BERT", filename="basline_nn_simple.pt")
+print("Download complete.")
+# ------------------------------
+# Dataset class for BERT batching
+# ------------------------------
+class IndicBERT_Data(Dataset):
+    def __init__(self, indices, X):
+        self.x = list(X)
+        self.i = list(indices)
+    def __len__(self):
+        return len(self.x)
+    def __getitem__(self, idx):
+        return self.i[idx], self.x[idx]
+# ------------------------------
+# Full IndicLID Ensemble
+# ------------------------------
+class IndicLID:
+    def __init__(self, input_threshold=0.5, roman_lid_threshold=0.6):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.IndicLID_FTN = fasttext.load_model(FTN_PATH)
+        self.IndicLID_FTR = fasttext.load_model(FTR_PATH)
+        self.IndicLID_BERT = torch.load(BERT_PATH, map_location=self.device)
+        self.IndicLID_BERT.eval()
+        self.IndicLID_BERT_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-MLM-only")
+        self.input_threshold = input_threshold
+        self.model_threshold = roman_lid_threshold
+        # Official label mapping from AI4Bharat
+        self.label_map_reverse = {
+            0: 'asm_Latn', 1: 'ben_Latn', 2: 'brx_Latn', 3: 'guj_Latn', 4: 'hin_Latn',
+            5: 'kan_Latn', 6: 'kas_Latn', 7: 'kok_Latn', 8: 'mai_Latn', 9: 'mal_Latn',
+            10: 'mni_Latn', 11: 'mar_Latn', 12: 'nep_Latn', 13: 'ori_Latn', 14: 'pan_Latn',
+            15: 'san_Latn', 16: 'snd_Latn', 17: 'tam_Latn', 18: 'tel_Latn', 19: 'urd_Latn',
+            20: 'eng_Latn', 21: 'other', 22: 'asm_Beng', 23: 'ben_Beng', 24: 'brx_Deva',
+            25: 'doi_Deva', 26: 'guj_Gujr', 27: 'hin_Deva', 28: 'kan_Knda', 29: 'kas_Arab',
+            30: 'kas_Deva', 31: 'kok_Deva', 32: 'mai_Deva', 33: 'mal_Mlym', 34: 'mni_Beng',
+            35: 'mni_Meti', 36: 'mar_Deva', 37: 'nep_Deva', 38: 'ori_Orya', 39: 'pan_Guru',
+            40: 'san_Deva', 41: 'sat_Olch', 42: 'snd_Arab', 43: 'tam_Tamil', 44: 'tel_Telu',
+            45: 'urd_Arab'
+        }
+    def char_percent_check(self, text):
+        total_chars = sum(c.isalpha() for c in text)
+        roman_chars = sum(bool(re.match(r"[A-Za-z]", c)) for c in text)
+        return roman_chars / total_chars if total_chars else 0
+    def native_inference(self, data, out_dict):
+        if not data: return out_dict
+        texts = [x[1] for x in data]
+        preds = self.IndicLID_FTN.predict(texts)
+        for (idx, txt), lbls, scrs in zip(data, preds[0], preds[1]):
+            out_dict[idx] = (txt, lbls[0][9:], float(scrs[0]), 'IndicLID-FTN')
+        return out_dict
+    def ftr_inference(self, data, out_dict, batch_size):
+        if not data: return out_dict
+        texts = [x[1] for x in data]
+        preds = self.IndicLID_FTR.predict(texts)
+        bert_inputs = []
+        for (idx, txt), lbls, scrs in zip(data, preds[0], preds[1]):
+            if float(scrs[0]) > self.model_threshold:
+                out_dict[idx] = (txt, lbls[0][9:], float(scrs[0]), 'IndicLID-FTR')
+            else:
+                bert_inputs.append((idx, txt))
+        return self.bert_inference(bert_inputs, out_dict, batch_size)
+    def bert_inference(self, data, out_dict, batch_size):
+        if not data: return out_dict
+        ds = IndicBERT_Data([x[0] for x in data], [x[1] for x in data])
+        dl = DataLoader(ds, batch_size=batch_size)
+        with torch.no_grad():
+            for idxs, texts in dl:
+                enc = self.IndicLID_BERT_tokenizer(list(texts), return_tensors="pt", padding=True,
+                                                   truncation=True, max_length=512).to(self.device)
+                outputs = self.IndicLID_BERT(**enc)
+                preds = torch.argmax(outputs.logits, dim=1)
+                probs = torch.softmax(outputs.logits, dim=1)
+                for i, t, p in zip(idxs, texts, preds):
+                    label = self.label_map_reverse[p.item()]
+                    score = probs[i, p].item()
+                    out_dict[i.item()] = (t, label, score, 'IndicLID-BERT')
+        return out_dict
+    def batch_predict(self, texts, batch_size=8):
+        native, roman = [], []
+        for i, t in enumerate(texts):
+            if self.char_percent_check(t) > self.input_threshold:
+                roman.append((i, t))
+            else:
+                native.append((i, t))
+        out_dict = {}
+        out_dict = self.native_inference(native, out_dict)
+        out_dict = self.ftr_inference(roman, out_dict, batch_size)
+        return [out_dict[i] for i in sorted(out_dict.keys())]
+# ------------------------------
+# Run a quick test
+# ------------------------------
 if __name__ == "__main__":
+    detector = IndicLID()
+    samples = [
+        "यह एक हिंदी वाक्य है।",
+        "ennai pudikkuma?",
+        "ఇది ఒక తెలుగు వాక్యం",
+        "Hello, how are you?"
+    ]
+    results = detector.batch_predict(samples)
+    for text, label, score, model in results:
+        print(f"Text: {text}\nPredicted: {label} | Score: {score:.4f} | Model: {model}\n")