AI_Detector_2

Sleeping

App Files Files Community

mahmoudsaber0 commited on Oct 18

Commit

080d131

verified ·

1 Parent(s): ab32be2

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -142

app.py CHANGED Viewed

@@ -1,145 +1,96 @@
 import os
-import re
 import torch
-from fastapi import FastAPI
-from pydantic import BaseModel
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-from tokenizers import normalizers
-from tokenizers.normalizers import Sequence, Replace, Strip
-from tokenizers import Regex
-# ✅ Environment cache setup for safe deployment
-os.environ["HF_HOME"] = "/tmp"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp"
-os.environ["HF_DATASETS_CACHE"] = "/tmp"
-os.environ["HF_HUB_CACHE"] = "/tmp"
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# ✅ Model & Tokenizer Setup
-model1_path = "modernbert.bin"
-model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
-model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
-tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
-# ✅ Load 3 models
-model_1 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
-model_1.load_state_dict(torch.load(model1_path, map_location=device))
-model_1.to(device).eval()
-model_2 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
-model_2.load_state_dict(torch.hub.load_state_dict_from_url(model2_path, map_location=device))
-model_2.to(device).eval()
-model_3 = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=41)
-model_3.load_state_dict(torch.hub.load_state_dict_from_url(model3_path, map_location=device))
-model_3.to(device).eval()
-# ✅ Label mapping
-label_mapping = {
-    0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
-    6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
-    11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
-    14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
-    18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
-    22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
-    27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
-    31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
-    35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
-    39: 'text-davinci-002', 40: 'text-davinci-003'
-}
-# ✅ Text cleaning and normalization
-def clean_text(text: str) -> str:
-    text = re.sub(r'\s{2,}', ' ', text)
-    text = re.sub(r'\s+([,.;:?!])', r'\1', text)
-    return text
-newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
-join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
-tokenizer.backend_tokenizer.normalizer = Sequence([
-    tokenizer.backend_tokenizer.normalizer,
-    join_hyphen_break,
-    newline_to_space,
-    Strip()
-])
-# ✅ FastAPI app
-app = FastAPI(title="ModernBERT AI Text Detector")
-class InputText(BaseModel):
-    text: str
-def classify_text_ensemble(text: str):
-    """Run ensemble classification and return percentages + identified model"""
-    cleaned_text = clean_text(text)
-    if not cleaned_text.strip():
-        return None
-    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
-    with torch.no_grad():
-        logits_1 = model_1(**inputs).logits
-        logits_2 = model_2(**inputs).logits
-        logits_3 = model_3(**inputs).logits
-        softmax_1 = torch.softmax(logits_1, dim=1)
-        softmax_2 = torch.softmax(logits_2, dim=1)
-        softmax_3 = torch.softmax(logits_3, dim=1)
-        averaged_probabilities = (softmax_1 + softmax_2 + softmax_3) / 3
-        probabilities = averaged_probabilities[0]
-    human_prob = probabilities[24].item()
-    ai_probs_clone = probabilities.clone()
-    ai_probs_clone[24] = 0
-    ai_total_prob = ai_probs_clone.sum().item()
-    total = human_prob + ai_total_prob
-    human_percentage = (human_prob / total) * 100
-    ai_percentage = (ai_total_prob / total) * 100
-    ai_argmax_index = torch.argmax(ai_probs_clone).item()
-    ai_model_name = label_mapping[ai_argmax_index]
-    return {
-        "ai_percentage": round(ai_percentage, 2),
-        "human_percentage": round(human_percentage, 2),
-        "identified_model": ai_model_name,
-        "is_ai": ai_percentage > human_percentage
-    }
-@app.get("/")
-def root():
-    return {"message": "ModernBERT AI Text Detector API is running. Use POST /analyze"}
 @app.post("/analyze")
-async def analyze(data: InputText):
-    text = data.text.strip()
-    if not text:
-        return {"success": False, "code": 400, "message": "Empty text"}
-    result = classify_text_ensemble(text)
-    if not result:
-        return {"success": False, "code": 400, "message": "Text too short or invalid"}
-    feedback = (
-        f"The text is {result['human_percentage']}% likely human-written."
-        if not result["is_ai"]
-        else f"The text is {result['ai_percentage']}% likely AI-generated. Identified LLM: {result['identified_model']}."
-    )
-    return {
-        "success": True,
-        "code": 200,
-        "data": {
-            "input_text": text,
-            "ai_percentage": result["ai_percentage"],
-            "human_percentage": result["human_percentage"],
-            "identified_model": result["identified_model"],
-            "feedback": feedback,
-            "is_ai": result["is_ai"]
-        }
-    }

 import os
+from fastapi import FastAPI, WebSocket, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
 import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+import asyncio
+# =====================================================
+# ✅ Fix Hugging Face Cache Permission Errors
+# =====================================================
+CACHE_DIR = "/tmp/hf_cache"
+os.environ["HF_HOME"] = CACHE_DIR
+os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
+os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
+os.environ["HF_HUB_CACHE"] = CACHE_DIR
+os.makedirs(CACHE_DIR, exist_ok=True)
+# =====================================================
+# ✅ Initialize Model and Tokenizer
+# =====================================================
+MODEL_NAME = "answerdotai/ModernBERT-base"
+print("Loading model and tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
+classifier = pipeline(
+    "text-classification",
+    model=model,
+    tokenizer=tokenizer,
+    device=0 if torch.cuda.is_available() else -1
+)
+# =====================================================
+# ✅ FastAPI App Setup
+# =====================================================
+app = FastAPI(title="ModernBERT FastAPI Server")
+# Allow all origins (for testing)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# =====================================================
+# ✅ REST Endpoint Example
+# =====================================================
 @app.post("/analyze")
+async def analyze_text(data: dict):
+    try:
+        text = data.get("text", "")
+        if not text.strip():
+            return JSONResponse({"error": "Empty text provided"}, status_code=400)
+        result = classifier(text)
+        return {"result": result}
+    except Exception as e:
+        return JSONResponse({"error": str(e)}, status_code=500)
+# =====================================================
+# ✅ WebSocket Endpoint (real-time classification)
+# =====================================================
+@app.websocket("/ws")
+async def websocket_endpoint(ws: WebSocket):
+    await ws.accept()
+    idle_timeout = 60  # seconds
+    async def close_if_idle():
+        while True:
+            await asyncio.sleep(idle_timeout)
+            await ws.close(code=1000)
+            break
+    asyncio.create_task(close_if_idle())
+    try:
+        while True:
+            message = await ws.receive_text()
+            if message.lower() in ["exit", "quit"]:
+                await ws.close(code=1000)
+                break
+            result = classifier(message)
+            await ws.send_json(result)
+    except Exception:
+        await ws.close()
+# =====================================================
+# ✅ Root Endpoint
+# =====================================================
+@app.get("/")
+def home():
+    return {"status": "ok", "model": MODEL_NAME, "device": "cuda" if torch.cuda.is_available() else "cpu"}