Spaces:
Sleeping
Sleeping
File size: 6,683 Bytes
1c174b0 dde6bd9 8f9d030 dde6bd9 630b894 080d131 06672e2 080d131 c5dea90 080d131 c5dea90 080d131 630b894 080d131 630b894 080d131 630b894 3ab40c4 080d131 630b894 080d131 630b894 080d131 630b894 080d131 630b894 080d131 dde6bd9 630b894 dde6bd9 630b894 dde6bd9 630b894 dde6bd9 080d131 630b894 080d131 630b894 dde6bd9 993d932 630b894 993d932 630b894 993d932 630b894 993d932 630b894 993d932 630b894 993d932 630b894 dde6bd9 630b894 dde6bd9 630b894 dde6bd9 993d932 630b894 dde6bd9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import os
import re
import torch
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tokenizers.normalizers import Sequence, Replace, Strip
from tokenizers import Regex
# --- Fix PyTorch Inductor UID issue ---
os.environ["TORCHINDUCTOR_DISABLE"] = "1"
os.environ["USER"] = os.environ.get("USER", "appuser")
CACHE_DIR = "/tmp/hf_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
os.environ["HF_HOME"] = CACHE_DIR
os.environ["TRANSFORMERS_CACHE"] = CACHE_DIR
os.environ["HF_DATASETS_CACHE"] = CACHE_DIR
os.environ["HF_HUB_CACHE"] = CACHE_DIR
os.environ["TORCH_HOME"] = CACHE_DIR
os.environ["XDG_CACHE_HOME"] = CACHE_DIR
# =====================================================
# ✅ Model Setup
# =====================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
# Paths or URLs to your model weights
# --- Model and Tokenizer Setup ---
model1_path = "modernbert.bin"
model2_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed12"
model3_path = "https://huggingface.co/mihalykiss/modernbert_2/resolve/main/Model_groups_3class_seed22"
def load_model(base_path, url=None):
model = AutoModelForSequenceClassification.from_pretrained(
"answerdotai/ModernBERT-base", num_labels=41
)
if url:
state_dict = torch.hub.load_state_dict_from_url(url, map_location=device)
else:
state_dict = torch.load(base_path, map_location=device)
model.load_state_dict(state_dict)
model.to(device).eval()
return model
model_1 = load_model(model1_path)
model_2 = load_model(None, model2_path)
model_3 = load_model(None, model3_path)
# =====================================================
# ✅ Label Mapping & Normalization
# =====================================================
label_mapping = {
0: '13B', 1: '30B', 2: '65B', 3: '7B', 4: 'GLM130B', 5: 'bloom_7b',
6: 'bloomz', 7: 'cohere', 8: 'davinci', 9: 'dolly', 10: 'dolly-v2-12b',
11: 'flan_t5_base', 12: 'flan_t5_large', 13: 'flan_t5_small',
14: 'flan_t5_xl', 15: 'flan_t5_xxl', 16: 'gemma-7b-it', 17: 'gemma2-9b-it',
18: 'gpt-3.5-turbo', 19: 'gpt-35', 20: 'gpt4', 21: 'gpt4o',
22: 'gpt_j', 23: 'gpt_neox', 24: 'human', 25: 'llama3-70b', 26: 'llama3-8b',
27: 'mixtral-8x7b', 28: 'opt_1.3b', 29: 'opt_125m', 30: 'opt_13b',
31: 'opt_2.7b', 32: 'opt_30b', 33: 'opt_350m', 34: 'opt_6.7b',
35: 'opt_iml_30b', 36: 'opt_iml_max_1.3b', 37: 't0_11b', 38: 't0_3b',
39: 'text-davinci-002', 40: 'text-davinci-003'
}
# Clean and normalize text before tokenization
def clean_text(text: str) -> str:
text = re.sub(r'\s{2,}', ' ', text)
text = re.sub(r'\s+([,.;:?!])', r'\1', text)
return text
newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
tokenizer.backend_tokenizer.normalizer = Sequence([
tokenizer.backend_tokenizer.normalizer,
join_hyphen_break,
newline_to_space,
Strip()
])
# =====================================================
# ✅ Core Analysis Function
# =====================================================
def analyze_text_block(text: str):
cleaned_text = clean_text(text)
inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True).to(device)
with torch.no_grad():
logits_1 = model_1(**inputs).logits
logits_2 = model_2(**inputs).logits
logits_3 = model_3(**inputs).logits
softmax_1 = torch.softmax(logits_1, dim=1)
softmax_2 = torch.softmax(logits_2, dim=1)
softmax_3 = torch.softmax(logits_3, dim=1)
avg_probs = (softmax_1 + softmax_2 + softmax_3) / 3
probs = avg_probs[0]
human_prob = probs[24].item()
ai_probs_clone = probs.clone()
ai_probs_clone[24] = 0
ai_total_prob = ai_probs_clone.sum().item()
total = human_prob + ai_total_prob
human_percentage = (human_prob / total) * 100
ai_percentage = (ai_total_prob / total) * 100
ai_model_index = torch.argmax(ai_probs_clone).item()
ai_model_label = label_mapping[ai_model_index]
return {
"human_written_score": round(human_percentage / 100, 4),
"ai_generated_score": round(ai_percentage / 100, 4),
"predicted_model": ai_model_label
}
# =====================================================
# ✅ Helper: Paragraph Split
# =====================================================
def split_into_paragraphs(text: str):
paragraphs = re.split(r'\n\s*\n', text.strip())
return [p.strip() for p in paragraphs if p.strip()]
# =====================================================
# ✅ FastAPI Setup
# =====================================================
app = FastAPI(title="ModernBERT AI Text Detector")
class InputText(BaseModel):
text: str
@app.post("/analyze")
async def analyze(data: InputText):
text = data.text.strip()
if not text:
return {"success": False, "code": 400, "message": "Empty input text"}
total_words = len(text.split())
full_result = analyze_text_block(text)
fake_percentage = round(full_result["ai_generated_score"] * 100, 2)
ai_words = int(total_words * (fake_percentage / 100))
results = []
if fake_percentage > 50:
paragraphs = split_into_paragraphs(text)
ai_words, total_words = 0, 0
for p in paragraphs:
res = analyze_text_block(p)
wc = len(p.split())
total_words += wc
ai_words += wc * res["ai_generated_score"]
results.append({
"paragraph": p,
"ai_generated_score": res["ai_generated_score"],
"human_written_score": res["human_written_score"],
"predicted_model": res["predicted_model"]
})
fake_percentage = round((ai_words / total_words) * 100, 2)
feedback = (
"Most of Your Text is AI/GPT Generated"
if fake_percentage > 50
else "Most of Your Text Appears Human-Written"
)
return {
"success": True,
"code": 200,
"message": "analysis completed",
"data": {
"fakePercentage": fake_percentage,
"isHuman": round(100 - fake_percentage, 2),
"textWords": total_words,
"aiWords": ai_words,
"paragraphs": results,
"predicted_model": full_result["predicted_model"],
"feedback": feedback,
"input_text": text,
"detected_language": "en"
}
}
|