import os import requests from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse, JSONResponse from fastapi.staticfiles import StaticFiles from fastapi.templating import Jinja2Templates from pydantic import BaseModel HF_API_TOKEN = os.getenv("HF_API_TOKEN") MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3" API_URL = f"/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2F%7BMODEL_ID%7D" if HF_API_TOKEN is None: raise RuntimeError("HF_API_TOKEN environment variable is not set.") HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"} app = FastAPI() # static + templates app.mount("/static", StaticFiles(directory="static"), name="static") templates = Jinja2Templates(directory="templates") class ChatRequest(BaseModel): context: str question: str @app.get("/", response_class=HTMLResponse) async def home(request: Request): return templates.TemplateResponse("index.html", {"request": request}) @app.post("/chat") async def chat_endpoint(payload: ChatRequest): context = payload.context.strip() question = payload.question.strip() if not context or not question: return JSONResponse({"answer": "Please provide both context and a question."}) # Build an instruct-style prompt prompt = f"""[INST] You are a helpful assistant that answers questions ONLY using the given context. If the answer cannot be found in the context, say you don't know and do not hallucinate. Context: {context} Question: {question} Answer concisely based only on the context.[/INST]""" try: resp = requests.post( API_URL, headers=HEADERS, json={ "inputs": prompt, "parameters": { "max_new_tokens": 256, "temperature": 0.2, }, }, timeout=60, ) resp.raise_for_status() data = resp.json() # data is usually a list with {"generated_text": "..."} generated = data[0].get("generated_text", "") # In many TGI backends, generated_text contains prompt + completion. # Try to cut off the prompt part if present. if generated.startswith(prompt): answer = generated[len(prompt) :].strip() else: answer = generated.strip() return {"answer": answer} except Exception as e: return JSONResponse( {"answer": f"Error calling model: {e}"}, status_code=500, )