import os
import requests
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
from pydantic import BaseModel

HF_API_TOKEN = os.getenv("HF_API_TOKEN")
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"
API_URL = f"/static-proxy?url=https%3A%2F%2Fapi-inference.huggingface.co%2Fmodels%2F%7BMODEL_ID%7D"

if HF_API_TOKEN is None:
    raise RuntimeError("HF_API_TOKEN environment variable is not set.")

HEADERS = {"Authorization": f"Bearer {HF_API_TOKEN}"}

app = FastAPI()

# static + templates
app.mount("/static", StaticFiles(directory="static"), name="static")
templates = Jinja2Templates(directory="templates")


class ChatRequest(BaseModel):
    context: str
    question: str


@app.get("/", response_class=HTMLResponse)
async def home(request: Request):
    return templates.TemplateResponse("index.html", {"request": request})


@app.post("/chat")
async def chat_endpoint(payload: ChatRequest):
    context = payload.context.strip()
    question = payload.question.strip()

    if not context or not question:
        return JSONResponse({"answer": "Please provide both context and a question."})

    # Build an instruct-style prompt
    prompt = f"""[INST] You are a helpful assistant that answers questions ONLY using the given context. 
If the answer cannot be found in the context, say you don't know and do not hallucinate.

Context:
{context}

Question:
{question}

Answer concisely based only on the context.[/INST]"""

    try:
        resp = requests.post(
            API_URL,
            headers=HEADERS,
            json={
                "inputs": prompt,
                "parameters": {
                    "max_new_tokens": 256,
                    "temperature": 0.2,
                },
            },
            timeout=60,
        )
        resp.raise_for_status()
        data = resp.json()

        # data is usually a list with {"generated_text": "..."}
        generated = data[0].get("generated_text", "")

        # In many TGI backends, generated_text contains prompt + completion.
        # Try to cut off the prompt part if present.
        if generated.startswith(prompt):
            answer = generated[len(prompt) :].strip()
        else:
            answer = generated.strip()

        return {"answer": answer}
    except Exception as e:
        return JSONResponse(
            {"answer": f"Error calling model: {e}"},
            status_code=500,
        )