# app.py

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.settings import Settings
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gradio as gr
import os

# 🔧 Load CPU-friendly model
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)

# ✅ Set global LLM and embedding
Settings.llm = HuggingFaceLLM(model=model, tokenizer=tokenizer, context_window=2048, max_new_tokens=256)
Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

# 📄 Create sample document if not exists
sample_path = "sample.txt"
if not os.path.exists(sample_path):
    with open(sample_path, "w") as f:
        f.write("LlamaIndex helps build LLM apps with structured data. It's fast, flexible, and easy to use.")

# ⚙️ Load and index document
documents = SimpleDirectoryReader(input_dir=".").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()

# 🎯 Gradio interface
def query_llamaindex(user_query):
    try:
        response = query_engine.query(user_query)
        return str(response)
    except Exception as e:
        return f"Error: {str(e)}"

demo = gr.Interface(fn=query_llamaindex, inputs="text", outputs="text", title="LlamaIndex Demo", description="Ask about the document!")

if __name__ == "__main__":
    demo.launch()