# app.py from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.llms.huggingface import HuggingFaceLLM from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.settings import Settings from transformers import AutoTokenizer, AutoModelForCausalLM import torch import gradio as gr import os # 🔧 Load CPU-friendly model model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32) # ✅ Set global LLM and embedding Settings.llm = HuggingFaceLLM(model=model, tokenizer=tokenizer, context_window=2048, max_new_tokens=256) Settings.embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2") # 📄 Create sample document if not exists sample_path = "sample.txt" if not os.path.exists(sample_path): with open(sample_path, "w") as f: f.write("LlamaIndex helps build LLM apps with structured data. It's fast, flexible, and easy to use.") # ⚙️ Load and index document documents = SimpleDirectoryReader(input_dir=".").load_data() index = VectorStoreIndex.from_documents(documents) query_engine = index.as_query_engine() # 🎯 Gradio interface def query_llamaindex(user_query): try: response = query_engine.query(user_query) return str(response) except Exception as e: return f"Error: {str(e)}" demo = gr.Interface(fn=query_llamaindex, inputs="text", outputs="text", title="LlamaIndex Demo", description="Ask about the document!") if __name__ == "__main__": demo.launch()