Spaces:
Running
Running
| # app.py | |
| import os | |
| import tempfile | |
| import re | |
| import torch | |
| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| from docx import Document as DocxDocument | |
| from pptx import Presentation | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_core.documents import Document | |
| # Load Reasoning Model | |
| model_id = "microsoft/phi-3-mini-128k-instruct" | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| reasoning_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, | |
| max_new_tokens=512, temperature=0.7, top_p=0.9) | |
| # Embedding Model | |
| embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| vectorstore = None | |
| # Summarizer | |
| summary_pipeline = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6") | |
| def clean_text(text): | |
| lines = text.split("\n") | |
| cleaned = [] | |
| for line in lines: | |
| line = line.strip() | |
| if re.search(r'(Page \d+|Slide \d+|CS583|UIC|Bing Liu)', line, re.IGNORECASE): | |
| continue | |
| if len(line) < 3: | |
| continue | |
| line = re.sub(r'[^\x00-\x7F]+', ' ', line) | |
| cleaned.append(line) | |
| return "\n".join(cleaned) | |
| def extract_text(file_path, ext): | |
| if ext == ".pdf": | |
| reader = PdfReader(file_path) | |
| return "\n".join([page.extract_text() or "" for page in reader.pages]) | |
| elif ext == ".docx": | |
| doc = DocxDocument(file_path) | |
| return "\n".join([p.text for p in doc.paragraphs]) | |
| elif ext == ".txt": | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| elif ext == ".pptx": | |
| prs = Presentation(file_path) | |
| return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text")) | |
| else: | |
| raise ValueError("Unsupported file format") | |
| def process_file(file): | |
| global vectorstore | |
| try: | |
| ext = os.path.splitext(file.name)[1].lower() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: | |
| tmp.write(file.read()) | |
| tmp.flush() | |
| full_text = extract_text(tmp.name, ext) | |
| cleaned = clean_text(full_text) | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | |
| chunks = splitter.split_text(cleaned) | |
| docs = [Document(page_content=c) for c in chunks] | |
| vectorstore = FAISS.from_documents(docs, embedding_model) | |
| return "β File processed. You can now ask questions." | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| def generate_prompt(context, question): | |
| return f""" | |
| You are a helpful academic tutor assisting a student strictly based on course slides or textbook material. | |
| Context: | |
| {context} | |
| Question: | |
| {question} | |
| Instructions: | |
| - Answer ONLY using the above context. Do NOT add outside knowledge. | |
| - Think clearly and deeply before answering. | |
| - Use structured academic language based strictly on the context. | |
| - Use clean formatting with helpful headings and minimal bullet points. | |
| - Do NOT repeat the question or include prompt labels. | |
| - If the context lacks an answer, say: "The provided material does not contain sufficient information to answer this question accurately." | |
| - Output must be academically concise, well-organized, and visually clear. | |
| """.strip() | |
| def detect_question_type(q): | |
| q = q.lower().strip() | |
| if q.startswith(("what is", "define", "give definition")): | |
| return "definition" | |
| elif q.startswith(("how", "explain", "why")): | |
| return "explanation" | |
| elif "difference between" in q or "compare" in q: | |
| return "comparison" | |
| elif q.startswith("list") or "types of" in q: | |
| return "list" | |
| return "general" | |
| def post_process_output(answer_text, question): | |
| qtype = detect_question_type(question) | |
| label_map = { | |
| "definition": "π **Definition**", | |
| "explanation": "π **Explanation**", | |
| "comparison": "π **Comparison**", | |
| "list": "π **Key Points**", | |
| "general": "π **Insight**", | |
| } | |
| answer_text = f"{label_map.get(qtype)}\n\n{answer_text}" | |
| if len(answer_text.split()) > 80: | |
| summary = summary_pipeline(answer_text, max_length=60, min_length=25, do_sample=False)[0]['summary_text'] | |
| answer_text += f"\n\nπ **Summary:** {summary.strip()}" | |
| return answer_text | |
| def ask_question(question): | |
| global vectorstore | |
| if vectorstore is None: | |
| return "β Please upload and process a file first." | |
| docs = vectorstore.similarity_search(question, k=3) | |
| if not docs: | |
| return "β No relevant information found." | |
| context = "\n".join([doc.page_content for doc in docs]) | |
| prompt = generate_prompt(context, question) | |
| result = reasoning_pipeline(prompt)[0]['generated_text'] | |
| for marker in ["Context:", "Question:", "Instructions:"]: | |
| if marker in result: | |
| result = result.split(marker)[-1].strip() | |
| if "." in result: | |
| result = result.rsplit(".", 1)[0] + "." | |
| return post_process_output(result.strip(), question) | |
| # Gradio UI | |
| title = "π AI Study Assistant" | |
| with gr.Blocks(css="footer {display:none !important}") as demo: | |
| gr.Markdown("""# π AI Study Assistant | |
| Upload your lecture notes and ask deep academic questions. Powered by Phi-3 & FAISS.""") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload Course Material (PDF, DOCX, TXT, PPTX)") | |
| upload_btn = gr.Button("Process File") | |
| status = gr.Textbox(label="Status", interactive=False) | |
| question = gr.Textbox(label="Ask a Question", placeholder="E.g., What is demand paging?") | |
| ask_btn = gr.Button("Get Answer") | |
| answer = gr.Markdown("", elem_id="answer-box") | |
| upload_btn.click(fn=process_file, inputs=file_input, outputs=status) | |
| ask_btn.click(fn=ask_question, inputs=question, outputs=answer) | |
| if __name__ == "__main__": | |
| demo.launch() | |