Spaces:

umerforsure
/

AI-Study-Assistant

Running

App Files Files Community

AI-Study-Assistant / app.py

umerforsure

🚀 Initial commit

c577877 5 months ago

raw

history blame

6.17 kB

	# app.py
	import os
	import tempfile
	import re
	import torch
	import gradio as gr
	from PyPDF2 import PdfReader
	from docx import Document as DocxDocument
	from pptx import Presentation
	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.documents import Document

	# Load Reasoning Model
	model_id = "microsoft/phi-3-mini-128k-instruct"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.bfloat16,
	device_map="auto"
	)
	reasoning_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer,
	max_new_tokens=512, temperature=0.7, top_p=0.9)

	# Embedding Model
	embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	vectorstore = None

	# Summarizer
	summary_pipeline = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

	def clean_text(text):
	lines = text.split("\n")
	cleaned = []
	for line in lines:
	line = line.strip()
	if re.search(r'(Page \d+\|Slide \d+\|CS583\|UIC\|Bing Liu)', line, re.IGNORECASE):
	continue
	if len(line) < 3:
	continue
	line = re.sub(r'[^\x00-\x7F]+', ' ', line)
	cleaned.append(line)
	return "\n".join(cleaned)

	def extract_text(file_path, ext):
	if ext == ".pdf":
	reader = PdfReader(file_path)
	return "\n".join([page.extract_text() or "" for page in reader.pages])
	elif ext == ".docx":
	doc = DocxDocument(file_path)
	return "\n".join([p.text for p in doc.paragraphs])
	elif ext == ".txt":
	with open(file_path, "r", encoding="utf-8") as f:
	return f.read()
	elif ext == ".pptx":
	prs = Presentation(file_path)
	return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
	else:
	raise ValueError("Unsupported file format")

	def process_file(file):
	global vectorstore
	try:
	ext = os.path.splitext(file.name)[1].lower()
	with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
	tmp.write(file.read())
	tmp.flush()
	full_text = extract_text(tmp.name, ext)

	cleaned = clean_text(full_text)
	splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
	chunks = splitter.split_text(cleaned)
	docs = [Document(page_content=c) for c in chunks]
	vectorstore = FAISS.from_documents(docs, embedding_model)

	return "✅ File processed. You can now ask questions."
	except Exception as e:
	return f"❌ Error: {str(e)}"

	def generate_prompt(context, question):
	return f"""
	You are a helpful academic tutor assisting a student strictly based on course slides or textbook material.

	Context:
	{context}

	Question:
	{question}

	Instructions:
	- Answer ONLY using the above context. Do NOT add outside knowledge.
	- Think clearly and deeply before answering.
	- Use structured academic language based strictly on the context.
	- Use clean formatting with helpful headings and minimal bullet points.
	- Do NOT repeat the question or include prompt labels.
	- If the context lacks an answer, say: "The provided material does not contain sufficient information to answer this question accurately."
	- Output must be academically concise, well-organized, and visually clear.
	""".strip()

	def detect_question_type(q):
	q = q.lower().strip()
	if q.startswith(("what is", "define", "give definition")):
	return "definition"
	elif q.startswith(("how", "explain", "why")):
	return "explanation"
	elif "difference between" in q or "compare" in q:
	return "comparison"
	elif q.startswith("list") or "types of" in q:
	return "list"
	return "general"

	def post_process_output(answer_text, question):
	qtype = detect_question_type(question)
	label_map = {
	"definition": "📘 Definition",
	"explanation": "📘 Explanation",
	"comparison": "📘 Comparison",
	"list": "📘 Key Points",
	"general": "📘 Insight",
	}
	answer_text = f"{label_map.get(qtype)}\n\n{answer_text}"

	if len(answer_text.split()) > 80:
	summary = summary_pipeline(answer_text, max_length=60, min_length=25, do_sample=False)[0]['summary_text']
	answer_text += f"\n\n📝 Summary: {summary.strip()}"

	return answer_text

	def ask_question(question):
	global vectorstore
	if vectorstore is None:
	return "❌ Please upload and process a file first."

	docs = vectorstore.similarity_search(question, k=3)
	if not docs:
	return "❌ No relevant information found."

	context = "\n".join([doc.page_content for doc in docs])
	prompt = generate_prompt(context, question)
	result = reasoning_pipeline(prompt)[0]['generated_text']

	for marker in ["Context:", "Question:", "Instructions:"]:
	if marker in result:
	result = result.split(marker)[-1].strip()
	if "." in result:
	result = result.rsplit(".", 1)[0] + "."

	return post_process_output(result.strip(), question)

	# Gradio UI
	title = "📚 AI Study Assistant"
	with gr.Blocks(css="footer {display:none !important}") as demo:
	gr.Markdown("""# 📘 AI Study Assistant
	Upload your lecture notes and ask deep academic questions. Powered by Phi-3 & FAISS.""")

	with gr.Row():
	file_input = gr.File(label="Upload Course Material (PDF, DOCX, TXT, PPTX)")
	upload_btn = gr.Button("Process File")

	status = gr.Textbox(label="Status", interactive=False)

	question = gr.Textbox(label="Ask a Question", placeholder="E.g., What is demand paging?")
	ask_btn = gr.Button("Get Answer")
	answer = gr.Markdown("", elem_id="answer-box")

	upload_btn.click(fn=process_file, inputs=file_input, outputs=status)
	ask_btn.click(fn=ask_question, inputs=question, outputs=answer)

	if __name__ == "__main__":
	demo.launch()