File size: 6,981 Bytes
c577877
 
 
 
 
 
 
 
816e864
c577877
 
 
 
 
1abf617
 
c577877
1abf617
 
 
 
 
 
 
 
 
c577877
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816e864
c577877
 
 
 
 
 
 
 
 
 
816e864
 
 
c577877
5a107ba
 
72f13fa
 
 
5a107ba
 
 
72f13fa
77e94ac
5a107ba
c577877
 
 
 
 
 
 
 
 
 
 
 
 
 
0c32660
c577877
 
 
 
 
 
0c32660
 
 
 
c577877
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816e864
 
 
 
 
c577877
0c32660
 
 
 
 
c577877
0c32660
 
 
c577877
0c32660
c577877
 
 
 
 
 
 
 
 
 
 
 
f582c24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c577877
 
 
41bb926
 
 
816e864
41bb926
 
 
 
 
 
 
0c32660
41bb926
 
 
 
 
 
 
0c32660
41bb926
 
 
0c32660
41bb926
c577877
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import tempfile
import re
import torch
import gradio as gr
from PyPDF2 import PdfReader
from docx import Document as DocxDocument
from pptx import Presentation
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# Load Reasoning Model (lightweight + CPU friendly)
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

reasoning_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.9
)

# Embedding Model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = None

# Summarizer
summary_pipeline = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

def clean_text(text):
    lines = text.split("\n")
    cleaned = []
    for line in lines:
        line = line.strip()
        if re.search(r'(Page \d+|Slide \d+|CS583|UIC|Bing Liu)', line, re.IGNORECASE):
            continue
        if len(line) < 3:
            continue
        line = re.sub(r'[^\x00-\x7F]+', ' ', line)
        cleaned.append(line)
    return "\n".join(cleaned)

def extract_text(file_path, ext):
    if ext == ".pdf":
        reader = PdfReader(file_path)
        return "\n".join([page.extract_text() or "" for page in reader.pages])
    elif ext == ".docx":
        doc = DocxDocument(file_path)
        return "\n".join([p.text for p in doc.paragraphs])
    elif ext == ".txt":
        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
            return f.read()
    elif ext == ".pptx":
        prs = Presentation(file_path)
        return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
    else:
        raise ValueError("Unsupported file format")

def process_file(file):
    global vectorstore
    try:
        filename = getattr(file, "name", None)
        ext = os.path.splitext(filename)[1].lower() if filename else ".pdf"

        with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
            if hasattr(file, "read"):
                file_bytes = file.read()
            elif isinstance(file, str) and os.path.exists(file):
                with open(file, "rb") as f:
                    file_bytes = f.read()
            elif isinstance(file, bytes):
                file_bytes = file
            else:
                return "❌ Error: Could not process uploaded file."

            tmp.write(file_bytes)
            tmp.flush()
            full_text = extract_text(tmp.name, ext)

        cleaned = clean_text(full_text)
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
        chunks = splitter.split_text(cleaned)
        docs = [Document(page_content=c) for c in chunks]
        vectorstore = FAISS.from_documents(docs, embedding_model)

        return "βœ… File processed. You can now ask questions."
    except Exception as e:
        return f"❌ Error: {str(e)}"

def generate_prompt(context, question):
    return f"""Context:
{context}

Question:
{question}

Instructions:
- Answer ONLY using the above context.
- Use structured academic language.
- Think carefully and answer concisely.
- If context lacks information, say: "The material does not contain enough information to answer this accurately."
""".strip()

def detect_question_type(q):
    q = q.lower().strip()
    if q.startswith(("what is", "define", "give definition")):
        return "definition"
    elif q.startswith(("how", "explain", "why")):
        return "explanation"
    elif "difference between" in q or "compare" in q:
        return "comparison"
    elif q.startswith("list") or "types of" in q:
        return "list"
    return "general"

def post_process_output(answer_text, question):
    qtype = detect_question_type(question)
    label_map = {
        "definition": "πŸ“˜ **Definition**",
        "explanation": "πŸ“˜ **Explanation**",
        "comparison": "πŸ“˜ **Comparison**",
        "list": "πŸ“˜ **Key Points**",
        "general": "πŸ“˜ **Insight**",
    }
    clean_answer = answer_text.strip()

    if clean_answer.lower().startswith("context:") or "instructions:" in clean_answer:
        for marker in ["Context:", "Question:", "Instructions:"]:
            clean_answer = clean_answer.replace(marker, "").strip()

    if len(clean_answer.split()) > 80:
        summary = summary_pipeline(clean_answer, max_length=60, min_length=25, do_sample=False)[0]['summary_text']
        clean_answer += f"\n\nπŸ“ **Summary:** {summary.strip()}"

    return f"{label_map.get(qtype)}\n\n{clean_answer}"

def ask_question(question):
    global vectorstore
    if vectorstore is None:
        return "❌ Please upload and process a file first."

    docs = vectorstore.similarity_search(question, k=3)
    if not docs:
        return "❌ No relevant information found."

    context = "\n".join([doc.page_content for doc in docs])
    prompt = generate_prompt(context, question)
    raw_output = reasoning_pipeline(prompt)[0]['generated_text']

    # 🧠 Smart cleanup: remove prompt leakage or echoed instructions
    for section in ["Context:", "Question:", "Instructions:", "Use structured academic language"]:
        raw_output = raw_output.replace(section, "").strip()

    # Remove anything before answer starts if needed
    if "Answer:" in raw_output:
        raw_output = raw_output.split("Answer:")[-1].strip()

    # Trim trailing junk
    if "." in raw_output:
        raw_output = raw_output.rsplit(".", 1)[0] + "."

    return post_process_output(raw_output.strip(), question)


# Gradio UI
with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
    gr.Markdown("""
    # πŸ“š AI Study Assistant
    Upload your lecture slide/text file, ask questions, and get intelligent answers powered by Flan-T5.
    """)

    with gr.Tab("Upload & Ask"):
        with gr.Row():
            file_input = gr.File(label="πŸ“„ Upload File", file_types=[".pdf", ".docx", ".pptx", ".txt"])
            upload_btn = gr.Button("Upload")
        upload_output = gr.Textbox(label="Upload Status", interactive=False)
        upload_btn.click(fn=process_file, inputs=[file_input], outputs=[upload_output])

        gr.Markdown("---")

        with gr.Row():
            question = gr.Textbox(label="❓ Ask a question")
            ask_btn = gr.Button("Ask")
        answer = gr.Textbox(label="πŸ’‘ Answer", interactive=False)
        ask_btn.click(fn=ask_question, inputs=[question], outputs=[answer])

    with gr.Tab("History"):
        gr.Markdown("""
        **⏳ Coming Soon**: Question-answer history, summarization view, and more!
        """)

if __name__ == "__main__":
    demo.launch()