Spaces:

aach456
/

Rah

Runtime error

App Files Files Community

aach456 commited on Aug 28, 2025

Commit

fc96ea6

verified ·

1 Parent(s): 67b085a

Create app.py

Browse files

Files changed (1) hide show

app.py +270 -0

app.py ADDED Viewed

	@@ -0,0 +1,270 @@

+# app.py
+# Chat-style RAG app with Streamlit chat UI, FAISS retrieval, SentenceTransformers embeddings,
+# and an open Mistral-7B pipeline. All caches redirected to /tmp to avoid PermissionError.
+# ---------- Writable dirs BEFORE third-party imports ----------
+import os, glob, tempfile
+# Streamlit internal runtime dir -> /tmp (fixes PermissionError: '/.streamlit')
+ST_RT = os.environ.get("STREAMLIT_RUNTIME_DIR", "/tmp/.streamlit_runtime")
+try:
+    os.makedirs(ST_RT, exist_ok=True)
+except Exception:
+    ST_RT = tempfile.mkdtemp(prefix="st_runtime_")
+os.environ["STREAMLIT_RUNTIME_DIR"] = ST_RT
+# Hugging Face caches -> /tmp
+HF_HOME = os.environ.get("HF_HOME", "/tmp/hf_cache")
+try:
+    os.makedirs(HF_HOME, exist_ok=True)
+except Exception:
+    HF_HOME = tempfile.mkdtemp(prefix="hf_cache_")
+os.environ["HF_HOME"] = HF_HOME
+os.environ["TRANSFORMERS_CACHE"] = HF_HOME  # backward-compat; deprecation warning is harmless
+os.environ["SENTENCE_TRANSFORMERS_HOME"] = HF_HOME
+os.environ["HF_DATASETS_CACHE"] = HF_HOME
+os.environ["XDG_CACHE_HOME"] = HF_HOME
+os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
+# Clean stale locks
+locks_dir = os.path.join(HF_HOME, "hub", ".locks")
+if os.path.isdir(locks_dir):
+    for p in glob.glob(os.path.join(locks_dir, "*.lock")):
+        try:
+            os.remove(p)
+        except Exception:
+            pass
+# ---------- Imports AFTER env is set ----------
+import io
+import time
+import pandas as pd
+import numpy as np
+import requests
+import streamlit as st
+from bs4 import BeautifulSoup
+from PyPDF2 import PdfReader
+from docx import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from sentence_transformers import SentenceTransformer
+from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, pipeline
+import faiss
+# ---------- Page ----------
+st.set_page_config(page_title="Chat RAG • Open Model + URLs", layout="wide")
+st.title("💬 Chat RAG with Open Model, FAISS, and Web URLs")
+# ---------- Session ----------
+for key, default in [
+    ("messages", []),
+    ("chunks", []),
+    ("embedder", None),
+    ("faiss_index", None),
+]:
+    if key not in st.session_state:
+        st.session_state[key] = default
+# ---------- Loaders ----------
+def load_txt(file):
+    raw = file.read()
+    for enc in ("utf-8", "latin-1"):
+        try:
+            return [{"source": file.name, "text": raw.decode(enc, errors="ignore")}]
+        except Exception:
+            continue
+    return [{"source": file.name, "text": raw.decode("utf-8", errors="ignore")}]
+def load_pdf(file):
+    pdf = PdfReader(file)
+    text = ""
+    for page in pdf.pages:
+        text += page.extract_text() or ""
+    return [{"source": file.name, "text": text}]
+def load_docx(file):
+    data = file.read()
+    doc = Document(io.BytesIO(data))
+    text = " ".join(p.text for p in doc.paragraphs)
+    return [{"source": file.name, "text": text}]
+def load_csv(file):
+    data = file.read()
+    df = None
+    for enc in ("utf-8", "latin-1"):
+        try:
+            df = pd.read_csv(io.BytesIO(data), encoding=enc)
+            break
+        except Exception:
+            df = None
+    if df is None:
+        try:
+            df = pd.read_csv(io.BytesIO(data), engine="python")
+        except Exception:
+            df = pd.DataFrame()
+    text = " ".join(df.astype(str).values.flatten().tolist()) if not df.empty else ""
+    return [{"source": file.name, "text": text}]
+def load_documents(files):
+    docs = []
+    for file in files or []:
+        name = file.name.lower()
+        if name.endswith(".pdf"):
+            docs += load_pdf(file)
+        elif name.endswith(".docx"):
+            docs += load_docx(file)
+        elif name.endswith(".csv"):
+            docs += load_csv(file)
+        elif name.endswith(".txt"):
+            docs += load_txt(file)
+    return docs
+# ---------- Web fetch ----------
+def fetch_web_text(url, timeout=12, retries=2, backoff=1.5):
+    for attempt in range(retries + 1):
+        try:
+            headers = {
+                "User-Agent": (
+                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                    "AppleWebKit/537.36 (KHTML, like Gecko) "
+                    "Chrome/124.0 Safari/537.36"
+                )
+            }
+            resp = requests.get(url, headers=headers, timeout=timeout)
+            resp.raise_for_status()
+            soup = BeautifulSoup(resp.text, "html.parser")
+            for tag in soup(["script", "style", "noscript"]):
+                tag.decompose()
+            text = " ".join(soup.get_text(separator=" ").split())
+            return [{"source": url, "text": text}]
+        except Exception:
+            if attempt < retries:
+                time.sleep(backoff ** attempt)
+            else:
+                return []
+# ---------- Chunking ----------
+def chunk_documents(docs, chunk_size=1000, chunk_overlap=120):
+    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
+    chunks = []
+    for doc in docs:
+        splits = splitter.split_text(doc.get("text", "") or "")
+        for idx, chunk in enumerate(splits):
+            chunks.append({"source": doc["source"], "chunk_id": f"{doc['source']}_chunk{idx}", "content": chunk})
+    return chunks
+# ---------- Embeddings / Index ----------
+@st.cache_resource(show_spinner=False)
+def load_embedder():
+    return SentenceTransformer("all-MiniLM-L6-v2", cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME", HF_HOME))
+def build_embeddings_index(chunks):
+    embedder = load_embedder()
+    texts = [c["content"] for c in chunks]
+    if not texts:
+        return embedder, None
+    emb = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
+    emb = np.asarray(emb, dtype="float32")
+    idx = faiss.IndexFlatL2(emb.shape[14])
+    idx.add(emb)
+    return embedder, idx
+def retrieve(query, embedder, index, chunks, top_k=4):
+    if index is None or not chunks:
+        return []
+    q_emb = embedder.encode([query], convert_to_numpy=True)
+    q_emb = np.asarray(q_emb, dtype="float32")
+    distances, indices = index.search(q_emb, top_k)
+    out = []
+    for pos, i in enumerate(indices):
+        if i >= 0 and i < len(chunks):
+            out.append({"chunk": chunks[i], "score": float(distances[pos])})
+    return out
+# ---------- LLM ----------
+MODEL_ID = "MehdiHosseiniMoghadam/AVA-Mistral-7B-V2"
+@st.cache_resource(show_spinner=False)
+def load_llm():
+    cache_dir = os.environ.get("HF_HOME", HF_HOME)
+    _ = AutoConfig.from_pretrained(MODEL_ID, cache_dir=cache_dir, trust_remote_code=True)
+    tok = AutoTokenizer.from_pretrained(MODEL_ID, cache_dir=cache_dir, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, cache_dir=cache_dir, trust_remote_code=True)
+    return pipeline("text-generation", model=model, tokenizer=tok, max_length=1024, do_sample=True, temperature=0.2, trust_remote_code=True, device_map="auto")
+def answer_with_llm(context_chunks, query, llm):
+    context_text = "\n".join(f"[{c['chunk_id']}] {c['content']}" for c in context_chunks)
+    prompt = (
+        "Answer the following question using ONLY the provided context and cite the chunk ids used.\n"
+        f"Question: {query}\n"
+        "Context:\n"
+        f"{context_text}\n"
+        "Answer with citations:"
+    )
+    out = llm(prompt, max_length=512, num_return_sequences=1)
+    return out["generated_text"]
+# ---------- Sidebar sources ----------
+st.sidebar.header("Data sources")
+uploaded_files = st.sidebar.file_uploader(
+    "Upload documents (PDF, DOCX, TXT, CSV)",
+    type=["pdf", "txt", "docx", "csv"],
+    accept_multiple_files=True,
+    help="Default per-file limit ~200MB; increase via .streamlit/config.toml if needed.",
+)
+with st.sidebar.expander("Upload debug"):
+    info = {
+        "type": type(uploaded_files).__name__,
+        "num_files": (len(uploaded_files) if isinstance(uploaded_files, list) else (1 if uploaded_files else 0)),
+        "names": ([f.name for f in uploaded_files] if isinstance(uploaded_files, list) else ([uploaded_files.name] if uploaded_files else [])),
+    }
+    st.write(info)
+url_input = st.sidebar.text_area("Web URLs (one per line)", value="", height=120)
+web_docs = []
+if url_input.strip():
+    urls = [u.strip() for u in url_input.splitlines() if u.strip()]
+    with st.sidebar.spinner("Fetching web content..."):
+        for u in urls:
+            web_docs += fetch_web_text(u)
+file_docs = load_documents(uploaded_files) if uploaded_files else []
+all_docs = file_docs + web_docs
+if all_docs:
+    st.success(f"{len(all_docs)} document(s) loaded from files and URLs.")
+    with st.spinner("Chunking and embedding..."):
+        st.session_state.chunks = chunk_documents(all_docs, chunk_size=1000, chunk_overlap=120)
+        st.session_state.embedder, st.session_state.faiss_index = build_embeddings_index(st.session_state.chunks)
+    st.write(f"{len(st.session_state.chunks)} chunks created and indexed.")
+else:
+    st.info("Add documents or URLs in the sidebar to start.")
+# ---------- Chat UI ----------
+for m in st.session_state.messages:
+    with st.chat_message(m["role"]):
+        st.markdown(m["content"])
+user_input = st.chat_input("Ask about the loaded documents...")
+if user_input:
+    st.session_state.messages.append({"role": "user", "content": user_input})
+    with st.chat_message("user"):
+        st.markdown(user_input)
+    with st.chat_message("assistant"):
+        with st.spinner("Thinking..."):
+            if st.session_state.chunks:
+                llm = load_llm()
+                results = retrieve(user_input, st.session_state.embedder, st.session_state.faiss_index, st.session_state.chunks, top_k=4)
+                context_chunks = [r["chunk"] for r in results]
+                answer = answer_with_llm(context_chunks, user_input, llm)
+                st.markdown(answer)
+                sources = "\n".join(f"[{r['chunk']['chunk_id']} from {r['chunk']['source']}]" for r in results) or "No sources (no matches)."
+                with st.expander("Sources"):
+                    st.code(sources)
+            else:
+                answer = "No documents indexed yet. Add files or URLs in the sidebar and try again."
+                st.warning(answer)
+    st.session_state.messages.append({"role": "assistant", "content": answer})
+st.caption("Chat RAG • Mistral-7B (open), FAISS, SentenceTransformers, and Web URLs • Streamlit chat UI")