Spaces:
Sleeping
Sleeping
Added vdb-v3-wikksplitter metadata
Browse files
app.py
CHANGED
|
@@ -11,7 +11,7 @@ MAX_NEW_TOKENS = 700
|
|
| 11 |
SHOW_MODEL_PARAMETERS_IN_UI = os.environ.get("SHOW_MODEL_PARAMETERS_IN_UI", default="False") == "True"
|
| 12 |
import logging
|
| 13 |
|
| 14 |
-
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)s] - %(message)s')
|
| 15 |
|
| 16 |
setup()
|
| 17 |
|
|
|
|
| 11 |
SHOW_MODEL_PARAMETERS_IN_UI = os.environ.get("SHOW_MODEL_PARAMETERS_IN_UI", default="False") == "True"
|
| 12 |
import logging
|
| 13 |
|
| 14 |
+
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(name)s][%(levelname)s] - %(message)s')
|
| 15 |
|
| 16 |
setup()
|
| 17 |
|
rag.py
CHANGED
|
@@ -9,9 +9,10 @@ from huggingface_hub import snapshot_download, InferenceClient
|
|
| 9 |
from langchain_community.vectorstores import FAISS
|
| 10 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 11 |
|
|
|
|
| 12 |
|
| 13 |
-
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(levelname)s] - %(message)s')
|
| 14 |
# logging.getLogger().setLevel(logging.INFO)
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class RAG:
|
|
@@ -99,6 +100,15 @@ class RAG:
|
|
| 99 |
documents_retrieved = self.vectore_store.similarity_search_with_score_by_vector(embedding, k=number_of_contexts)
|
| 100 |
logging.info(f"Documents retrieved: {len(documents_retrieved)}")
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
# Reranking
|
| 104 |
# ==============================================================================================================
|
|
@@ -137,6 +147,7 @@ class RAG:
|
|
| 137 |
|
| 138 |
return response.json()[0]["generated_text"].split("###")[-1][8:]
|
| 139 |
|
|
|
|
| 140 |
def predict_completion(self, instruction, context, model_parameters):
|
| 141 |
|
| 142 |
client = OpenAI(
|
|
@@ -183,22 +194,39 @@ class RAG:
|
|
| 183 |
|
| 184 |
return text_context, full_context, source_context
|
| 185 |
|
|
|
|
| 186 |
def get_response(self, prompt: str, model_parameters: dict) -> str:
|
| 187 |
try:
|
| 188 |
docs = self.get_context(prompt, model_parameters["NUM_CHUNKS"])
|
| 189 |
|
| 190 |
response = ""
|
| 191 |
|
| 192 |
-
for i, (doc, score) in enumerate(docs):
|
| 193 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
response += "\n\n" + "="*100
|
| 195 |
response += f"\nDocument {i+1}"
|
| 196 |
response += "\n" + "="*100
|
| 197 |
response += f"\nScore: {score:.5f}"
|
| 198 |
response += f"\nTitle: {doc.metadata['title']}"
|
|
|
|
| 199 |
response += f"\nURL: {doc.metadata['url']}"
|
| 200 |
-
response += f"\nID: {doc.metadata['
|
| 201 |
-
response += f"\nStart index: {doc.metadata['start_index']}"
|
| 202 |
# response += f"\nSource: {doc.metadata['src']}"
|
| 203 |
# response += f"\nRedirected: {doc.metadata['redirected']}"
|
| 204 |
# url = doc.metadata['url']
|
|
|
|
| 9 |
from langchain_community.vectorstores import FAISS
|
| 10 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 11 |
|
| 12 |
+
from termcolor import cprint
|
| 13 |
|
|
|
|
| 14 |
# logging.getLogger().setLevel(logging.INFO)
|
| 15 |
+
logging.basicConfig(level=logging.INFO, format='[%(asctime)s][%(name)s][%(levelname)s] - %(message)s')
|
| 16 |
|
| 17 |
|
| 18 |
class RAG:
|
|
|
|
| 100 |
documents_retrieved = self.vectore_store.similarity_search_with_score_by_vector(embedding, k=number_of_contexts)
|
| 101 |
logging.info(f"Documents retrieved: {len(documents_retrieved)}")
|
| 102 |
|
| 103 |
+
for i, (doc, score) in enumerate(documents_retrieved):
|
| 104 |
+
logging.info(f"Document {i+1}:")
|
| 105 |
+
logging.info(f"Score: {score:.5f}")
|
| 106 |
+
logging.info(f"Title: {doc.metadata}")
|
| 107 |
+
# logging.info(f"Source: {doc.metadata['src']}")
|
| 108 |
+
# logging.info(f"Redirected: {doc.metadata['redirected']}")
|
| 109 |
+
# url = doc.metadata['url']
|
| 110 |
+
# logging.info(f"Revision ID: {url}")
|
| 111 |
+
# logging.info(f'URL: <a href="{url}" target="_blank">{url}</a><br>')
|
| 112 |
|
| 113 |
# Reranking
|
| 114 |
# ==============================================================================================================
|
|
|
|
| 147 |
|
| 148 |
return response.json()[0]["generated_text"].split("###")[-1][8:]
|
| 149 |
|
| 150 |
+
|
| 151 |
def predict_completion(self, instruction, context, model_parameters):
|
| 152 |
|
| 153 |
client = OpenAI(
|
|
|
|
| 194 |
|
| 195 |
return text_context, full_context, source_context
|
| 196 |
|
| 197 |
+
|
| 198 |
def get_response(self, prompt: str, model_parameters: dict) -> str:
|
| 199 |
try:
|
| 200 |
docs = self.get_context(prompt, model_parameters["NUM_CHUNKS"])
|
| 201 |
|
| 202 |
response = ""
|
| 203 |
|
| 204 |
+
for i, (doc, score) in enumerate(docs):
|
| 205 |
|
| 206 |
+
# ----------------------------------------------------------------------------
|
| 207 |
+
# vector_db__BAAI__bge-m3__cfg-v3-wikisplitter => metadata
|
| 208 |
+
# ----------------------------------------------------------------------------
|
| 209 |
+
# {
|
| 210 |
+
# 'document_id': '1535',
|
| 211 |
+
# 'title': 'Intel·ligència artificial',
|
| 212 |
+
# 'url': 'https://ca.wikipedia.org/wiki?curid=1535',
|
| 213 |
+
# 'language': 'ca',
|
| 214 |
+
# 'src': '/gpfs/projects/bsc88/apps/projects/__wiki-rag__/_data/json_extractor/cawiki-20250501/wiki_00.jsonl',
|
| 215 |
+
# 'section_title': 'Centres tecnològics a Catalunya i les seves aportacions i investigacions en la IA.',
|
| 216 |
+
# 'section_id': 32,
|
| 217 |
+
# 'section_len': 3403,
|
| 218 |
+
# 'split_level': 'section'
|
| 219 |
+
# }
|
| 220 |
+
# ----------------------------------------------------------------------------
|
| 221 |
response += "\n\n" + "="*100
|
| 222 |
response += f"\nDocument {i+1}"
|
| 223 |
response += "\n" + "="*100
|
| 224 |
response += f"\nScore: {score:.5f}"
|
| 225 |
response += f"\nTitle: {doc.metadata['title']}"
|
| 226 |
+
response += f"\nSection title: {doc.metadata['section_title']}"
|
| 227 |
response += f"\nURL: {doc.metadata['url']}"
|
| 228 |
+
response += f"\nID: {doc.metadata['document_id']}"
|
| 229 |
+
# response += f"\nStart index: {doc.metadata['start_index']}"
|
| 230 |
# response += f"\nSource: {doc.metadata['src']}"
|
| 231 |
# response += f"\nRedirected: {doc.metadata['redirected']}"
|
| 232 |
# url = doc.metadata['url']
|