Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on Nov 12, 2024

Commit

f484b3f

1 Parent(s): b4c9139

Rerank documents and force summary for policy makers

Browse files

Files changed (5) hide show

climateqa/engine/chains/retrieve_documents.py +119 -91
climateqa/engine/chains/retriever.py +96 -96
climateqa/engine/graph.py +1 -0
climateqa/knowledge/retriever.py +33 -14
sandbox/20241104 - CQA - StepByStep CQA.ipynb +0 -0

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -57,107 +57,135 @@ def query_retriever(question):
     """Just a dummy tool to simulate the retriever query"""
     return question
-def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-    # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
-    @chain
-    async def retrieve_documents(state,config):
-        print("---- Retrieve documents ----")
-        keywords_extraction = make_keywords_extraction_chain(llm)
-        current_question = state["remaining_questions"][0]
-        remaining_questions = state["remaining_questions"][1:]
-        # ToolMessage(f"Retrieving documents for question: {current_question['question']}",tool_call_id = "retriever")
-        # # There are several options to get the final top k
-        # # Option 1 - Get 100 documents by question and rerank by question
-        # # Option 2 - Get 100/n documents by question and rerank the total
-        # if rerank_by_question:
-        #     k_by_question = divide_into_parts(k_final,len(questions))
-        if "documents" in state and state["documents"] is not None:
-            docs = state["documents"]
-        else:
-            docs = []
-        k_by_question = k_final // state["n_questions"]
-        sources = current_question["sources"]
-        question = current_question["question"]
-        index = current_question["index"]
-        await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
-        if index == "Vector":
-            # Search the document store using the retriever
-            # Configure high top k for further reranking step
-            retriever = ClimateQARetriever(
-                vectorstore=vectorstore,
-                sources = sources,
-                min_size = 200,
-                k_summary = k_summary,
-                k_total = k_before_reranking,
-                threshold = 0.5,
-            )
-            docs_question = await retriever.ainvoke(question,config)
-        elif index == "OpenAlex":
-            keywords = keywords_extraction.invoke(question)["keywords"]
-            openalex_query = " AND ".join(keywords)
-            print(f"... OpenAlex query: {openalex_query}")
-            retriever_openalex = OpenAlexRetriever(
-                min_year = state.get("min_year",1960),
-                max_year = state.get("max_year",None),
-                k = k_before_reranking
-            )
-            docs_question = await retriever_openalex.ainvoke(openalex_query,config)
-        else:
-            raise Exception(f"Index {index} not found in the routing index")
-        # Rerank
-        if reranker is not None:
-            with suppress_output():
-                docs_question = rerank_docs(reranker,docs_question,question)
-        else:
-            # Add a default reranking score
-            for doc in docs_question:
-                doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-        # If rerank by question we select the top documents for each question
-        if rerank_by_question:
-            docs_question = docs_question[:k_by_question]
-        # Add sources used in the metadata
-        for doc in docs_question:
-            doc.metadata["sources_used"] = sources
-            doc.metadata["question_used"] = question
-            doc.metadata["index_used"] = index
-        # Add to the list of docs
-        docs.extend(docs_question)
-        # Sorting the list in descending order by rerank_score
-        docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-        new_state = {"documents":docs,"remaining_questions":remaining_questions}
-        return new_state
-    return retrieve_documents

     """Just a dummy tool to simulate the retriever query"""
     return question
+def _add_sources_used_in_metadata(docs,sources,question,index):
+    for doc in docs:
+        doc.metadata["sources_used"] = sources
+        doc.metadata["question_used"] = question
+        doc.metadata["index_used"] = index
+    return docs
+def _get_k_summary_by_question(n_questions):
+    if n_questions == 0:
+        return 0
+    elif n_questions == 1:
+        return 5
+    elif n_questions == 2:
+        return 3
+    elif n_questions == 3:
+        return 2
+    else:
+        return 1
+# The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
+# @chain
+async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
+    print("---- Retrieve documents ----")
+    # Get the documents from the state
+    if "documents" in state and state["documents"] is not None:
+        docs = state["documents"]
+    else:
+        docs = []
+    # Get the related_content from the state
+    if "related_content" in state and state["related_content"] is not None:
+        related_content = state["related_content"]
+    else:
+        related_content = []
+    # Get the current question
+    current_question = state["remaining_questions"][0]
+    remaining_questions = state["remaining_questions"][1:]
+    k_by_question = k_final // state["n_questions"]
+    k_summary_by_question = _get_k_summary_by_question(state["n_questions"])
+    sources = current_question["sources"]
+    question = current_question["question"]
+    index = current_question["index"]
+    await log_event({"question":question,"sources":sources,"index":index},"log_retriever",config)
+    if index == "Vector":
+        # Search the document store using the retriever
+        # Configure high top k for further reranking step
+        retriever = ClimateQARetriever(
+            vectorstore=vectorstore,
+            sources = sources,
+            min_size = 200,
+            k_summary = k_summary_by_question,
+            k_total = k_before_reranking,
+            threshold = 0.5,
+        )
+        docs_question_dict = await retriever.ainvoke(question,config)
+    # elif index == "OpenAlex":
+    #     # keyword extraction
+    #     keywords_extraction = make_keywords_extraction_chain(llm)
+    #     keywords = keywords_extraction.invoke(question)["keywords"]
+    #     openalex_query = " AND ".join(keywords)
+    #     print(f"... OpenAlex query: {openalex_query}")
+    #     retriever_openalex = OpenAlexRetriever(
+    #         min_year = state.get("min_year",1960),
+    #         max_year = state.get("max_year",None),
+    #         k = k_before_reranking
+    #     )
+    #     docs_question = await retriever_openalex.ainvoke(openalex_query,config)
+    # else:
+    #     raise Exception(f"Index {index} not found in the routing index")
+    # Rerank
+    if reranker is not None:
+        with suppress_output():
+            docs_question_summary_reranked = rerank_docs(reranker,docs_question_dict["docs_summaries"],question)
+            docs_question_fulltext_reranked = rerank_docs(reranker,docs_question_dict["docs_full"],question)
+            docs_question_images_reranked = rerank_docs(reranker,docs_question_dict["docs_images"],question)
+            if rerank_by_question:
+                docs_question_summary_reranked = sorted(docs_question_summary_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
+                docs_question_fulltext_reranked = sorted(docs_question_fulltext_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
+                docs_question_images_reranked = sorted(docs_question_images_reranked, key=lambda x: x.metadata["reranking_score"], reverse=True)
+    else:
+        docs_question = docs_question_dict["docs_summaries"] + docs_question_dict["docs_full"]
+        # Add a default reranking score
+        for doc in docs_question:
+            doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
+    docs_question = docs_question_summary_reranked + docs_question_fulltext_reranked
+    docs_question = docs_question[:k_by_question]
+    images_question = docs_question_images_reranked[:k_by_question]
+    if reranker is not None and rerank_by_question:
+        docs_question = sorted(docs_question, key=lambda x: x.metadata["reranking_score"], reverse=True)
+    # Add sources used in the metadata
+    docs_question = _add_sources_used_in_metadata(docs_question,sources,question,index)
+    images_question = _add_sources_used_in_metadata(images_question,sources,question,index)
+    # Add to the list of docs
+    docs.extend(docs_question)
+    related_content.extend(images_question)
+    new_state = {"documents":docs, "related_contents": related_content,"remaining_questions":remaining_questions}
+    return new_state
+def make_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
+    @chain
+    async def retrieve_docs(state, config):
+        state =  await retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_question, k_final, k_before_reranking, k_summary)
+        return state
+    return retrieve_docs

climateqa/engine/chains/retriever.py CHANGED Viewed

@@ -1,126 +1,126 @@
-import sys
-import os
-from contextlib import contextmanager
-from ..reranker import rerank_docs
-from ...knowledge.retriever import ClimateQARetriever
-def divide_into_parts(target, parts):
-    # Base value for each part
-    base = target // parts
-    # Remainder to distribute
-    remainder = target % parts
-    # List to hold the result
-    result = []
-    for i in range(parts):
-        if i < remainder:
-            # These parts get base value + 1
-            result.append(base + 1)
-        else:
-            # The rest get the base value
-            result.append(base)
-    return result
-@contextmanager
-def suppress_output():
-    # Open a null device
-    with open(os.devnull, 'w') as devnull:
-        # Store the original stdout and stderr
-        old_stdout = sys.stdout
-        old_stderr = sys.stderr
-        # Redirect stdout and stderr to the null device
-        sys.stdout = devnull
-        sys.stderr = devnull
-        try:
-            yield
-        finally:
-            # Restore stdout and stderr
-            sys.stdout = old_stdout
-            sys.stderr = old_stderr
-def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
-    def retrieve_documents(state):
-        POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS"] # ,"OpenAlex"]
-        questions = state["questions"]
-        # Use sources from the user input or from the LLM detection
-        if "sources_input" not in state or state["sources_input"] is None:
-            sources_input = ["auto"]
-        else:
-            sources_input = state["sources_input"]
-        auto_mode = "auto" in sources_input
-        # There are several options to get the final top k
-        # Option 1 - Get 100 documents by question and rerank by question
-        # Option 2 - Get 100/n documents by question and rerank the total
-        if rerank_by_question:
-            k_by_question = divide_into_parts(k_final,len(questions))
-        docs = []
-        for i,q in enumerate(questions):
-            sources = q["sources"]
-            question = q["question"]
-            # If auto mode, we use the sources detected by the LLM
-            if auto_mode:
-                sources = [x for x in sources if x in POSSIBLE_SOURCES]
-            # Otherwise, we use the config
-            else:
-                sources = sources_input
-            # Search the document store using the retriever
-            # Configure high top k for further reranking step
-            retriever = ClimateQARetriever(
-                vectorstore=vectorstore,
-                sources = sources,
-                # reports = ias_reports,
-                min_size = 200,
-                k_summary = k_summary,
-                k_total = k_before_reranking,
-                threshold = 0.5,
-            )
-            docs_question = retriever.get_relevant_documents(question)
-            # Rerank
-            if reranker is not None:
-                with suppress_output():
-                    docs_question = rerank_docs(reranker,docs_question,question)
-            else:
-                # Add a default reranking score
-                for doc in docs_question:
-                    doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
-            # If rerank by question we select the top documents for each question
-            if rerank_by_question:
-                docs_question = docs_question[:k_by_question[i]]
-            # Add sources used in the metadata
-            for doc in docs_question:
-                doc.metadata["sources_used"] = sources
-            # Add to the list of docs
-            docs.extend(docs_question)
-        # Sorting the list in descending order by rerank_score
-        # Then select the top k
-        docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
-        docs = docs[:k_final]
-        new_state = {"documents":docs}
-        return new_state
-    return retrieve_documents

+# import sys
+# import os
+# from contextlib import contextmanager
+# from ..reranker import rerank_docs
+# from ...knowledge.retriever import ClimateQARetriever
+# def divide_into_parts(target, parts):
+#     # Base value for each part
+#     base = target // parts
+#     # Remainder to distribute
+#     remainder = target % parts
+#     # List to hold the result
+#     result = []
+#     for i in range(parts):
+#         if i < remainder:
+#             # These parts get base value + 1
+#             result.append(base + 1)
+#         else:
+#             # The rest get the base value
+#             result.append(base)
+#     return result
+# @contextmanager
+# def suppress_output():
+#     # Open a null device
+#     with open(os.devnull, 'w') as devnull:
+#         # Store the original stdout and stderr
+#         old_stdout = sys.stdout
+#         old_stderr = sys.stderr
+#         # Redirect stdout and stderr to the null device
+#         sys.stdout = devnull
+#         sys.stderr = devnull
+#         try:
+#             yield
+#         finally:
+#             # Restore stdout and stderr
+#             sys.stdout = old_stdout
+#             sys.stderr = old_stderr
+# def make_retriever_node(vectorstore,reranker,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
+#     def retrieve_documents(state):
+#         POSSIBLE_SOURCES = ["IPCC","IPBES","IPOS"] # ,"OpenAlex"]
+#         questions = state["questions"]
+#         # Use sources from the user input or from the LLM detection
+#         if "sources_input" not in state or state["sources_input"] is None:
+#             sources_input = ["auto"]
+#         else:
+#             sources_input = state["sources_input"]
+#         auto_mode = "auto" in sources_input
+#         # There are several options to get the final top k
+#         # Option 1 - Get 100 documents by question and rerank by question
+#         # Option 2 - Get 100/n documents by question and rerank the total
+#         if rerank_by_question:
+#             k_by_question = divide_into_parts(k_final,len(questions))
+#         docs = []
+#         for i,q in enumerate(questions):
+#             sources = q["sources"]
+#             question = q["question"]
+#             # If auto mode, we use the sources detected by the LLM
+#             if auto_mode:
+#                 sources = [x for x in sources if x in POSSIBLE_SOURCES]
+#             # Otherwise, we use the config
+#             else:
+#                 sources = sources_input
+#             # Search the document store using the retriever
+#             # Configure high top k for further reranking step
+#             retriever = ClimateQARetriever(
+#                 vectorstore=vectorstore,
+#                 sources = sources,
+#                 # reports = ias_reports,
+#                 min_size = 200,
+#                 k_summary = k_summary,
+#                 k_total = k_before_reranking,
+#                 threshold = 0.5,
+#             )
+#             docs_question = retriever.get_relevant_documents(question)
+#             # Rerank
+#             if reranker is not None:
+#                 with suppress_output():
+#                     docs_question = rerank_docs(reranker,docs_question,question)
+#             else:
+#                 # Add a default reranking score
+#                 for doc in docs_question:
+#                     doc.metadata["reranking_score"] = doc.metadata["similarity_score"]
+#             # If rerank by question we select the top documents for each question
+#             if rerank_by_question:
+#                 docs_question = docs_question[:k_by_question[i]]
+#             # Add sources used in the metadata
+#             for doc in docs_question:
+#                 doc.metadata["sources_used"] = sources
+#             # Add to the list of docs
+#             docs.extend(docs_question)
+#         # Sorting the list in descending order by rerank_score
+#         # Then select the top k
+#         docs = sorted(docs, key=lambda x: x.metadata["reranking_score"], reverse=True)
+#         docs = docs[:k_final]
+#         new_state = {"documents":docs}
+#         return new_state
+#     return retrieve_documents

climateqa/engine/graph.py CHANGED Viewed

@@ -40,6 +40,7 @@ class GraphState(TypedDict):
     min_year: int = 1960
     max_year: int = None
     documents: List[Document]
     recommended_content : List[Document]
     # graphs_returned: Dict[str,str]

     min_year: int = 1960
     max_year: int = None
     documents: List[Document]
+    related_contents : Dict[str,Document]
     recommended_content : List[Document]
     # graphs_returned: Dict[str,str]

climateqa/knowledge/retriever.py CHANGED Viewed

@@ -11,6 +11,18 @@ from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
 from typing import List
 from pydantic import Field
 class ClimateQARetriever(BaseRetriever):
     vectorstore:VectorStore
     sources:list = ["IPCC","IPBES","IPOS"]
@@ -20,6 +32,7 @@ class ClimateQARetriever(BaseRetriever):
     k_total:int = 10
     namespace:str = "vectors",
     min_size:int = 200,
     def _get_relevant_documents(
@@ -43,6 +56,7 @@ class ClimateQARetriever(BaseRetriever):
         # Search for k_summary documents in the summaries dataset
         filters_summaries = {
             **filters,
             "report_type": { "$in":["SPM"]},
         }
@@ -52,31 +66,36 @@ class ClimateQARetriever(BaseRetriever):
         # Search for k_total - k_summary documents in the full reports dataset
         filters_full = {
             **filters,
             "report_type": { "$nin":["SPM"]},
         }
         k_full = self.k_total - len(docs_summaries)
         docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
         # Concatenate documents
-        docs = docs_summaries + docs_full
         # Filter if scores are below threshold
         docs = [x for x in docs if len(x[0].page_content) > self.min_size]
         # docs = [x for x in docs if x[1] > self.threshold]
-        # Add score to metadata
-        results = []
-        for i,(doc,score) in enumerate(docs):
-            doc.page_content = doc.page_content.replace("\r\n"," ")
-            doc.metadata["similarity_score"] = score
-            doc.metadata["content"] = doc.page_content
-            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
-            # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
-            results.append(doc)
-        # Sort by score
-        # results = sorted(results,key = lambda x : x.metadata["similarity_score"],reverse = True)
-        return results

 from typing import List
 from pydantic import Field
+def _add_metadata_and_score(docs: List) -> Document:
+    # Add score to metadata
+    docs_with_metadata = []
+    for i,(doc,score) in enumerate(docs):
+        doc.page_content = doc.page_content.replace("\r\n"," ")
+        doc.metadata["similarity_score"] = score
+        doc.metadata["content"] = doc.page_content
+        doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+        # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
+        docs_with_metadata.append(doc)
+    return docs_with_metadata
 class ClimateQARetriever(BaseRetriever):
     vectorstore:VectorStore
     sources:list = ["IPCC","IPBES","IPOS"]
     k_total:int = 10
     namespace:str = "vectors",
     min_size:int = 200,
     def _get_relevant_documents(
         # Search for k_summary documents in the summaries dataset
         filters_summaries = {
             **filters,
+            "chunk_type":"text",
             "report_type": { "$in":["SPM"]},
         }
         # Search for k_total - k_summary documents in the full reports dataset
         filters_full = {
             **filters,
+            "chunk_type":"text",
             "report_type": { "$nin":["SPM"]},
         }
         k_full = self.k_total - len(docs_summaries)
         docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
+        # Images
+        filters_image = {
+            **filters,
+            "chunk_type":"image"
+        }
+        docs_images = self.vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_full)
         # Concatenate documents
+        docs = docs_summaries + docs_full + docs_images
         # Filter if scores are below threshold
         docs = [x for x in docs if len(x[0].page_content) > self.min_size]
         # docs = [x for x in docs if x[1] > self.threshold]
+        docs_summaries, docs_full, docs_images = _add_metadata_and_score(docs_summaries), _add_metadata_and_score(docs_full), _add_metadata_and_score(docs_images)
+        # Filter if length are below threshold
+        docs_summaries = [x for x in docs_summaries if len(x.page_content) > self.min_size]
+        docs_full = [x for x in docs_full if len(x.page_content) > self.min_size]
+        return {
+            "docs_summaries" : docs_summaries,
+            "docs_full" : docs_full,
+            "docs_images" : docs_images
+        }

sandbox/20241104 - CQA - StepByStep CQA.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff