climate-question-answering-test

Sleeping

App Files Files Community

timeki commited on Dec 19, 2024

Commit

5bf8044

1 Parent(s): f7ebe84

fix figures retrieval

Browse files

Files changed (4) hide show

app.py +38 -16
climateqa/engine/chains/retrieve_documents.py +6 -3
climateqa/engine/graph.py +2 -2
front/utils.py +20 -16

app.py CHANGED Viewed

@@ -113,7 +113,7 @@ vectorstore = get_pinecone_vectorstore(embeddings_function, index_name = os.gete
 vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name = os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
-reranker = get_reranker("nano")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
@@ -142,7 +142,6 @@ async def chat(query, history, audience, sources, reports, relevant_content_sour
     docs = []
-    used_figures=[]
     related_contents = []
     docs_html = ""
     output_query = ""
@@ -165,7 +164,7 @@ async def chat(query, history, audience, sources, reports, relevant_content_sour
             if "langgraph_node" in event["metadata"]:
                 node = event["metadata"]["langgraph_node"]
-                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" :# when documents are retrieved
                     docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(event, history, used_documents)
                 elif event["event"] == "on_chain_end" and node == "categorize_intent" and event["name"] == "_write": # when the query is transformed
@@ -321,10 +320,19 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                 with gr.Row(elem_id = "input-message"):
-                    textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=7,lines = 1,interactive = True,elem_id="input-textbox")
-                    config_button = gr.Button("",elem_id="config-button")
-                    # config_checkbox_button = gr.Checkbox(label = '⚙️', value="show",visible=True, interactive=True, elem_id="checkbox-config")
@@ -417,7 +425,9 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                         with gr.Tabs(elem_id = "group-subtabs") as tabs_recommended_content:
                             with gr.Tab("Figures",elem_id = "tab-figures",id = 3) as tab_figures:
-                                sources_raw = gr.State()
                                 with Modal(visible=False, elem_id="modal_figure_galery") as figure_modal:
                                     gallery_component = gr.Gallery(object_fit='scale-down',elem_id="gallery-component", height="80vh")
@@ -475,9 +485,9 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
                 )
                 dropdown_external_sources = gr.CheckboxGroup(
-                    ["IPCC figures","OpenAlex", "OurWorldInData"],
                     label="Select database to search for relevant content",
-                    value=["IPCC figures"],
                     interactive=True,
                 )
@@ -633,15 +643,25 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
         return gr.update(label = recommended_content_notif_label), gr.update(label = sources_notif_label), gr.update(label = figures_notif_label), gr.update(label = graphs_notif_label), gr.update(label = papers_notif_label)
     (textbox
-        .submit(start_chat, [textbox,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_textbox")
-        .then(chat, [textbox,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
-        .then(finish_chat, None, [textbox],api_name = "finish_chat_textbox")
-        # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
-        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, sources_raw, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
     )
@@ -654,7 +674,7 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
         return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
-    sources_raw.change(process_figures, inputs=[sources_raw], outputs=[figures_cards, gallery_component])
     # update sources numbers
     sources_textbox.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
@@ -674,4 +694,6 @@ with gr.Blocks(title="Climate Q&A", css_paths=os.getcwd()+ "/style.css", theme=t
     demo.queue()
 demo.launch(ssr_mode=False)

 vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name = os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+reranker = get_reranker("large")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, reranker=reranker)
     docs = []
     related_contents = []
     docs_html = ""
     output_query = ""
             if "langgraph_node" in event["metadata"]:
                 node = event["metadata"]["langgraph_node"]
+                if event["event"] == "on_chain_end" and event["name"] == "retrieve_documents" and event["data"]["output"] != None:# when documents are retrieved
                     docs, docs_html, history, used_documents, related_contents = handle_retrieved_documents(event, history, used_documents)
                 elif event["event"] == "on_chain_end" and node == "categorize_intent" and event["name"] == "_write": # when the query is transformed
                 with gr.Row(elem_id = "input-message"):
+                    textbox = gr.Textbox(
+                        placeholder="Ask me anything here!",
+                        show_label=False,
+                        scale=12,
+                        lines=1,
+                        interactive=True,
+                        elem_id="input-textbox"
+                    )
+                    config_button = gr.Button(
+                        "",
+                        elem_id="config-button"
+                    )
                         with gr.Tabs(elem_id = "group-subtabs") as tabs_recommended_content:
                             with gr.Tab("Figures",elem_id = "tab-figures",id = 3) as tab_figures:
+                                sources_raw = gr.State([])
+                                new_figures = gr.State([])
+                                used_figures = gr.State([])
                                 with Modal(visible=False, elem_id="modal_figure_galery") as figure_modal:
                                     gallery_component = gr.Gallery(object_fit='scale-down',elem_id="gallery-component", height="80vh")
                 )
                 dropdown_external_sources = gr.CheckboxGroup(
+                    ["Figures (IPCC/IPBES)","Papers (OpenAlex)", "Graphs (OurWorldInData)"],
                     label="Select database to search for relevant content",
+                    value=["Figures (IPCC/IPBES)"],
                     interactive=True,
                 )
         return gr.update(label = recommended_content_notif_label), gr.update(label = sources_notif_label), gr.update(label = figures_notif_label), gr.update(label = graphs_notif_label), gr.update(label = papers_notif_label)
     (textbox
+        .submit(start_chat, [textbox, chatbot, search_only],
+                [textbox, tabs, chatbot],
+                queue=False,
+                api_name="start_chat_textbox")
+        .then(chat, [textbox, chatbot, dropdown_audience, dropdown_sources,
+                     dropdown_reports, dropdown_external_sources, search_only],
+              [chatbot, sources_textbox, output_query, output_language,
+               new_figures, current_graphs],
+              concurrency_limit=8,
+              api_name="chat_textbox")
+        .then(finish_chat, None, [textbox],
+              api_name="finish_chat_textbox")
     )
     (examples_hidden
         .change(start_chat, [examples_hidden,chatbot, search_only], [textbox,tabs,chatbot],queue = False,api_name = "start_chat_examples")
+        .then(chat, [examples_hidden,chatbot,dropdown_audience, dropdown_sources,dropdown_reports, dropdown_external_sources, search_only] ,[chatbot,sources_textbox,output_query,output_language, new_figures, current_graphs],concurrency_limit = 8,api_name = "chat_textbox")
         .then(finish_chat, None, [textbox],api_name = "finish_chat_examples")
         # .then(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_sources, tab_figures, tab_graphs, tab_papers] )
     )
         return [gr.update(visible=visible_bools[i]) for i in range(len(samples))]
+    new_figures.change(process_figures, inputs=[sources_raw, new_figures], outputs=[sources_raw, figures_cards, gallery_component])
     # update sources numbers
     sources_textbox.change(update_sources_number_display, [sources_textbox, figures_cards, current_graphs,papers_html],[tab_recommended_content, tab_sources, tab_figures, tab_graphs, tab_papers])
     demo.queue()
 demo.launch(ssr_mode=False)

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -87,7 +87,7 @@ def _get_k_images_by_question(n_questions):
     elif n_questions == 2:
         return 5
     elif n_questions == 3:
-        return 2
     else:
         return 1
@@ -98,7 +98,10 @@ def _add_metadata_and_score(docs: List) -> Document:
         doc.page_content = doc.page_content.replace("\r\n"," ")
         doc.metadata["similarity_score"] = score
         doc.metadata["content"] = doc.page_content
-        doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
         # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
         docs_with_metadata.append(doc)
     return docs_with_metadata
@@ -222,7 +225,7 @@ async def retrieve_documents(state,config, vectorstore,reranker,llm,rerank_by_qu
     else:
         related_content = []
-    search_figures = "IPCC figures" in state["relevant_content_sources"]
     search_only = state["search_only"]
     # Get the current question

     elif n_questions == 2:
         return 5
     elif n_questions == 3:
+        return 3
     else:
         return 1
         doc.page_content = doc.page_content.replace("\r\n"," ")
         doc.metadata["similarity_score"] = score
         doc.metadata["content"] = doc.page_content
+        if doc.metadata["page_number"] != "N/A":
+            doc.metadata["page_number"] = int(doc.metadata["page_number"]) + 1
+        else:
+            doc.metadata["page_number"] = 1
         # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
         docs_with_metadata.append(doc)
     return docs_with_metadata
     else:
         related_content = []
+    search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources"]
     search_only = state["search_only"]
     # Get the current question

climateqa/engine/graph.py CHANGED Viewed

@@ -36,7 +36,7 @@ class GraphState(TypedDict):
     answer: str
     audience: str = "experts"
     sources_input: List[str] = ["IPCC","IPBES"]
-    relevant_content_sources: List[str] = ["IPCC figures"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
@@ -82,7 +82,7 @@ def route_based_on_relevant_docs(state,threshold_docs=0.2):
         return "answer_rag_no_docs"
 def route_retrieve_documents(state):
-    if state["search_only"] :
         return END
     elif len(state["remaining_questions"]) > 0:
         return "retrieve_documents"

     answer: str
     audience: str = "experts"
     sources_input: List[str] = ["IPCC","IPBES"]
+    relevant_content_sources: List[str] = ["Figures (IPCC/IPBES)"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
         return "answer_rag_no_docs"
 def route_retrieve_documents(state):
+    if len(state["remaining_questions"]) == 0 and state["search_only"] :
         return END
     elif len(state["remaining_questions"]) > 0:
         return "retrieve_documents"

front/utils.py CHANGED Viewed

@@ -39,25 +39,29 @@ def parse_output_llm_with_sources(output:str)->str:
     content_parts = "".join(parts)
     return content_parts
-def process_figures(docs:list)->tuple:
-    gallery=[]
-    used_figures =[]
     figures = '<div class="figures-container"><p></p> </div>'
     if docs == []:
-        return figures, gallery
     docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
-    for i, doc in enumerate(docs_figures):
-        if doc.metadata["chunk_type"] == "image":
-            if doc.metadata["figure_code"] != "N/A":
-                title = f"{doc.metadata['figure_code']} - {doc.metadata['short_name']}"
-            else:
-                title = f"{doc.metadata['short_name']}"
-            if title not in used_figures:
-                used_figures.append(title)
                 try:
-                    key = f"Image {i+1}"
                     image_path = doc.metadata["image_path"].split("documents/")[1]
                     img = get_image_from_azure_blob_storage(image_path)
@@ -70,12 +74,12 @@ def process_figures(docs:list)->tuple:
                     img_str = base64.b64encode(buffered.getvalue()).decode()
-                    figures = figures + make_html_figure_sources(doc, i, img_str)
                     gallery.append(img)
                 except Exception as e:
-                    print(f"Skipped adding image {i} because of {e}")
-    return figures, gallery
 def generate_html_graphs(graphs:list)->str:

     content_parts = "".join(parts)
     return content_parts
+def process_figures(docs:list, new_figures:list)->tuple:
+    docs = docs + new_figures
     figures = '<div class="figures-container"><p></p> </div>'
+    gallery = []
+    used_figures = []
     if docs == []:
+        return figures, gallery, used_figures
     docs_figures = [d for d in docs if d.metadata["chunk_type"] == "image"]
+    for i_doc, doc in enumerate(docs_figures):
+        if doc.metadata["chunk_type"] == "image":
+            path = doc.metadata["image_path"]
+            if path not in used_figures:
+                used_figures.append(path)
+                figure_number = len(used_figures)
                 try:
+                    key = f"Image {figure_number}"
                     image_path = doc.metadata["image_path"].split("documents/")[1]
                     img = get_image_from_azure_blob_storage(image_path)
                     img_str = base64.b64encode(buffered.getvalue()).decode()
+                    figures = figures + make_html_figure_sources(doc, figure_number, img_str)
                     gallery.append(img)
                 except Exception as e:
+                    print(f"Skipped adding image {figure_number} because of {e}")
+    return docs, figures, gallery
 def generate_html_graphs(graphs:list)->str: