Spaces:
Sleeping
Sleeping
Commit
Β·
63c9deb
1
Parent(s):
9a43597
included weeds into insects identifier as well; and removed domain dropdowno
Browse files- app.py +18 -48
- app_database_prep.py +18 -8
- vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 +2 -2
- vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β e82d58e5-16f1-41a6-9289-211464329861}/data_level0.bin +2 -2
- vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β e82d58e5-16f1-41a6-9289-211464329861}/header.bin +1 -1
- vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β e82d58e5-16f1-41a6-9289-211464329861}/length.bin +2 -2
- vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β e82d58e5-16f1-41a6-9289-211464329861}/link_lists.bin +0 -0
- vector-databases-deployed/db5-agllm-data-isu-field-weeds-all-species/chroma.sqlite3 +2 -2
app.py
CHANGED
|
@@ -78,11 +78,9 @@ def get_species_list_from_db(db_name):
|
|
| 78 |
|
| 79 |
|
| 80 |
# default_persist_directory = './db5' # For deployement
|
| 81 |
-
|
| 82 |
-
default_persist_directory_weeds='./vector-databases-deployed/db5-agllm-data-isu-field-weeds-all-species'
|
| 83 |
|
| 84 |
-
|
| 85 |
-
species_list_weeds=get_species_list_from_db(default_persist_directory_weeds)
|
| 86 |
# default_persist_directory = 'vector-databases/db5-pre-completion' # For Development
|
| 87 |
csv_filepath1 = "./agllm-data/corrected/Corrected_supplemented-insect_data-2500-sorted.xlsx"
|
| 88 |
csv_filepath2 = "./agllm-data/corrected/Corrected_supplemented-insect_data-remaining.xlsx"
|
|
@@ -251,7 +249,7 @@ filter = {
|
|
| 251 |
# )
|
| 252 |
#
|
| 253 |
|
| 254 |
-
def initialize_qa_chain(specie_selector, application_mode, model_name, region, database_persistent_directory=
|
| 255 |
# Add helper function for India info (kept for potential future use, but removed from RAG prompt)
|
| 256 |
def read_and_format_filtered_csv_better(dataframe_given, insect_specie):
|
| 257 |
filtered_data = dataframe_given[dataframe_given['species'] == insect_specie]
|
|
@@ -325,7 +323,7 @@ def initialize_qa_chain(specie_selector, application_mode, model_name, region, d
|
|
| 325 |
availability_message = f"Information for **{specie_selector}** is available in region(s): **{', '.join(available_regions_list)}**."
|
| 326 |
else:
|
| 327 |
available_regions_list = []
|
| 328 |
-
availability_message = f"No regional information found for **{specie_selector}** in the
|
| 329 |
except Exception as e:
|
| 330 |
print(f"Error checking region availability: {e}")
|
| 331 |
available_regions_list = []
|
|
@@ -401,7 +399,7 @@ def initialize_qa_chain(specie_selector, application_mode, model_name, region, d
|
|
| 401 |
|
| 402 |
# Updated prompt template for multi-part response with region-specific contexts
|
| 403 |
general_system_template = f"""
|
| 404 |
-
You are an AI assistant specialized in providing information about
|
| 405 |
|
| 406 |
The following context has been retrieved from a database organized by region:
|
| 407 |
|
|
@@ -454,7 +452,7 @@ IMPORTANT:
|
|
| 454 |
4. Apply this language constraint: {language_constraint}
|
| 455 |
5. Keep your summaries concise and directly related to the user's question.
|
| 456 |
|
| 457 |
-
User Question about {specie_selector}
|
| 458 |
"""
|
| 459 |
|
| 460 |
class RegionFormattingLLMChain:
|
|
@@ -530,15 +528,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 530 |
with gr.Row():
|
| 531 |
with gr.Column(scale=1):
|
| 532 |
with gr.Row():
|
| 533 |
-
domain_name = gr.Dropdown(
|
| 534 |
-
list(["Insects", "Weeds"]),
|
| 535 |
-
value="Insects",
|
| 536 |
-
label="Domain",
|
| 537 |
-
info="Select Domain",
|
| 538 |
-
interactive=True,
|
| 539 |
-
scale=1,
|
| 540 |
-
visible=True
|
| 541 |
-
)
|
| 542 |
region_selector = gr.Dropdown(
|
| 543 |
list(["United States", "India", "Africa"]), # Updated regions
|
| 544 |
value="United States", # Updated default
|
|
@@ -551,8 +540,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 551 |
|
| 552 |
# Model selection
|
| 553 |
specie_selector = gr.Dropdown(
|
| 554 |
-
list(set(
|
| 555 |
-
value=
|
| 556 |
label="Species",
|
| 557 |
info="Select the Species",
|
| 558 |
interactive=True,
|
|
@@ -578,7 +567,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 578 |
scale=1,
|
| 579 |
visible=True
|
| 580 |
)
|
| 581 |
-
region_availability_display = gr.Markdown(value="Select species
|
| 582 |
|
| 583 |
with gr.Column(scale=2):
|
| 584 |
# User input prompt text field
|
|
@@ -599,13 +588,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 599 |
return history + [["Invalid prompts - user prompt cannot be empty", None]]
|
| 600 |
|
| 601 |
# Chatbot logic for configuration, sending the prompts, rendering the streamed back generations, etc.
|
| 602 |
-
def bot(model_name, application_mode, user_prompt_message, history, messages_history, qa_chain,
|
| 603 |
if qa_chain == None:
|
| 604 |
# Initial QA chain setup if not already done (uses default species for the selected domain)
|
| 605 |
-
initial_species =
|
| 606 |
# Need to handle the tuple returned by init_qa_chain now
|
| 607 |
# Use the currently selected region for initialization if qa_chain is None
|
| 608 |
-
qa_chain, _ = init_qa_chain(initial_species, application_mode, model_name,
|
| 609 |
|
| 610 |
history[-1][1] = "" # Placeholder for the answer
|
| 611 |
|
|
@@ -640,19 +629,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 640 |
def input_cleanup():
|
| 641 |
return ""
|
| 642 |
|
| 643 |
-
def init_qa_chain(specie_selector, application_mode, model_name,
|
| 644 |
-
print(f"--- init_qa_chain wrapper called
|
| 645 |
qa_chain_instance = None
|
| 646 |
availability_msg = "Error initializing QA chain."
|
| 647 |
try:
|
| 648 |
-
|
| 649 |
-
qa_chain_instance, availability_msg = initialize_qa_chain(specie_selector, application_mode, model_name, region, default_persist_directory_insects, domain_name) # Removed use_rag
|
| 650 |
-
elif domain_name=="Weeds":
|
| 651 |
-
qa_chain_instance, availability_msg = initialize_qa_chain(specie_selector, application_mode, model_name, region, default_persist_directory_weeds, domain_name) # Removed use_rag
|
| 652 |
-
else:
|
| 653 |
-
print("No Appropriate Chain Selected")
|
| 654 |
-
availability_msg = "Invalid domain selected."
|
| 655 |
-
# Return None for chain and the message
|
| 656 |
except Exception as e:
|
| 657 |
print(f"Error in init_qa_chain wrapper: {e}")
|
| 658 |
availability_msg = f"Error initializing: {e}"
|
|
@@ -660,7 +642,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 660 |
return qa_chain_instance, availability_msg # Return both chain and message
|
| 661 |
|
| 662 |
# Update QA chain AND availability message when relevant inputs change
|
| 663 |
-
inputs_for_qa_chain = [specie_selector, application_mode, model_name,
|
| 664 |
outputs_for_qa_chain = [qa_chain_state, region_availability_display]
|
| 665 |
|
| 666 |
# specie_selector.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
|
@@ -672,20 +654,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 672 |
model_name.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
| 673 |
region_selector.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
| 674 |
application_mode.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
| 675 |
-
domain_name.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
| 676 |
|
| 677 |
#####
|
| 678 |
-
def update_species_list(domain):
|
| 679 |
-
if domain == "Insects":
|
| 680 |
-
return gr.Dropdown( species_list_insects, value=species_list_insects[0], label="Species", info="Select the Species", interactive=True, scale=1, visible=True )
|
| 681 |
-
elif domain == "Weeds":
|
| 682 |
-
return gr.Dropdown( species_list_weeds, value=species_list_weeds[0], label="Species", info="Select the Species", interactive=True, scale=1, visible=True )
|
| 683 |
-
|
| 684 |
-
domain_name.change(
|
| 685 |
-
update_species_list,
|
| 686 |
-
inputs=[domain_name],
|
| 687 |
-
outputs=[specie_selector]
|
| 688 |
-
)
|
| 689 |
|
| 690 |
# When the user clicks Enter and the user message is submitted
|
| 691 |
user_prompt_message.submit(
|
|
@@ -695,7 +665,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 695 |
queue=False
|
| 696 |
).then(
|
| 697 |
bot,
|
| 698 |
-
[model_name, application_mode, user_prompt_message, chatbot, state, qa_chain_state,
|
| 699 |
[chatbot, state]
|
| 700 |
).then(input_cleanup,
|
| 701 |
[],
|
|
@@ -711,7 +681,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 711 |
queue=False
|
| 712 |
).then(
|
| 713 |
bot,
|
| 714 |
-
[model_name, application_mode, user_prompt_message, chatbot, state, qa_chain_state,
|
| 715 |
[chatbot, state]
|
| 716 |
).then(
|
| 717 |
input_cleanup,
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
# default_persist_directory = './db5' # For deployement
|
| 81 |
+
default_persist_directory='./vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
|
|
|
|
| 82 |
|
| 83 |
+
species_list=get_species_list_from_db(default_persist_directory)
|
|
|
|
| 84 |
# default_persist_directory = 'vector-databases/db5-pre-completion' # For Development
|
| 85 |
csv_filepath1 = "./agllm-data/corrected/Corrected_supplemented-insect_data-2500-sorted.xlsx"
|
| 86 |
csv_filepath2 = "./agllm-data/corrected/Corrected_supplemented-insect_data-remaining.xlsx"
|
|
|
|
| 249 |
# )
|
| 250 |
#
|
| 251 |
|
| 252 |
+
def initialize_qa_chain(specie_selector, application_mode, model_name, region, database_persistent_directory=default_persist_directory):
|
| 253 |
# Add helper function for India info (kept for potential future use, but removed from RAG prompt)
|
| 254 |
def read_and_format_filtered_csv_better(dataframe_given, insect_specie):
|
| 255 |
filtered_data = dataframe_given[dataframe_given['species'] == insect_specie]
|
|
|
|
| 323 |
availability_message = f"Information for **{specie_selector}** is available in region(s): **{', '.join(available_regions_list)}**."
|
| 324 |
else:
|
| 325 |
available_regions_list = []
|
| 326 |
+
availability_message = f"No regional information found for **{specie_selector}** in the database."
|
| 327 |
except Exception as e:
|
| 328 |
print(f"Error checking region availability: {e}")
|
| 329 |
available_regions_list = []
|
|
|
|
| 399 |
|
| 400 |
# Updated prompt template for multi-part response with region-specific contexts
|
| 401 |
general_system_template = f"""
|
| 402 |
+
You are an AI assistant specialized in providing information about agricultural pests ({specie_selector}). The user is primarily interested in the '{region}' region.
|
| 403 |
|
| 404 |
The following context has been retrieved from a database organized by region:
|
| 405 |
|
|
|
|
| 452 |
4. Apply this language constraint: {language_constraint}
|
| 453 |
5. Keep your summaries concise and directly related to the user's question.
|
| 454 |
|
| 455 |
+
User Question about {specie_selector}: {{question}}
|
| 456 |
"""
|
| 457 |
|
| 458 |
class RegionFormattingLLMChain:
|
|
|
|
| 528 |
with gr.Row():
|
| 529 |
with gr.Column(scale=1):
|
| 530 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
region_selector = gr.Dropdown(
|
| 532 |
list(["United States", "India", "Africa"]), # Updated regions
|
| 533 |
value="United States", # Updated default
|
|
|
|
| 540 |
|
| 541 |
# Model selection
|
| 542 |
specie_selector = gr.Dropdown(
|
| 543 |
+
list(set(species_list)),
|
| 544 |
+
value=species_list[0],
|
| 545 |
label="Species",
|
| 546 |
info="Select the Species",
|
| 547 |
interactive=True,
|
|
|
|
| 567 |
scale=1,
|
| 568 |
visible=True
|
| 569 |
)
|
| 570 |
+
region_availability_display = gr.Markdown(value="Select species to see region availability.") # Added display area
|
| 571 |
|
| 572 |
with gr.Column(scale=2):
|
| 573 |
# User input prompt text field
|
|
|
|
| 588 |
return history + [["Invalid prompts - user prompt cannot be empty", None]]
|
| 589 |
|
| 590 |
# Chatbot logic for configuration, sending the prompts, rendering the streamed back generations, etc.
|
| 591 |
+
def bot(model_name, application_mode, user_prompt_message, history, messages_history, qa_chain, region): # Removed use_rag
|
| 592 |
if qa_chain == None:
|
| 593 |
# Initial QA chain setup if not already done (uses default species for the selected domain)
|
| 594 |
+
initial_species = species_list[0]
|
| 595 |
# Need to handle the tuple returned by init_qa_chain now
|
| 596 |
# Use the currently selected region for initialization if qa_chain is None
|
| 597 |
+
qa_chain, _ = init_qa_chain(initial_species, application_mode, model_name, region) # Pass region
|
| 598 |
|
| 599 |
history[-1][1] = "" # Placeholder for the answer
|
| 600 |
|
|
|
|
| 629 |
def input_cleanup():
|
| 630 |
return ""
|
| 631 |
|
| 632 |
+
def init_qa_chain(specie_selector, application_mode, model_name, region): # Removed use_rag
|
| 633 |
+
print(f"--- init_qa_chain wrapper called ---") # DIAGNOSTIC PRINT
|
| 634 |
qa_chain_instance = None
|
| 635 |
availability_msg = "Error initializing QA chain."
|
| 636 |
try:
|
| 637 |
+
qa_chain_instance, availability_msg = initialize_qa_chain(specie_selector, application_mode, model_name, region) # Removed use_rag
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
except Exception as e:
|
| 639 |
print(f"Error in init_qa_chain wrapper: {e}")
|
| 640 |
availability_msg = f"Error initializing: {e}"
|
|
|
|
| 642 |
return qa_chain_instance, availability_msg # Return both chain and message
|
| 643 |
|
| 644 |
# Update QA chain AND availability message when relevant inputs change
|
| 645 |
+
inputs_for_qa_chain = [specie_selector, application_mode, model_name, region_selector] # CORRECT ORDER
|
| 646 |
outputs_for_qa_chain = [qa_chain_state, region_availability_display]
|
| 647 |
|
| 648 |
# specie_selector.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
|
|
|
| 654 |
model_name.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
| 655 |
region_selector.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
| 656 |
application_mode.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
|
|
|
|
| 657 |
|
| 658 |
#####
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
|
| 660 |
# When the user clicks Enter and the user message is submitted
|
| 661 |
user_prompt_message.submit(
|
|
|
|
| 665 |
queue=False
|
| 666 |
).then(
|
| 667 |
bot,
|
| 668 |
+
[model_name, application_mode, user_prompt_message, chatbot, state, qa_chain_state, region_selector], # Removed use_rag
|
| 669 |
[chatbot, state]
|
| 670 |
).then(input_cleanup,
|
| 671 |
[],
|
|
|
|
| 681 |
queue=False
|
| 682 |
).then(
|
| 683 |
bot,
|
| 684 |
+
[model_name, application_mode, user_prompt_message, chatbot, state, qa_chain_state, region_selector], # Removed use_rag
|
| 685 |
[chatbot, state]
|
| 686 |
).then(
|
| 687 |
input_cleanup,
|
app_database_prep.py
CHANGED
|
@@ -101,14 +101,24 @@ def process_excel_sheet(
|
|
| 101 |
|
| 102 |
# --- Main Script Logic ---
|
| 103 |
|
| 104 |
-
#
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
## Load Excel File Path (Define once)
|
| 114 |
excel_file_path = "agllm-data/PestID Species.xlsx"
|
|
|
|
| 101 |
|
| 102 |
# --- Main Script Logic ---
|
| 103 |
|
| 104 |
+
# --- INSECTS DATA PROCESSING ---
|
| 105 |
+
insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
|
| 106 |
+
persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
|
| 107 |
+
insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
|
| 108 |
+
chunk_size_input = 512
|
| 109 |
+
insects_metadata_raw = pd.read_csv(f"./agllm-data/{insects_data_domain_identifier}/matched_species_results_v2.csv")
|
| 110 |
+
insects_documents = insects_loader.load()
|
| 111 |
+
|
| 112 |
+
# --- WEEDS DATA PROCESSING ---
|
| 113 |
+
weeds_data_domain_identifier = "agllm-data-isu-field-weeds-all-species"
|
| 114 |
+
weeds_loader = DirectoryLoader(f'agllm-data/{weeds_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
|
| 115 |
+
weeds_metadata_raw = pd.read_csv(f"./agllm-data/{weeds_data_domain_identifier}/matched_species_results_v2.csv")
|
| 116 |
+
weeds_documents = weeds_loader.load()
|
| 117 |
+
|
| 118 |
+
# Combine documents from both sources before processing
|
| 119 |
+
documents = insects_documents + weeds_documents
|
| 120 |
+
metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_index=True)
|
| 121 |
+
|
| 122 |
|
| 123 |
## Load Excel File Path (Define once)
|
| 124 |
excel_file_path = "agllm-data/PestID Species.xlsx"
|
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d7b0db4eaef0c4bc975ba1ce231b7a2bcb3134b13139fe7586a58d5dcc1f97e9
|
| 3 |
+
size 9072640
|
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β e82d58e5-16f1-41a6-9289-211464329861}/data_level0.bin
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb76ae0f6ca830a9a048b0cf53962a78c88c5c7fcda63fc846077d3456eb3890
|
| 3 |
+
size 62840000
|
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β e82d58e5-16f1-41a6-9289-211464329861}/header.bin
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 100
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec666c9828420c69fc6b597461d8c18487becec1527c7d1cff9b898cbb393c2d
|
| 3 |
size 100
|
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β e82d58e5-16f1-41a6-9289-211464329861}/length.bin
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7e2dcff542de95352682dc186432e98f0188084896773f1973276b0577d5305
|
| 3 |
+
size 40000
|
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β e82d58e5-16f1-41a6-9289-211464329861}/link_lists.bin
RENAMED
|
File without changes
|
vector-databases-deployed/db5-agllm-data-isu-field-weeds-all-species/chroma.sqlite3
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9bdbd2acc8e88eaf4e6b6bc8a8687b36fe5c624b986515bad74e37010f3924ba
|
| 3 |
+
size 167936
|