arbabarshad commited on
Commit
63c9deb
Β·
1 Parent(s): 9a43597

included weeds into insects identifier as well; and removed domain dropdowno

Browse files
app.py CHANGED
@@ -78,11 +78,9 @@ def get_species_list_from_db(db_name):
78
 
79
 
80
  # default_persist_directory = './db5' # For deployement
81
- default_persist_directory_insects='./vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
82
- default_persist_directory_weeds='./vector-databases-deployed/db5-agllm-data-isu-field-weeds-all-species'
83
 
84
- species_list_insects=get_species_list_from_db(default_persist_directory_insects)
85
- species_list_weeds=get_species_list_from_db(default_persist_directory_weeds)
86
  # default_persist_directory = 'vector-databases/db5-pre-completion' # For Development
87
  csv_filepath1 = "./agllm-data/corrected/Corrected_supplemented-insect_data-2500-sorted.xlsx"
88
  csv_filepath2 = "./agllm-data/corrected/Corrected_supplemented-insect_data-remaining.xlsx"
@@ -251,7 +249,7 @@ filter = {
251
  # )
252
  #
253
 
254
- def initialize_qa_chain(specie_selector, application_mode, model_name, region, database_persistent_directory=default_persist_directory_insects, domain_name="Insects"):
255
  # Add helper function for India info (kept for potential future use, but removed from RAG prompt)
256
  def read_and_format_filtered_csv_better(dataframe_given, insect_specie):
257
  filtered_data = dataframe_given[dataframe_given['species'] == insect_specie]
@@ -325,7 +323,7 @@ def initialize_qa_chain(specie_selector, application_mode, model_name, region, d
325
  availability_message = f"Information for **{specie_selector}** is available in region(s): **{', '.join(available_regions_list)}**."
326
  else:
327
  available_regions_list = []
328
- availability_message = f"No regional information found for **{specie_selector}** in the {domain_name} database."
329
  except Exception as e:
330
  print(f"Error checking region availability: {e}")
331
  available_regions_list = []
@@ -401,7 +399,7 @@ def initialize_qa_chain(specie_selector, application_mode, model_name, region, d
401
 
402
  # Updated prompt template for multi-part response with region-specific contexts
403
  general_system_template = f"""
404
- You are an AI assistant specialized in providing information about {domain_name.lower()} ({specie_selector}). The user is primarily interested in the '{region}' region.
405
 
406
  The following context has been retrieved from a database organized by region:
407
 
@@ -454,7 +452,7 @@ IMPORTANT:
454
  4. Apply this language constraint: {language_constraint}
455
  5. Keep your summaries concise and directly related to the user's question.
456
 
457
- User Question about {specie_selector} ({domain_name}): {{question}}
458
  """
459
 
460
  class RegionFormattingLLMChain:
@@ -530,15 +528,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
530
  with gr.Row():
531
  with gr.Column(scale=1):
532
  with gr.Row():
533
- domain_name = gr.Dropdown(
534
- list(["Insects", "Weeds"]),
535
- value="Insects",
536
- label="Domain",
537
- info="Select Domain",
538
- interactive=True,
539
- scale=1,
540
- visible=True
541
- )
542
  region_selector = gr.Dropdown(
543
  list(["United States", "India", "Africa"]), # Updated regions
544
  value="United States", # Updated default
@@ -551,8 +540,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
551
 
552
  # Model selection
553
  specie_selector = gr.Dropdown(
554
- list(set(species_list_insects)),
555
- value=species_list_insects[0],
556
  label="Species",
557
  info="Select the Species",
558
  interactive=True,
@@ -578,7 +567,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
578
  scale=1,
579
  visible=True
580
  )
581
- region_availability_display = gr.Markdown(value="Select species/domain to see region availability.") # Added display area
582
 
583
  with gr.Column(scale=2):
584
  # User input prompt text field
@@ -599,13 +588,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
599
  return history + [["Invalid prompts - user prompt cannot be empty", None]]
600
 
601
  # Chatbot logic for configuration, sending the prompts, rendering the streamed back generations, etc.
602
- def bot(model_name, application_mode, user_prompt_message, history, messages_history, qa_chain, domain_name, region): # Removed use_rag
603
  if qa_chain == None:
604
  # Initial QA chain setup if not already done (uses default species for the selected domain)
605
- initial_species = species_list_insects[0] if domain_name == "Insects" else species_list_weeds[0]
606
  # Need to handle the tuple returned by init_qa_chain now
607
  # Use the currently selected region for initialization if qa_chain is None
608
- qa_chain, _ = init_qa_chain(initial_species, application_mode, model_name, domain_name, region) # Pass region
609
 
610
  history[-1][1] = "" # Placeholder for the answer
611
 
@@ -640,19 +629,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
640
  def input_cleanup():
641
  return ""
642
 
643
- def init_qa_chain(specie_selector, application_mode, model_name, domain_name, region): # Removed use_rag
644
- print(f"--- init_qa_chain wrapper called with domain: '{domain_name}' ---") # DIAGNOSTIC PRINT
645
  qa_chain_instance = None
646
  availability_msg = "Error initializing QA chain."
647
  try:
648
- if domain_name=="Insects":
649
- qa_chain_instance, availability_msg = initialize_qa_chain(specie_selector, application_mode, model_name, region, default_persist_directory_insects, domain_name) # Removed use_rag
650
- elif domain_name=="Weeds":
651
- qa_chain_instance, availability_msg = initialize_qa_chain(specie_selector, application_mode, model_name, region, default_persist_directory_weeds, domain_name) # Removed use_rag
652
- else:
653
- print("No Appropriate Chain Selected")
654
- availability_msg = "Invalid domain selected."
655
- # Return None for chain and the message
656
  except Exception as e:
657
  print(f"Error in init_qa_chain wrapper: {e}")
658
  availability_msg = f"Error initializing: {e}"
@@ -660,7 +642,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
660
  return qa_chain_instance, availability_msg # Return both chain and message
661
 
662
  # Update QA chain AND availability message when relevant inputs change
663
- inputs_for_qa_chain = [specie_selector, application_mode, model_name, domain_name, region_selector] # CORRECT ORDER
664
  outputs_for_qa_chain = [qa_chain_state, region_availability_display]
665
 
666
  # specie_selector.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
@@ -672,20 +654,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
672
  model_name.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
673
  region_selector.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
674
  application_mode.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
675
- domain_name.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
676
 
677
  #####
678
- def update_species_list(domain):
679
- if domain == "Insects":
680
- return gr.Dropdown( species_list_insects, value=species_list_insects[0], label="Species", info="Select the Species", interactive=True, scale=1, visible=True )
681
- elif domain == "Weeds":
682
- return gr.Dropdown( species_list_weeds, value=species_list_weeds[0], label="Species", info="Select the Species", interactive=True, scale=1, visible=True )
683
-
684
- domain_name.change(
685
- update_species_list,
686
- inputs=[domain_name],
687
- outputs=[specie_selector]
688
- )
689
 
690
  # When the user clicks Enter and the user message is submitted
691
  user_prompt_message.submit(
@@ -695,7 +665,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
695
  queue=False
696
  ).then(
697
  bot,
698
- [model_name, application_mode, user_prompt_message, chatbot, state, qa_chain_state, domain_name, region_selector], # Removed use_rag
699
  [chatbot, state]
700
  ).then(input_cleanup,
701
  [],
@@ -711,7 +681,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
711
  queue=False
712
  ).then(
713
  bot,
714
- [model_name, application_mode, user_prompt_message, chatbot, state, qa_chain_state, domain_name, region_selector], # Removed use_rag
715
  [chatbot, state]
716
  ).then(
717
  input_cleanup,
 
78
 
79
 
80
  # default_persist_directory = './db5' # For deployement
81
+ default_persist_directory='./vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species'
 
82
 
83
+ species_list=get_species_list_from_db(default_persist_directory)
 
84
  # default_persist_directory = 'vector-databases/db5-pre-completion' # For Development
85
  csv_filepath1 = "./agllm-data/corrected/Corrected_supplemented-insect_data-2500-sorted.xlsx"
86
  csv_filepath2 = "./agllm-data/corrected/Corrected_supplemented-insect_data-remaining.xlsx"
 
249
  # )
250
  #
251
 
252
+ def initialize_qa_chain(specie_selector, application_mode, model_name, region, database_persistent_directory=default_persist_directory):
253
  # Add helper function for India info (kept for potential future use, but removed from RAG prompt)
254
  def read_and_format_filtered_csv_better(dataframe_given, insect_specie):
255
  filtered_data = dataframe_given[dataframe_given['species'] == insect_specie]
 
323
  availability_message = f"Information for **{specie_selector}** is available in region(s): **{', '.join(available_regions_list)}**."
324
  else:
325
  available_regions_list = []
326
+ availability_message = f"No regional information found for **{specie_selector}** in the database."
327
  except Exception as e:
328
  print(f"Error checking region availability: {e}")
329
  available_regions_list = []
 
399
 
400
  # Updated prompt template for multi-part response with region-specific contexts
401
  general_system_template = f"""
402
+ You are an AI assistant specialized in providing information about agricultural pests ({specie_selector}). The user is primarily interested in the '{region}' region.
403
 
404
  The following context has been retrieved from a database organized by region:
405
 
 
452
  4. Apply this language constraint: {language_constraint}
453
  5. Keep your summaries concise and directly related to the user's question.
454
 
455
+ User Question about {specie_selector}: {{question}}
456
  """
457
 
458
  class RegionFormattingLLMChain:
 
528
  with gr.Row():
529
  with gr.Column(scale=1):
530
  with gr.Row():
 
 
 
 
 
 
 
 
 
531
  region_selector = gr.Dropdown(
532
  list(["United States", "India", "Africa"]), # Updated regions
533
  value="United States", # Updated default
 
540
 
541
  # Model selection
542
  specie_selector = gr.Dropdown(
543
+ list(set(species_list)),
544
+ value=species_list[0],
545
  label="Species",
546
  info="Select the Species",
547
  interactive=True,
 
567
  scale=1,
568
  visible=True
569
  )
570
+ region_availability_display = gr.Markdown(value="Select species to see region availability.") # Added display area
571
 
572
  with gr.Column(scale=2):
573
  # User input prompt text field
 
588
  return history + [["Invalid prompts - user prompt cannot be empty", None]]
589
 
590
  # Chatbot logic for configuration, sending the prompts, rendering the streamed back generations, etc.
591
+ def bot(model_name, application_mode, user_prompt_message, history, messages_history, qa_chain, region): # Removed use_rag
592
  if qa_chain == None:
593
  # Initial QA chain setup if not already done (uses default species for the selected domain)
594
+ initial_species = species_list[0]
595
  # Need to handle the tuple returned by init_qa_chain now
596
  # Use the currently selected region for initialization if qa_chain is None
597
+ qa_chain, _ = init_qa_chain(initial_species, application_mode, model_name, region) # Pass region
598
 
599
  history[-1][1] = "" # Placeholder for the answer
600
 
 
629
  def input_cleanup():
630
  return ""
631
 
632
+ def init_qa_chain(specie_selector, application_mode, model_name, region): # Removed use_rag
633
+ print(f"--- init_qa_chain wrapper called ---") # DIAGNOSTIC PRINT
634
  qa_chain_instance = None
635
  availability_msg = "Error initializing QA chain."
636
  try:
637
+ qa_chain_instance, availability_msg = initialize_qa_chain(specie_selector, application_mode, model_name, region) # Removed use_rag
 
 
 
 
 
 
 
638
  except Exception as e:
639
  print(f"Error in init_qa_chain wrapper: {e}")
640
  availability_msg = f"Error initializing: {e}"
 
642
  return qa_chain_instance, availability_msg # Return both chain and message
643
 
644
  # Update QA chain AND availability message when relevant inputs change
645
+ inputs_for_qa_chain = [specie_selector, application_mode, model_name, region_selector] # CORRECT ORDER
646
  outputs_for_qa_chain = [qa_chain_state, region_availability_display]
647
 
648
  # specie_selector.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
 
654
  model_name.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
655
  region_selector.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
656
  application_mode.change(init_qa_chain, inputs=inputs_for_qa_chain, outputs=outputs_for_qa_chain)
 
657
 
658
  #####
 
 
 
 
 
 
 
 
 
 
 
659
 
660
  # When the user clicks Enter and the user message is submitted
661
  user_prompt_message.submit(
 
665
  queue=False
666
  ).then(
667
  bot,
668
+ [model_name, application_mode, user_prompt_message, chatbot, state, qa_chain_state, region_selector], # Removed use_rag
669
  [chatbot, state]
670
  ).then(input_cleanup,
671
  [],
 
681
  queue=False
682
  ).then(
683
  bot,
684
+ [model_name, application_mode, user_prompt_message, chatbot, state, qa_chain_state, region_selector], # Removed use_rag
685
  [chatbot, state]
686
  ).then(
687
  input_cleanup,
app_database_prep.py CHANGED
@@ -101,14 +101,24 @@ def process_excel_sheet(
101
 
102
  # --- Main Script Logic ---
103
 
104
- # loader = DirectoryLoader('./agllm-data/', glob="./*.pdf", loader_cls=PyMuPDFLoader)
105
- # loader = DirectoryLoader('/u/marshad/data/agllm-data/', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
106
- data_domain_identifier="agllm-data-isu-field-insects-all-species"
107
- persist_directory = f'vector-databases-deployed/db5-{data_domain_identifier}' # was full
108
- loader = DirectoryLoader(f'agllm-data/{data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)#,# was full, loader_kwargs={'chunk_size':512})
109
- chunk_size_input=512
110
- metadata_raw = pd.read_csv(f"./agllm-data/{data_domain_identifier}/matched_species_results_v2.csv")
111
- documents = loader.load()
 
 
 
 
 
 
 
 
 
 
112
 
113
  ## Load Excel File Path (Define once)
114
  excel_file_path = "agllm-data/PestID Species.xlsx"
 
101
 
102
  # --- Main Script Logic ---
103
 
104
+ # --- INSECTS DATA PROCESSING ---
105
+ insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
106
+ persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
107
+ insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
108
+ chunk_size_input = 512
109
+ insects_metadata_raw = pd.read_csv(f"./agllm-data/{insects_data_domain_identifier}/matched_species_results_v2.csv")
110
+ insects_documents = insects_loader.load()
111
+
112
+ # --- WEEDS DATA PROCESSING ---
113
+ weeds_data_domain_identifier = "agllm-data-isu-field-weeds-all-species"
114
+ weeds_loader = DirectoryLoader(f'agllm-data/{weeds_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
115
+ weeds_metadata_raw = pd.read_csv(f"./agllm-data/{weeds_data_domain_identifier}/matched_species_results_v2.csv")
116
+ weeds_documents = weeds_loader.load()
117
+
118
+ # Combine documents from both sources before processing
119
+ documents = insects_documents + weeds_documents
120
+ metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_index=True)
121
+
122
 
123
  ## Load Excel File Path (Define once)
124
  excel_file_path = "agllm-data/PestID Species.xlsx"
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e4c853859ede8bd23b2d2fa7a6754c568b5b55ee5cb1263176179372d6e3837
3
- size 5844992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7b0db4eaef0c4bc975ba1ce231b7a2bcb3134b13139fe7586a58d5dcc1f97e9
3
+ size 9072640
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β†’ e82d58e5-16f1-41a6-9289-211464329861}/data_level0.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f18abd8c514282db82706e52b0a33ed659cd534e925a6f149deb7af9ce34bd8e
3
- size 6284000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb76ae0f6ca830a9a048b0cf53962a78c88c5c7fcda63fc846077d3456eb3890
3
+ size 62840000
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β†’ e82d58e5-16f1-41a6-9289-211464329861}/header.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:effaa959ce2b30070fdafc2fe82096fc46e4ee7561b75920dd3ce43d09679b21
3
  size 100
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec666c9828420c69fc6b597461d8c18487becec1527c7d1cff9b898cbb393c2d
3
  size 100
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β†’ e82d58e5-16f1-41a6-9289-211464329861}/length.bin RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
3
- size 4000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7e2dcff542de95352682dc186432e98f0188084896773f1973276b0577d5305
3
+ size 40000
vector-databases-deployed/db5-agllm-data-isu-field-insects-all-species/{34b62619-98c9-4834-894c-fbbb81d34690 β†’ e82d58e5-16f1-41a6-9289-211464329861}/link_lists.bin RENAMED
File without changes
vector-databases-deployed/db5-agllm-data-isu-field-weeds-all-species/chroma.sqlite3 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6c522f18861ff080a6073fa9c7fcfd1e08d5e65f1b64c8748b52c851387f0d7
3
- size 155648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bdbd2acc8e88eaf4e6b6bc8a8687b36fe5c624b986515bad74e37010f3924ba
3
+ size 167936