agllm2-dev / app_database_prep.py
arbabarshad's picture
included weeds into insects identifier as well; and removed domain dropdowno
63c9deb
import os
import shutil
from langchain.document_loaders import PyPDFDirectoryLoader
import pandas as pd
import langchain
from queue import Queue
from typing import Any, List
from langchain.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.schema import LLMResult
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts.prompt import PromptTemplate
from anyio.from_thread import start_blocking_portal #For model callback streaming
langchain.debug=True # TODO: DOUBLE CHECK
system_message = {"role": "system", "content": "You are a helpful assistant."} # TODO: double check how this plays out later.
import os
from dotenv import load_dotenv
import streamlit as st
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
import chromadb
## added information in metadata:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.schema import Document
# Function to process a sheet from the Excel file
def process_excel_sheet(
excel_path: str,
sheet_name: str,
region: str,
splitter: RecursiveCharacterTextSplitter
) -> List[Document]:
"""Loads data from an Excel sheet, creates Documents, splits them, and adds metadata."""
print(f"--- Processing Excel Sheet: {sheet_name} (Region: {region}) ---")
try:
df = pd.read_excel(excel_path, sheet_name=sheet_name)
print(f"Excel Data Head ({sheet_name}):\\n", df.head())
except Exception as e:
print(f"Error loading sheet '{sheet_name}' from {excel_path}: {e}")
return []
initial_documents = []
for index, row in df.iterrows():
ipm_info = str(row['IPM Info']) if pd.notna(row['IPM Info']) else ""
# Check if essential columns exist and are not empty (removed accuracy check)
if pd.isna(row['Common Name']) or pd.isna(row['Species']):
print(f"Skipping row {index+2} in sheet '{sheet_name}' due to missing essential data (Common Name or Species).")
continue
doc = Document(
page_content=ipm_info,
metadata={
"source": f"{excel_path}#sheet={sheet_name}#row={index+2}",
"common_name": row['Common Name'],
"species": row['Species'],
"matched_specie_0": row['Species'],
"region": region
}
)
initial_documents.append(doc)
if initial_documents:
print(f"First Document from {sheet_name} (before splitting):\\n", initial_documents[0])
else:
print(f"No documents created from sheet: {sheet_name}")
return [] # Return empty list if no documents were created
split_documents = []
for doc in initial_documents:
splits = splitter.split_documents([doc])
for i, split_doc in enumerate(splits, start=1):
metadata = split_doc.metadata.copy()
metadata["source"] = f"{metadata['source']}#chunk{i}"
split_doc.metadata = metadata
split_documents.append(split_doc)
if split_documents:
print(f"First Document chunk from {sheet_name}:\\n", split_documents[0])
print(f"Finished processing sheet: {sheet_name}. Found {len(split_documents)} chunks.")
print("---------------------------------------------------")
return split_documents
# --- Main Script Logic ---
# --- INSECTS DATA PROCESSING ---
insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species"
persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}'
insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
chunk_size_input = 512
insects_metadata_raw = pd.read_csv(f"./agllm-data/{insects_data_domain_identifier}/matched_species_results_v2.csv")
insects_documents = insects_loader.load()
# --- WEEDS DATA PROCESSING ---
weeds_data_domain_identifier = "agllm-data-isu-field-weeds-all-species"
weeds_loader = DirectoryLoader(f'agllm-data/{weeds_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader)
weeds_metadata_raw = pd.read_csv(f"./agllm-data/{weeds_data_domain_identifier}/matched_species_results_v2.csv")
weeds_documents = weeds_loader.load()
# Combine documents from both sources before processing
documents = insects_documents + weeds_documents
metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_index=True)
## Load Excel File Path (Define once)
excel_file_path = "agllm-data/PestID Species.xlsx"
## Process PDF documents and add metadata
print("--- Processing PDF Documents ---")
pdf_documents_for_splitting = [] # Prepare list to hold docs with added metadata
for doc in documents:
# Add region for PDF docs
doc.metadata["region"] = "United States"
# Add species metadata (existing logic)
file_name_associated_with_this_doc = doc.metadata["source"].split('/')[-1]
matching_species_for_this_file_name = metadata_raw[metadata_raw["File Name"].str.lower() == file_name_associated_with_this_doc.lower()]["Species"]
# Ensure matching_species_for_this_file_name is iterable and not empty
if not matching_species_for_this_file_name.empty:
for specie_index in range(len(matching_species_for_this_file_name)):
# Check if specie_index is within bounds (although range should handle this)
if specie_index < len(matching_species_for_this_file_name):
specie_name = matching_species_for_this_file_name.iloc[specie_index]
doc.metadata["matched_specie_" + str(specie_index)] = specie_name
else:
# This case should ideally not happen with range(len(...))
print(f"Warning: Specie index {specie_index} out of bounds for file {file_name_associated_with_this_doc}")
else:
print(f"Warning: No matching species found in CSV for PDF: {file_name_associated_with_this_doc}")
pdf_documents_for_splitting.append(doc) # Add modified doc to new list
# Initialize Text Splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size_input, chunk_overlap=10)
# Split PDF documents
pdf_splitted_documents = []
for doc in pdf_documents_for_splitting: # Use the list with added metadata
splits = text_splitter.split_documents([doc])
for i, split_doc in enumerate(splits, start=1):
metadata = split_doc.metadata.copy()
# Update source for PDF chunks (existing logic)
source_base = metadata.get('source', 'unknown_source')
page_num = metadata.get('page', 'unknown_page')
metadata["source"] = f"{source_base}#page{page_num}#chunk{i}"
# Remove the raw page number if desired, as it's now in the source string
# metadata.pop('page', None)
split_doc.metadata = metadata
pdf_splitted_documents.append(split_doc)
print("First PDF Document chunk:\\n", pdf_splitted_documents[0] if pdf_splitted_documents else "No PDF documents processed")
print(f"Count after PDF processing: {len(pdf_splitted_documents)}")
print("---------------------------------------------------")
# Process Excel Sheets using the function
india_splitted_documents = process_excel_sheet(
excel_path=excel_file_path,
sheet_name="India",
region="India",
splitter=text_splitter
)
africa_splitted_documents = process_excel_sheet(
excel_path=excel_file_path,
sheet_name="Africa",
region="Africa",
splitter=text_splitter
)
# Combine lists from all sources
splitted_documents = pdf_splitted_documents + india_splitted_documents + africa_splitted_documents
# print(splitted_documents[0]) # Original print statement - commented out as we print chunks above
print("=== Combined Processing Done ===") # Adjusted print statement
print(f"Total documents after combining PDF, India, and Africa sources: {len(splitted_documents)}")
print("=============================")
# ONLY FOR THE FIRST TIME
# Check if the persist directory exists and delete it to ensure a fresh start
if os.path.exists(persist_directory):
print(f"Deleting existing vector database directory: {persist_directory}")
shutil.rmtree(persist_directory)
print(f"Directory deleted.")
else:
print(f"Vector database directory not found, creating a new one: {persist_directory}")
embedding = OpenAIEmbeddings()
vectordb = Chroma.from_documents(documents=splitted_documents,
embedding=embedding,
persist_directory=persist_directory)
# persiste the db to disk
vectordb.persist()
vectordb = None
# Now we can load the persisted database from disk, and use it as normal.
vectordb = Chroma(persist_directory=persist_directory,
embedding_function=embedding)
print(vectordb.get())
#just a test script:
specie_selector="Aphis spiraecola"
filter = {
"$or": [
{"matched_specie_0": specie_selector},
{"matched_specie_1": specie_selector},
{"matched_specie_2": specie_selector},
]
}
answer = vectordb.as_retriever(search_kwargs={'k':10, 'filter': filter}).get_relevant_documents(
"anything else.?")
print(answer)