Spaces:
Sleeping
Sleeping
| import os | |
| import shutil | |
| from langchain.document_loaders import PyPDFDirectoryLoader | |
| import pandas as pd | |
| import langchain | |
| from queue import Queue | |
| from typing import Any, List | |
| from langchain.llms.huggingface_text_gen_inference import HuggingFaceTextGenInference | |
| from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
| from langchain.schema import LLMResult | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from langchain.prompts.prompt import PromptTemplate | |
| from anyio.from_thread import start_blocking_portal #For model callback streaming | |
| langchain.debug=True # TODO: DOUBLE CHECK | |
| system_message = {"role": "system", "content": "You are a helpful assistant."} # TODO: double check how this plays out later. | |
| import os | |
| from dotenv import load_dotenv | |
| import streamlit as st | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import CharacterTextSplitter | |
| from langchain.embeddings import OpenAIEmbeddings | |
| from langchain.chains.question_answering import load_qa_chain | |
| from langchain.chat_models import ChatOpenAI | |
| from langchain.vectorstores import Chroma | |
| import chromadb | |
| ## added information in metadata: | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.llms import OpenAI | |
| from langchain.chains import RetrievalQA | |
| from langchain.document_loaders import TextLoader | |
| from langchain.document_loaders import DirectoryLoader | |
| from langchain_community.document_loaders import PyMuPDFLoader | |
| from langchain.schema import Document | |
| # Function to process a sheet from the Excel file | |
| def process_excel_sheet( | |
| excel_path: str, | |
| sheet_name: str, | |
| region: str, | |
| splitter: RecursiveCharacterTextSplitter | |
| ) -> List[Document]: | |
| """Loads data from an Excel sheet, creates Documents, splits them, and adds metadata.""" | |
| print(f"--- Processing Excel Sheet: {sheet_name} (Region: {region}) ---") | |
| try: | |
| df = pd.read_excel(excel_path, sheet_name=sheet_name) | |
| print(f"Excel Data Head ({sheet_name}):\\n", df.head()) | |
| except Exception as e: | |
| print(f"Error loading sheet '{sheet_name}' from {excel_path}: {e}") | |
| return [] | |
| initial_documents = [] | |
| for index, row in df.iterrows(): | |
| ipm_info = str(row['IPM Info']) if pd.notna(row['IPM Info']) else "" | |
| # Check if essential columns exist and are not empty (removed accuracy check) | |
| if pd.isna(row['Common Name']) or pd.isna(row['Species']): | |
| print(f"Skipping row {index+2} in sheet '{sheet_name}' due to missing essential data (Common Name or Species).") | |
| continue | |
| doc = Document( | |
| page_content=ipm_info, | |
| metadata={ | |
| "source": f"{excel_path}#sheet={sheet_name}#row={index+2}", | |
| "common_name": row['Common Name'], | |
| "species": row['Species'], | |
| "matched_specie_0": row['Species'], | |
| "region": region | |
| } | |
| ) | |
| initial_documents.append(doc) | |
| if initial_documents: | |
| print(f"First Document from {sheet_name} (before splitting):\\n", initial_documents[0]) | |
| else: | |
| print(f"No documents created from sheet: {sheet_name}") | |
| return [] # Return empty list if no documents were created | |
| split_documents = [] | |
| for doc in initial_documents: | |
| splits = splitter.split_documents([doc]) | |
| for i, split_doc in enumerate(splits, start=1): | |
| metadata = split_doc.metadata.copy() | |
| metadata["source"] = f"{metadata['source']}#chunk{i}" | |
| split_doc.metadata = metadata | |
| split_documents.append(split_doc) | |
| if split_documents: | |
| print(f"First Document chunk from {sheet_name}:\\n", split_documents[0]) | |
| print(f"Finished processing sheet: {sheet_name}. Found {len(split_documents)} chunks.") | |
| print("---------------------------------------------------") | |
| return split_documents | |
| # --- Main Script Logic --- | |
| # --- INSECTS DATA PROCESSING --- | |
| insects_data_domain_identifier = "agllm-data-isu-field-insects-all-species" | |
| persist_directory = f'vector-databases-deployed/db5-{insects_data_domain_identifier}' | |
| insects_loader = DirectoryLoader(f'agllm-data/{insects_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader) | |
| chunk_size_input = 512 | |
| insects_metadata_raw = pd.read_csv(f"./agllm-data/{insects_data_domain_identifier}/matched_species_results_v2.csv") | |
| insects_documents = insects_loader.load() | |
| # --- WEEDS DATA PROCESSING --- | |
| weeds_data_domain_identifier = "agllm-data-isu-field-weeds-all-species" | |
| weeds_loader = DirectoryLoader(f'agllm-data/{weeds_data_domain_identifier}', glob='**/*.pdf', loader_cls=PyMuPDFLoader) | |
| weeds_metadata_raw = pd.read_csv(f"./agllm-data/{weeds_data_domain_identifier}/matched_species_results_v2.csv") | |
| weeds_documents = weeds_loader.load() | |
| # Combine documents from both sources before processing | |
| documents = insects_documents + weeds_documents | |
| metadata_raw = pd.concat([insects_metadata_raw, weeds_metadata_raw], ignore_index=True) | |
| ## Load Excel File Path (Define once) | |
| excel_file_path = "agllm-data/PestID Species.xlsx" | |
| ## Process PDF documents and add metadata | |
| print("--- Processing PDF Documents ---") | |
| pdf_documents_for_splitting = [] # Prepare list to hold docs with added metadata | |
| for doc in documents: | |
| # Add region for PDF docs | |
| doc.metadata["region"] = "United States" | |
| # Add species metadata (existing logic) | |
| file_name_associated_with_this_doc = doc.metadata["source"].split('/')[-1] | |
| matching_species_for_this_file_name = metadata_raw[metadata_raw["File Name"].str.lower() == file_name_associated_with_this_doc.lower()]["Species"] | |
| # Ensure matching_species_for_this_file_name is iterable and not empty | |
| if not matching_species_for_this_file_name.empty: | |
| for specie_index in range(len(matching_species_for_this_file_name)): | |
| # Check if specie_index is within bounds (although range should handle this) | |
| if specie_index < len(matching_species_for_this_file_name): | |
| specie_name = matching_species_for_this_file_name.iloc[specie_index] | |
| doc.metadata["matched_specie_" + str(specie_index)] = specie_name | |
| else: | |
| # This case should ideally not happen with range(len(...)) | |
| print(f"Warning: Specie index {specie_index} out of bounds for file {file_name_associated_with_this_doc}") | |
| else: | |
| print(f"Warning: No matching species found in CSV for PDF: {file_name_associated_with_this_doc}") | |
| pdf_documents_for_splitting.append(doc) # Add modified doc to new list | |
| # Initialize Text Splitter | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size_input, chunk_overlap=10) | |
| # Split PDF documents | |
| pdf_splitted_documents = [] | |
| for doc in pdf_documents_for_splitting: # Use the list with added metadata | |
| splits = text_splitter.split_documents([doc]) | |
| for i, split_doc in enumerate(splits, start=1): | |
| metadata = split_doc.metadata.copy() | |
| # Update source for PDF chunks (existing logic) | |
| source_base = metadata.get('source', 'unknown_source') | |
| page_num = metadata.get('page', 'unknown_page') | |
| metadata["source"] = f"{source_base}#page{page_num}#chunk{i}" | |
| # Remove the raw page number if desired, as it's now in the source string | |
| # metadata.pop('page', None) | |
| split_doc.metadata = metadata | |
| pdf_splitted_documents.append(split_doc) | |
| print("First PDF Document chunk:\\n", pdf_splitted_documents[0] if pdf_splitted_documents else "No PDF documents processed") | |
| print(f"Count after PDF processing: {len(pdf_splitted_documents)}") | |
| print("---------------------------------------------------") | |
| # Process Excel Sheets using the function | |
| india_splitted_documents = process_excel_sheet( | |
| excel_path=excel_file_path, | |
| sheet_name="India", | |
| region="India", | |
| splitter=text_splitter | |
| ) | |
| africa_splitted_documents = process_excel_sheet( | |
| excel_path=excel_file_path, | |
| sheet_name="Africa", | |
| region="Africa", | |
| splitter=text_splitter | |
| ) | |
| # Combine lists from all sources | |
| splitted_documents = pdf_splitted_documents + india_splitted_documents + africa_splitted_documents | |
| # print(splitted_documents[0]) # Original print statement - commented out as we print chunks above | |
| print("=== Combined Processing Done ===") # Adjusted print statement | |
| print(f"Total documents after combining PDF, India, and Africa sources: {len(splitted_documents)}") | |
| print("=============================") | |
| # ONLY FOR THE FIRST TIME | |
| # Check if the persist directory exists and delete it to ensure a fresh start | |
| if os.path.exists(persist_directory): | |
| print(f"Deleting existing vector database directory: {persist_directory}") | |
| shutil.rmtree(persist_directory) | |
| print(f"Directory deleted.") | |
| else: | |
| print(f"Vector database directory not found, creating a new one: {persist_directory}") | |
| embedding = OpenAIEmbeddings() | |
| vectordb = Chroma.from_documents(documents=splitted_documents, | |
| embedding=embedding, | |
| persist_directory=persist_directory) | |
| # persiste the db to disk | |
| vectordb.persist() | |
| vectordb = None | |
| # Now we can load the persisted database from disk, and use it as normal. | |
| vectordb = Chroma(persist_directory=persist_directory, | |
| embedding_function=embedding) | |
| print(vectordb.get()) | |
| #just a test script: | |
| specie_selector="Aphis spiraecola" | |
| filter = { | |
| "$or": [ | |
| {"matched_specie_0": specie_selector}, | |
| {"matched_specie_1": specie_selector}, | |
| {"matched_specie_2": specie_selector}, | |
| ] | |
| } | |
| answer = vectordb.as_retriever(search_kwargs={'k':10, 'filter': filter}).get_relevant_documents( | |
| "anything else.?") | |
| print(answer) | |