Spaces:
Sleeping
Sleeping
Implement genus classification
Browse files- app.py +80 -55
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
import PIL.Image
|
| 3 |
import pandas as pd
|
|
@@ -12,14 +14,18 @@ import torch
|
|
| 12 |
from torch import nn
|
| 13 |
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
|
| 14 |
from huggingface_hub import PyTorchModelHubMixin
|
|
|
|
| 15 |
|
| 16 |
-
from config import DEFAULT_INPUTS, MODELS, DATASETS
|
| 17 |
|
| 18 |
# We need this for the eco layers because they are too big
|
| 19 |
PIL.Image.MAX_IMAGE_PIXELS = None
|
| 20 |
|
| 21 |
torch.set_grad_enabled(False)
|
| 22 |
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Load models
|
| 25 |
class DNASeqClassifier(nn.Module, PyTorchModelHubMixin):
|
|
@@ -60,10 +66,8 @@ def set_default_inputs():
|
|
| 60 |
DEFAULT_INPUTS["longitude"])
|
| 61 |
|
| 62 |
|
| 63 |
-
def preprocess(dna_sequence: str, latitude:
|
| 64 |
-
"""
|
| 65 |
-
Prepares app input for downsteram tasks
|
| 66 |
-
"""
|
| 67 |
|
| 68 |
# Preprocess the DNA sequence turning it into an embedding
|
| 69 |
dna_seq_preprocessed: str = re.sub(r"[^ACGT]", "N", dna_sequence)
|
|
@@ -80,58 +84,65 @@ def preprocess(dna_sequence: str, latitude: str, longitude: str):
|
|
| 80 |
# Preprocess the location data
|
| 81 |
coords = (float(latitude), float(longitude))
|
| 82 |
|
| 83 |
-
return dna_embedding, coords
|
| 84 |
-
# ecolayer_data = ecolayers_ds # TODO something something...
|
| 85 |
|
| 86 |
-
# # format lat and lon into coords
|
| 87 |
-
# coords = (inp_lat, inp_lng)
|
| 88 |
-
# # Grab rasters from the tifs
|
| 89 |
-
# ecoLayers = load_dataset("LofiAmazon/Global-Ecolayers")
|
| 90 |
-
# temp = pd.DataFrame([coords, embed], columns = ['coord', 'embeddings'])
|
| 91 |
-
# data = pd.merge(temp, ecoLayers, on='coord', how='left')
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
# genuses = infer.infer()
|
| 102 |
-
|
| 103 |
-
# results.append({
|
| 104 |
-
# "sequence": dna_df['nucraw'],
|
| 105 |
-
# # "predictions": pd.concat([dna_genuses, envdna_genuses], axis=0)
|
| 106 |
-
# 'predictions': genuses})
|
| 107 |
-
|
| 108 |
-
# return results
|
| 109 |
|
| 110 |
-
|
| 111 |
-
# data["embeddings"] = data["embeddings"].apply(lambda x: np.array(list(map(float, x[1:-1].split()))))
|
| 112 |
|
| 113 |
-
# # Pick genuses with most samples
|
| 114 |
-
# top_k = 5
|
| 115 |
-
# genus_counts = df["genus"].value_counts()
|
| 116 |
-
# top_genuses = genus_counts.head(top_k).index
|
| 117 |
-
# df = df[df["genus"].isin(top_genuses)]
|
| 118 |
|
| 119 |
-
# # Create a t-SNE plot of the embeddings
|
| 120 |
-
# n_genus = len(df["genus"].unique())
|
| 121 |
-
# tsne = TSNE(n_components=2, perplexity=30, learning_rate=200, n_iter=1000, random_state=0)
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
| 125 |
|
| 126 |
-
|
| 127 |
|
| 128 |
-
# label_encoder = LabelEncoder()
|
| 129 |
-
# y_encoded = label_encoder.fit_transform(y)
|
| 130 |
|
| 131 |
-
|
| 132 |
-
|
| 133 |
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
|
| 137 |
with gr.Blocks() as demo:
|
|
@@ -156,20 +167,34 @@ with gr.Blocks() as demo:
|
|
| 156 |
|
| 157 |
with gr.Row():
|
| 158 |
btn_run = gr.Button("Predict")
|
| 159 |
-
btn_run.click(
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
btn_defaults = gr.Button("I'm feeling lucky")
|
| 162 |
btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
|
| 163 |
|
| 164 |
-
|
| 165 |
with gr.Tab("Genus Prediction"):
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
-
with gr.Row():
|
| 170 |
-
genus_out = gr.Dataframe(headers=["DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
|
| 171 |
-
# btn_run.click(fn=predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out)
|
| 172 |
-
|
| 173 |
with gr.Tab('DNA Embedding Space Visualizer'):
|
| 174 |
gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples for clues.")
|
| 175 |
|
|
|
|
| 1 |
+
from io import BytesIO
|
| 2 |
+
import os
|
| 3 |
import re
|
| 4 |
import PIL.Image
|
| 5 |
import pandas as pd
|
|
|
|
| 14 |
from torch import nn
|
| 15 |
from transformers import BertConfig, BertForMaskedLM, PreTrainedTokenizerFast
|
| 16 |
from huggingface_hub import PyTorchModelHubMixin
|
| 17 |
+
from pinecone import Pinecone
|
| 18 |
|
| 19 |
+
from config import DEFAULT_INPUTS, MODELS, DATASETS, ID_TO_GENUS_MAP
|
| 20 |
|
| 21 |
# We need this for the eco layers because they are too big
|
| 22 |
PIL.Image.MAX_IMAGE_PIXELS = None
|
| 23 |
|
| 24 |
torch.set_grad_enabled(False)
|
| 25 |
|
| 26 |
+
# Configure pinecone
|
| 27 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
| 28 |
+
pc_index = pc.Index("amazon")
|
| 29 |
|
| 30 |
# Load models
|
| 31 |
class DNASeqClassifier(nn.Module, PyTorchModelHubMixin):
|
|
|
|
| 66 |
DEFAULT_INPUTS["longitude"])
|
| 67 |
|
| 68 |
|
| 69 |
+
def preprocess(dna_sequence: str, latitude: float, longitude: float):
|
| 70 |
+
"""Prepares app input for downsteram tasks"""
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# Preprocess the DNA sequence turning it into an embedding
|
| 73 |
dna_seq_preprocessed: str = re.sub(r"[^ACGT]", "N", dna_sequence)
|
|
|
|
| 84 |
# Preprocess the location data
|
| 85 |
coords = (float(latitude), float(longitude))
|
| 86 |
|
| 87 |
+
return dna_embedding, coords[0], coords[1]
|
|
|
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
def tokenize(dna_sequence: str) -> dict[str, torch.Tensor]:
|
| 91 |
+
dna_seq_preprocessed: str = re.sub(r"[^ACGT]", "N", dna_sequence)
|
| 92 |
+
dna_seq_preprocessed: str = re.sub(r"N+$", "", dna_sequence)
|
| 93 |
+
dna_seq_preprocessed = dna_seq_preprocessed[:660]
|
| 94 |
+
dna_seq_preprocessed = " ".join([
|
| 95 |
+
dna_seq_preprocessed[i:i+4] for i in range(0, len(dna_seq_preprocessed), 4)
|
| 96 |
+
])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
+
return tokenizer(dna_seq_preprocessed, return_tensors="pt")
|
|
|
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
+
def get_embedding(dna_sequence: str) -> torch.Tensor:
|
| 103 |
+
dna_embedding: torch.Tensor = embeddings_model(
|
| 104 |
+
**tokenize(dna_sequence)
|
| 105 |
+
).hidden_states[-1].mean(1).squeeze()
|
| 106 |
|
| 107 |
+
return dna_embedding
|
| 108 |
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
def predict_genus(method: str, dna_sequence: str, latitude: str, longitude: str):
|
| 111 |
+
coords = (float(latitude), float(longitude))
|
| 112 |
|
| 113 |
+
if method == "cosine":
|
| 114 |
+
embedding = get_embedding(dna_sequence)
|
| 115 |
+
result = pc_index.query(
|
| 116 |
+
namespace="all",
|
| 117 |
+
vector=embedding.tolist(),
|
| 118 |
+
top_k=100,
|
| 119 |
+
include_metadata=True,
|
| 120 |
+
)
|
| 121 |
+
top_k = [m["metadata"]["genus"] for m in result["matches"]]
|
| 122 |
+
|
| 123 |
+
top_k = pd.Series(top_k).value_counts()
|
| 124 |
+
top_k = top_k / top_k.sum()
|
| 125 |
+
|
| 126 |
+
if method == "fine_tuned_model":
|
| 127 |
+
bert_inputs = tokenize(dna_sequence)
|
| 128 |
+
logits = classification_model(bert_inputs, torch.zeros(1, 7))
|
| 129 |
+
probs = torch.softmax(logits, dim=1).squeeze()
|
| 130 |
+
top_k = torch.topk(probs, 10)
|
| 131 |
+
top_k = pd.Series(
|
| 132 |
+
top_k.values.detach().numpy(),
|
| 133 |
+
index=[ID_TO_GENUS_MAP[i] for i in top_k.indices.detach().numpy()]
|
| 134 |
+
)
|
| 135 |
+
# top_k = pd.Series(top_k.values.detach().numpy(), index=top_k.indices.detach().numpy())
|
| 136 |
+
|
| 137 |
+
fig, ax = plt.subplots()
|
| 138 |
+
ax.bar(top_k.index.astype(str), top_k.values)
|
| 139 |
+
ax.set_title("Genus Prediction")
|
| 140 |
+
ax.set_xlabel("Genus")
|
| 141 |
+
ax.set_ylabel("Probability")
|
| 142 |
+
ax.set_xticklabels(top_k.index.astype(str), rotation=90)
|
| 143 |
+
fig.canvas.draw()
|
| 144 |
+
|
| 145 |
+
return PIL.Image.frombytes("RGB", fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
|
| 146 |
|
| 147 |
|
| 148 |
with gr.Blocks() as demo:
|
|
|
|
| 167 |
|
| 168 |
with gr.Row():
|
| 169 |
btn_run = gr.Button("Predict")
|
| 170 |
+
btn_run.click(
|
| 171 |
+
fn=preprocess,
|
| 172 |
+
inputs=[inp_dna, inp_lat, inp_lng],
|
| 173 |
+
)
|
| 174 |
|
| 175 |
btn_defaults = gr.Button("I'm feeling lucky")
|
| 176 |
btn_defaults.click(fn=set_default_inputs, outputs=[inp_dna, inp_lat, inp_lng])
|
| 177 |
|
|
|
|
| 178 |
with gr.Tab("Genus Prediction"):
|
| 179 |
+
gr.Interface(
|
| 180 |
+
fn=predict_genus,
|
| 181 |
+
inputs=[
|
| 182 |
+
gr.Dropdown(choices=["cosine", "fine_tuned_model"], value="fine_tuned_model"),
|
| 183 |
+
inp_dna,
|
| 184 |
+
inp_lat,
|
| 185 |
+
inp_lng,
|
| 186 |
+
],
|
| 187 |
+
outputs=["image"],
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# with gr.Row():
|
| 191 |
+
|
| 192 |
+
# gr.Markdown("Make plot or table for Top 5 species")
|
| 193 |
+
|
| 194 |
+
# with gr.Row():
|
| 195 |
+
# genus_out = gr.Dataframe(headers=["DNA Only Pred Genus", "DNA Only Prob", "DNA & Env Pred Genus", "DNA & Env Prob"])
|
| 196 |
+
# # btn_run.click(fn=predict_genus, inputs=[inp_dna, inp_lat, inp_lng], outputs=genus_out)
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
with gr.Tab('DNA Embedding Space Visualizer'):
|
| 199 |
gr.Markdown("If the highest genus probability is very low for your DNA sequence, we can still examine the DNA embedding of the sequence in relation to known samples for clues.")
|
| 200 |
|
requirements.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
huggingface-hub==0.23.2
|
| 2 |
pandas==2.2.2
|
|
|
|
| 3 |
rasterio==1.3.10
|
| 4 |
torch==2.3.0
|
| 5 |
tqdm==4.66.4
|
|
|
|
| 1 |
huggingface-hub==0.23.2
|
| 2 |
pandas==2.2.2
|
| 3 |
+
pinecone_client==4.1.0
|
| 4 |
rasterio==1.3.10
|
| 5 |
torch==2.3.0
|
| 6 |
tqdm==4.66.4
|