Spaces:

Johnny-Z
/

dan_retrieval

Running

App Files Files Community

Johnny-Z commited on Nov 8

Commit

7126f84

verified ·

1 Parent(s): f8efae2

Upload 2 files

Browse files

Files changed (2) hide show

app.py +473 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,473 @@

+from transformers import CLIPImageProcessor, AutoModel
+import torch
+import json
+import torch.nn as nn
+from PIL import Image
+import gradio as gr
+import os
+import faiss
+import time
+import requests
+from huggingface_hub import login, snapshot_download
+TITLE = "Danbooru Tagger"
+DESCRIPTION = """
+## Dataset
+- Source: Cleaned Danbooru
+## Metrics
+- Validation Split: 10% of Dataset
+- Validation Results:
+### General
+| Metric          | Value       |
+|-----------------|-------------|
+| Macro F1        | 0.4678      |
+| Macro Precision | 0.4605      |
+| Macro Recall    | 0.5229      |
+| Micro F1        | 0.6661      |
+| Micro Precision | 0.6049      |
+| Micro Recall    | 0.7411      |
+### Character
+| Metric          | Value       |
+|-----------------|-------------|
+| Macro F1        | 0.8925      |
+| Macro Precision | 0.9099      |
+| Macro Recall    | 0.8935      |
+| Micro F1        | 0.9232      |
+| Micro Precision | 0.9264      |
+| Micro Recall    | 0.9199      |
+### Artist
+| Metric          | Value       |
+|-----------------|-------------|
+| Macro F1        | 0.7904      |
+| Macro Precision | 0.8286      |
+| Macro Recall    | 0.7904      |
+| Micro F1        | 0.5989      |
+| Micro Precision | 0.5975      |
+| Micro Recall    | 0.6004      |
+"""
+kaomojis = [
+    "0_0",
+    "(o)_(o)",
+    "+_+",
+    "+_-",
+    "._.",
+    "<o>_<o>",
+    "<|>_<|>",
+    "=_=",
+    ">_<",
+    "3_3",
+    "6_9",
+    ">_o",
+    "@_@",
+    "^_^",
+    "o_o",
+    "u_u",
+    "x_x",
+    "|_|",
+    "||_||",
+]
+device = torch.device('cpu')
+dtype = torch.float32
+hf_token = os.getenv("HF_TOKEN")
+if hf_token:
+    login(token=hf_token)
+else:
+    raise ValueError("environment variable HF_TOKEN not found.")
+repo = snapshot_download('Johnny-Z/vit-e4')
+model = AutoModel.from_pretrained(repo, dtype=dtype, trust_remote_code=True, device_map=device)
+index_dir = snapshot_download('Johnny-Z/dan_index', repo_type='dataset')
+processor = CLIPImageProcessor.from_pretrained(repo)
+class MultiheadAttentionPoolingHead(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.map_probe = nn.Parameter(torch.randn(1, 1, input_size))
+        self.map_layernorm0 = nn.LayerNorm(input_size, eps=1e-08)
+        self.map_attention = torch.nn.MultiheadAttention(input_size, input_size // 64, batch_first=True)
+        self.map_layernorm1 = nn.LayerNorm(input_size, eps=1e-08)
+        self.map_ffn = nn.Sequential(
+            nn.Linear(input_size, input_size * 4),
+            nn.SiLU(),
+            nn.Linear(input_size * 4, input_size)
+        )
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.map_probe.repeat(batch_size, 1, 1)
+        hidden_state = self.map_layernorm0(hidden_state)
+        hidden_state = self.map_attention(probe, hidden_state, hidden_state)[0]
+        hidden_state = self.map_layernorm1(hidden_state)
+        residual = hidden_state
+        hidden_state = residual + self.map_ffn(hidden_state)
+        return hidden_state[:, 0]
+class MLP(nn.Module):
+    def __init__(self, input_size, class_num):
+        super().__init__()
+        self.mlp_layer0 = nn.Sequential(
+            nn.LayerNorm(input_size, eps=1e-08),
+            nn.Linear(input_size, input_size // 2),
+            nn.SiLU()
+        )
+        self.mlp_layer1 = nn.Linear(input_size // 2, class_num)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        x = self.mlp_layer0(x)
+        x = self.mlp_layer1(x)
+        x = self.sigmoid(x)
+        return x
+class MLP_Retrieval(nn.Module):
+    def __init__(self, input_size, class_num):
+        super().__init__()
+        self.mlp_layer0 = nn.Sequential(
+            nn.Linear(input_size, input_size // 2),
+            nn.SiLU()
+        )
+        self.mlp_layer1 = nn.Linear(input_size // 2, class_num)
+    def forward(self, x):
+        x = self.mlp_layer0(x)
+        x = self.mlp_layer1(x)
+        x1, x2 = x[:, :15], x[:, 15:]
+        x1 = torch.softmax(x1, dim=1)
+        x2 = torch.softmax(x2, dim=1)
+        x = torch.cat([x1, x2], dim=1)
+        return x
+class MLP_R(nn.Module):
+    def __init__(self, input_size):
+        super().__init__()
+        self.mlp_layer0 = nn.Sequential(
+            nn.Linear(input_size, 256),
+        )
+    def forward(self, x):
+        x = self.mlp_layer0(x)
+        return x
+with open(os.path.join(repo, 'general_tag_dict.json'), 'r', encoding='utf-8') as f:
+    general_dict = json.load(f)
+with open(os.path.join(repo, 'character_tag_dict.json'), 'r', encoding='utf-8') as f:
+    character_dict = json.load(f)
+with open(os.path.join(repo, 'artist_tag_dict.json'), 'r', encoding='utf-8') as f:
+    artist_dict = json.load(f)
+with open(os.path.join(repo, 'implications_list.json'), 'r', encoding='utf-8') as f:
+    implications_list = json.load(f)
+with open(os.path.join(repo, 'artist_threshold.json'), 'r', encoding='utf-8') as f:
+    artist_thresholds = json.load(f)
+with open(os.path.join(repo, 'character_threshold.json'), 'r', encoding='utf-8') as f:
+    character_thresholds = json.load(f)
+with open(os.path.join(repo, 'general_threshold.json'), 'r', encoding='utf-8') as f:
+    general_thresholds = json.load(f)
+model_map = MultiheadAttentionPoolingHead(2048)
+model_map.load_state_dict(torch.load(os.path.join(repo, "map_head.pth"), map_location=device, weights_only=True))
+model_map.to(device).to(dtype).eval()
+general_class = 9775
+mlp_general = MLP(2048, general_class)
+mlp_general.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_general.pth"), map_location=device, weights_only=True))
+mlp_general.to(device).to(dtype).eval()
+character_class = 7568
+mlp_character = MLP(2048, character_class)
+mlp_character.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_character.pth"), map_location=device, weights_only=True))
+mlp_character.to(device).to(dtype).eval()
+artist_class = 13957
+mlp_artist = MLP(2048, artist_class)
+mlp_artist.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_artist.pth"), map_location=device, weights_only=True))
+mlp_artist.to(device).to(dtype).eval()
+mlp_artist_retrieval = MLP_Retrieval(2048, artist_class)
+mlp_artist_retrieval.load_state_dict(torch.load(os.path.join(repo, "cls_predictor_artist_retrieval.pth"), map_location=device, weights_only=True))
+mlp_artist_retrieval.to(device).to(dtype).eval()
+mlp_r = MLP_R(2048)
+mlp_r.load_state_dict(torch.load(os.path.join(repo, "retrieval_head.pth"), map_location=device, weights_only=True))
+mlp_r.to(device).to(dtype).eval()
+def prediction_to_tag(prediction, tag_dict, class_num):
+    prediction = prediction.view(class_num)
+    predicted_ids = (prediction >= 0.2).nonzero(as_tuple=True)[0].cpu().numpy() + 1
+    general = {}
+    character = {}
+    artist = {}
+    date = {}
+    rating = {}
+    for tag, value in tag_dict.items():
+        if value[2] in predicted_ids:
+            tag_value = round(prediction[value[2] - 1].item(), 6)
+            if value[1] == "general" and tag_value >= general_thresholds.get(tag, {}).get("Threshold", 0.75):
+                general[tag] = tag_value
+            elif value[1] == "character" and tag_value >= character_thresholds.get(tag, {}).get("Threshold", 0.75):
+                character[tag] = tag_value
+            elif value[1] == "artist" and tag_value >= artist_thresholds.get(tag, {}).get("Threshold", 0.75):
+                artist[tag] = tag_value
+            elif value[1] == "rating":
+                rating[tag] = tag_value
+            elif value[1] == "date":
+                date[tag] = tag_value
+    general = dict(sorted(general.items(), key=lambda item: item[1], reverse=True))
+    character = dict(sorted(character.items(), key=lambda item: item[1], reverse=True))
+    artist = dict(sorted(artist.items(), key=lambda item: item[1], reverse=True))
+    if date:
+        date = {max(date, key=date.get): date[max(date, key=date.get)]}
+    if rating:
+        rating = {max(rating, key=rating.get): rating[max(rating, key=rating.get)]}
+    return general, character, artist, date, rating
+def prediction_to_retrieval(prediction, tag_dict, class_num, top_k):
+    prediction = prediction.view(class_num)
+    predicted_ids = (prediction>=0.005).nonzero(as_tuple=True)[0].cpu().numpy() + 1
+    artist = {}
+    date = {}
+    for tag, value in tag_dict.items():
+        if value[2] in predicted_ids:
+            tag_value = round(prediction[value[2] - 1].item(), 6)
+            if value[1] == "artist":
+                artist[tag] = tag_value
+            elif value[1] == "date":
+                date[tag] = tag_value
+    artist = dict(sorted(artist.items(), key=lambda item: item[1], reverse=True))
+    artist = dict(list(artist.items())[:top_k])
+    if date:
+        date = {max(date, key=date.get): date[max(date, key=date.get)]}
+    return artist, date
+def load_id_map(id_map_path):
+    with open(id_map_path, "r") as f:
+        id_map = json.load(f)
+    id_map = {int(k): int(v) for k, v in id_map.items()}
+    inv_map = {v: k for k, v in id_map.items()}
+    return id_map, inv_map
+def search_index(query_vector, k=32, distance_threshold_min=0, distance_threshold_max=64, nprobe=4):
+    global index_dir
+    index_path = os.path.join(index_dir, 'danbooru_retrieval.index')
+    id_map_path = os.path.join(index_dir, 'danbooru_retrieval_id_map.json')
+    distance_threshold_min = distance_threshold_min**2
+    distance_threshold_max = distance_threshold_max**2
+    index = faiss.read_index(index_path)
+    if nprobe is not None and hasattr(index, "nprobe"):
+        index.nprobe = nprobe
+    _, inv_map = load_id_map(id_map_path)
+    qv = query_vector.detach().to(torch.float32).cpu().numpy()
+    distances, internal_ids = index.search(qv, k)
+    distances = distances[0]
+    internal_ids = internal_ids[0]
+    results = []
+    for dist, internal_id in zip(distances, internal_ids):
+        if internal_id == -1:
+            continue
+        if dist < distance_threshold_min or dist > distance_threshold_max:
+            continue
+        original_id = inv_map.get(int(internal_id))
+        if original_id is None:
+            continue
+        results.append({"original_id": original_id, "l2_distance": float(dist**0.5)})
+    results.sort(key=lambda x: x["l2_distance"])
+    return results
+def fetch_retrieval_image_urls(retrieval_results, sleep_sec=0.25, timeout=4.0):
+    pairs = []
+    for item in retrieval_results:
+        oid = item.get("original_id")
+        if oid is None:
+            continue
+        api_url = f"https://danbooru.donmai.us/posts/{oid}.json"
+        try:
+            resp = requests.get(api_url, timeout=timeout)
+            if resp.status_code != 200:
+                time.sleep(sleep_sec)
+                continue
+            data = resp.json()
+            url = data.get("large_file_url") or data.get("file_url") or data.get("preview_file_url")
+            if not url:
+                time.sleep(sleep_sec)
+                continue
+            if url.startswith("//"):
+                url = "https:" + url
+            elif url.startswith("/"):
+                url = "https://danbooru.donmai.us" + url
+            pairs.append((url, oid))
+        except Exception:
+            pass
+        finally:
+            time.sleep(sleep_sec)
+    return pairs
+def process_image(image, k, distance_threshold_min, distance_threshold_max):
+    try:
+        image = image.convert('RGBA')
+        background = Image.new('RGBA', image.size, (255, 255, 255, 255))
+        image = Image.alpha_composite(background, image).convert('RGB')
+        image_inputs = processor(images=[image], return_tensors="pt").to(device).to(dtype)
+    except (OSError, IOError) as e:
+        print(f"Error opening image: {e}")
+        return
+    with torch.no_grad():
+        embedding = model(image_inputs.pixel_values)
+        embedding = model_map(embedding)
+        embedding_r = mlp_r(embedding)
+        retrieval_results = search_index(embedding_r, k, distance_threshold_min, distance_threshold_max)
+        url_id_pairs = fetch_retrieval_image_urls(retrieval_results)
+        retrieval_gallery_items = [(url, f"https://danbooru.donmai.us/posts/{oid}") for url, oid in url_id_pairs]
+        general_prediction = mlp_general(embedding)
+        general_ = prediction_to_tag(general_prediction, general_dict, general_class)
+        general_tags = general_[0]
+        rating = general_[4]
+        character_prediction = mlp_character(embedding)
+        character_ = prediction_to_tag(character_prediction, character_dict, character_class)
+        character_tags = character_[1]
+        artist_retrieval_prediction = mlp_artist_retrieval(embedding)
+        artist_retrieval_ = prediction_to_retrieval(artist_retrieval_prediction, artist_dict, artist_class, 10)
+        artist_tags = artist_retrieval_[0]
+        date = artist_retrieval_[1]
+    combined_tags = {**general_tags}
+    tags_list = [tag for tag in combined_tags]
+    remove_list = []
+    for tag in tags_list:
+        if tag in implications_list:
+            for implication in implications_list[tag]:
+                remove_list.append(implication)
+    tags_list = [tag for tag in tags_list if tag not in remove_list]
+    tags_list = [tag.replace("_", " ") if tag not in kaomojis else tag for tag in tags_list]
+    tags_str = ", ".join(tags_list).replace("(", r"\(").replace(")", r"\)")
+    return (
+        tags_str,
+        artist_tags,
+        character_tags,
+        general_tags,
+        rating,
+        date,
+        retrieval_gallery_items,
+    )
+def main():
+    with gr.Blocks(title=TITLE) as demo:
+        with gr.Column():
+            gr.Markdown(
+                value=f"<h1 style='text-align: center; margin-bottom: 1rem'>{TITLE}</h1>"
+            )
+            with gr.Row():
+                with gr.Column(variant="panel"):
+                    submit = gr.Button(value="Submit", variant="primary", size="lg")
+                    image = gr.Image(type="pil", image_mode="RGBA", label="Input")
+                    k_slider = gr.Slider(1, 100, value=32, step=1, label="Top K Results")
+                    distance_min_slider = gr.Slider(0, 128, value=0, step=1, label="Min Distance Threshold")
+                    distance_max_slider = gr.Slider(0, 128, value=80, step=1, label="Max Distance Threshold")
+                    with gr.Row():
+                        clear = gr.ClearButton(
+                            components=[
+                                image,
+                                k_slider,
+                                distance_min_slider,
+                                distance_max_slider,
+                            ],
+                            variant="secondary",
+                            size="lg",
+                        )
+                    gr.Markdown(value=DESCRIPTION)
+                with gr.Column(variant="panel"):
+                    tags_str = gr.Textbox(label="Output", lines=4)
+                    with gr.Row():
+                        rating = gr.Label(label="Rating")
+                        date = gr.Label(label="Year")
+                    artist_tags = gr.Label(label="Artist")
+                    character_tags = gr.Label(label="Character")
+                    general_tags = gr.Label(label="General")
+            with gr.Row():
+                retrieval_gallery = gr.Gallery(
+                    label="Retrieval Preview",
+                    columns=5,
+                )
+            clear.add(
+                [
+                    tags_str,
+                    artist_tags,
+                    general_tags,
+                    character_tags,
+                    rating,
+                    date,
+                    retrieval_gallery,
+                ]
+            )
+        submit.click(
+            process_image,
+            inputs=[image, k_slider, distance_min_slider, distance_max_slider],
+            outputs=[
+                tags_str,
+                artist_tags,
+                character_tags,
+                general_tags,
+                rating,
+                date,
+                retrieval_gallery,
+            ],
+        )
+    demo.queue(max_size=10)
+    demo.launch()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+transformers
+Pillow
+gradio
+einops
+timm
+accelerate
+faiss-cpu