Spaces:

SZhanZ
/

REC

Sleeping

App Files Files Community

SZhanZ commited on Feb 16, 2025

Commit

d1798f9

1 Parent(s): 15f1882

init

Browse files

Files changed (7) hide show

.gitattributes +1 -0
.gradio/certificate.pem +31 -0
README.md +0 -0
app.py +104 -0
examples/image1.jpg +3 -0
examples/image2.jpg +0 -0
requirements.txt +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/image1.jpg filter=lfs diff=lfs merge=lfs -text

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,104 @@

+import re
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from PIL import Image, ImageDraw
+def draw_bbox(image, bbox):
+    x1, y1, x2, y2 = bbox
+    draw = ImageDraw.Draw(image)
+    draw.rectangle((x1, y1, x2, y2), outline="red", width=5)
+    return image
+def extract_bbox_answer(content):
+    bbox_pattern = r'\{.*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)]\s*.*\}'
+    bbox_match = re.search(bbox_pattern, content)
+    if bbox_match:
+        bbox = [int(bbox_match.group(1)), int(bbox_match.group(2)), int(bbox_match.group(3)), int(bbox_match.group(4))]
+        return bbox
+    return [0, 0, 0, 0]
+def process_image_and_text(image, text):
+    """Process image and text input, return thinking process and bbox"""
+    question = f"Please provide the bounding box coordinate of the region this sentence describes: {text}."
+    QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": QUESTION_TEMPLATE.format(Question=question)},
+            ],
+        }
+    ]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(
+        text=[text],
+        images=image,
+        return_tensors="pt",
+        padding=True,
+        padding_side="left",
+        add_special_tokens=False,
+    )
+    inputs = inputs.to("cuda")
+    with torch.no_grad():
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(inputs.input_ids[0]):] for out_ids in generated_ids
+        ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True
+    )[0]
+    print("output_text: ", output_text)
+    # Extract thinking process
+    think_match = re.search(r'<think>(.*?)</think>', output_text, re.DOTALL)
+    thinking_process = think_match.group(1).strip() if think_match else "No thinking process found"
+    # Get bbox and draw
+    bbox = extract_bbox_answer(output_text)
+    # Draw bbox on the image
+    result_image = image.copy()
+    result_image = draw_bbox(result_image, bbox)
+    return thinking_process, result_image
+if __name__ == "__main__":
+    import gradio as gr
+    # model_path = "/data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-GRPO-REC/checkpoint-500"
+    model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, device_map="cuda:0")
+    processor = AutoProcessor.from_pretrained(model_path)
+    def gradio_interface(image, text):
+        thinking, result_image = process_image_and_text(image, text)
+        return thinking, result_image
+    demo = gr.Interface(
+        fn=gradio_interface,
+        inputs=[
+            gr.Image(type="pil", label="Input Image"),
+            gr.Textbox(label="Description Text")
+        ],
+        outputs=[
+            gr.Textbox(label="Thinking Process"),
+            gr.Image(type="pil", label="Result with Bbox")
+        ],
+        title="Visual Referring Expression Demo",
+        description="Upload an image and input description text, the system will return the thinking process and region annotation",
+        examples=[
+            ["examples/image1.jpg", "food with the highest protein"],
+            ["examples/image2.jpg", "the cheapest laptop"],
+        ]
+    )
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

examples/image1.jpg ADDED Viewed

Git LFS Details

SHA256: e779913142b5db662be50e6e5e8d9b598913dc3a1c2c27abfbbd1dd44630cdd9
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

examples/image2.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.0.0
+git+https://github.com/huggingface/transformers
+Pillow>=10.0.0
+httpx[socks]
+accelerate>=0.26.0