Spaces:

akhaliq
/

hub-qj4gyx8r

Sleeping

App Files Files Community

akhaliq HF Staff commited on Oct 10

Commit

882079e

verified ·

1 Parent(s): 5925a3a

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +332 -0

app.py ADDED Viewed

	@@ -0,0 +1,332 @@

+I'll create a chat application for the UserLM-8b model with a clean interface and proper GPU optimization. Since this model runs locally, I won't use the @spaces.GPU decorator as it's not needed for external model loading.
+```python
+import gradio as gr
+import spaces
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import time
+from typing import List, Tuple
+# Model configuration
+MODEL_PATH = "microsoft/UserLM-8b"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# Global variables for model and tokenizer
+model = None
+tokenizer = None
+def load_model():
+    """Load the model and tokenizer."""
+    global model, tokenizer
+    print(f"Loading model {MODEL_PATH}...")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH,
+        trust_remote_code=True,
+        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+        low_cpu_mem_usage=True
+    ).to(DEVICE)
+    print(f"Model loaded successfully on {DEVICE}")
+    return model, tokenizer
+@spaces.GPU(duration=120)
+def generate_response(
+    message: str,
+    chat_history: List[Tuple[str, str]],
+    system_prompt: str,
+    temperature: float,
+    top_p: float,
+    max_new_tokens: int,
+) -> str:
+    """Generate a response from the model."""
+    global model, tokenizer
+    # Load model if not already loaded
+    if model is None or tokenizer is None:
+        model, tokenizer = load_model()
+    # Build conversation history
+    messages = []
+    # Add system prompt if provided
+    if system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt})
+    # Add chat history
+    for user_msg, assistant_msg in chat_history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
+    # Add current message
+    messages.append({"role": "user", "content": message})
+    # Tokenize input
+    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(DEVICE)
+    # Define special tokens
+    end_token = "<|eot_id|>"
+    end_token_id = tokenizer.encode(end_token, add_special_tokens=False)
+    end_conv_token = "<|endconversation|>"
+    end_conv_token_id = tokenizer.encode(end_conv_token, add_special_tokens=False)
+    # Generate response
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids=inputs,
+            do_sample=True,
+            top_p=top_p,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            eos_token_id=end_token_id,
+            pad_token_id=tokenizer.eos_token_id,
+            bad_words_ids=[[token_id] for token_id in end_conv_token_id]
+        )
+    # Decode response
+    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+    return response
+def respond(
+    message: str,
+    chat_history: List[Tuple[str, str]],
+    system_prompt: str,
+    temperature: float,
+    top_p: float,
+    max_new_tokens: int,
+):
+    """Stream response to the chatbot."""
+    # Generate complete response
+    bot_message = generate_response(
+        message,
+        chat_history,
+        system_prompt,
+        temperature,
+        top_p,
+        max_new_tokens
+    )
+    # Add to chat history
+    chat_history.append((message, bot_message))
+    # Stream the response character by character for better UX
+    partial_message = ""
+    for char in bot_message:
+        partial_message += char
+        time.sleep(0.01)  # Small delay for streaming effect
+        yield chat_history[:-1] + [(message, partial_message)]
+    yield chat_history
+def clear_conversation():
+    """Clear the conversation history."""
+    return [], None
+# Create the Gradio interface
+with gr.Blocks(title="UserLM-8b Chat", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🤖 UserLM-8b Chat Interface
+        Chat with Microsoft's UserLM-8b model. This model is designed to simulate user behavior and generate responses as if from a user perspective.
+        [Built with anycoder](https://huggingface.co/spaces/akhaliq/anycoder)
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(
+                height=500,
+                show_copy_button=True,
+                bubble_full_width=False,
+                avatar_images=(None, "🤖"),
+                render_markdown=True,
+            )
+            with gr.Row():
+                msg = gr.Textbox(
+                    label="Message",
+                    placeholder="Type your message here and press Enter...",
+                    lines=2,
+                    scale=4,
+                    autofocus=True,
+                )
+                submit_btn = gr.Button("Send", variant="primary", scale=1)
+            with gr.Row():
+                clear_btn = gr.ClearButton(
+                    [chatbot, msg],
+                    value="🗑️ Clear Chat"
+                )
+                retry_btn = gr.Button("🔄 Retry Last")
+                undo_btn = gr.Button("↩️ Undo Last")
+        with gr.Column(scale=1):
+            gr.Markdown("### ⚙️ Settings")
+            system_prompt = gr.Textbox(
+                label="System Prompt",
+                placeholder="Set the behavior of the model...",
+                value="You are a user who wants to implement a special type of sequence. The sequence sums up the two previous numbers in the sequence and adds 1 to the result. The first two numbers in the sequence are 1 and 1.",
+                lines=4,
+            )
+            temperature = gr.Slider(
+                minimum=0.1,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                label="Temperature",
+                info="Higher values make output more random"
+            )
+            top_p = gr.Slider(
+                minimum=0.1,
+                maximum=1.0,
+                value=0.8,
+                step=0.05,
+                label="Top-p (nucleus sampling)",
+                info="Lower values focus on more likely tokens"
+            )
+            max_new_tokens = gr.Slider(
+                minimum=10,
+                maximum=512,
+                value=100,
+                step=10,
+                label="Max New Tokens",
+                info="Maximum number of tokens to generate"
+            )
+            gr.Markdown(
+                """
+                ### 📊 Model Info
+                - **Model**: microsoft/UserLM-8b
+                - **Parameters**: 8 billion
+                - **Device**: """ + DEVICE.upper() + """
+                - **Precision**: FP16 (CUDA) / FP32 (CPU)
+                """
+            )
+    # Store conversation history
+    chat_history = gr.State([])
+    # Event handlers
+    def user_submit(message, history):
+        return "", history + [(message, None)]
+    def bot_respond(history, system, temp, top_p, max_tokens):
+        if not history or history[-1][1] is not None:
+            return history
+        message = history[-1][0]
+        history_without_last = history[:-1]
+        for new_history in respond(message, history_without_last, system, temp, top_p, max_tokens):
+            yield new_history
+    def retry_last(history, system, temp, top_p, max_tokens):
+        if not history:
+            return history
+        # Remove last exchange and regenerate
+        last_user_msg = history[-1][0]
+        history = history[:-1]
+        for new_history in respond(last_user_msg, history, system, temp, top_p, max_tokens):
+            yield new_history
+    def undo_last(history):
+        if history:
+            return history[:-1]
+        return history
+    # Connect events
+    msg.submit(
+        user_submit,
+        [msg, chatbot],
+        [msg, chatbot],
+        queue=False
+    ).then(
+        bot_respond,
+        [chatbot, system_prompt, temperature, top_p, max_new_tokens],
+        chatbot
+    )
+    submit_btn.click(
+        user_submit,
+        [msg, chatbot],
+        [msg, chatbot],
+        queue=False
+    ).then(
+        bot_respond,
+        [chatbot, system_prompt, temperature, top_p, max_new_tokens],
+        chatbot
+    )
+    retry_btn.click(
+        retry_last,
+        [chatbot, system_prompt, temperature, top_p, max_new_tokens],
+        chatbot
+    )
+    undo_btn.click(
+        undo_last,
+        chatbot,
+        chatbot
+    )
+    # Load model on startup
+    demo.load(
+        fn=lambda: gr.Info("Model loading... This may take a moment on first run."),
+        inputs=None,
+        outputs=None
+    )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["Can you help me understand how this sequence works?"],
+            ["What would be the next 5 numbers in the sequence?"],
+            ["Let's implement this sequence in Python together."],
+            ["Can you explain the pattern: 1, 1, 3, 5, 9, 15...?"],
+        ],
+        inputs=msg,
+        label="Example Messages",
+    )
+if __name__ == "__main__":
+    demo.launch(
+        share=False,
+        show_error=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+    )
+```
+This chat application provides:
+## Key Features:
+1. **Clean Chat Interface**: A modern, responsive chat UI with message bubbles and avatars
+2. **Streaming Responses**: Character-by-character streaming for better UX
+3. **Customizable Settings**: Temperature, top-p, and max token controls
+4. **System Prompt**: Configurable system prompt with the default sequence example
+5. **Chat Management**: Clear, retry, and undo functionality
+6. **GPU Optimization**: Automatic GPU detection and FP16 precision on CUDA
+7. **Example Messages**: Pre-defined examples to get started quickly
+8. **Model Info Display**: Shows current device and model configuration
+## Technical Highlights:
+- **Lazy Loading**: Model loads only when first message is sent
+- **Memory Efficient**: Uses `low_cpu_mem_usage=True` and appropriate precision
+- **Proper Token Handling**: Implements the special tokens from your example
+- **State Management**: Maintains conversation history properly
+- **Error Handling**: Graceful fallback to CPU if CUDA unavailable
+The interface preserves your original model loading and generation logic while wrapping it in a user-friendly Gradio interface. Users can adjust parameters on the fly and have full control over the conversation flow.