Spaces:

KingNish
/

Sarvam-M-Demo

Running on Zero

App Files Files Community

KingNish commited on Jun 8

Commit

a56c04c

verified ·

1 Parent(s): 43e2f2f

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -124

app.py CHANGED Viewed

@@ -1,40 +1,28 @@
 import gradio as gr
-from gradio import ChatMessage
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
 import spaces
-import time
-# --- Model and Tokenizer Setup ---
-print("Loading model and tokenizer...")
 model_name = "sarvamai/sarvam-m"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.bfloat16, # bfloat16 is often better for inference
-    device_map="auto"
-)
-print("Model and tokenizer loaded.")
-# --- Core Generation Logic ---
 @spaces.GPU
-def generate_response(history: list[ChatMessage]):
-    # 1. Format the conversation history for the model
-    # The model expects a list of dictionaries, e.g., [{"role": "user", "content": "Hello"}]
-    # We convert our ChatMessage history to this format.
-    query = [msg.model_dump() for msg in history]
-    # Remove metadata as the model doesn't use it
-    for msg in query:
-        msg.pop('metadata', None)
-    prompt_text = tokenizer.apply_chat_template(query, tokenize=False, add_generation_prompt=True, enable_thinking=True)
-    model_inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device)
-    # 2. Set up the streamer
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # 3. Start generation in a separate thread
     generation_kwargs = dict(
         input_ids=model_inputs.input_ids,
         max_new_tokens=8192,
@@ -42,116 +30,39 @@ def generate_response(history: list[ChatMessage]):
         temperature=0.7,
         streamer=streamer,
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    # 4. Stream and process the output to create structured ChatMessages
-    in_thought_block = False
-    thought_content = ""
-    response_content = ""
-    # Add placeholder messages to the history that we will update
-    # One for thoughts, one for the final answer.
-    history.append(ChatMessage(role="assistant", content="", metadata={"title": "🤔 Thinking..."}))
-    history.append(ChatMessage(role="assistant", content="" ))
-    yield history
-    start_time = time.time()
     for new_text in streamer:
-        # Check if the model is starting to think
-        if "<think>" in new_text and not in_thought_block:
-            in_thought_block = True
-            # Any text after the tag in this chunk is part of the thought
-            thought_content += new_text.split("<think>", 1)[-1]
-            continue # Move to next token
-        # Check if the model has finished thinking
-        if "</think>" in new_text and in_thought_block:
-            in_thought_block = False
-            duration = time.time() - start_time
-            # Update the thought message with the full thought and completion status
-            parts = new_text.split("</think>", 1)
-            thought_content += parts[0]
-            history[-2].content = thought_content.strip() # The first placeholder message
-            history[-2].metadata = {"title": f"✅ Thinking Completed in {duration:.2f}s"}
-            # Any text after the tag is part of the final response
-            response_content += parts[1]
-            history[-1].content = response_content.lstrip() # The second placeholder
-            yield history
-            continue
-        # Accumulate content based on whether we are in a thought block or not
-        if in_thought_block:
-            thought_content += new_text
-            # Update the thinking message in real-time
-            history[-2].content = thought_content.strip()
         else:
-            response_content += new_text
-            # Update the final answer message in real-time
-            history[-1].content = response_content.lstrip()
-        yield history
-    # Final cleanup: if the thought bubble is empty, remove it.
-    if not history[-2].content.strip():
-        history.pop(-2)
-        yield history
-# --- Gradio Interface ---
-with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="blue")) as demo:
-    gr.Markdown(
-        """
-        # 🧠 Sarvam AI Chatbot with Thinking Process
-        This chatbot uses the `sarvamai/sarvam-m` model.
-        It will show its "thoughts" in a separate, collapsible box before giving the final answer.
-        """
-    )
-    chatbot = gr.Chatbot(
-        [],
-        elem_id="chatbot",
-        bubble_full_width=False,
-        height=600,
-        avatar_images=(None, "https://huggingface.co/sarvamai/sarvam-m/resolve/main/Sarvam.AI.logo.jpeg"),
-        show_copy_button=True,
-        type="messages" # Crucial for using ChatMessage objects
-    )
-    with gr.Row():
-        txt = gr.Textbox(
-            scale=4,
-            show_label=False,
-            placeholder="Enter your message and press enter...",
-            container=False,
-        )
-        btn = gr.Button("Submit", scale=1)
-    # Function to handle user submission
-    def user(user_message, history):
-        # Create a user message and add it to the history
-        history.append(ChatMessage(role="user", content=user_message))
-        return "", history
-    # Chain the events: user submission -> update history -> generate response
-    txt.submit(user, [txt, chatbot], [txt, chatbot], queue=False).then(
-        generate_response, chatbot, chatbot
-    )
-    btn.click(user, [txt, chatbot], [txt, chatbot], queue=False).then(
-        generate_response, chatbot, chatbot
-    )
-    gr.Examples(
-        [
-            "Write a short story about a robot who discovers music.",
-            "Explain the concept of black holes to a 5-year-old.",
-            "Plan a 3-day itinerary for a trip to Paris.",
-        ],
-        inputs=txt,
-        label="Example Prompts"
-    )
 if __name__ == "__main__":
-    demo.launch(debug=True)

 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import torch
 from threading import Thread
 import spaces
+# Load the model and tokenizer
 model_name = "sarvamai/sarvam-m"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
 @spaces.GPU
+def generate_response(prompt, chat_history):
+    chat_history.append(dict(role="user", content=prompt ))
+    messages = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(messages, tokenize=False, enable_thinking=True)
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    # Use TextIteratorStreamer for streaming
     streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Conduct text generation with streaming
     generation_kwargs = dict(
         input_ids=model_inputs.input_ids,
         max_new_tokens=8192,
         temperature=0.7,
         streamer=streamer,
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    # Initialize variables to capture reasoning content and main content
+    reasoning_content = ""
+    content = ""
+    reasoning_done = False
+    chat_history.append(dict(role="assistant", content=reasoning_content, metadata={"title": "Thinking..."}) )
     for new_text in streamer:
+        if "</think>" in new_text:
+            chat_history[-1]["metadata"] = {"title": "Thinking Completed"}
+            reasoning_done = True
+            chat_history.append(dict(role="assistant", content=content))
+        if not reasoning_done:
+            reasoning_content += new_text
+            chat_history[-1]["content"] = reasoning_content
         else:
+            content += new_text
+            chat_history[-1]["content"] = content
+        yield chat_history
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Sarvam M Demo")
+    chatbot = gr.Chatbot(height=600)
+    msg = gr.Textbox(label="Your Message")
+    msg.submit(respond, [msg, chatbot], [chatbot])
 if __name__ == "__main__":
+    demo.launch()