Spaces:

KingNish
/

Sarvam-M-Demo

Running on Zero

App Files Files Community

KingNish commited on Jun 8

Commit

3d1de8f

verified ·

1 Parent(s): fcc1fac

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -224

app.py CHANGED Viewed

@@ -4,234 +4,91 @@ import torch
 from threading import Thread
 import spaces
 import time
-import copy
-# For the advanced UI components
-import modelscope_studio.components.antd as antd
-import modelscope_studio.components.antdx as antdx
-import modelscope_studio.components.base as ms
-import modelscope_studio.components.pro as pro
-from modelscope_studio.components.pro.chatbot import (ChatbotBotConfig,
-                                                      ChatbotPromptsConfig,
-                                                      ChatbotUserConfig,
-                                                      ChatbotWelcomeConfig)
-# --- 1. Load the Hugging Face Model and Tokenizer ---
 model_name = "sarvamai/sarvam-m"
-print(f"Loading model: {model_name}...")
 tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-print("Model loaded successfully.")
-# --- 2. Helper and Event Handler Functions ---
-def format_history_for_sarvam(history: list) -> list:
-    messages = []
-    if not history:
-        return messages
-    for item in history:
-        role = item.get("role")
-        content = item.get("content")
-        if role == "user":
-            messages.append({"role": "user", "content": content})
-        elif role == "assistant":
-            final_content = ""
-            if isinstance(content, list):
-                for part in content:
-                    if part.get("type") == "text":
-                        final_content = part.get("content", "")
-                        break
-            elif isinstance(content, str):
-                final_content = content
-            if final_content:
-                messages.append({"role": "assistant", "content": final_content})
-    return messages
 @spaces.GPU
-def submit(sender_value: str, chatbot_value: list):
-    if sender_value:
-        chatbot_value.append({"role": "user", "content": sender_value})
-    chatbot_value.append({
-        "role": "assistant",
-        "content": [],
-        "loading": True,
-        "status": "pending"
-    })
-    # <-- 2. APPLY FIX HERE
-    yield {
-        sender: gr.update(value=None, loading=True),
-        clear_btn: gr.update(disabled=True),
-        chatbot: gr.update(value=copy.deepcopy(chatbot_value))
-    }
-    try:
-        history_messages = format_history_for_sarvam(chatbot_value)
-        prompt_text = tokenizer.apply_chat_template(
-            history_messages, tokenize=False, add_generation_prompt=True, enable_thinking=True
-        )
-        model_inputs = tokenizer([prompt_text], return_tensors="pt").to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(
-            input_ids=model_inputs.input_ids,
-            max_new_tokens=8192,
-            do_sample=True,
-            temperature=0.7,
-            streamer=streamer,
-        )
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
-        start_time = time.time()
-        message_content = chatbot_value[-1]["content"]
-        message_content.append({
-            "copyable": False, "editable": False, "type": "tool", "content": "",
-            "options": {"title": "Thinking...", "status": "pending"}
-        })
-        message_content.append({"type": "text", "content": ""})
-        chatbot_value[-1]["loading"] = False
-        full_response = ""
-        thinking_content = ""
-        main_content = ""
-        thinking_done = False
-        for new_text in streamer:
-            full_response += new_text
-            if not thinking_done and "</think>" in full_response:
-                thinking_done = True
-                try:
-                    parts = full_response.split("</think>", 1)
-                    thinking_content = parts[0].split("<think>", 1)[1]
-                    main_content = parts[1]
-                    thought_cost_time = "{:.2f}".format(time.time() - start_time)
-                    message_content[0]["content"] = thinking_content.strip()
-                    message_content[0]["options"]["title"] = f"End of Thought ({thought_cost_time}s)"
-                    message_content[0]["options"]["status"] = "done"
-                except IndexError:
-                    main_content = full_response
-            elif not thinking_done:
-                if full_response.lstrip().startswith("<think>"):
-                    thinking_content = full_response.lstrip()[len("<think>"):]
-                    message_content[0]["content"] = thinking_content.strip()
-            else:
-                main_content = full_response.split("</think>", 1)[1]
-            message_content[1]["content"] = main_content.lstrip("\n")
-            # <-- 3. APPLY FIX HERE
-            yield {chatbot: gr.update(value=copy.deepcopy(chatbot_value))}
-        chatbot_value[-1]["footer"] = "{:.2f}s".format(time.time() - start_time)
-        chatbot_value[-1]["status"] = "done"
-        # <-- 4. APPLY FIX HERE
-        yield {
-            clear_btn: gr.update(disabled=False),
-            sender: gr.update(loading=False),
-            chatbot: gr.update(value=copy.deepcopy(chatbot_value)),
-        }
-    except Exception as e:
-        print(f"An error occurred: {e}")
-        chatbot_value[-1]["loading"] = False
-        chatbot_value[-1]["status"] = "done"
-        chatbot_value[-1]["content"] = f"Failed to respond due to an error: {e}"
-        # <-- 5. APPLY FIX HERE
-        yield {
-            clear_btn: gr.update(disabled=False),
-            sender: gr.update(loading=False),
-            chatbot: gr.update(value=copy.deepcopy(chatbot_value)),
-        }
-def prompt_select(e: gr.EventData):
-    return gr.update(value=e._data["payload"][0]["value"]["description"])
-def clear():
-    return gr.update(value=None)
-def retry(chatbot_value: list, e: gr.EventData):
-    index = e._data["payload"][0]["index"]
-    chatbot_value = chatbot_value[:index-1]
-    yield {
-        sender: gr.update(loading=True),
-        chatbot: gr.update(value=chatbot_value),
-        clear_btn: gr.update(disabled=True)
-    }
-    for chunk in submit(None, chatbot_value):
-        yield chunk
-def cancel(chatbot_value: list):
-    if chatbot_value and chatbot_value[-1].get("status") == "pending":
-        chatbot_value[-1]["loading"] = False
-        chatbot_value[-1]["status"] = "done"
-        chatbot_value[-1]["footer"] = "Chat completion paused"
-    return {
-        chatbot: gr.update(value=chatbot_value),
-        sender: gr.update(loading=False),
-        clear_btn: gr.update(disabled=False)
-    }
-# --- 3. Build the Gradio UI ---
-with gr.Blocks(theme=gr.themes.Default(primary_hue="blue", secondary_hue="blue")) as demo, ms.Application(), antdx.XProvider():
-    with antd.Flex(vertical=True, gap="middle"):
-        chatbot = pro.Chatbot(
-            height=650,
-            welcome_config=ChatbotWelcomeConfig(
-                variant="borderless",
-                icon="https://cdn-avatars.huggingface.co/v1/production/uploads/60270a7c32856987162c641a/umd13GCWVijwTDGZzw3q-.png",
-                title=f"Hello, I'm {model_name.split('/')[-1]}",
-                description="I can show you my thinking process. How can I help you today?",
-                prompts=ChatbotPromptsConfig(
-                    items=[
-                        {"label": "Explain a concept", "children": [{"description": "Explain what a Large Language Model is in simple terms."}]},
-                        {"label": "Help me write", "children": [{"description": "Write a short, futuristic story about AI companions."}]},
-                        {"label": "Creative Ideas", "children": [{"description": "Give me three creative names for a new coffee shop."}]},
-                        {"label": "Code generation", "children": [{"description": "Write a python function to find the factorial of a number."}]}
-                    ]
-                )
-            ),
-            user_config=ChatbotUserConfig(avatar="https://api.dicebear.com/7.x/miniavs/svg?seed=gradio"),
-            bot_config=ChatbotBotConfig(
-                header=model_name,
-                avatar="https://cdn-avatars.huggingface.co/v1/production/uploads/60270a7c32856987162c641a/umd13GCWVijwTDGZzw3q-.png",
-                actions=["copy", "retry"]
-            ),
-        )
-        with antdx.Sender() as sender:
-            with ms.Slot("prefix"):
-                with antd.Button(value=None, color="default", variant="text") as clear_btn:
-                    with ms.Slot("icon"):
-                        antd.Icon("ClearOutlined")
-    clear_btn.click(fn=clear, outputs=[chatbot])
-    submit_event = sender.submit(
-        fn=submit,
-        inputs=[sender, chatbot],
-        outputs=[sender, chatbot, clear_btn]
-    )
-    sender.cancel(
-        fn=cancel,
-        inputs=[chatbot],
-        outputs=[chatbot, sender, clear_btn],
-        cancels=[submit_event],
-        queue=False
-    )
-    chatbot.retry(
-        fn=retry,
-        inputs=[chatbot],
-        outputs=[sender, chatbot, clear_btn]
-    )
-    chatbot.welcome_prompt_select(
-        fn=prompt_select,
-        outputs=[sender]
     )
 if __name__ == "__main__":
-    demo.queue().launch(debug=True)

 from threading import Thread
 import spaces
 import time
+# Load the model and tokenizer
 model_name = "sarvamai/sarvam-m"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
 @spaces.GPU
+def generate_response(prompt, chat_history):
+    messages = [{"role": "user", "content": prompt}]
+    text = tokenizer.apply_chat_template(messages, tokenize=False, enable_thinking=True)
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    # Use TextIteratorStreamer for streaming
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    # Conduct text generation with streaming
+    generation_kwargs = dict(
+        input_ids=model_inputs.input_ids,
+        max_new_tokens=8192,
+        do_sample=True,
+        temperature=0.7,
+        streamer=streamer,
     )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Initialize variables to capture reasoning content and main content
+    reasoning_content = ""
+    content = ""
+    start_time = time.time()
+    # First yield: Show thinking has started
+    yield chat_history + [(None, "Thinking...")], ""
+    for new_text in streamer:
+        if "</think>" in new_text:
+            parts = new_text.split("</think>")
+            reasoning_content = parts[0].rstrip("\n")
+            content = parts[-1].lstrip("\n").rstrip("</s>")
+            # Calculate thinking time
+            thinking_time = time.time() - start_time
+            # Yield the thinking process
+            yield chat_history + [
+                (None, f"Thinking..."),
+                (None, f"Thinking completed. Thought for {thinking_time:.1f} seconds."),
+                (None, f"Thought process:\n{reasoning_content}")
+            ], ""
+        else:
+            content += new_text
+            # Yield the content as it's being generated
+            yield chat_history + [
+                (None, f"Thinking..."),
+                (None, f"Thinking completed. Thought for {time.time() - start_time:.1f} seconds."),
+                (None, f"Thought process:\n{reasoning_content}"),
+                (None, content)
+            ], ""
+    # Final yield with complete response
+    yield chat_history + [
+        (None, f"Thinking..."),
+        (None, f"Thinking completed. Thought for {time.time() - start_time:.1f} seconds."),
+        (None, f"Thought process:\n{reasoning_content}"),
+        (prompt, f"{reasoning_content}\n{content}" if reasoning_content else content)
+    ], ""
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Sarvam M Demo")
+    chatbot = gr.Chatbot(show_copy_button=True)
+    msg = gr.Textbox(label="Your Message")
+    def respond(message, chat_history):
+        # Start with the user message
+        chat_history.append((message, None))
+        yield chat_history, ""
+        # Then stream the assistant's response
+        for updated_history, _ in generate_response(message, chat_history):
+            yield updated_history, ""
+    msg.submit(respond, [msg, chatbot], [chatbot, msg])
 if __name__ == "__main__":
+    demo.launch()