Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -3,20 +3,22 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
|
|
3 |
import torch
|
4 |
from threading import Thread
|
5 |
import spaces
|
|
|
6 |
|
7 |
# Load the model and tokenizer
|
8 |
model_name = "sarvamai/sarvam-m"
|
9 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
10 |
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
|
11 |
|
|
|
|
|
12 |
@spaces.GPU(duration=120)
|
13 |
def generate_response(prompt, chat_history):
|
14 |
-
|
15 |
-
chat_history.append(dict(role="user", content=prompt ))
|
16 |
yield chat_history
|
17 |
|
18 |
print(chat_history)
|
19 |
-
|
20 |
text = tokenizer.apply_chat_template(chat_history, tokenize=False, enable_thinking=True)
|
21 |
|
22 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
@@ -27,7 +29,7 @@ def generate_response(prompt, chat_history):
|
|
27 |
# Conduct text generation with streaming
|
28 |
generation_kwargs = dict(
|
29 |
input_ids=model_inputs.input_ids,
|
30 |
-
max_new_tokens=
|
31 |
streamer=streamer,
|
32 |
)
|
33 |
|
@@ -38,22 +40,28 @@ def generate_response(prompt, chat_history):
|
|
38 |
reasoning_content = ""
|
39 |
content = ""
|
40 |
reasoning_done = False
|
|
|
41 |
|
42 |
-
chat_history.append(dict(role="assistant", content=reasoning_content, metadata={"title": "Thinking..."})
|
43 |
|
|
|
44 |
for new_text in streamer:
|
45 |
if "</think>" in new_text:
|
46 |
-
chat_history[-1]["metadata"] = {"title": "Thinking Completed"}
|
47 |
reasoning_done = True
|
|
|
|
|
48 |
chat_history.append(dict(role="assistant", content=content))
|
49 |
-
|
50 |
if not reasoning_done:
|
|
|
|
|
|
|
51 |
reasoning_content += new_text
|
52 |
chat_history[-1]["content"] = reasoning_content
|
53 |
else:
|
54 |
content += new_text
|
55 |
chat_history[-1]["content"] = content
|
56 |
-
|
57 |
yield chat_history
|
58 |
|
59 |
# Create the Gradio interface
|
@@ -64,4 +72,4 @@ with gr.Blocks() as demo:
|
|
64 |
msg.submit(generate_response, [msg, chatbot], [chatbot])
|
65 |
|
66 |
if __name__ == "__main__":
|
67 |
-
demo.launch()
|
|
|
3 |
import torch
|
4 |
from threading import Thread
|
5 |
import spaces
|
6 |
+
import time
|
7 |
|
8 |
# Load the model and tokenizer
|
9 |
model_name = "sarvamai/sarvam-m"
|
10 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
11 |
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
|
12 |
|
13 |
+
indicators = ["Thinking ⠋", "Thinking ⠙", "Thinking ⠹", "Thinking ⠸", "Thinking ⠼", "Thinking ⠴", "Thinking ⠦", "Thinking ⠧", "Thinking ⠇", "Thinking ⠏"]
|
14 |
+
|
15 |
@spaces.GPU(duration=120)
|
16 |
def generate_response(prompt, chat_history):
|
17 |
+
chat_history.append(dict(role="user", content=prompt))
|
|
|
18 |
yield chat_history
|
19 |
|
20 |
print(chat_history)
|
21 |
+
|
22 |
text = tokenizer.apply_chat_template(chat_history, tokenize=False, enable_thinking=True)
|
23 |
|
24 |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
|
|
29 |
# Conduct text generation with streaming
|
30 |
generation_kwargs = dict(
|
31 |
input_ids=model_inputs.input_ids,
|
32 |
+
max_new_tokens=8192,
|
33 |
streamer=streamer,
|
34 |
)
|
35 |
|
|
|
40 |
reasoning_content = ""
|
41 |
content = ""
|
42 |
reasoning_done = False
|
43 |
+
start_time = time.time()
|
44 |
|
45 |
+
chat_history.append(dict(role="assistant", content=reasoning_content, metadata={"title": "Thinking..."}))
|
46 |
|
47 |
+
indicator_index = 0
|
48 |
for new_text in streamer:
|
49 |
if "</think>" in new_text:
|
|
|
50 |
reasoning_done = True
|
51 |
+
thought_duration = time.time() - start_time
|
52 |
+
chat_history[-1]["metadata"] = {"title": f"Thought for {thought_duration:.2f} seconds"}
|
53 |
chat_history.append(dict(role="assistant", content=content))
|
54 |
+
|
55 |
if not reasoning_done:
|
56 |
+
# Update the thinking indicator
|
57 |
+
indicator_index = (indicator_index + 1) % len(indicators)
|
58 |
+
chat_history[-1]["metadata"] = {"title": indicators[indicator_index]}
|
59 |
reasoning_content += new_text
|
60 |
chat_history[-1]["content"] = reasoning_content
|
61 |
else:
|
62 |
content += new_text
|
63 |
chat_history[-1]["content"] = content
|
64 |
+
|
65 |
yield chat_history
|
66 |
|
67 |
# Create the Gradio interface
|
|
|
72 |
msg.submit(generate_response, [msg, chatbot], [chatbot])
|
73 |
|
74 |
if __name__ == "__main__":
|
75 |
+
demo.launch(mcp_server=True)
|