Spaces:
Sleeping
Sleeping
import gradio as gr | |
from huggingface_hub import InferenceClient | |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
import torch | |
import spaces | |
# ←–– set this to the exact name of your HF repo | |
HF_MODEL_ID = "rieon/DeepCoder-14B-Preview-Suger" | |
# explicitly tell the client you want text-generation | |
# client = InferenceClient(model=HF_MODEL_ID) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
HF_MODEL_ID, | |
device_map="auto", # spreads across all available GPUs | |
torch_dtype=torch.float16 | |
) | |
model.eval() | |
# def respond( | |
# message: str, | |
# history: list[dict], # [{"role":"user"/"assistant","content":…}, …] | |
# system_message: str, | |
# max_tokens: int, | |
# temperature: float, | |
# top_p: float, | |
# ): | |
# # 1️⃣ Build one raw-text prompt from system + chat history + new user turn | |
# prompt = system_message.strip() + "\n" | |
# for msg in history: | |
# role = msg["role"] | |
# content = msg["content"] | |
# if role == "user": | |
# prompt += f"User: {content}\n" | |
# elif role == "assistant": | |
# prompt += f"Assistant: {content}\n" | |
# prompt += f"User: {message}\nAssistant:" | |
# # 2️⃣ Stream tokens from the text-generation endpoint | |
# generated = "" | |
# for chunk in client.text_generation( | |
# prompt, # first positional arg | |
# max_new_tokens=max_tokens, | |
# temperature=temperature, | |
# top_p=top_p, | |
# stream=True, | |
# ): | |
# generated += chunk.generated_text | |
# yield generated | |
def respond( | |
message: str, | |
history: list[dict], | |
system_message: str, | |
max_tokens: int, | |
temperature: float, | |
top_p: float, | |
): | |
# assemble a single prompt from system message + history | |
prompt = system_message.strip() + "\n" | |
# for user, bot in history: | |
# prompt += f"User: {user}\nAssistant: {bot}\n" | |
prompt += f"User: {message}\nAssistant:" | |
# stream back tokens | |
# generated = "" | |
# for chunk in client.text_generation( | |
# prompt, | |
# max_new_tokens=max_tokens, | |
# temperature=temperature, | |
# top_p=top_p, | |
# stream=True, | |
# ): | |
# # the API returns a small JSON with .generated_text | |
# generated += chunk.generated_text | |
# yield generated | |
streamer = TextIteratorStreamer(tokenizer, | |
skip_prompt=True, | |
skip_special_tokens=True) | |
inputs = tokenizer(prompt, return_tensors="pt").to(device) | |
model.generate(**inputs, | |
streamer=streamer, | |
max_new_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p) | |
output = "" | |
for tok in streamer: | |
output += tok | |
yield output | |
demo = gr.ChatInterface( | |
fn=respond, | |
type="messages", | |
title="DeepCoder with Suger", | |
description="Upload any text or pdf files and ask questions about them!", | |
additional_inputs=[ | |
gr.Textbox(value="You are a helpful coding assistant.", label="System message"), | |
gr.Slider(1, 2048, value=512, step=1, label="Max new tokens"), | |
gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), | |
gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), | |
], | |
) | |
if __name__ == "__main__": | |
demo.launch() | |