import gradio as gr from huggingface_hub import InferenceClient from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import torch import spaces # ←–– set this to the exact name of your HF repo HF_MODEL_ID = "rieon/DeepCoder-14B-Preview-Suger" # explicitly tell the client you want text-generation # client = InferenceClient(model=HF_MODEL_ID) device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True) model = AutoModelForCausalLM.from_pretrained( HF_MODEL_ID, device_map="auto", # spreads across all available GPUs torch_dtype=torch.float16 ) model.eval() # def respond( # message: str, # history: list[dict], # [{"role":"user"/"assistant","content":…}, …] # system_message: str, # max_tokens: int, # temperature: float, # top_p: float, # ): # # 1️⃣ Build one raw-text prompt from system + chat history + new user turn # prompt = system_message.strip() + "\n" # for msg in history: # role = msg["role"] # content = msg["content"] # if role == "user": # prompt += f"User: {content}\n" # elif role == "assistant": # prompt += f"Assistant: {content}\n" # prompt += f"User: {message}\nAssistant:" # # 2️⃣ Stream tokens from the text-generation endpoint # generated = "" # for chunk in client.text_generation( # prompt, # first positional arg # max_new_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # generated += chunk.generated_text # yield generated @spaces.GPU def respond( message: str, history: list[dict], system_message: str, max_tokens: int, temperature: float, top_p: float, ): # assemble a single prompt from system message + history prompt = system_message.strip() + "\n" # for user, bot in history: # prompt += f"User: {user}\nAssistant: {bot}\n" prompt += f"User: {message}\nAssistant:" # stream back tokens # generated = "" # for chunk in client.text_generation( # prompt, # max_new_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # stream=True, # ): # # the API returns a small JSON with .generated_text # generated += chunk.generated_text # yield generated streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) inputs = tokenizer(prompt, return_tensors="pt").to(device) model.generate(**inputs, streamer=streamer, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p) output = "" for tok in streamer: output += tok yield output demo = gr.ChatInterface( fn=respond, type="messages", title="DeepCoder with Suger", description="Upload any text or pdf files and ask questions about them!", additional_inputs=[ gr.Textbox(value="You are a helpful coding assistant.", label="System message"), gr.Slider(1, 2048, value=512, step=1, label="Max new tokens"), gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), ], ) if __name__ == "__main__": demo.launch()