Spaces:

rieon
/

deepcoder-v1

Sleeping

App Files Files Community

rieon commited on Apr 24

Commit

8b2a8d4

1 Parent(s): d0dc3ee

fix

Browse files

Files changed (1) hide show

app.py +38 -12

app.py CHANGED Viewed

@@ -1,11 +1,22 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
 # ←–– set this to the exact name of your HF repo
 HF_MODEL_ID = "rieon/DeepCoder-14B-Preview-Suger"
 # explicitly tell the client you want text-generation
-client = InferenceClient(model=HF_MODEL_ID)
 # def respond(
 #     message: str,
@@ -53,17 +64,32 @@ def respond(
     prompt += f"User: {message}\nAssistant:"
     # stream back tokens
-    generated = ""
-    for chunk in client.text_generation(
-        prompt,
-        max_new_tokens=max_tokens,
-        temperature=temperature,
-        top_p=top_p,
-        stream=True,
-    ):
-        # the API returns a small JSON with .generated_text
-        generated += chunk.generated_text
-        yield generated
 demo = gr.ChatInterface(
     fn=respond,

 import gradio as gr
 from huggingface_hub import InferenceClient
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import torch
 # ←–– set this to the exact name of your HF repo
 HF_MODEL_ID = "rieon/DeepCoder-14B-Preview-Suger"
 # explicitly tell the client you want text-generation
+# client = InferenceClient(model=HF_MODEL_ID)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True)
+model = AutoModelForCausalLM.from_pretrained(
+    HF_MODEL_ID,
+    device_map="auto",       # spreads across all available GPUs
+    torch_dtype=torch.float16
+)
+model.eval()
 # def respond(
 #     message: str,
     prompt += f"User: {message}\nAssistant:"
     # stream back tokens
+    # generated = ""
+    # for chunk in client.text_generation(
+    #     prompt,
+    #     max_new_tokens=max_tokens,
+    #     temperature=temperature,
+    #     top_p=top_p,
+    #     stream=True,
+    # ):
+    #     # the API returns a small JSON with .generated_text
+    #     generated += chunk.generated_text
+    #     yield generated
+    streamer = TextIteratorStreamer(tokenizer,
+                                    skip_prompt=True,
+                                    skip_special_tokens=True)
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    model.generate(**inputs,
+                   streamer=streamer,
+                   max_new_tokens=max_new_tokens,
+                   temperature=temperature,
+                   top_p=top_p)
+    output = ""
+    for tok in streamer:
+        output += tok
+        yield output
 demo = gr.ChatInterface(
     fn=respond,