rieon commited on
Commit
8b2a8d4
Β·
1 Parent(s): d0dc3ee
Files changed (1) hide show
  1. app.py +38 -12
app.py CHANGED
@@ -1,11 +1,22 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
 
 
3
 
4
  # ←–– set this to the exact name of your HF repo
5
  HF_MODEL_ID = "rieon/DeepCoder-14B-Preview-Suger"
6
 
7
  # explicitly tell the client you want text-generation
8
- client = InferenceClient(model=HF_MODEL_ID)
 
 
 
 
 
 
 
 
 
9
 
10
  # def respond(
11
  # message: str,
@@ -53,17 +64,32 @@ def respond(
53
  prompt += f"User: {message}\nAssistant:"
54
 
55
  # stream back tokens
56
- generated = ""
57
- for chunk in client.text_generation(
58
- prompt,
59
- max_new_tokens=max_tokens,
60
- temperature=temperature,
61
- top_p=top_p,
62
- stream=True,
63
- ):
64
- # the API returns a small JSON with .generated_text
65
- generated += chunk.generated_text
66
- yield generated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  demo = gr.ChatInterface(
69
  fn=respond,
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ import torch
5
 
6
  # ←–– set this to the exact name of your HF repo
7
  HF_MODEL_ID = "rieon/DeepCoder-14B-Preview-Suger"
8
 
9
  # explicitly tell the client you want text-generation
10
+ # client = InferenceClient(model=HF_MODEL_ID)
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, use_fast=True)
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ HF_MODEL_ID,
16
+ device_map="auto", # spreads across all available GPUs
17
+ torch_dtype=torch.float16
18
+ )
19
+ model.eval()
20
 
21
  # def respond(
22
  # message: str,
 
64
  prompt += f"User: {message}\nAssistant:"
65
 
66
  # stream back tokens
67
+ # generated = ""
68
+ # for chunk in client.text_generation(
69
+ # prompt,
70
+ # max_new_tokens=max_tokens,
71
+ # temperature=temperature,
72
+ # top_p=top_p,
73
+ # stream=True,
74
+ # ):
75
+ # # the API returns a small JSON with .generated_text
76
+ # generated += chunk.generated_text
77
+ # yield generated
78
+ streamer = TextIteratorStreamer(tokenizer,
79
+ skip_prompt=True,
80
+ skip_special_tokens=True)
81
+ inputs = tokenizer(prompt, return_tensors="pt").to(device)
82
+ model.generate(**inputs,
83
+ streamer=streamer,
84
+ max_new_tokens=max_new_tokens,
85
+ temperature=temperature,
86
+ top_p=top_p)
87
+
88
+ output = ""
89
+ for tok in streamer:
90
+ output += tok
91
+ yield output
92
+
93
 
94
  demo = gr.ChatInterface(
95
  fn=respond,