Bton commited on
Commit
48a7253
·
verified ·
1 Parent(s): 1b10488

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -23
app.py CHANGED
@@ -2,42 +2,39 @@ import os
2
  import gradio as gr
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
- import spaces
6
 
7
- # Download model from Hugging Face
8
  REPO_ID = "Bton/llama3-product-reviewer"
9
  FILENAME = "unsloth.Q4_K_M.gguf"
 
 
10
  model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=".", local_dir_use_symlinks=False)
11
 
12
- # Load model with GPU offloading (-1 means all layers offloaded)
13
  llm = Llama(
14
  model_path=model_path,
15
- n_gpu_layers=-1, # Use all available GPU memory
16
  n_ctx=2048,
17
  n_threads=os.cpu_count(),
 
18
  use_mlock=False,
19
- chat_format="chatml", # ✅ Use ChatML for LLaMA3 finetune
20
  verbose=False
21
  )
22
 
23
- # Create chat prompt using ChatML format
24
- def build_prompt(title, price, rating, about):
25
- return [
26
  {"role": "system", "content": "Write a helpful and natural-sounding customer review in JSON format with two fields: 'review_title' and 'review_body' for the product below."},
27
  {"role": "user", "content": f"Product Title: {title}\nRating: {rating}\nPrice: {price}\nAbout This Item: {about}"}
28
  ]
29
 
30
- # Accelerated GPU-backed function
31
- @spaces.GPU
32
- def generate_review(title, price, rating, about, temperature, top_p, top_k):
33
- prompt = build_prompt(title, price, rating, about)
34
  response = llm.create_chat_completion(
35
  messages=prompt,
36
  temperature=temperature,
37
  top_p=top_p,
38
  top_k=top_k,
39
- stream=False,
40
  max_tokens=1024,
 
41
  )
42
  return response["choices"][0]["message"]["content"]
43
 
@@ -45,17 +42,17 @@ def generate_review(title, price, rating, about, temperature, top_p, top_k):
45
  iface = gr.Interface(
46
  fn=generate_review,
47
  inputs=[
48
- gr.Textbox(label="Product Title", placeholder="e.g. Ergonomic Chair"),
49
- gr.Textbox(label="Price", placeholder="e.g. $129.99"),
50
- gr.Textbox(label="Rating", placeholder="e.g. 4.6 out of 5 stars"),
51
- gr.Textbox(label="About This Item", lines=4, placeholder="Breathable mesh back, adjustable lumbar support..."),
52
- gr.Slider(label="Temperature", value=0.7, minimum=0.0, maximum=1.5, step=0.05),
53
- gr.Slider(label="Top P", value=1.0, minimum=0.1, maximum=1.0, step=0.01),
54
- gr.Slider(label="Top K", value=40, minimum=0, maximum=1000, step=1)
55
  ],
56
- outputs="text",
57
- title="🛍️ LLaMA3 Product Review Generator (GPU)",
58
- description="Generates natural-sounding customer reviews using your fine-tuned LLaMA3 GGUF model with GPU acceleration. Format: JSON with 'review_title' and 'review_body'."
59
  )
60
 
61
  if __name__ == "__main__":
 
2
  import gradio as gr
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
 
5
 
6
+ # Hugging Face Model Details
7
  REPO_ID = "Bton/llama3-product-reviewer"
8
  FILENAME = "unsloth.Q4_K_M.gguf"
9
+
10
+ # Download model if needed
11
  model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=".", local_dir_use_symlinks=False)
12
 
13
+ # Initialize llama-cpp with GPU offloading
14
  llm = Llama(
15
  model_path=model_path,
16
+ chat_format="chatml", # Required for Unsloth
17
  n_ctx=2048,
18
  n_threads=os.cpu_count(),
19
+ n_gpu_layers=-1, # Offload all layers to GPU
20
  use_mlock=False,
 
21
  verbose=False
22
  )
23
 
24
+ # Inference function (non-streaming for ZeroGPU stability)
25
+ def generate_review(title, price, rating, about, temperature, top_p, top_k):
26
+ prompt = [
27
  {"role": "system", "content": "Write a helpful and natural-sounding customer review in JSON format with two fields: 'review_title' and 'review_body' for the product below."},
28
  {"role": "user", "content": f"Product Title: {title}\nRating: {rating}\nPrice: {price}\nAbout This Item: {about}"}
29
  ]
30
 
 
 
 
 
31
  response = llm.create_chat_completion(
32
  messages=prompt,
33
  temperature=temperature,
34
  top_p=top_p,
35
  top_k=top_k,
 
36
  max_tokens=1024,
37
+ stream=False # ⚠️ Streaming causes task abortion on ZeroGPU sometimes
38
  )
39
  return response["choices"][0]["message"]["content"]
40
 
 
42
  iface = gr.Interface(
43
  fn=generate_review,
44
  inputs=[
45
+ gr.Textbox(label="Product Title", placeholder="Ergonomic Mesh Office Chair"),
46
+ gr.Textbox(label="Price", placeholder="$129.99"),
47
+ gr.Textbox(label="Rating", placeholder="4.6 out of 5 stars"),
48
+ gr.Textbox(label="About This Item", lines=4, placeholder="Adjustable lumbar support, breathable mesh..."),
49
+ gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, value=0.7, step=0.1),
50
+ gr.Slider(label="Top-p", minimum=0.0, maximum=1.0, value=1.0, step=0.05),
51
+ gr.Slider(label="Top-k", minimum=0, maximum=100, value=50, step=1)
52
  ],
53
+ outputs=gr.Textbox(label="Generated Review", lines=8),
54
+ title="🛍️ LLaMA 3 Product Review Bot (ZeroGPU Optimized)",
55
+ description="Generate helpful, natural-sounding product reviews using a fine-tuned LLaMA 3 model. Powered by llama.cpp + Hugging Face + GGUF."
56
  )
57
 
58
  if __name__ == "__main__":