Spaces:

Bton
/

llama3_product-reviewer

Sleeping

App Files Files Community

Bton commited on Jun 8

Commit

48a7253

verified ·

1 Parent(s): 1b10488

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -23

app.py CHANGED Viewed

@@ -2,42 +2,39 @@ import os
 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-import spaces
-# Download model from Hugging Face
 REPO_ID = "Bton/llama3-product-reviewer"
 FILENAME = "unsloth.Q4_K_M.gguf"
 model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=".", local_dir_use_symlinks=False)
-# Load model with GPU offloading (-1 means all layers offloaded)
 llm = Llama(
     model_path=model_path,
-    n_gpu_layers=-1,               # ✅ Use all available GPU memory
     n_ctx=2048,
     n_threads=os.cpu_count(),
     use_mlock=False,
-    chat_format="chatml",          # ✅ Use ChatML for LLaMA3 finetune
     verbose=False
 )
-# Create chat prompt using ChatML format
-def build_prompt(title, price, rating, about):
-    return [
         {"role": "system", "content": "Write a helpful and natural-sounding customer review in JSON format with two fields: 'review_title' and 'review_body' for the product below."},
         {"role": "user", "content": f"Product Title: {title}\nRating: {rating}\nPrice: {price}\nAbout This Item: {about}"}
     ]
-# Accelerated GPU-backed function
-@spaces.GPU
-def generate_review(title, price, rating, about, temperature, top_p, top_k):
-    prompt = build_prompt(title, price, rating, about)
     response = llm.create_chat_completion(
         messages=prompt,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
-        stream=False,
         max_tokens=1024,
     )
     return response["choices"][0]["message"]["content"]
@@ -45,17 +42,17 @@ def generate_review(title, price, rating, about, temperature, top_p, top_k):
 iface = gr.Interface(
     fn=generate_review,
     inputs=[
-        gr.Textbox(label="Product Title", placeholder="e.g. Ergonomic Chair"),
-        gr.Textbox(label="Price", placeholder="e.g. $129.99"),
-        gr.Textbox(label="Rating", placeholder="e.g. 4.6 out of 5 stars"),
-        gr.Textbox(label="About This Item", lines=4, placeholder="Breathable mesh back, adjustable lumbar support..."),
-        gr.Slider(label="Temperature", value=0.7, minimum=0.0, maximum=1.5, step=0.05),
-        gr.Slider(label="Top P", value=1.0, minimum=0.1, maximum=1.0, step=0.01),
-        gr.Slider(label="Top K", value=40, minimum=0, maximum=1000, step=1)
     ],
-    outputs="text",
-    title="🛍️ LLaMA3 Product Review Generator (GPU)",
-    description="Generates natural-sounding customer reviews using your fine-tuned LLaMA3 GGUF model with GPU acceleration. Format: JSON with 'review_title' and 'review_body'."
 )
 if __name__ == "__main__":

 import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+# Hugging Face Model Details
 REPO_ID = "Bton/llama3-product-reviewer"
 FILENAME = "unsloth.Q4_K_M.gguf"
+# Download model if needed
 model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME, local_dir=".", local_dir_use_symlinks=False)
+# Initialize llama-cpp with GPU offloading
 llm = Llama(
     model_path=model_path,
+    chat_format="chatml",  # Required for Unsloth
     n_ctx=2048,
     n_threads=os.cpu_count(),
+    n_gpu_layers=-1,       # Offload all layers to GPU
     use_mlock=False,
     verbose=False
 )
+# Inference function (non-streaming for ZeroGPU stability)
+def generate_review(title, price, rating, about, temperature, top_p, top_k):
+    prompt = [
         {"role": "system", "content": "Write a helpful and natural-sounding customer review in JSON format with two fields: 'review_title' and 'review_body' for the product below."},
         {"role": "user", "content": f"Product Title: {title}\nRating: {rating}\nPrice: {price}\nAbout This Item: {about}"}
     ]
     response = llm.create_chat_completion(
         messages=prompt,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
         max_tokens=1024,
+        stream=False  # ⚠️ Streaming causes task abortion on ZeroGPU sometimes
     )
     return response["choices"][0]["message"]["content"]
 iface = gr.Interface(
     fn=generate_review,
     inputs=[
+        gr.Textbox(label="Product Title", placeholder="Ergonomic Mesh Office Chair"),
+        gr.Textbox(label="Price", placeholder="$129.99"),
+        gr.Textbox(label="Rating", placeholder="4.6 out of 5 stars"),
+        gr.Textbox(label="About This Item", lines=4, placeholder="Adjustable lumbar support, breathable mesh..."),
+        gr.Slider(label="Temperature", minimum=0.0, maximum=1.5, value=0.7, step=0.1),
+        gr.Slider(label="Top-p", minimum=0.0, maximum=1.0, value=1.0, step=0.05),
+        gr.Slider(label="Top-k", minimum=0, maximum=100, value=50, step=1)
     ],
+    outputs=gr.Textbox(label="Generated Review", lines=8),
+    title="🛍️ LLaMA 3 Product Review Bot (ZeroGPU Optimized)",
+    description="Generate helpful, natural-sounding product reviews using a fine-tuned LLaMA 3 model. Powered by llama.cpp + Hugging Face + GGUF."
 )
 if __name__ == "__main__":