Spaces:

brendon-ai
/

faq-huggingface-model

Running

App Files Files Community

brendon-ai commited on Jun 20

Commit

5f4cb37

verified ·

1 Parent(s): 764800d

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -54

app.py CHANGED Viewed

@@ -1,77 +1,96 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-MODEL_NAME = "HuggingFaceTB/SmolVLM-Instruct" #"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-# Load tokenizer and model
-print("Loading tokenizer...")
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-print("Tokenizer loaded.")
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
-print("Loading model...")
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
-)
-print("Model loaded.")
-if torch.cuda.is_available():
-    print("Moving model to GPU...")
-    model.to("cuda")
-model.eval()
-print("Model ready.")
-def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50):
     if not prompt:
-        return "Please enter a prompt."
-    messages = [{"role": "user", "content": prompt}]
-    encoded = tokenizer.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        return_tensors="pt",
-        padding=True,
-        return_attention_mask=True,
-    )
-    input_ids = encoded["input_ids"]
-    attention_mask = encoded["attention_mask"]
-    if torch.cuda.is_available():
-        input_ids = input_ids.to("cuda")
-        attention_mask = attention_mask.to("cuda")
-    output_ids = model.generate(
-        input_ids,
-        attention_mask=attention_mask,
-        max_new_tokens=max_new_tokens,
-        do_sample=True,
-        temperature=temperature,
-        top_k=top_k,
-        pad_token_id=tokenizer.eos_token_id
-    )
-    response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
-    return response
-# Gradio interface
 demo = gr.Interface(
-    fn=generate_text,
     inputs=[
-        gr.Textbox(label="Prompt"),
         gr.Slider(minimum=10, maximum=500, value=100, label="Max New Tokens"),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
         gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top K")
     ],
-    outputs=gr.Textbox(label="Generated Text"),
-    title="gemma-3-1b-it Gradio API",
-    description="Use this via UI or API via `/run/predict`"
 )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import torch
+from llama_cpp import Llama # New import for GGUF models
+# Define the model name for the GGUF model.
+# IMPORTANT: This assumes you will upload the 'Magistral-Small-2506_gguf' file
+# directly into the root directory of your Hugging Face Space.
+# You will need to download this file from its Hugging Face repository (e.g., from the "Files and versions" tab)
+# and upload it to your Space, naming it exactly as it appears here.
+GGUF_MODEL_FILE = "Magistral-Small-2506_gguf" # Adjust if your uploaded file name is different
+# Global variable for the Llama model instance
+llm = None
+# Function to load the Llama GGUF model
+def load_model():
+    global llm
+    if llm is None:
+        print(f"Loading GGUF model: {GGUF_MODEL_FILE}...")
+        try:
+            # Initialize the Llama model.
+            # `model_path` must point to the local file path of your GGUF model.
+            # `n_gpu_layers` can be set to a positive integer to offload layers to GPU if available.
+            # Set to 0 for CPU-only inference (recommended for simplicity on free Spaces tiers).
+            # `n_ctx` defines the context window size. Adjust as needed for your use case.
+            llm = Llama(model_path=GGUF_MODEL_FILE, n_gpu_layers=0, n_ctx=2048)
+            print("GGUF Model loaded successfully.")
+        except Exception as e:
+            print(f"Error loading GGUF model: {e}")
+            raise RuntimeError(f"Failed to load GGUF model: {e}. Please ensure '{GGUF_MODEL_FILE}' is correctly uploaded and accessible.")
+# Call this function once at the start of the script to load the model.
+load_model()
+# This is the core function that will be exposed as an API endpoint.
+# It takes a prompt and generation parameters, and returns generated text.
+def generate_text(prompt: str, max_new_tokens: int = 100, temperature: float = 0.7, top_k: int = 50) -> str:
+    # Basic input validation for the prompt.
     if not prompt:
+        return "Please enter a prompt to generate text!"
+    if llm is None:
+        return "Model not loaded. Please check Space logs for errors."
+    try:
+        # Generate text using the Llama model's create_completion method.
+        # `prompt` is the input text.
+        # `max_tokens` controls the length of the generated output.
+        # `temperature` controls randomness (higher = more creative).
+        # `top_k` filters the sampling pool.
+        # `stop` can be used to define tokens where the generation should stop (e.g., ["\nUser:"]).
+        # `echo=False` ensures the prompt is not repeated in the output.
+        output = llm.create_completion(
+            prompt=prompt,
+            max_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            stop=["\nUser:", "##"], # Example stop sequences
+            echo=False
+        )
+        # The generated text is typically found in the 'choices' list of the output dictionary.
+        generated_text = output['choices'][0]['text']
+        return generated_text
+    except Exception as e:
+        # Log any errors that occur during text generation for debugging.
+        print(f"Error during text generation: {e}")
+        # Return an informative error message to the user/caller.
+        return f"An error occurred: {e}. Please try again with a different prompt or check the Space logs."
+# Create the Gradio interface.
+# This interface will automatically generate a web UI and an API endpoint.
 demo = gr.Interface(
+    fn=generate_text, # The Python function to expose.
     inputs=[
+        # Input component for the prompt.
+        gr.Textbox(label="Enter your prompt here", lines=3),
+        # Slider for maximum number of new tokens to generate.
         gr.Slider(minimum=10, maximum=500, value=100, label="Max New Tokens"),
+        # Slider for generation temperature (randomness).
         gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
+        # Slider for Top-K sampling (diversity).
         gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top K")
     ],
+    outputs=gr.Textbox(label="Generated Text", lines=5), # Output component for generated text.
+    title="Magistral-Small-2506_gguf Text Generation API on Hugging Face Space",
+    description="Enter a prompt and Magistral-Small-2506_gguf will generate a response. Adjust parameters for different results. This function is also exposed as an API endpoint.",
+    allow_flagging="never" # Disables Gradio's data flagging feature.
 )
+# Launch the Gradio application.
+# `server_name="0.0.0.0"` is essential for Hugging Face Spaces to expose the app publicly.
+# `server_port=7860` is the default port used by Hugging Face Spaces.
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)