Update app.py
Browse files
app.py
CHANGED
@@ -1,77 +1,96 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
3 |
import torch
|
|
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
#
|
8 |
-
|
9 |
-
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
10 |
-
print("Tokenizer loaded.")
|
11 |
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
MODEL_NAME,
|
18 |
-
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
19 |
-
)
|
20 |
-
print("Model loaded.")
|
21 |
-
|
22 |
-
if torch.cuda.is_available():
|
23 |
-
print("Moving model to GPU...")
|
24 |
-
model.to("cuda")
|
25 |
-
model.eval()
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50):
|
|
|
30 |
if not prompt:
|
31 |
-
return "Please enter a prompt
|
32 |
-
|
33 |
-
messages = [{"role": "user", "content": prompt}]
|
34 |
-
encoded = tokenizer.apply_chat_template(
|
35 |
-
messages,
|
36 |
-
add_generation_prompt=True,
|
37 |
-
return_tensors="pt",
|
38 |
-
padding=True,
|
39 |
-
return_attention_mask=True,
|
40 |
-
)
|
41 |
|
42 |
-
|
43 |
-
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
max_new_tokens=max_new_tokens,
|
53 |
-
do_sample=True,
|
54 |
-
temperature=temperature,
|
55 |
-
top_k=top_k,
|
56 |
-
pad_token_id=tokenizer.eos_token_id
|
57 |
-
)
|
58 |
|
59 |
-
|
60 |
-
|
|
|
|
|
|
|
61 |
|
62 |
-
# Gradio interface
|
|
|
63 |
demo = gr.Interface(
|
64 |
-
fn=generate_text,
|
65 |
inputs=[
|
66 |
-
|
|
|
|
|
67 |
gr.Slider(minimum=10, maximum=500, value=100, label="Max New Tokens"),
|
|
|
68 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
|
|
|
69 |
gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top K")
|
70 |
],
|
71 |
-
outputs=gr.Textbox(label="Generated Text"),
|
72 |
-
title="
|
73 |
-
description="
|
|
|
74 |
)
|
75 |
|
|
|
|
|
|
|
76 |
if __name__ == "__main__":
|
77 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import torch
|
3 |
+
from llama_cpp import Llama # New import for GGUF models
|
4 |
|
5 |
+
# Define the model name for the GGUF model.
|
6 |
+
# IMPORTANT: This assumes you will upload the 'Magistral-Small-2506_gguf' file
|
7 |
+
# directly into the root directory of your Hugging Face Space.
|
8 |
+
# You will need to download this file from its Hugging Face repository (e.g., from the "Files and versions" tab)
|
9 |
+
# and upload it to your Space, naming it exactly as it appears here.
|
10 |
+
GGUF_MODEL_FILE = "Magistral-Small-2506_gguf" # Adjust if your uploaded file name is different
|
11 |
|
12 |
+
# Global variable for the Llama model instance
|
13 |
+
llm = None
|
|
|
|
|
14 |
|
15 |
+
# Function to load the Llama GGUF model
|
16 |
+
def load_model():
|
17 |
+
global llm
|
18 |
+
if llm is None:
|
19 |
+
print(f"Loading GGUF model: {GGUF_MODEL_FILE}...")
|
20 |
+
try:
|
21 |
+
# Initialize the Llama model.
|
22 |
+
# `model_path` must point to the local file path of your GGUF model.
|
23 |
+
# `n_gpu_layers` can be set to a positive integer to offload layers to GPU if available.
|
24 |
+
# Set to 0 for CPU-only inference (recommended for simplicity on free Spaces tiers).
|
25 |
+
# `n_ctx` defines the context window size. Adjust as needed for your use case.
|
26 |
+
llm = Llama(model_path=GGUF_MODEL_FILE, n_gpu_layers=0, n_ctx=2048)
|
27 |
+
print("GGUF Model loaded successfully.")
|
28 |
+
except Exception as e:
|
29 |
+
print(f"Error loading GGUF model: {e}")
|
30 |
+
raise RuntimeError(f"Failed to load GGUF model: {e}. Please ensure '{GGUF_MODEL_FILE}' is correctly uploaded and accessible.")
|
31 |
|
32 |
+
# Call this function once at the start of the script to load the model.
|
33 |
+
load_model()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
+
# This is the core function that will be exposed as an API endpoint.
|
36 |
+
# It takes a prompt and generation parameters, and returns generated text.
|
37 |
+
def generate_text(prompt: str, max_new_tokens: int = 100, temperature: float = 0.7, top_k: int = 50) -> str:
|
38 |
+
# Basic input validation for the prompt.
|
39 |
if not prompt:
|
40 |
+
return "Please enter a prompt to generate text!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
if llm is None:
|
43 |
+
return "Model not loaded. Please check Space logs for errors."
|
44 |
|
45 |
+
try:
|
46 |
+
# Generate text using the Llama model's create_completion method.
|
47 |
+
# `prompt` is the input text.
|
48 |
+
# `max_tokens` controls the length of the generated output.
|
49 |
+
# `temperature` controls randomness (higher = more creative).
|
50 |
+
# `top_k` filters the sampling pool.
|
51 |
+
# `stop` can be used to define tokens where the generation should stop (e.g., ["\nUser:"]).
|
52 |
+
# `echo=False` ensures the prompt is not repeated in the output.
|
53 |
+
output = llm.create_completion(
|
54 |
+
prompt=prompt,
|
55 |
+
max_tokens=max_new_tokens,
|
56 |
+
temperature=temperature,
|
57 |
+
top_k=top_k,
|
58 |
+
stop=["\nUser:", "##"], # Example stop sequences
|
59 |
+
echo=False
|
60 |
+
)
|
61 |
|
62 |
+
# The generated text is typically found in the 'choices' list of the output dictionary.
|
63 |
+
generated_text = output['choices'][0]['text']
|
64 |
+
return generated_text
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
66 |
+
except Exception as e:
|
67 |
+
# Log any errors that occur during text generation for debugging.
|
68 |
+
print(f"Error during text generation: {e}")
|
69 |
+
# Return an informative error message to the user/caller.
|
70 |
+
return f"An error occurred: {e}. Please try again with a different prompt or check the Space logs."
|
71 |
|
72 |
+
# Create the Gradio interface.
|
73 |
+
# This interface will automatically generate a web UI and an API endpoint.
|
74 |
demo = gr.Interface(
|
75 |
+
fn=generate_text, # The Python function to expose.
|
76 |
inputs=[
|
77 |
+
# Input component for the prompt.
|
78 |
+
gr.Textbox(label="Enter your prompt here", lines=3),
|
79 |
+
# Slider for maximum number of new tokens to generate.
|
80 |
gr.Slider(minimum=10, maximum=500, value=100, label="Max New Tokens"),
|
81 |
+
# Slider for generation temperature (randomness).
|
82 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
|
83 |
+
# Slider for Top-K sampling (diversity).
|
84 |
gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top K")
|
85 |
],
|
86 |
+
outputs=gr.Textbox(label="Generated Text", lines=5), # Output component for generated text.
|
87 |
+
title="Magistral-Small-2506_gguf Text Generation API on Hugging Face Space",
|
88 |
+
description="Enter a prompt and Magistral-Small-2506_gguf will generate a response. Adjust parameters for different results. This function is also exposed as an API endpoint.",
|
89 |
+
allow_flagging="never" # Disables Gradio's data flagging feature.
|
90 |
)
|
91 |
|
92 |
+
# Launch the Gradio application.
|
93 |
+
# `server_name="0.0.0.0"` is essential for Hugging Face Spaces to expose the app publicly.
|
94 |
+
# `server_port=7860` is the default port used by Hugging Face Spaces.
|
95 |
if __name__ == "__main__":
|
96 |
demo.launch(server_name="0.0.0.0", server_port=7860)
|