brendon-ai commited on
Commit
5f4cb37
·
verified ·
1 Parent(s): 764800d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -54
app.py CHANGED
@@ -1,77 +1,96 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
 
4
 
5
- MODEL_NAME = "HuggingFaceTB/SmolVLM-Instruct" #"TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
 
 
 
 
6
 
7
- # Load tokenizer and model
8
- print("Loading tokenizer...")
9
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
- print("Tokenizer loaded.")
11
 
12
- if tokenizer.pad_token is None:
13
- tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- print("Loading model...")
16
- model = AutoModelForCausalLM.from_pretrained(
17
- MODEL_NAME,
18
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
19
- )
20
- print("Model loaded.")
21
-
22
- if torch.cuda.is_available():
23
- print("Moving model to GPU...")
24
- model.to("cuda")
25
- model.eval()
26
 
27
- print("Model ready.")
28
-
29
- def generate_text(prompt, max_new_tokens=100, temperature=0.7, top_k=50):
 
30
  if not prompt:
31
- return "Please enter a prompt."
32
-
33
- messages = [{"role": "user", "content": prompt}]
34
- encoded = tokenizer.apply_chat_template(
35
- messages,
36
- add_generation_prompt=True,
37
- return_tensors="pt",
38
- padding=True,
39
- return_attention_mask=True,
40
- )
41
 
42
- input_ids = encoded["input_ids"]
43
- attention_mask = encoded["attention_mask"]
44
 
45
- if torch.cuda.is_available():
46
- input_ids = input_ids.to("cuda")
47
- attention_mask = attention_mask.to("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- output_ids = model.generate(
50
- input_ids,
51
- attention_mask=attention_mask,
52
- max_new_tokens=max_new_tokens,
53
- do_sample=True,
54
- temperature=temperature,
55
- top_k=top_k,
56
- pad_token_id=tokenizer.eos_token_id
57
- )
58
 
59
- response = tokenizer.decode(output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True)
60
- return response
 
 
 
61
 
62
- # Gradio interface
 
63
  demo = gr.Interface(
64
- fn=generate_text,
65
  inputs=[
66
- gr.Textbox(label="Prompt"),
 
 
67
  gr.Slider(minimum=10, maximum=500, value=100, label="Max New Tokens"),
 
68
  gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
 
69
  gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top K")
70
  ],
71
- outputs=gr.Textbox(label="Generated Text"),
72
- title="gemma-3-1b-it Gradio API",
73
- description="Use this via UI or API via `/run/predict`"
 
74
  )
75
 
 
 
 
76
  if __name__ == "__main__":
77
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
 
2
  import torch
3
+ from llama_cpp import Llama # New import for GGUF models
4
 
5
+ # Define the model name for the GGUF model.
6
+ # IMPORTANT: This assumes you will upload the 'Magistral-Small-2506_gguf' file
7
+ # directly into the root directory of your Hugging Face Space.
8
+ # You will need to download this file from its Hugging Face repository (e.g., from the "Files and versions" tab)
9
+ # and upload it to your Space, naming it exactly as it appears here.
10
+ GGUF_MODEL_FILE = "Magistral-Small-2506_gguf" # Adjust if your uploaded file name is different
11
 
12
+ # Global variable for the Llama model instance
13
+ llm = None
 
 
14
 
15
+ # Function to load the Llama GGUF model
16
+ def load_model():
17
+ global llm
18
+ if llm is None:
19
+ print(f"Loading GGUF model: {GGUF_MODEL_FILE}...")
20
+ try:
21
+ # Initialize the Llama model.
22
+ # `model_path` must point to the local file path of your GGUF model.
23
+ # `n_gpu_layers` can be set to a positive integer to offload layers to GPU if available.
24
+ # Set to 0 for CPU-only inference (recommended for simplicity on free Spaces tiers).
25
+ # `n_ctx` defines the context window size. Adjust as needed for your use case.
26
+ llm = Llama(model_path=GGUF_MODEL_FILE, n_gpu_layers=0, n_ctx=2048)
27
+ print("GGUF Model loaded successfully.")
28
+ except Exception as e:
29
+ print(f"Error loading GGUF model: {e}")
30
+ raise RuntimeError(f"Failed to load GGUF model: {e}. Please ensure '{GGUF_MODEL_FILE}' is correctly uploaded and accessible.")
31
 
32
+ # Call this function once at the start of the script to load the model.
33
+ load_model()
 
 
 
 
 
 
 
 
 
34
 
35
+ # This is the core function that will be exposed as an API endpoint.
36
+ # It takes a prompt and generation parameters, and returns generated text.
37
+ def generate_text(prompt: str, max_new_tokens: int = 100, temperature: float = 0.7, top_k: int = 50) -> str:
38
+ # Basic input validation for the prompt.
39
  if not prompt:
40
+ return "Please enter a prompt to generate text!"
 
 
 
 
 
 
 
 
 
41
 
42
+ if llm is None:
43
+ return "Model not loaded. Please check Space logs for errors."
44
 
45
+ try:
46
+ # Generate text using the Llama model's create_completion method.
47
+ # `prompt` is the input text.
48
+ # `max_tokens` controls the length of the generated output.
49
+ # `temperature` controls randomness (higher = more creative).
50
+ # `top_k` filters the sampling pool.
51
+ # `stop` can be used to define tokens where the generation should stop (e.g., ["\nUser:"]).
52
+ # `echo=False` ensures the prompt is not repeated in the output.
53
+ output = llm.create_completion(
54
+ prompt=prompt,
55
+ max_tokens=max_new_tokens,
56
+ temperature=temperature,
57
+ top_k=top_k,
58
+ stop=["\nUser:", "##"], # Example stop sequences
59
+ echo=False
60
+ )
61
 
62
+ # The generated text is typically found in the 'choices' list of the output dictionary.
63
+ generated_text = output['choices'][0]['text']
64
+ return generated_text
 
 
 
 
 
 
65
 
66
+ except Exception as e:
67
+ # Log any errors that occur during text generation for debugging.
68
+ print(f"Error during text generation: {e}")
69
+ # Return an informative error message to the user/caller.
70
+ return f"An error occurred: {e}. Please try again with a different prompt or check the Space logs."
71
 
72
+ # Create the Gradio interface.
73
+ # This interface will automatically generate a web UI and an API endpoint.
74
  demo = gr.Interface(
75
+ fn=generate_text, # The Python function to expose.
76
  inputs=[
77
+ # Input component for the prompt.
78
+ gr.Textbox(label="Enter your prompt here", lines=3),
79
+ # Slider for maximum number of new tokens to generate.
80
  gr.Slider(minimum=10, maximum=500, value=100, label="Max New Tokens"),
81
+ # Slider for generation temperature (randomness).
82
  gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
83
+ # Slider for Top-K sampling (diversity).
84
  gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top K")
85
  ],
86
+ outputs=gr.Textbox(label="Generated Text", lines=5), # Output component for generated text.
87
+ title="Magistral-Small-2506_gguf Text Generation API on Hugging Face Space",
88
+ description="Enter a prompt and Magistral-Small-2506_gguf will generate a response. Adjust parameters for different results. This function is also exposed as an API endpoint.",
89
+ allow_flagging="never" # Disables Gradio's data flagging feature.
90
  )
91
 
92
+ # Launch the Gradio application.
93
+ # `server_name="0.0.0.0"` is essential for Hugging Face Spaces to expose the app publicly.
94
+ # `server_port=7860` is the default port used by Hugging Face Spaces.
95
  if __name__ == "__main__":
96
  demo.launch(server_name="0.0.0.0", server_port=7860)