brendon-ai commited on
Commit
5811c7f
·
verified ·
1 Parent(s): a9f05b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -95
app.py CHANGED
@@ -1,96 +1,22 @@
1
- import gradio as gr
2
  import torch
3
- from llama_cpp import Llama # New import for GGUF models
4
-
5
- # Define the model name for the GGUF model.
6
- # IMPORTANT: This assumes you will upload the 'Magistral-Small-2506_gguf' file
7
- # directly into the root directory of your Hugging Face Space.
8
- # You will need to download this file from its Hugging Face repository (e.g., from the "Files and versions" tab)
9
- # and upload it to your Space, naming it exactly as it appears here.
10
- GGUF_MODEL_FILE = "Magistral-Small-2506_gguf" # Adjust if your uploaded file name is different
11
-
12
- # Global variable for the Llama model instance
13
- llm = None
14
-
15
- # Function to load the Llama GGUF model
16
- def load_model():
17
- global llm
18
- if llm is None:
19
- print(f"Loading GGUF model: {GGUF_MODEL_FILE}...")
20
- try:
21
- # Initialize the Llama model.
22
- # `model_path` must point to the local file path of your GGUF model.
23
- # `n_gpu_layers` can be set to a positive integer to offload layers to GPU if available.
24
- # Set to 0 for CPU-only inference (recommended for simplicity on free Spaces tiers).
25
- # `n_ctx` defines the context window size. Adjust as needed for your use case.
26
- llm = Llama(model_path=GGUF_MODEL_FILE, n_gpu_layers=0, n_ctx=2048)
27
- print("GGUF Model loaded successfully.")
28
- except Exception as e:
29
- print(f"Error loading GGUF model: {e}")
30
- raise RuntimeError(f"Failed to load GGUF model: {e}. Please ensure '{GGUF_MODEL_FILE}' is correctly uploaded and accessible.")
31
-
32
- # Call this function once at the start of the script to load the model.
33
- load_model()
34
-
35
- # This is the core function that will be exposed as an API endpoint.
36
- # It takes a prompt and generation parameters, and returns generated text.
37
- def generate_text(prompt: str, max_new_tokens: int = 100, temperature: float = 0.7, top_k: int = 50) -> str:
38
- # Basic input validation for the prompt.
39
- if not prompt:
40
- return "Please enter a prompt to generate text!"
41
-
42
- if llm is None:
43
- return "Model not loaded. Please check Space logs for errors."
44
-
45
- try:
46
- # Generate text using the Llama model's create_completion method.
47
- # `prompt` is the input text.
48
- # `max_tokens` controls the length of the generated output.
49
- # `temperature` controls randomness (higher = more creative).
50
- # `top_k` filters the sampling pool.
51
- # `stop` can be used to define tokens where the generation should stop (e.g., ["\nUser:"]).
52
- # `echo=False` ensures the prompt is not repeated in the output.
53
- output = llm.create_completion(
54
- prompt=prompt,
55
- max_tokens=max_new_tokens,
56
- temperature=temperature,
57
- top_k=top_k,
58
- stop=["\nUser:", "##"], # Example stop sequences
59
- echo=False
60
- )
61
-
62
- # The generated text is typically found in the 'choices' list of the output dictionary.
63
- generated_text = output['choices'][0]['text']
64
- return generated_text
65
-
66
- except Exception as e:
67
- # Log any errors that occur during text generation for debugging.
68
- print(f"Error during text generation: {e}")
69
- # Return an informative error message to the user/caller.
70
- return f"An error occurred: {e}. Please try again with a different prompt or check the Space logs."
71
-
72
- # Create the Gradio interface.
73
- # This interface will automatically generate a web UI and an API endpoint.
74
- demo = gr.Interface(
75
- fn=generate_text, # The Python function to expose.
76
- inputs=[
77
- # Input component for the prompt.
78
- gr.Textbox(label="Enter your prompt here", lines=3),
79
- # Slider for maximum number of new tokens to generate.
80
- gr.Slider(minimum=10, maximum=500, value=100, label="Max New Tokens"),
81
- # Slider for generation temperature (randomness).
82
- gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.05, label="Temperature"),
83
- # Slider for Top-K sampling (diversity).
84
- gr.Slider(minimum=0, maximum=100, value=50, step=1, label="Top K")
85
- ],
86
- outputs=gr.Textbox(label="Generated Text", lines=5), # Output component for generated text.
87
- title="Magistral-Small-2506_gguf Text Generation API on Hugging Face Space",
88
- description="Enter a prompt and Magistral-Small-2506_gguf will generate a response. Adjust parameters for different results. This function is also exposed as an API endpoint.",
89
- allow_flagging="never" # Disables Gradio's data flagging feature.
90
- )
91
-
92
- # Launch the Gradio application.
93
- # `server_name="0.0.0.0"` is essential for Hugging Face Spaces to expose the app publicly.
94
- # `server_port=7860` is the default port used by Hugging Face Spaces.
95
- if __name__ == "__main__":
96
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
  import torch
2
+ from transformers import pipeline
3
+
4
+ # Check for GPU
5
+ if torch.cuda.is_available():
6
+ print(f"CUDA is available! Using {torch.cuda.get_device_name(0)}")
7
+ device = 0 # Use GPU
8
+ else:
9
+ print("CUDA not available, using CPU.")
10
+ device = -1 # Use CPU
11
+
12
+ # Load a text generation pipeline
13
+ # For a free tier/small GPU, consider a smaller model like 'distilgpt2' or 'gpt2'
14
+ # For larger GPUs, you can try models like 'meta-llama/Llama-2-7b-hf' (requires auth)
15
+ # or 'mistralai/Mistral-7B-Instruct-v0.2'
16
+ generator = pipeline('text-generation', model='distilgpt2', device=device) # or specify a larger model
17
+
18
+ # Generate text
19
+ prompt = "The quick brown fox jumps over the lazy dog because"
20
+ result = generator(prompt, max_length=50, num_return_sequences=1)
21
+
22
+ print(result[0]['generated_text'])