Spaces:

brendon-ai
/

faq-huggingface-model

Running

App Files Files Community

brendon-ai commited on Jun 24

Commit

5734a73

verified ·

1 Parent(s): 6806851

Update app.py

Browse files

Files changed (1) hide show

app.py +71 -19

app.py CHANGED Viewed

@@ -1,22 +1,74 @@
 import torch
 from transformers import pipeline
-# Check for GPU
-if torch.cuda.is_available():
-    print(f"CUDA is available! Using {torch.cuda.get_device_name(0)}")
-    device = 0 # Use GPU
-else:
-    print("CUDA not available, using CPU.")
-    device = -1 # Use CPU
-# Load a text generation pipeline
-# For a free tier/small GPU, consider a smaller model like 'distilgpt2' or 'gpt2'
-# For larger GPUs, you can try models like 'meta-llama/Llama-2-7b-hf' (requires auth)
-# or 'mistralai/Mistral-7B-Instruct-v0.2'
-generator = pipeline('text-generation', model='distilgpt2', device=device) # or specify a larger model
-# Generate text
-prompt = "The quick brown fox jumps over the lazy dog because"
-result = generator(prompt, max_length=50, num_return_sequences=1)
-print(result[0]['generated_text'])

+# app.py
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
 import torch
 from transformers import pipeline
+import os
+app = FastAPI()
+# --- Model Loading (Global Scope to load once) ---
+# This part will be executed only once when the FastAPI application starts up.
+# This saves memory and time compared to loading the model on every request.
+generator = None # Initialize generator to None
+@app.on_event("startup")
+async def load_model():
+    """
+    Load the model when the FastAPI application starts.
+    """
+    global generator
+    try:
+        # Check for GPU
+        if torch.cuda.is_available():
+            print(f"CUDA is available! Using {torch.cuda.get_device_name(0)}")
+            device = 0 # Use GPU
+        else:
+            print("CUDA not available, using CPU.")
+            device = -1 # Use CPU
+        # Load a text generation pipeline
+        # For a free tier/small GPU, consider a smaller model like 'distilgpt2' or 'gpt2'
+        # For larger GPUs, you can try models like 'meta-llama/Llama-2-7b-hf' (requires auth)
+        # or 'mistralai/Mistral-7B-Instruct-v0.2'
+        print(f"Loading model 'distilgpt2' on device: {'cuda' if device == 0 else 'cpu'}")
+        generator = pipeline('text-generation', model='distilgpt2', device=device)
+        print("Model loaded successfully!")
+    except Exception as e:
+        print(f"Error loading model: {e}")
+        # You might want to raise an exception or log this more robustly in production
+        # For a simple app, we'll let it fail and then handle requests later.
+# --- Define Request Body Schema ---
+class PromptRequest(BaseModel):
+    prompt: str
+    max_length: int = 50 # Default value, can be overridden by user
+    num_return_sequences: int = 1 # Default value
+# --- Define API Endpoint ---
+@app.post("/generate")
+async def generate_text(request: PromptRequest):
+    """
+    Generates text based on a given prompt using the loaded LLM.
+    """
+    if generator is None:
+        raise HTTPException(status_code=503, detail="Model not loaded. Please try again later.")
+    try:
+        result = generator(
+            request.prompt,
+            max_length=request.max_length,
+            num_return_sequences=request.num_return_sequences
+        )
+        return {"generated_text": result[0]['generated_text']}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error during text generation: {e}")
+# --- Basic Health Check Endpoint (Optional but Recommended) ---
+@app.get("/")
+async def read_root():
+    """
+    A simple health check endpoint to confirm the API is running.
+    """
+    return {"message": "LLM Inference API is running!"}