Spaces:

brendon-ai
/

faq-huggingface-model

Running

App Files Files Community

brendon-ai commited on Jun 24

Commit

dcbac7e

verified ·

1 Parent(s): 5ccc276

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -88

app.py CHANGED Viewed

@@ -1,91 +1,22 @@
-# app.py
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
 import torch
 from transformers import pipeline
-import os
-from contextlib import asynccontextmanager # Import this!
-# --- Global variable for the model ---
-# It's important to declare this globally so it can be accessed within
-# the lifespan function and the API endpoint functions.
-generator = None
-# --- Lifespan Event Handler ---
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """
-    Handles startup and shutdown events for the FastAPI application.
-    Loads the model on startup and can optionally clean up on shutdown.
-    """
-    global generator # Declare intent to modify the global 'generator' variable
-    try:
-        # --- Startup Code: Load the model ---
-        # This code runs BEFORE the application starts receiving requests.
-        if torch.cuda.is_available():
-            print(f"CUDA is available! Using {torch.cuda.get_device_name(0)}")
-            device = 0 # Use GPU
-        else:
-            print("CUDA not available, using CPU.")
-            device = -1 # Use CPU
-        print(f"Loading model 'distilgpt2' on device: {'cuda' if device == 0 else 'cpu'}")
-        generator = pipeline('text-generation', model='distilgpt2', device=device)
-        print("Model loaded successfully!")
-        # 'yield' signifies that the startup code has completed, and the application
-        # can now start processing requests.
-        yield
-    except Exception as e:
-        print(f"Error loading model during startup: {e}")
-        # In a real application, you might want to exit here if the model is crucial
-        # sys.exit(1) or raise an exception to prevent the app from starting unhealthy.
-    finally:
-        # --- Shutdown Code (Optional): Clean up resources ---
-        # This code runs AFTER the application has finished handling requests and is shutting down.
-        # For a simple model loaded like this, there might not be explicit cleanup needed.
-        # If you had database connections, external client sessions, etc., you'd close them here.
-        print("Application shutting down. Any cleanup can go here.")
-# --- Initialize FastAPI application with the lifespan handler ---
-app = FastAPI(lifespan=lifespan) # Pass the lifespan function to the FastAPI app
-# --- Define Request Body Schema ---
-class PromptRequest(BaseModel):
-    prompt: str
-    max_length: int = 50 # Default value, can be overridden by user
-    num_return_sequences: int = 1 # Default value
-# --- Define API Endpoint ---
-@app.post("/generate")
-async def generate_text(request: PromptRequest):
-    """
-    Generates text based on a given prompt using the loaded LLM.
-    """
-    if generator is None:
-        # This indicates a failure during startup, or the app started unhealthy
-        raise HTTPException(status_code=503, detail="Model not loaded. Service unavailable.")
-    try:
-        result = generator(
-            request.prompt,
-            max_length=request.max_length,
-            num_return_sequences=request.num_return_sequences
-        )
-        return {"generated_text": result[0]['generated_text']}
-    except Exception as e:
-        # Log the full exception for debugging in production
-        print(f"Error during text generation: {e}")
-        raise HTTPException(status_code=500, detail=f"Error during text generation: {e}")
-# --- Basic Health Check Endpoint ---
-@app.get("/")
-async def read_root():
-    """
-    A simple health check endpoint to confirm the API is running.
-    """
-    return {"message": "LLM Inference API is running!"}

 import torch
 from transformers import pipeline
+# Check for GPU
+if torch.cuda.is_available():
+    print(f"CUDA is available! Using {torch.cuda.get_device_name(0)}")
+    device = 0 # Use GPU
+else:
+    print("CUDA not available, using CPU.")
+    device = -1 # Use CPU
+# Load a text generation pipeline
+# For a free tier/small GPU, consider a smaller model like 'distilgpt2' or 'gpt2'
+# For larger GPUs, you can try models like 'meta-llama/Llama-2-7b-hf' (requires auth)
+# or 'mistralai/Mistral-7B-Instruct-v0.2'
+generator = pipeline('text-generation', model='distilgpt2', device=device) # or specify a larger model
+# Generate text
+prompt = "The quick brown fox jumps over the lazy dog because"
+result = generator(prompt, max_length=50, num_return_sequences=1)
+print(result[0]['generated_text'])