import os from openai import OpenAI import modal from dotenv import load_dotenv load_dotenv() class Colors: """ANSI color codes for terminal output formatting.""" GREEN = "\033[0;32m" RED = "\033[0;31m" BLUE = "\033[0;34m" GRAY = "\033[0;90m" BOLD = "\033[1m" END = "\033[0m" def ask_ai( prompt, system_prompt, temperature=0.7, max_tokens=None, stream=True, verbose=False ): """ Send a prompt to the AI model and get a response. Args: prompt (str): The user prompt to send to the AI system_prompt (str): The system instructions for the AI model (str): The model name to use temperature (float): Controls randomness (0.0-1.0) max_tokens (int): Maximum tokens in the response stream (bool): Whether to stream the response verbose (bool): Whether to print status messages Returns: str: The AI's response text """ # Create OpenAI client and set up the connection to Modal API_KEY = os.getenv("Modal_API_KEY") client = OpenAI(api_key=API_KEY) # Set base URL to point to our Modal-deployed endpoint client.base_url = f"https://abhinav77642--llama-3-1-8b-instruct-serve.modal.run/v1" # Set up the messages for the conversation messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt} ] # Set up the completion parameters completion_args = { "model": "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", "messages": messages, "temperature": temperature, "max_tokens": max_tokens, "stream": stream } # Remove None values completion_args = {k: v for k, v in completion_args.items() if v is not None} try: response = client.chat.completions.create(**completion_args) # Handle the response based on streaming or non-streaming mode if stream: result = "" for chunk in response: if chunk.choices and chunk.choices[0].delta.content: content = chunk.choices[0].delta.content result += content return result else: result = response.choices[0].message.content return result except Exception as e: error_msg = f"Error during API call: {e}" return f"Error: {error_msg}"