File size: 2,493 Bytes
484bee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
from openai import OpenAI
import modal
from dotenv import load_dotenv

load_dotenv()

class Colors:
    """ANSI color codes for terminal output formatting."""
    GREEN = "\033[0;32m"
    RED = "\033[0;31m" 
    BLUE = "\033[0;34m"
    GRAY = "\033[0;90m"
    BOLD = "\033[1m"
    END = "\033[0m"


def ask_ai(
    prompt, 
    system_prompt,
    temperature=0.7,
    max_tokens=None,
    stream=True,
    verbose=False
):
    """
    Send a prompt to the AI model and get a response.
    
    Args:
        prompt (str): The user prompt to send to the AI
        system_prompt (str): The system instructions for the AI
        model (str): The model name to use
        temperature (float): Controls randomness (0.0-1.0)
        max_tokens (int): Maximum tokens in the response
        stream (bool): Whether to stream the response
        verbose (bool): Whether to print status messages
        
    Returns:
        str: The AI's response text
    """
    
    # Create OpenAI client and set up the connection to Modal
    API_KEY = os.getenv("Modal_API_KEY")
    client = OpenAI(api_key=API_KEY)
    
    # Set base URL to point to our Modal-deployed endpoint
    client.base_url = f"https://abhinav77642--llama-3-1-8b-instruct-serve.modal.run/v1"
    
    # Set up the messages for the conversation
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt}
    ]
    
    # Set up the completion parameters
    completion_args = {
        "model": "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16",
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
        "stream": stream
    }
    
    # Remove None values
    completion_args = {k: v for k, v in completion_args.items() if v is not None}
    
    try:
        response = client.chat.completions.create(**completion_args)
        
        # Handle the response based on streaming or non-streaming mode
        if stream:
            result = ""
            for chunk in response:
                if chunk.choices and chunk.choices[0].delta.content:
                    content = chunk.choices[0].delta.content
                    result += content
            
            return result
        else:
            result = response.choices[0].message.content
            return result
            
    except Exception as e:
        error_msg = f"Error during API call: {e}"
        return f"Error: {error_msg}"