File size: 4,518 Bytes
293ab16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# agent.py - Enhanced LLaMA model wrapper and LangChain agent support

import os
from typing import Optional, Dict, List
from llama_cpp import Llama

from app.chat_memory import PersistentMemory as Memory
from app.embeddings import DocStore
from app.tools import get_tools
from app.langchain_agent import create_langchain_agent
from app.model_utils import download_model_if_missing, list_available_models

# ===============================
# Configuration & Utilities
# ===============================

MODEL_DIR = os.getenv("MODEL_DIR", "models")
DEFAULT_MODEL_PATH = os.getenv(
    "MODEL_PATH",
    download_model_if_missing() or os.path.join(MODEL_DIR, "capybarahermes-2.5-mistral-7b.Q5_K_S.gguf")
)
try:
    import llama_cpp
    llama_cpp_available = True
except Exception as e:
    print("❌ Failed to load llama_cpp:", e)
    llama_cpp_available = False

def list_models() -> List[str]:
    """List available .gguf models in the model directory."""
    if not os.path.exists(MODEL_DIR):
        return []
    return [f for f in os.listdir(MODEL_DIR) if f.endswith(".gguf")]

def set_model_path(name: str) -> str:
    """Build and verify full path to a model file."""
    path = os.path.join(MODEL_DIR, name)
    if not os.path.exists(path):
        raise FileNotFoundError(f"⚠️ Model not found: {path}")
    return path

# ===============================
# Core Local LLaMA Wrapper Class
# ===============================

class LocalLLMAgent:
    def __init__(self, model_path: str = DEFAULT_MODEL_PATH, docstore: Optional[DocStore] = None):
        self.model_path = model_path
        self.llm = self._load_llm()
        self.mem = Memory()
        self.docs = docstore

    def _load_llm(self) -> Llama:
        """Initialize and return LLaMA model."""
        return Llama(
            model_path=self.model_path,
            n_ctx=2048,
            n_threads=8,
            n_gpu_layers=40,
            verbose=False
        )

    def chat(self, prompt: str) -> str:
        """Chat with context-aware memory."""
        ctx = self.mem.get_last()
        full_prompt = f"{ctx}\nUser: {prompt}\nAI:"
        response = self.llm(full_prompt, max_tokens=256, stop=["User:", "\n"])
        answer = response["choices"][0]["text"].strip()
        self.mem.add(prompt, answer)
        return answer

    def ask(self, question: str) -> str:
        """Simple Q&A without memory."""
        response = self.llm(f"Q: {question}\nA:", max_tokens=256, stop=["Q:", "\n"])
        return response["choices"][0]["text"].strip()

    def ask_doc(self, question: str) -> Dict[str, str]:
        """Ask a question against the document store."""
        if not self.docs:
            raise ValueError("❌ Document store not initialized.")
        meta, chunk = self.docs.retrieve(question)
        context = f"Relevant content:\n{chunk}\nQuestion: {question}\nAnswer:"
        response = self.llm(context, max_tokens=256, stop=["Question:", "\n"])
        return {
            "source": meta,
            "answer": response["choices"][0]["text"].strip()
        }

    def reset_memory(self):
        """Clear memory context."""
        self.mem.clear()

    def switch_model(self, model_name: str):
        """Dynamically switch the model being used."""
        self.model_path = set_model_path(model_name)
        self.llm = self._load_llm()
        print(f"βœ… Model switched to {model_name}")

# ===============================
# Lightweight One-Shot Chat
# ===============================

_basic_llm = Llama(
    model_path=DEFAULT_MODEL_PATH,
    n_ctx=2048,
    n_threads=8,
    n_gpu_layers=40,
    verbose=False
)

def local_llm_chat(prompt: str) -> str:
    """Simple one-shot LLaMA call without memory."""
    response = _basic_llm(f"[INST] {prompt} [/INST]", stop=["</s>"], max_tokens=1024)
    return response["choices"][0]["text"].strip()

# ===============================
# LangChain Tool Agent Interface
# ===============================

def run_agent(message: str) -> str:
    """Execute LangChain agent with tools and memory."""
    tools = get_tools()
    memory = Memory()
    agent_executor = create_langchain_agent(tools, memory)
    return agent_executor.run(message)

# ===============================
# Optional Debug/Test Mode
# ===============================

if __name__ == "__main__":
    print("πŸ“‚ Available Models:", list_models())
    agent = LocalLLMAgent()
    print("πŸ€–", agent.chat("Hello! Who are you?"))
    print("🧠", agent.ask("What is the capital of France?"))