Spaces:
Running
Running
reduce ctx and max tokens for performance
Browse files
app.py
CHANGED
@@ -75,7 +75,7 @@ def load_llm():
|
|
75 |
llm = Llama(
|
76 |
model_path=MODEL_FILE,
|
77 |
chat_handler=handler,
|
78 |
-
n_ctx=
|
79 |
verbose=False,
|
80 |
)
|
81 |
logging.info("Llama model loaded successfully.")
|
@@ -129,7 +129,7 @@ def caption_frame(frame):
|
|
129 |
logging.debug("Sending chat completion request...")
|
130 |
resp = llm.create_chat_completion(
|
131 |
messages=messages,
|
132 |
-
max_tokens=
|
133 |
temperature=0.1,
|
134 |
stop=["<end_of_utterance>"],
|
135 |
)
|
|
|
75 |
llm = Llama(
|
76 |
model_path=MODEL_FILE,
|
77 |
chat_handler=handler,
|
78 |
+
n_ctx=1024,
|
79 |
verbose=False,
|
80 |
)
|
81 |
logging.info("Llama model loaded successfully.")
|
|
|
129 |
logging.debug("Sending chat completion request...")
|
130 |
resp = llm.create_chat_completion(
|
131 |
messages=messages,
|
132 |
+
max_tokens=128,
|
133 |
temperature=0.1,
|
134 |
stop=["<end_of_utterance>"],
|
135 |
)
|