Spaces:

Luigi
/

SmolVLM2-on-llama.cpp

Running

Luigi commited on Jun 16

Commit

76a0b57

1 Parent(s): 65b3c3a

reduce ctx and max tokens for performance

Files changed (1) hide show

app.py CHANGED Viewed

@@ -75,7 +75,7 @@ def load_llm():
     llm = Llama(
         model_path=MODEL_FILE,
         chat_handler=handler,
-        n_ctx=8192,
         verbose=False,
     )
     logging.info("Llama model loaded successfully.")
@@ -129,7 +129,7 @@ def caption_frame(frame):
         logging.debug("Sending chat completion request...")
         resp = llm.create_chat_completion(
             messages=messages,
-            max_tokens=256,
             temperature=0.1,
             stop=["<end_of_utterance>"],
         )

     llm = Llama(
         model_path=MODEL_FILE,
         chat_handler=handler,
+        n_ctx=1024,
         verbose=False,
     )
     logging.info("Llama model loaded successfully.")
         logging.debug("Sending chat completion request...")
         resp = llm.create_chat_completion(
             messages=messages,
+            max_tokens=128,
             temperature=0.1,
             stop=["<end_of_utterance>"],
         )