Spaces:
Configuration error
Configuration error
import json | |
import os | |
import sys | |
import time | |
from datetime import datetime | |
from unittest.mock import AsyncMock, patch, MagicMock | |
import pytest | |
sys.path.insert( | |
0, os.path.abspath("../..") | |
) # Adds the parent directory to the system path | |
import litellm | |
async def test_litellm_overhead(model): | |
litellm._turn_on_debug() | |
start_time = datetime.now() | |
if model == "openai/self_hosted": | |
response = await litellm.acompletion( | |
model=model, | |
messages=[{"role": "user", "content": "Hello, world!"}], | |
api_base="https://exampleopenaiendpoint-production.up.railway.app/", | |
) | |
else: | |
response = await litellm.acompletion( | |
model=model, | |
messages=[{"role": "user", "content": "Hello, world!"}], | |
) | |
end_time = datetime.now() | |
total_time_ms = (end_time - start_time).total_seconds() * 1000 | |
print(response) | |
print(response._hidden_params) | |
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"] | |
# calculate percent of overhead caused by litellm | |
overhead_percent = litellm_overhead_ms * 100 / total_time_ms | |
print("##########################\n") | |
print("total_time_ms", total_time_ms) | |
print("response litellm_overhead_ms", litellm_overhead_ms) | |
print("litellm overhead_percent {}%".format(overhead_percent)) | |
print("##########################\n") | |
assert litellm_overhead_ms > 0 | |
assert litellm_overhead_ms < 1000 | |
# latency overhead should be less than total request time | |
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000 | |
# latency overhead should be under 40% of total request time | |
assert overhead_percent < 40 | |
pass | |
async def test_litellm_overhead_stream(model): | |
litellm._turn_on_debug() | |
start_time = datetime.now() | |
if model == "openai/self_hosted": | |
response = await litellm.acompletion( | |
model=model, | |
messages=[{"role": "user", "content": "Hello, world!"}], | |
api_base="https://exampleopenaiendpoint-production.up.railway.app/", | |
stream=True, | |
) | |
else: | |
response = await litellm.acompletion( | |
model=model, | |
messages=[{"role": "user", "content": "Hello, world!"}], | |
stream=True, | |
) | |
async for chunk in response: | |
print() | |
end_time = datetime.now() | |
total_time_ms = (end_time - start_time).total_seconds() * 1000 | |
print(response) | |
print(response._hidden_params) | |
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"] | |
# calculate percent of overhead caused by litellm | |
overhead_percent = litellm_overhead_ms * 100 / total_time_ms | |
print("##########################\n") | |
print("total_time_ms", total_time_ms) | |
print("response litellm_overhead_ms", litellm_overhead_ms) | |
print("litellm overhead_percent {}%".format(overhead_percent)) | |
print("##########################\n") | |
assert litellm_overhead_ms > 0 | |
assert litellm_overhead_ms < 1000 | |
# latency overhead should be less than total request time | |
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000 | |
# latency overhead should be under 40% of total request time | |
assert overhead_percent < 40 | |
pass | |