Spaces:
Configuration error
Configuration error
File size: 3,776 Bytes
447ebeb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import json
import os
import sys
import time
from datetime import datetime
from unittest.mock import AsyncMock, patch, MagicMock
import pytest
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model",
[
"bedrock/mistral.mistral-7b-instruct-v0:2",
"openai/gpt-4o",
"openai/self_hosted",
"bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
],
)
async def test_litellm_overhead(model):
litellm._turn_on_debug()
start_time = datetime.now()
if model == "openai/self_hosted":
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hello, world!"}],
api_base="https://exampleopenaiendpoint-production.up.railway.app/",
)
else:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hello, world!"}],
)
end_time = datetime.now()
total_time_ms = (end_time - start_time).total_seconds() * 1000
print(response)
print(response._hidden_params)
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
# calculate percent of overhead caused by litellm
overhead_percent = litellm_overhead_ms * 100 / total_time_ms
print("##########################\n")
print("total_time_ms", total_time_ms)
print("response litellm_overhead_ms", litellm_overhead_ms)
print("litellm overhead_percent {}%".format(overhead_percent))
print("##########################\n")
assert litellm_overhead_ms > 0
assert litellm_overhead_ms < 1000
# latency overhead should be less than total request time
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
# latency overhead should be under 40% of total request time
assert overhead_percent < 40
pass
@pytest.mark.asyncio
@pytest.mark.parametrize(
"model",
[
"bedrock/mistral.mistral-7b-instruct-v0:2",
"openai/gpt-4o",
"bedrock/anthropic.claude-3-5-haiku-20241022-v1:0",
"openai/self_hosted",
],
)
async def test_litellm_overhead_stream(model):
litellm._turn_on_debug()
start_time = datetime.now()
if model == "openai/self_hosted":
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hello, world!"}],
api_base="https://exampleopenaiendpoint-production.up.railway.app/",
stream=True,
)
else:
response = await litellm.acompletion(
model=model,
messages=[{"role": "user", "content": "Hello, world!"}],
stream=True,
)
async for chunk in response:
print()
end_time = datetime.now()
total_time_ms = (end_time - start_time).total_seconds() * 1000
print(response)
print(response._hidden_params)
litellm_overhead_ms = response._hidden_params["litellm_overhead_time_ms"]
# calculate percent of overhead caused by litellm
overhead_percent = litellm_overhead_ms * 100 / total_time_ms
print("##########################\n")
print("total_time_ms", total_time_ms)
print("response litellm_overhead_ms", litellm_overhead_ms)
print("litellm overhead_percent {}%".format(overhead_percent))
print("##########################\n")
assert litellm_overhead_ms > 0
assert litellm_overhead_ms < 1000
# latency overhead should be less than total request time
assert litellm_overhead_ms < (end_time - start_time).total_seconds() * 1000
# latency overhead should be under 40% of total request time
assert overhead_percent < 40
pass
|