Update app.py
Browse files
app.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
-
import
|
2 |
-
from math import exp
|
3 |
import re
|
4 |
import struct
|
5 |
-
import requests
|
6 |
-
import io
|
7 |
from enum import IntEnum
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
|
10 |
class GGUFValueType(IntEnum):
|
@@ -274,28 +276,27 @@ def estimate_vram(metadata, gpu_layers, ctx_size, cache_type):
|
|
274 |
gpu_layers = n_layers
|
275 |
|
276 |
# Convert cache_type to numeric
|
277 |
-
|
278 |
-
|
|
|
|
|
|
|
|
|
279 |
|
280 |
# Derived features
|
281 |
size_per_layer = size_in_mb / max(n_layers, 1e-6)
|
282 |
-
|
283 |
-
|
284 |
-
kv_cache_factor = n_kv_heads * cache_type_numeric * ctx_size
|
285 |
-
|
286 |
-
# Helper function for smaller
|
287 |
-
def smaller(x, y):
|
288 |
-
return 1 if x < y else 0
|
289 |
|
290 |
# Calculate VRAM using the model
|
|
|
291 |
vram = (
|
292 |
-
(size_per_layer -
|
293 |
-
*
|
294 |
-
+
|
295 |
-
|
296 |
-
) * (1.363306170123392 + gpu_layers) + 1255.163594536052
|
297 |
|
298 |
-
return
|
299 |
|
300 |
except Exception as e:
|
301 |
print(f"Error in VRAM calculation: {e}")
|
@@ -310,7 +311,7 @@ def estimate_vram_wrapper(model_metadata, gpu_layers, ctx_size, cache_type):
|
|
310 |
# Use cache_type directly (it's already a string from the radio button)
|
311 |
try:
|
312 |
result = estimate_vram(model_metadata, gpu_layers, ctx_size, cache_type)
|
313 |
-
conservative = result +
|
314 |
return f"""<div id="vram-info">
|
315 |
<div>Expected VRAM usage: <span class="value">{result:.0f} MiB</span></div>
|
316 |
<div>Safe estimate: <span class="value">{conservative:.0f} MiB</span> - 95% chance the VRAM is at most this.</div>
|
|
|
1 |
+
import io
|
|
|
2 |
import re
|
3 |
import struct
|
|
|
|
|
4 |
from enum import IntEnum
|
5 |
+
from math import floor
|
6 |
+
|
7 |
+
import requests
|
8 |
+
|
9 |
+
import gradio as gr
|
10 |
|
11 |
|
12 |
class GGUFValueType(IntEnum):
|
|
|
276 |
gpu_layers = n_layers
|
277 |
|
278 |
# Convert cache_type to numeric
|
279 |
+
if cache_type == 'q4_0':
|
280 |
+
cache_type = 4
|
281 |
+
elif cache_type == 'q8_0':
|
282 |
+
cache_type = 8
|
283 |
+
else:
|
284 |
+
cache_type = 16
|
285 |
|
286 |
# Derived features
|
287 |
size_per_layer = size_in_mb / max(n_layers, 1e-6)
|
288 |
+
kv_cache_factor = n_kv_heads * cache_type * ctx_size
|
289 |
+
embedding_per_context = embedding_dim / ctx_size
|
|
|
|
|
|
|
|
|
|
|
290 |
|
291 |
# Calculate VRAM using the model
|
292 |
+
# Details: https://oobabooga.github.io/blog/posts/gguf-vram-formula/
|
293 |
vram = (
|
294 |
+
(size_per_layer - 17.99552795246051 + 3.148552680382576e-05 * kv_cache_factor)
|
295 |
+
* (gpu_layers + max(0.9690636483914102, cache_type - (floor(50.77817218646521 * embedding_per_context) + 9.987899908205632)))
|
296 |
+
+ 1516.522943869404
|
297 |
+
)
|
|
|
298 |
|
299 |
+
return vram
|
300 |
|
301 |
except Exception as e:
|
302 |
print(f"Error in VRAM calculation: {e}")
|
|
|
311 |
# Use cache_type directly (it's already a string from the radio button)
|
312 |
try:
|
313 |
result = estimate_vram(model_metadata, gpu_layers, ctx_size, cache_type)
|
314 |
+
conservative = result + 577
|
315 |
return f"""<div id="vram-info">
|
316 |
<div>Expected VRAM usage: <span class="value">{result:.0f} MiB</span></div>
|
317 |
<div>Safe estimate: <span class="value">{conservative:.0f} MiB</span> - 95% chance the VRAM is at most this.</div>
|