Spaces:
Running
Running
File size: 19,895 Bytes
5f23e61 8bcc059 ad1f73b 5f23e61 8bcc059 ad1f73b 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 ad1f73b e9c1ec4 ad1f73b 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 8bcc059 ad1f73b e9c1ec4 ad1f73b 5f23e61 6762e4b 5f23e61 6762e4b e9c1ec4 6762e4b 69e8ec5 6762e4b 5f23e61 8bcc059 e9c1ec4 6762e4b 8bcc059 e9c1ec4 6762e4b e9c1ec4 6762e4b 8bcc059 e9c1ec4 6762e4b e9c1ec4 5f23e61 e9c1ec4 6762e4b e9c1ec4 8bcc059 e9c1ec4 8bcc059 e9c1ec4 5f23e61 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 |
import gradio as gr
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import tempfile
import os
import hashlib
import time
from gradio import Progress
# Function to get OID from a raw Hugging Face LFS file URL
def get_lfs_oid(raw_url: str) -> str | None:
"""
Fetches the content of a raw Hugging Face LFS file URL and extracts the SHA256 OID.
"""
try:
response = requests.get(raw_url, timeout=10)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
content = response.text
for line in content.splitlines():
if line.startswith("oid sha256:"):
return line.split("sha256:")[1].strip()
return None
except requests.exceptions.RequestException as e:
print(f"Error fetching OID from {raw_url}: {e}")
return None
# Function to get .safetensors file info (file list and OIDs) using only HTTP requests
def get_model_safetensors_info(model_id: str) -> tuple[dict, str]:
"""
Fetches safetensors file information for a Hugging Face model using HTTP requests.
Returns {filename: oid} and error_message.
"""
safetensors_oids = {}
error_message = ""
try:
# Use Hugging Face Hub REST API to get file list
api_url = f"https://huggingface.co/api/models/{model_id}"
resp = requests.get(api_url, timeout=10)
if resp.status_code != 200:
error_message += f"Could not fetch file list for {model_id}: HTTP {resp.status_code}\n"
return safetensors_oids, error_message
data = resp.json()
files = [f['rfilename'] for f in data.get('siblings', []) if f['rfilename'].endswith('.safetensors')]
if not files:
error_message += f"No .safetensors files found for {model_id}.\n"
return safetensors_oids, error_message
# Parallel OID fetch
def fetch_oid(f):
raw_url = f"https://huggingface.co/{model_id}/raw/main/{f}"
oid = get_lfs_oid(raw_url)
return f, oid
with ThreadPoolExecutor(max_workers=min(8, len(files))) as executor:
future_to_file = {executor.submit(fetch_oid, f): f for f in files}
for future in as_completed(future_to_file):
f, oid = future.result()
if oid:
safetensors_oids[f] = oid
else:
error_message += f"Could not get OID for {f} in {model_id}.\n"
except Exception as e:
error_message += f"Error fetching info for {model_id}: {e}\n"
return safetensors_oids, error_message
# Main comparison function (no config, only file structure and OIDs)
def compare_hf_models(model_id1: str, model_id2: str) -> str:
"""
Compares two Hugging Face models based on their safetensors OIDs.
"""
if not model_id1 or not model_id2:
return "Please provide both model IDs."
output = []
output.append(f"--- Fetching info for Model 1: {model_id1} ---")
oids1, err1 = get_model_safetensors_info(model_id1)
if err1: output.append(err1)
output.append(f"Found {len(oids1)} .safetensors files for {model_id1}.")
output.append(f"\n--- Fetching info for Model 2: {model_id2} ---")
oids2, err2 = get_model_safetensors_info(model_id2)
if err2: output.append(err2)
output.append(f"Found {len(oids2)} .safetensors files for {model_id2}.")
# 1. Compare Safetensors OIDs
output.append("\n--- Safetensors Weight File Comparison (via OID) ---")
if not oids1 and not oids2:
output.append("No .safetensors files found for either model. Cannot compare weights.")
weights_identical = False
elif not oids1:
output.append(f"No .safetensors files found for {model_id1}. Cannot compare weights.")
weights_identical = False
elif not oids2:
output.append(f"No .safetensors files found for {model_id2}. Cannot compare weights.")
weights_identical = False
else:
# Check if file lists are identical
files1_set = set(oids1.keys())
files2_set = set(oids2.keys())
if files1_set != files2_set:
output.append("The set of .safetensors files differs between models.")
output.append(f"Files in {model_id1} but not {model_id2}: {files1_set - files2_set}")
output.append(f"Files in {model_id2} but not {model_id1}: {files2_set - files1_set}")
weights_identical = False
else:
output.append("The models have the same set of .safetensors files.")
all_oids_match = True
diff_files = []
for filename in files1_set:
if oids1[filename] != oids2[filename]:
all_oids_match = False
diff_files.append(filename)
if all_oids_match:
output.append("All corresponding .safetensors OIDs are IDENTICAL.")
output.append(f"This strongly suggests '{model_id1}' and '{model_id2}' are 'copy-paste' models at the weight level.")
weights_identical = True
else:
output.append(f"Some .safetensors OIDs DIFFER. Differing files: {', '.join(diff_files)}")
output.append(f"This indicates different weights. If file structure is identical, '{model_id2}' could be a 'fine-tuned' version of '{model_id1}' (or vice-versa, or both fine-tuned from a common base).")
weights_identical = False
output.append("\n--- Summary ---")
if weights_identical:
output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' are IDENTICAL (copy-paste).")
else:
output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' have different weights or file structures. They are distinct or fine-tuned models.")
return "\n".join(output)
def multi_compare_hf_models(model_ids: list) -> tuple:
if not model_ids or len(model_ids) < 2:
return "Please provide at least two model IDs.", None, None
details = []
safetensors_data = {}
errors = {}
# Fetch all model info in parallel
with ThreadPoolExecutor(max_workers=min(8, len(model_ids))) as executor:
future_to_model = {executor.submit(get_model_safetensors_info, mid): mid for mid in model_ids}
for future in as_completed(future_to_model):
mid = future_to_model[future]
oids, err = future.result()
safetensors_data[mid] = oids
errors[mid] = err
# Build summary
summary = []
all_files = set()
for mid, oids in safetensors_data.items():
all_files.update(oids.keys())
all_files = sorted(all_files)
# Table header
table = [["File"] + model_ids + ["Match"]]
for f in all_files:
row = [f]
oids_for_file = []
for mid in model_ids:
oid = safetensors_data.get(mid, {}).get(f, "-")
oids_for_file.append(oid if oid else "-")
row.append(oid if oid else "-")
# Determine if all OIDs for this file match (ignoring missing)
present_oids = [oid for oid in oids_for_file if oid != "-"]
if len(present_oids) > 1 and all(oid == present_oids[0] for oid in present_oids):
row.append("Match")
else:
row.append("Unmatch")
table.append(row)
# Per-model details
for mid in model_ids:
oids = safetensors_data.get(mid, {})
summary.append(f"{mid}: {len(oids)} .safetensors files.")
if errors[mid]:
summary.append(f"Errors for {mid}: {errors[mid]}")
# File presence summary
for f in all_files:
present = [mid for mid in model_ids if f in safetensors_data.get(mid, {})]
if len(present) != len(model_ids):
summary.append(f"File '{f}' missing in: {set(model_ids) - set(present)}")
return "\n".join(summary), table, safetensors_data
def download_file(url, dest):
try:
r = requests.get(url, stream=True, timeout=30)
r.raise_for_status()
with open(dest, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
return True, ""
except Exception as e:
return False, str(e)
def download_file_with_progress(url, dest, progress: Progress = None, progress_offset=0, progress_scale=1):
try:
r = requests.get(url, stream=True, timeout=30)
r.raise_for_status()
total = int(r.headers.get('content-length', 0))
downloaded = 0
start_time = time.time()
last_update_time = start_time
update_interval = 1.0 # Update every 1 second for HF Spaces compatibility
if progress and total:
mb_total = total // 1024 // 1024
progress(progress_offset, desc=f"🎯 Starting: {os.path.basename(dest)} ({mb_total}MB)")
with open(dest, 'wb') as f:
for chunk in r.iter_content(chunk_size=65536): # 64KB chunks for better performance on HF Spaces
if chunk:
f.write(chunk)
downloaded += len(chunk)
current_time = time.time()
# Update progress less frequently for HF Spaces
if progress and total and (current_time - last_update_time) >= update_interval:
file_progress = downloaded / total
overall_progress = progress_offset + (file_progress * progress_scale)
# Calculate download speed
elapsed_time = current_time - start_time
if elapsed_time > 0:
speed_bps = downloaded / elapsed_time
speed_mbps = speed_bps / (1024 * 1024)
if speed_mbps >= 1:
speed_str = f"{speed_mbps:.1f}MB/s"
else:
speed_kbps = speed_bps / 1024
speed_str = f"{speed_kbps:.0f}KB/s"
else:
speed_str = "calculating..."
# Calculate ETA
if speed_bps > 0:
remaining_bytes = total - downloaded
eta_seconds = remaining_bytes / speed_bps
if eta_seconds < 60:
eta_str = f"{eta_seconds:.0f}s"
else:
eta_minutes = eta_seconds / 60
eta_str = f"{eta_minutes:.1f}min"
else:
eta_str = "calculating..."
mb_downloaded = downloaded // 1024 // 1024
mb_total = total // 1024 // 1024
# Simplified progress message for HF Spaces
progress(overall_progress,
desc=f"⬇️ {mb_downloaded}/{mb_total}MB ({file_progress*100:.0f}%) • {speed_str} • ETA: {eta_str}")
last_update_time = current_time
if progress:
final_time = time.time()
total_time = final_time - start_time
avg_speed = (downloaded / total_time) / (1024 * 1024) if total_time > 0 else 0
mb_total = total // 1024 // 1024
progress(progress_offset + progress_scale,
desc=f"✅ Complete: {mb_total}MB downloaded (avg {avg_speed:.1f}MB/s)")
return True, ""
except Exception as e:
if progress:
progress(progress_offset + progress_scale, desc=f"❌ Download failed: {str(e)[:50]}...")
return False, str(e)
def file_similarity(file1, file2, chunk_size=1024*1024):
"""
Compares two files byte-by-byte and returns percent similarity (by identical bytes).
"""
size1 = os.path.getsize(file1)
size2 = os.path.getsize(file2)
if size1 != size2:
return 0.0, f"File sizes differ: {size1} vs {size2} bytes."
total = size1
same = 0
with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
while True:
b1 = f1.read(chunk_size)
b2 = f2.read(chunk_size)
if not b1:
break
for x, y in zip(b1, b2):
if x == y:
same += 1
percent = (same / total) * 100 if total else 0.0
return percent, None
# Gradio Interface
with gr.Blocks(theme="soft") as demo:
gr.Markdown(
"""
# 🤖 Hugging Face Model Cross-Checker
Easily check if two Hugging Face models are **identical (copy-paste)**, **fine-tuned**, or **completely different**—without downloading any weights!
- Enter two model IDs below (e.g. `deepseek-ai/DeepSeek-R1-0528` and `Parveshiiii/DeepSeek-R1-0528-MathX`).
- Click **Compare** to see a clear verdict and detailed breakdown.
"""
)
with gr.Row():
model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
compare_btn = gr.Button("Compare")
verdict = gr.HighlightedText(label="Result Verdict", color_map={"Copy-Paste":"green","Fine-Tuned":"orange","Different":"red","Error":"gray"})
details = gr.Dataframe(headers=["File","Model 1 OID","Model 2 OID","Match"], label="File-by-File Comparison", interactive=False)
summary = gr.Textbox(label="Summary Details", lines=8, interactive=False)
def crosscheck_ui(m1, m2):
if not m1 or not m2:
return [("Error: Please provide both model IDs.", "Error")], [], ""
oids1, err1 = get_model_safetensors_info(m1)
oids2, err2 = get_model_safetensors_info(m2)
if err1 or err2:
return [(f"Error: {err1 or ''} {err2 or ''}", "Error")], [], ""
files = sorted(set(oids1.keys()) | set(oids2.keys()))
table = []
all_match = True
all_present = True
diff_count = 0
for f in files:
oid1 = oids1.get(f, "-")
oid2 = oids2.get(f, "-")
if oid1 == oid2 and oid1 != "-":
match = "Match"
else:
match = "Unmatch"
all_match = False
if oid1 != "-" and oid2 != "-":
diff_count += 1
if oid1 == "-" or oid2 == "-":
all_present = False
table.append([f, oid1, oid2, match])
# Verdict logic
if all_match and all_present and files:
verdict_text = [("Copy-Paste: Models are identical at the safetensors level!", "Copy-Paste")]
elif all_present and diff_count > 0:
verdict_text = [("Fine-Tuned: Same file structure, but weights differ.", "Fine-Tuned")]
else:
verdict_text = [("Different: File structure or weights are different.", "Different")]
# Summary
summary_lines = [
f"Model 1: {m1} ({len(oids1)} .safetensors files)",
f"Model 2: {m2} ({len(oids2)} .safetensors files)",
f"Files compared: {len(files)}",
f"Matching files: {sum(1 for row in table if row[3]=='Match')}",
f"Unmatched files: {sum(1 for row in table if row[3]=='Unmatch')}",
]
missing1 = [f for f in files if oids1.get(f) is None]
missing2 = [f for f in files if oids2.get(f) is None]
if missing1:
summary_lines.append(f"Files missing in Model 1: {', '.join(missing1)}")
if missing2:
summary_lines.append(f"Files missing in Model 2: {', '.join(missing2)}")
return verdict_text, table, "\n".join(summary_lines)
compare_btn.click(
fn=crosscheck_ui,
inputs=[model1, model2],
outputs=[verdict, details, summary]
)
with gr.Accordion("Advanced: Compare File Shards Bitwise", open=False):
gr.Markdown("""
## Compare a specific file (shard) from both models, byte-by-byte
- Enter the file name (e.g. `model-00001-of-00010.safetensors`).
- The tool will download this file from both models and compare their contents.
- Shows the percent of identical bytes (100% = exact copy).
""")
adv_model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
adv_model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
adv_filename = gr.Textbox(label="File Name", placeholder="e.g. model-00001-of-00010.safetensors")
adv_btn = gr.Button("Download & Compare File")
adv_result = gr.Textbox(label="Bitwise Comparison Result", lines=3, interactive=False)
def adv_compare(m1, m2, fname, progress=gr.Progress()):
if not m1 or not m2 or not fname:
return "Please provide both model IDs and the file name."
progress(0.0, desc="🚀 Initializing comparison...")
url1 = f"https://huggingface.co/{m1}/resolve/main/{fname}?download=true"
url2 = f"https://huggingface.co/{m2}/resolve/main/{fname}?download=true"
with tempfile.TemporaryDirectory() as tmp:
f1 = os.path.join(tmp, f"model1_{fname}")
f2 = os.path.join(tmp, f"model2_{fname}")
# Download first file (5% to 47.5%)
progress(0.05, desc=f"📡 Connecting to {m1.split('/')[-1]}...")
ok1, err1 = download_file_with_progress(url1, f1, progress, progress_offset=0.05, progress_scale=0.425)
if not ok1:
return f"❌ Download failed from {m1}: {err1}"
# Download second file (50% to 92.5%)
progress(0.5, desc=f"📡 Connecting to {m2.split('/')[-1]}...")
ok2, err2 = download_file_with_progress(url2, f2, progress, progress_offset=0.5, progress_scale=0.425)
if not ok2:
return f"❌ Download failed from {m2}: {err2}"
# Compare files (95% to 100%)
progress(0.95, desc="🔍 Analyzing files byte-by-byte...")
percent, err = file_similarity(f1, f2)
if err:
return f"❌ Comparison error: {err}"
progress(1.0, desc="✅ Analysis complete!")
# Get file info
size1 = os.path.getsize(f1)
size2 = os.path.getsize(f2)
size_mb = size1 // 1024 // 1024
# Enhanced result formatting
if percent == 100:
result_icon = "🟢"
result_text = "IDENTICAL"
elif percent >= 99:
result_icon = "🟡"
result_text = "NEARLY IDENTICAL"
elif percent >= 90:
result_icon = "🟠"
result_text = "SIMILAR"
else:
result_icon = "🔴"
result_text = "DIFFERENT"
return f"{result_icon} **{result_text}** ({percent:.3f}% similarity)\n📁 File size: {size_mb}MB\n🔗 Models: {m1.split('/')[-1]} vs {m2.split('/')[-1]}"
adv_btn.click(
fn=adv_compare,
inputs=[adv_model1, adv_model2, adv_filename],
outputs=[adv_result]
)
demo.launch() |