File size: 19,895 Bytes
5f23e61
 
 
 
 
 
8bcc059
ad1f73b
5f23e61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8bcc059
ad1f73b
 
 
 
 
8bcc059
 
e9c1ec4
8bcc059
 
e9c1ec4
 
8bcc059
ad1f73b
e9c1ec4
ad1f73b
 
 
8bcc059
 
e9c1ec4
 
8bcc059
 
 
 
 
 
 
 
 
 
e9c1ec4
8bcc059
 
e9c1ec4
8bcc059
e9c1ec4
8bcc059
 
 
 
 
 
 
 
 
e9c1ec4
8bcc059
e9c1ec4
8bcc059
 
 
 
e9c1ec4
8bcc059
e9c1ec4
8bcc059
 
 
 
 
 
 
e9c1ec4
8bcc059
e9c1ec4
8bcc059
ad1f73b
 
e9c1ec4
 
ad1f73b
 
5f23e61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6762e4b
5f23e61
 
6762e4b
e9c1ec4
6762e4b
69e8ec5
 
6762e4b
5f23e61
8bcc059
 
 
e9c1ec4
 
 
6762e4b
8bcc059
e9c1ec4
6762e4b
e9c1ec4
 
 
6762e4b
8bcc059
e9c1ec4
6762e4b
e9c1ec4
 
5f23e61
 
e9c1ec4
6762e4b
e9c1ec4
8bcc059
e9c1ec4
8bcc059
 
 
 
e9c1ec4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f23e61
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
import gradio as gr
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import tempfile
import os
import hashlib
import time
from gradio import Progress

# Function to get OID from a raw Hugging Face LFS file URL
def get_lfs_oid(raw_url: str) -> str | None:
    """
    Fetches the content of a raw Hugging Face LFS file URL and extracts the SHA256 OID.
    """
    try:
        response = requests.get(raw_url, timeout=10)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        content = response.text
        for line in content.splitlines():
            if line.startswith("oid sha256:"):
                return line.split("sha256:")[1].strip()
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error fetching OID from {raw_url}: {e}")
        return None

# Function to get .safetensors file info (file list and OIDs) using only HTTP requests
def get_model_safetensors_info(model_id: str) -> tuple[dict, str]:
    """
    Fetches safetensors file information for a Hugging Face model using HTTP requests.
    Returns {filename: oid} and error_message.
    """
    safetensors_oids = {}
    error_message = ""

    try:
        # Use Hugging Face Hub REST API to get file list
        api_url = f"https://huggingface.co/api/models/{model_id}"
        resp = requests.get(api_url, timeout=10)
        if resp.status_code != 200:
            error_message += f"Could not fetch file list for {model_id}: HTTP {resp.status_code}\n"
            return safetensors_oids, error_message
        data = resp.json()
        files = [f['rfilename'] for f in data.get('siblings', []) if f['rfilename'].endswith('.safetensors')]
        if not files:
            error_message += f"No .safetensors files found for {model_id}.\n"
            return safetensors_oids, error_message

        # Parallel OID fetch
        def fetch_oid(f):
            raw_url = f"https://huggingface.co/{model_id}/raw/main/{f}"
            oid = get_lfs_oid(raw_url)
            return f, oid

        with ThreadPoolExecutor(max_workers=min(8, len(files))) as executor:
            future_to_file = {executor.submit(fetch_oid, f): f for f in files}
            for future in as_completed(future_to_file):
                f, oid = future.result()
                if oid:
                    safetensors_oids[f] = oid
                else:
                    error_message += f"Could not get OID for {f} in {model_id}.\n"

    except Exception as e:
        error_message += f"Error fetching info for {model_id}: {e}\n"

    return safetensors_oids, error_message

# Main comparison function (no config, only file structure and OIDs)
def compare_hf_models(model_id1: str, model_id2: str) -> str:
    """
    Compares two Hugging Face models based on their safetensors OIDs.
    """
    if not model_id1 or not model_id2:
        return "Please provide both model IDs."

    output = []

    output.append(f"--- Fetching info for Model 1: {model_id1} ---")
    oids1, err1 = get_model_safetensors_info(model_id1)
    if err1: output.append(err1)
    output.append(f"Found {len(oids1)} .safetensors files for {model_id1}.")

    output.append(f"\n--- Fetching info for Model 2: {model_id2} ---")
    oids2, err2 = get_model_safetensors_info(model_id2)
    if err2: output.append(err2)
    output.append(f"Found {len(oids2)} .safetensors files for {model_id2}.")

    # 1. Compare Safetensors OIDs
    output.append("\n--- Safetensors Weight File Comparison (via OID) ---")

    if not oids1 and not oids2:
        output.append("No .safetensors files found for either model. Cannot compare weights.")
        weights_identical = False
    elif not oids1:
        output.append(f"No .safetensors files found for {model_id1}. Cannot compare weights.")
        weights_identical = False
    elif not oids2:
        output.append(f"No .safetensors files found for {model_id2}. Cannot compare weights.")
        weights_identical = False
    else:
        # Check if file lists are identical
        files1_set = set(oids1.keys())
        files2_set = set(oids2.keys())

        if files1_set != files2_set:
            output.append("The set of .safetensors files differs between models.")
            output.append(f"Files in {model_id1} but not {model_id2}: {files1_set - files2_set}")
            output.append(f"Files in {model_id2} but not {model_id1}: {files2_set - files1_set}")
            weights_identical = False
        else:
            output.append("The models have the same set of .safetensors files.")
            all_oids_match = True
            diff_files = []
            for filename in files1_set:
                if oids1[filename] != oids2[filename]:
                    all_oids_match = False
                    diff_files.append(filename)
            
            if all_oids_match:
                output.append("All corresponding .safetensors OIDs are IDENTICAL.")
                output.append(f"This strongly suggests '{model_id1}' and '{model_id2}' are 'copy-paste' models at the weight level.")
                weights_identical = True
            else:
                output.append(f"Some .safetensors OIDs DIFFER. Differing files: {', '.join(diff_files)}")
                output.append(f"This indicates different weights. If file structure is identical, '{model_id2}' could be a 'fine-tuned' version of '{model_id1}' (or vice-versa, or both fine-tuned from a common base).")
                weights_identical = False

    output.append("\n--- Summary ---")
    if weights_identical:
        output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' are IDENTICAL (copy-paste).")
    else:
        output.append(f"Conclusion: Models '{model_id1}' and '{model_id2}' have different weights or file structures. They are distinct or fine-tuned models.")

    return "\n".join(output)

def multi_compare_hf_models(model_ids: list) -> tuple:
    if not model_ids or len(model_ids) < 2:
        return "Please provide at least two model IDs.", None, None
    details = []
    safetensors_data = {}
    errors = {}
    # Fetch all model info in parallel
    with ThreadPoolExecutor(max_workers=min(8, len(model_ids))) as executor:
        future_to_model = {executor.submit(get_model_safetensors_info, mid): mid for mid in model_ids}
        for future in as_completed(future_to_model):
            mid = future_to_model[future]
            oids, err = future.result()
            safetensors_data[mid] = oids
            errors[mid] = err
    # Build summary
    summary = []
    all_files = set()
    for mid, oids in safetensors_data.items():
        all_files.update(oids.keys())
    all_files = sorted(all_files)
    # Table header
    table = [["File"] + model_ids + ["Match"]]
    for f in all_files:
        row = [f]
        oids_for_file = []
        for mid in model_ids:
            oid = safetensors_data.get(mid, {}).get(f, "-")
            oids_for_file.append(oid if oid else "-")
            row.append(oid if oid else "-")
        # Determine if all OIDs for this file match (ignoring missing)
        present_oids = [oid for oid in oids_for_file if oid != "-"]
        if len(present_oids) > 1 and all(oid == present_oids[0] for oid in present_oids):
            row.append("Match")
        else:
            row.append("Unmatch")
        table.append(row)
    # Per-model details
    for mid in model_ids:
        oids = safetensors_data.get(mid, {})
        summary.append(f"{mid}: {len(oids)} .safetensors files.")
        if errors[mid]:
            summary.append(f"Errors for {mid}: {errors[mid]}")
    # File presence summary
    for f in all_files:
        present = [mid for mid in model_ids if f in safetensors_data.get(mid, {})]
        if len(present) != len(model_ids):
            summary.append(f"File '{f}' missing in: {set(model_ids) - set(present)}")
    return "\n".join(summary), table, safetensors_data

def download_file(url, dest):
    try:
        r = requests.get(url, stream=True, timeout=30)
        r.raise_for_status()
        with open(dest, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
        return True, ""
    except Exception as e:
        return False, str(e)

def download_file_with_progress(url, dest, progress: Progress = None, progress_offset=0, progress_scale=1):
    try:
        r = requests.get(url, stream=True, timeout=30)
        r.raise_for_status()
        total = int(r.headers.get('content-length', 0))
        downloaded = 0
        start_time = time.time()
        last_update_time = start_time
        update_interval = 1.0  # Update every 1 second for HF Spaces compatibility
        
        if progress and total:
            mb_total = total // 1024 // 1024
            progress(progress_offset, desc=f"🎯 Starting: {os.path.basename(dest)} ({mb_total}MB)")
        
        with open(dest, 'wb') as f:
            for chunk in r.iter_content(chunk_size=65536):  # 64KB chunks for better performance on HF Spaces
                if chunk:
                    f.write(chunk)
                    downloaded += len(chunk)
                    current_time = time.time()
                    
                    # Update progress less frequently for HF Spaces
                    if progress and total and (current_time - last_update_time) >= update_interval:
                        file_progress = downloaded / total
                        overall_progress = progress_offset + (file_progress * progress_scale)
                        
                        # Calculate download speed
                        elapsed_time = current_time - start_time
                        if elapsed_time > 0:
                            speed_bps = downloaded / elapsed_time
                            speed_mbps = speed_bps / (1024 * 1024)
                            
                            if speed_mbps >= 1:
                                speed_str = f"{speed_mbps:.1f}MB/s"
                            else:
                                speed_kbps = speed_bps / 1024
                                speed_str = f"{speed_kbps:.0f}KB/s"
                        else:
                            speed_str = "calculating..."
                        
                        # Calculate ETA
                        if speed_bps > 0:
                            remaining_bytes = total - downloaded
                            eta_seconds = remaining_bytes / speed_bps
                            if eta_seconds < 60:
                                eta_str = f"{eta_seconds:.0f}s"
                            else:
                                eta_minutes = eta_seconds / 60
                                eta_str = f"{eta_minutes:.1f}min"
                        else:
                            eta_str = "calculating..."
                        
                        mb_downloaded = downloaded // 1024 // 1024
                        mb_total = total // 1024 // 1024
                        
                        # Simplified progress message for HF Spaces
                        progress(overall_progress, 
                                desc=f"⬇️ {mb_downloaded}/{mb_total}MB ({file_progress*100:.0f}%) • {speed_str} • ETA: {eta_str}")
                        
                        last_update_time = current_time
        
        if progress:
            final_time = time.time()
            total_time = final_time - start_time
            avg_speed = (downloaded / total_time) / (1024 * 1024) if total_time > 0 else 0
            mb_total = total // 1024 // 1024
            progress(progress_offset + progress_scale, 
                    desc=f"✅ Complete: {mb_total}MB downloaded (avg {avg_speed:.1f}MB/s)")
        
        return True, ""
    except Exception as e:
        if progress:
            progress(progress_offset + progress_scale, desc=f"❌ Download failed: {str(e)[:50]}...")
        return False, str(e)

def file_similarity(file1, file2, chunk_size=1024*1024):
    """
    Compares two files byte-by-byte and returns percent similarity (by identical bytes).
    """
    size1 = os.path.getsize(file1)
    size2 = os.path.getsize(file2)
    if size1 != size2:
        return 0.0, f"File sizes differ: {size1} vs {size2} bytes."
    total = size1
    same = 0
    with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
        while True:
            b1 = f1.read(chunk_size)
            b2 = f2.read(chunk_size)
            if not b1:
                break
            for x, y in zip(b1, b2):
                if x == y:
                    same += 1
    percent = (same / total) * 100 if total else 0.0
    return percent, None

# Gradio Interface
with gr.Blocks(theme="soft") as demo:
    gr.Markdown(
        """
        # 🤖 Hugging Face Model Cross-Checker
        Easily check if two Hugging Face models are **identical (copy-paste)**, **fine-tuned**, or **completely different**—without downloading any weights!
        
        - Enter two model IDs below (e.g. `deepseek-ai/DeepSeek-R1-0528` and `Parveshiiii/DeepSeek-R1-0528-MathX`).
        - Click **Compare** to see a clear verdict and detailed breakdown.
        """
    )
    with gr.Row():
        model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
        model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
    compare_btn = gr.Button("Compare")
    verdict = gr.HighlightedText(label="Result Verdict", color_map={"Copy-Paste":"green","Fine-Tuned":"orange","Different":"red","Error":"gray"})
    details = gr.Dataframe(headers=["File","Model 1 OID","Model 2 OID","Match"], label="File-by-File Comparison", interactive=False)
    summary = gr.Textbox(label="Summary Details", lines=8, interactive=False)

    def crosscheck_ui(m1, m2):
        if not m1 or not m2:
            return [("Error: Please provide both model IDs.", "Error")], [], ""
        oids1, err1 = get_model_safetensors_info(m1)
        oids2, err2 = get_model_safetensors_info(m2)
        if err1 or err2:
            return [(f"Error: {err1 or ''} {err2 or ''}", "Error")], [], ""
        files = sorted(set(oids1.keys()) | set(oids2.keys()))
        table = []
        all_match = True
        all_present = True
        diff_count = 0
        for f in files:
            oid1 = oids1.get(f, "-")
            oid2 = oids2.get(f, "-")
            if oid1 == oid2 and oid1 != "-":
                match = "Match"
            else:
                match = "Unmatch"
                all_match = False
                if oid1 != "-" and oid2 != "-":
                    diff_count += 1
            if oid1 == "-" or oid2 == "-":
                all_present = False
            table.append([f, oid1, oid2, match])
        # Verdict logic
        if all_match and all_present and files:
            verdict_text = [("Copy-Paste: Models are identical at the safetensors level!", "Copy-Paste")]
        elif all_present and diff_count > 0:
            verdict_text = [("Fine-Tuned: Same file structure, but weights differ.", "Fine-Tuned")]
        else:
            verdict_text = [("Different: File structure or weights are different.", "Different")]
        # Summary
        summary_lines = [
            f"Model 1: {m1} ({len(oids1)} .safetensors files)",
            f"Model 2: {m2} ({len(oids2)} .safetensors files)",
            f"Files compared: {len(files)}",
            f"Matching files: {sum(1 for row in table if row[3]=='Match')}",
            f"Unmatched files: {sum(1 for row in table if row[3]=='Unmatch')}",
        ]
        missing1 = [f for f in files if oids1.get(f) is None]
        missing2 = [f for f in files if oids2.get(f) is None]
        if missing1:
            summary_lines.append(f"Files missing in Model 1: {', '.join(missing1)}")
        if missing2:
            summary_lines.append(f"Files missing in Model 2: {', '.join(missing2)}")
        return verdict_text, table, "\n".join(summary_lines)

    compare_btn.click(
        fn=crosscheck_ui,
        inputs=[model1, model2],
        outputs=[verdict, details, summary]
    )
    with gr.Accordion("Advanced: Compare File Shards Bitwise", open=False):
        gr.Markdown("""
        ## Compare a specific file (shard) from both models, byte-by-byte
        - Enter the file name (e.g. `model-00001-of-00010.safetensors`).
        - The tool will download this file from both models and compare their contents.
        - Shows the percent of identical bytes (100% = exact copy).
        """)
        adv_model1 = gr.Textbox(label="Model ID 1", placeholder="e.g. deepseek-ai/DeepSeek-R1-0528")
        adv_model2 = gr.Textbox(label="Model ID 2", placeholder="e.g. Parveshiiii/DeepSeek-R1-0528-MathX")
        adv_filename = gr.Textbox(label="File Name", placeholder="e.g. model-00001-of-00010.safetensors")
        adv_btn = gr.Button("Download & Compare File")
        adv_result = gr.Textbox(label="Bitwise Comparison Result", lines=3, interactive=False)

        def adv_compare(m1, m2, fname, progress=gr.Progress()):
            if not m1 or not m2 or not fname:
                return "Please provide both model IDs and the file name."
            
            progress(0.0, desc="🚀 Initializing comparison...")
            
            url1 = f"https://huggingface.co/{m1}/resolve/main/{fname}?download=true"
            url2 = f"https://huggingface.co/{m2}/resolve/main/{fname}?download=true"
            
            with tempfile.TemporaryDirectory() as tmp:
                f1 = os.path.join(tmp, f"model1_{fname}")
                f2 = os.path.join(tmp, f"model2_{fname}")
                
                # Download first file (5% to 47.5%)
                progress(0.05, desc=f"📡 Connecting to {m1.split('/')[-1]}...")
                ok1, err1 = download_file_with_progress(url1, f1, progress, progress_offset=0.05, progress_scale=0.425)
                
                if not ok1:
                    return f"❌ Download failed from {m1}: {err1}"
                
                # Download second file (50% to 92.5%)
                progress(0.5, desc=f"📡 Connecting to {m2.split('/')[-1]}...")
                ok2, err2 = download_file_with_progress(url2, f2, progress, progress_offset=0.5, progress_scale=0.425)
                
                if not ok2:
                    return f"❌ Download failed from {m2}: {err2}"
                
                # Compare files (95% to 100%)
                progress(0.95, desc="🔍 Analyzing files byte-by-byte...")
                percent, err = file_similarity(f1, f2)
                if err:
                    return f"❌ Comparison error: {err}"
                
                progress(1.0, desc="✅ Analysis complete!")
                
                # Get file info
                size1 = os.path.getsize(f1)
                size2 = os.path.getsize(f2)
                size_mb = size1 // 1024 // 1024
                
                # Enhanced result formatting
                if percent == 100:
                    result_icon = "🟢"
                    result_text = "IDENTICAL"
                elif percent >= 99:
                    result_icon = "🟡"
                    result_text = "NEARLY IDENTICAL"
                elif percent >= 90:
                    result_icon = "🟠"
                    result_text = "SIMILAR"
                else:
                    result_icon = "🔴"
                    result_text = "DIFFERENT"
                
                return f"{result_icon} **{result_text}** ({percent:.3f}% similarity)\n📁 File size: {size_mb}MB\n🔗 Models: {m1.split('/')[-1]} vs {m2.split('/')[-1]}"
        adv_btn.click(
            fn=adv_compare,
            inputs=[adv_model1, adv_model2, adv_filename],
            outputs=[adv_result]
        )
demo.launch()