Spaces:
Running
on
Zero
Running
on
Zero
Update file check script to check sizes (#32)
Browse files- bytelatent/data/file_util.py +48 -3
bytelatent/data/file_util.py
CHANGED
|
@@ -65,7 +65,10 @@ def print_local_to_delete(
|
|
| 65 |
|
| 66 |
@app.command()
|
| 67 |
def compare_local_to_blob(
|
| 68 |
-
source_dirs: list[str],
|
|
|
|
|
|
|
|
|
|
| 69 |
):
|
| 70 |
for s in source_dirs:
|
| 71 |
assert s.endswith("/"), "Dirs must end with /"
|
|
@@ -75,6 +78,7 @@ def compare_local_to_blob(
|
|
| 75 |
local_fs = fsspec.filesystem("file")
|
| 76 |
dst_fs = fsspec.filesystem("s3", profile=s3_profile)
|
| 77 |
source_to_files = {}
|
|
|
|
| 78 |
all_local_files = set()
|
| 79 |
for s in source_dirs:
|
| 80 |
skipped = []
|
|
@@ -97,14 +101,28 @@ def compare_local_to_blob(
|
|
| 97 |
skipped.append(f)
|
| 98 |
continue
|
| 99 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
source_to_files[s].append(f)
|
| 101 |
-
all_local_files.add(
|
| 102 |
print(s, len(source_to_files[s]), "skipped", len(skipped), skipped[:10])
|
| 103 |
|
| 104 |
dst_files = dst_fs.find(dst_dir)
|
| 105 |
print(dst_dir, len(dst_files))
|
| 106 |
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
diff = all_local_files.symmetric_difference(dst_file_set)
|
| 109 |
print("Local files", len(all_local_files))
|
| 110 |
print("DST Files", len(dst_file_set))
|
|
@@ -112,6 +130,33 @@ def compare_local_to_blob(
|
|
| 112 |
dst_only_files = dst_file_set - all_local_files
|
| 113 |
print("DST only", len(dst_only_files), list(dst_only_files)[:10])
|
| 114 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
if __name__ == "__main__":
|
| 117 |
app()
|
|
|
|
| 65 |
|
| 66 |
@app.command()
|
| 67 |
def compare_local_to_blob(
|
| 68 |
+
source_dirs: list[str],
|
| 69 |
+
dst_dir: str,
|
| 70 |
+
s3_profile: str = "blt",
|
| 71 |
+
print_sizes: bool = False,
|
| 72 |
):
|
| 73 |
for s in source_dirs:
|
| 74 |
assert s.endswith("/"), "Dirs must end with /"
|
|
|
|
| 78 |
local_fs = fsspec.filesystem("file")
|
| 79 |
dst_fs = fsspec.filesystem("s3", profile=s3_profile)
|
| 80 |
source_to_files = {}
|
| 81 |
+
source_file_to_size = {}
|
| 82 |
all_local_files = set()
|
| 83 |
for s in source_dirs:
|
| 84 |
skipped = []
|
|
|
|
| 101 |
skipped.append(f)
|
| 102 |
continue
|
| 103 |
|
| 104 |
+
file_without_prefix = f[len(s) :]
|
| 105 |
+
if file_without_prefix not in source_file_to_size:
|
| 106 |
+
source_file_to_size[file_without_prefix] = os.path.getsize(f)
|
| 107 |
+
else:
|
| 108 |
+
source_file_to_size[file_without_prefix] = max(
|
| 109 |
+
source_file_to_size[file_without_prefix], os.path.getsize(f)
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
source_to_files[s].append(f)
|
| 113 |
+
all_local_files.add(file_without_prefix)
|
| 114 |
print(s, len(source_to_files[s]), "skipped", len(skipped), skipped[:10])
|
| 115 |
|
| 116 |
dst_files = dst_fs.find(dst_dir)
|
| 117 |
print(dst_dir, len(dst_files))
|
| 118 |
|
| 119 |
+
dst_file_to_size = {}
|
| 120 |
+
dst_file_set = set()
|
| 121 |
+
for f in dst_files:
|
| 122 |
+
dst_file_without_prefix = f[len(dst_dir) - len(S3_PREFIX) :]
|
| 123 |
+
dst_file_set.add(dst_file_without_prefix)
|
| 124 |
+
dst_file_to_size[dst_file_without_prefix] = dst_fs.size(f)
|
| 125 |
+
|
| 126 |
diff = all_local_files.symmetric_difference(dst_file_set)
|
| 127 |
print("Local files", len(all_local_files))
|
| 128 |
print("DST Files", len(dst_file_set))
|
|
|
|
| 130 |
dst_only_files = dst_file_set - all_local_files
|
| 131 |
print("DST only", len(dst_only_files), list(dst_only_files)[:10])
|
| 132 |
|
| 133 |
+
all_files = dst_file_set | all_local_files
|
| 134 |
+
print("Check that files match")
|
| 135 |
+
size_success = True
|
| 136 |
+
for f in sorted(all_files):
|
| 137 |
+
if f in source_file_to_size and f in dst_file_to_size:
|
| 138 |
+
if source_file_to_size[f] != dst_file_to_size[f]:
|
| 139 |
+
size_success = False
|
| 140 |
+
print(
|
| 141 |
+
f"Mismatch file size for {f}, Local: {source_file_to_size[f]} Blob: {dst_file_to_size[f]}"
|
| 142 |
+
)
|
| 143 |
+
else:
|
| 144 |
+
if print_sizes:
|
| 145 |
+
print(f"Matching file size: {dst_file_to_size[f]} for {f}")
|
| 146 |
+
elif f not in source_file_to_size:
|
| 147 |
+
size_success = False
|
| 148 |
+
print(f"Missing file in source: {f}")
|
| 149 |
+
elif f not in dst_file_to_size:
|
| 150 |
+
size_success = False
|
| 151 |
+
print(f"missing file in dst: {f}")
|
| 152 |
+
else:
|
| 153 |
+
raise ValueError("Unexpected to be missing file in src and dst")
|
| 154 |
+
|
| 155 |
+
if size_success:
|
| 156 |
+
print("All files pass size check")
|
| 157 |
+
else:
|
| 158 |
+
raise ValueError("At least one file failed size comparison check")
|
| 159 |
+
|
| 160 |
|
| 161 |
if __name__ == "__main__":
|
| 162 |
app()
|