Spaces:
Running
Running
import json | |
import os | |
from github import Github | |
from datetime import datetime, timedelta | |
from huggingface_hub import HfApi, login | |
def check_dataset_updates(dataset_id): | |
api = HfApi() | |
github = Github(os.environ["GH_TOKEN"]) | |
repo = github.get_repo("argmaxinc/whisperkit") | |
dataset_info = api.dataset_info(dataset_id) | |
last_modified = dataset_info.lastModified.isoformat() | |
current_sha = dataset_info.sha | |
repo_tree = api.list_repo_tree(repo_id=dataset_id, repo_type="dataset", path_in_repo="benchmark_data", recursive=False) | |
cutoff_date = datetime.now(dataset_info.lastModified.tzinfo) - timedelta(weeks=6) | |
commit_dates_hashes = [item.path.split("/")[-1] for item in repo_tree] | |
new_commit_hashes = [] | |
for commit_date_hash in commit_dates_hashes: | |
commit_date, commit_hash = commit_date_hash.split("_") | |
commit_date = datetime.strptime(commit_date, "%Y-%m-%dT%H%M%S").replace( | |
tzinfo=dataset_info.lastModified.tzinfo | |
) | |
if commit_date < cutoff_date: | |
continue | |
new_commit_hashes.append(commit_hash) | |
commit_info = [] | |
for commit_hash in new_commit_hashes: | |
try: | |
commit = repo.get_commit(commit_hash) | |
commit_date = commit.commit.author.date | |
version = get_commit_version(repo, commit_hash) | |
if version: | |
commit_info.append((commit_hash, commit_date, version)) | |
except Exception as e: | |
print(f"Error processing commit {commit_hash}: {str(e)}") | |
continue | |
# Sort by commit date | |
commit_info.sort(key=lambda x: x[1]) | |
# Extract sorted commits and versions | |
new_releases = [info[0] for info in commit_info] | |
new_versions = [info[2] for info in commit_info] | |
cache_dir = "dashboard_data" | |
cache_file = os.path.join(cache_dir, "version.json") | |
with open(cache_file, "r") as f: | |
version = json.load(f) | |
releases = version["releases"] | |
versions = version["versions"] | |
updated_releases = [] | |
updated_versions = [] | |
for release, version in zip(new_releases, new_versions): | |
if release not in releases: | |
updated_releases.append(release) | |
updated_versions.append(version) | |
if os.path.exists(cache_file): | |
with open(cache_file, "r") as f: | |
cached_data = json.load(f) | |
if cached_data.get("sha") == current_sha: | |
with open(os.environ["GITHUB_OUTPUT"], "a") as fh: | |
print(f"has_updates=false", file=fh) | |
return | |
with open(cache_file, "w") as f: | |
json.dump( | |
{ | |
"last_modified": last_modified, | |
"sha": current_sha, | |
"releases": releases + updated_releases, | |
"versions": versions + updated_versions, | |
}, | |
f, | |
) | |
with open(os.environ["GITHUB_OUTPUT"], "a") as fh: | |
print(f"has_updates=true", file=fh) | |
def get_commit_version(repo, commit_hash): | |
try: | |
releases = list(repo.get_releases()) | |
releases.sort(key=lambda x: x.created_at) | |
commit = repo.get_commit(commit_hash) | |
commit_date = commit.commit.author.date | |
for i, release in enumerate(releases): | |
if commit_date <= release.created_at: | |
return releases[i].tag_name.lstrip('v') | |
return releases[-1].tag_name.lstrip('v') | |
except Exception as e: | |
print(f"Error processing commit {commit_hash}: {str(e)}") | |
return None | |
if __name__ == "__main__": | |
login(token=os.environ["HF_TOKEN"]) | |
check_dataset_updates("argmaxinc/whisperkit-evals-dataset") | |