import json import os from github import Github from datetime import datetime, timedelta from huggingface_hub import HfApi, login def check_dataset_updates(dataset_id): api = HfApi() github = Github(os.environ["GH_TOKEN"]) repo = github.get_repo("argmaxinc/whisperkit") dataset_info = api.dataset_info(dataset_id) last_modified = dataset_info.lastModified.isoformat() current_sha = dataset_info.sha repo_tree = api.list_repo_tree(repo_id=dataset_id, repo_type="dataset", path_in_repo="benchmark_data", recursive=False) cutoff_date = datetime.now(dataset_info.lastModified.tzinfo) - timedelta(weeks=6) commit_dates_hashes = [item.path.split("/")[-1] for item in repo_tree] new_commit_hashes = [] for commit_date_hash in commit_dates_hashes: commit_date, commit_hash = commit_date_hash.split("_") commit_date = datetime.strptime(commit_date, "%Y-%m-%dT%H%M%S").replace( tzinfo=dataset_info.lastModified.tzinfo ) if commit_date < cutoff_date: continue new_commit_hashes.append(commit_hash) commit_info = [] for commit_hash in new_commit_hashes: try: commit = repo.get_commit(commit_hash) commit_date = commit.commit.author.date version = get_commit_version(repo, commit_hash) if version: commit_info.append((commit_hash, commit_date, version)) except Exception as e: print(f"Error processing commit {commit_hash}: {str(e)}") continue # Sort by commit date commit_info.sort(key=lambda x: x[1]) # Extract sorted commits and versions new_releases = [info[0] for info in commit_info] new_versions = [info[2] for info in commit_info] cache_dir = "dashboard_data" cache_file = os.path.join(cache_dir, "version.json") with open(cache_file, "r") as f: version = json.load(f) releases = version["releases"] versions = version["versions"] updated_releases = [] updated_versions = [] for release, version in zip(new_releases, new_versions): if release not in releases: updated_releases.append(release) updated_versions.append(version) if os.path.exists(cache_file): with open(cache_file, "r") as f: cached_data = json.load(f) if cached_data.get("sha") == current_sha: with open(os.environ["GITHUB_OUTPUT"], "a") as fh: print(f"has_updates=false", file=fh) return with open(cache_file, "w") as f: json.dump( { "last_modified": last_modified, "sha": current_sha, "releases": releases + updated_releases, "versions": versions + updated_versions, }, f, ) with open(os.environ["GITHUB_OUTPUT"], "a") as fh: print(f"has_updates=true", file=fh) def get_commit_version(repo, commit_hash): try: releases = list(repo.get_releases()) releases.sort(key=lambda x: x.created_at) commit = repo.get_commit(commit_hash) commit_date = commit.commit.author.date for i, release in enumerate(releases): if commit_date <= release.created_at: return releases[i].tag_name.lstrip('v') return releases[-1].tag_name.lstrip('v') except Exception as e: print(f"Error processing commit {commit_hash}: {str(e)}") return None if __name__ == "__main__": login(token=os.environ["HF_TOKEN"]) check_dataset_updates("argmaxinc/whisperkit-evals-dataset")