whisperkit-benchmarks / .github /scripts /check_dataset_update.py
ardaatahan's picture
No need to convert releases to list
51eec1a
import json
import os
from github import Github
from datetime import datetime, timedelta
from huggingface_hub import HfApi, login
def check_dataset_updates(dataset_id):
api = HfApi()
github = Github(os.environ["GH_TOKEN"])
repo = github.get_repo("argmaxinc/whisperkit")
dataset_info = api.dataset_info(dataset_id)
last_modified = dataset_info.lastModified.isoformat()
current_sha = dataset_info.sha
repo_tree = api.list_repo_tree(repo_id=dataset_id, repo_type="dataset", path_in_repo="benchmark_data", recursive=False)
cutoff_date = datetime.now(dataset_info.lastModified.tzinfo) - timedelta(weeks=6)
commit_dates_hashes = [item.path.split("/")[-1] for item in repo_tree]
new_commit_hashes = []
for commit_date_hash in commit_dates_hashes:
commit_date, commit_hash = commit_date_hash.split("_")
commit_date = datetime.strptime(commit_date, "%Y-%m-%dT%H%M%S").replace(
tzinfo=dataset_info.lastModified.tzinfo
)
if commit_date < cutoff_date:
continue
new_commit_hashes.append(commit_hash)
commit_info = []
for commit_hash in new_commit_hashes:
try:
commit = repo.get_commit(commit_hash)
commit_date = commit.commit.author.date
version = get_commit_version(repo, commit_hash)
if version:
commit_info.append((commit_hash, commit_date, version))
except Exception as e:
print(f"Error processing commit {commit_hash}: {str(e)}")
continue
# Sort by commit date
commit_info.sort(key=lambda x: x[1])
# Extract sorted commits and versions
new_releases = [info[0] for info in commit_info]
new_versions = [info[2] for info in commit_info]
cache_dir = "dashboard_data"
cache_file = os.path.join(cache_dir, "version.json")
with open(cache_file, "r") as f:
version = json.load(f)
releases = version["releases"]
versions = version["versions"]
updated_releases = []
updated_versions = []
for release, version in zip(new_releases, new_versions):
if release not in releases:
updated_releases.append(release)
updated_versions.append(version)
if os.path.exists(cache_file):
with open(cache_file, "r") as f:
cached_data = json.load(f)
if cached_data.get("sha") == current_sha:
with open(os.environ["GITHUB_OUTPUT"], "a") as fh:
print(f"has_updates=false", file=fh)
return
with open(cache_file, "w") as f:
json.dump(
{
"last_modified": last_modified,
"sha": current_sha,
"releases": releases + updated_releases,
"versions": versions + updated_versions,
},
f,
)
with open(os.environ["GITHUB_OUTPUT"], "a") as fh:
print(f"has_updates=true", file=fh)
def get_commit_version(repo, commit_hash):
try:
releases = list(repo.get_releases())
releases.sort(key=lambda x: x.created_at)
commit = repo.get_commit(commit_hash)
commit_date = commit.commit.author.date
for i, release in enumerate(releases):
if commit_date <= release.created_at:
return releases[i].tag_name.lstrip('v')
return releases[-1].tag_name.lstrip('v')
except Exception as e:
print(f"Error processing commit {commit_hash}: {str(e)}")
return None
if __name__ == "__main__":
login(token=os.environ["HF_TOKEN"])
check_dataset_updates("argmaxinc/whisperkit-evals-dataset")