jamtur01's picture
Upload folder using huggingface_hub
9c6594c verified
"""Support for parsing GitHub URLs (which might be user provided) into constituent parts."""
import re
from dataclasses import dataclass
from enum import IntEnum
from typing import Optional, Tuple, Union
from wandb.sdk.launch.errors import LaunchError
PREFIX_HTTPS = "https://"
PREFIX_SSH = "git@"
SUFFIX_GIT = ".git"
GIT_COMMIT_REGEX = re.compile(r"[0-9a-f]{40}")
class ReferenceType(IntEnum):
BRANCH = 1
COMMIT = 2
def _parse_netloc(netloc: str) -> Tuple[Optional[str], Optional[str], str]:
"""Parse netloc into username, password, and host.
github.com => None, None, "@github.com"
username@github.com => "username", None, "github.com"
username:password@github.com => "username", "password", "github.com"
"""
parts = netloc.split("@", 1)
if len(parts) == 1:
return None, None, parts[0]
auth, host = parts
parts = auth.split(":", 1)
if len(parts) == 1:
return parts[0], None, host
return parts[0], parts[1], host
@dataclass
class GitReference:
def __init__(self, remote: str, ref: Optional[str] = None) -> None:
"""Initialize a reference from a remote and ref.
Arguments:
remote: A remote URL or URI.
ref: A branch, tag, or commit hash.
"""
self.uri = remote
self.ref = ref
@property
def url(self) -> Optional[str]:
return self.uri
def fetch(self, dst_dir: str) -> None:
"""Fetch the repo into dst_dir and refine githubref based on what we learn."""
# We defer importing git until the last moment, because the import requires that the git
# executable is available on the PATH, so we only want to fail if we actually need it.
import git # type: ignore
repo = git.Repo.init(dst_dir)
self.path = repo.working_dir
origin = repo.create_remote("origin", self.uri or "")
try:
# We fetch the origin so that we have branch and tag references
origin.fetch()
except git.exc.GitCommandError as e:
raise LaunchError(
f"Unable to fetch from git remote repository {self.url}:\n{e}"
)
ref: Union[git.RemoteReference, str]
if self.ref:
if self.ref in origin.refs:
ref = origin.refs[self.ref]
else:
ref = self.ref
head = repo.create_head(self.ref, ref)
head.checkout()
self.commit_hash = head.commit.hexsha
else:
# TODO: Is there a better way to do this?
default_branch = None
for ref in repo.references:
if hasattr(ref, "tag"): # Skip tag references
continue
refname = ref.name
if refname.startswith("origin/"): # Trim off "origin/"
refname = refname[7:]
if refname == "main":
default_branch = "main"
break
if refname == "master":
default_branch = "master"
# Keep looking in case we also have a main, which we let take precedence
# (While the references appear to be sorted, not clear if that's guaranteed.)
if not default_branch:
raise LaunchError(
f"Unable to determine branch or commit to checkout from {self.url}"
)
self.default_branch = default_branch
self.ref = default_branch
head = repo.create_head(default_branch, origin.refs[default_branch])
head.checkout()
self.commit_hash = head.commit.hexsha
repo.submodule_update(init=True, recursive=True)