Spaces:

jamtur01
/

MMaDA

Runtime error

App Files Files Community

MMaDA / venv /lib /python3.11 /site-packages /fairscale /experimental /wgit /repo.py

jamtur01

Upload folder using huggingface_hub

9c6594c verified about 1 month ago

raw

history blame contribute delete

19.2 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the BSD license found in the
	# LICENSE file in the root directory of this source tree.

	import copy
	from dataclasses import dataclass
	from enum import Enum
	import json
	from pathlib import Path
	import sys
	from typing import Any, Dict, List, Optional, Union

	import torch
	from torch import Tensor

	from .pygit import PyGit
	from .sha1_store import SHA1_Store

	# This is a fixed dir name we use for sha1_store. It should not be changed
	# for backward compatibility reasons.
	SHA1_STORE_DIR_NAME = "sha1_store"


	# These are on-disk keys. Don't modify for backward compatibility.
	SHA1_KEY = "SHA1"
	LAST_MODIFIED_TS_KEY = "last_modified_time_stamp"
	REL_PATH_KEY = "file_path" # this will be removed from the json since it is redundant.


	class RepoStatus(Enum):
	"""Repo Statuses"""

	CLEAN = 1
	CHANGES_NOT_ADDED = 2
	CHANGES_ADDED_NOT_COMMITED = 3


	@dataclass
	class SizeInfo:
	"""Size info for a file or the repo in bytes.

	Deduped size can't be disabled. So it is always performed.

	Both sparsified and gzipped are optional. They are applied in the following
	order if both are enabled:

	sparsify -> gzip

	Therefore, original >= deduped >= sparsified >= gzipped
	"""

	original: int
	deduped: int
	sparsified: int
	gzipped: int


	@dataclass
	class _SHA1_Tensor:
	"""Representing a tensor using sha1(s) from SHA1 store.

	It can be either a dense one or two sparse one (SST and DST).
	"""

	is_dense: bool = True
	dense_sha1: str = ""
	sst_sha1: str = ""
	dst_sha1: str = ""


	def _recursive_apply_to_elements(data: Union[List[Any], Dict[str, Any]], fn: Any, names: List[str]) -> None:
	"""Helper function to traverse a dict recursively and apply a function to leafs.


	Args:
	data (dict or list):
	A dict or a list and it should only contain dict and list.
	fn (Any):
	A call back function on each element. Signature:
	fn(element: Any, names: List[str]) -> Any
	names (list):
	Stack of names for making the element path.
	"""
	if isinstance(data, list):
	for i, _ in enumerate(data):
	names.append(str(i))
	if isinstance(data[i], (list, dict)):
	_recursive_apply_to_elements(data[i], fn, names)
	else:
	data[i] = fn(data[i], names)
	names.pop()
	elif isinstance(data, dict):
	for key in data.keys():
	names.append(str(key))
	if isinstance(data[key], (list, dict)):
	_recursive_apply_to_elements(data[key], fn, names)
	else:
	data[key] = fn(data[key], names)
	names.pop()
	else:
	assert False, f"Unexpected data type: {type(data)}"


	class Repo:
	"""
	Represents the WeiGit repo for tracking neural network weights and their versions.

	A WeiGit repo is like a git repo. It is a dir, in which a .wgit dir exists to keep
	track of the content.

	Args:
	parent_dir (Path, str):
	Parent dir in which to make or to load a .wgit dir.
	Default: "", which means CWD.
	init (bool, optional):
	- If ``True``, initializes a new WeiGit repo in the parent_dir. Initialization
	creates a `.wgit` directory within the <parent_dir>, triggers an initialization.
	of a sha1_store in the ./<parent_dir>/.wgit directory, and makes the
	./<parent_dir>/.wgit a git repository through git initialization.
	- If ``False``, a new WeiGit repo is not initialized and the existing repo is
	wrapped, populating the `_wgit_parent` and other internal attributes.
	- Default: False
	"""

	def __init__(self, parent_dir: Union[Path, str] = "", init: bool = False) -> None:
	# Set _wgit_parent.
	self._wgit_parent = Path(parent_dir if parent_dir != "" else Path.cwd())

	# Set _dot_wgit_dir_path.
	self._dot_wgit_dir_path: Optional[Path] = None
	exists = self._recursive_search_and_may_init_dot_wgit_dir_path(self._wgit_parent)

	if not exists and init:
	# No weigit repo exists and is being initialized with init=True
	# Make .wgit directory, create sha1_store
	self._dot_wgit_dir_path = self._wgit_parent.joinpath(".wgit")
	self._dot_wgit_dir_path.mkdir(parents=False, exist_ok=True)

	# Initializing sha1_store only after wgit has been initialized!
	self._sha1_store = SHA1_Store(self._dot_wgit_dir_path.joinpath(SHA1_STORE_DIR_NAME), init=True)

	# Create a git repo for the metadata versioning.
	self._pygit = PyGit(self._dot_wgit_dir_path, gitignore=[SHA1_STORE_DIR_NAME])

	elif exists:
	# Weigit repo already exists, populate this object.
	assert self._dot_wgit_dir_path is not None
	self._sha1_store = SHA1_Store(self._dot_wgit_dir_path.joinpath(SHA1_STORE_DIR_NAME))
	self._pygit = PyGit(self._dot_wgit_dir_path)

	else:
	# weigit doesn't exist and is not trying to be initialized (triggers
	# during non-init commands)
	sys.stderr.write("fatal: not a wgit repository!\n")
	sys.exit(1)

	# We are done init. Do a check.
	self._sanity_check()

	def _recursive_search_and_may_init_dot_wgit_dir_path(self, check_dir: Path) -> bool:
	"""Search for a wgit repo top level dir from potentiall a subdir of a repo.

	This may set the self._dot_wgit_dir_path if a repo is found.

	Args:
	check_dir (Path):
	Path to the directory from where search is started.

	Returns:
	Returns True if a repo is found.
	"""
	assert self._dot_wgit_dir_path is None, f"_dot_wgit_dir_path is already set to {self._dot_wgit_dir_path}"
	if self._weigit_repo_exists(check_dir):
	self._dot_wgit_dir_path = check_dir.joinpath(".wgit")
	else:
	root = Path(check_dir.parts[0])
	while check_dir != root:
	check_dir = check_dir.parent
	if self._weigit_repo_exists(check_dir):
	self._dot_wgit_dir_path = check_dir.joinpath(".wgit")
	break
	return True if self._dot_wgit_dir_path is not None else False

	def _weigit_repo_exists(self, check_dir: Path) -> bool:
	"""Returns True if a valid WeiGit repo exists in the path: check_dir."""
	wgit_exists, git_exists, gitignore_exists = self._weigit_repo_file_check(check_dir)
	return wgit_exists and git_exists and gitignore_exists

	def _weigit_repo_file_check(self, check_dir: Path) -> tuple:
	"""Returns a tuple of boolean corresponding to the existence of each
	.wgit internally required files.
	"""
	wgit_exists = check_dir.joinpath(".wgit").exists()
	git_exists = check_dir.joinpath(".wgit/.git").exists()
	gitignore_exists = check_dir.joinpath(".wgit/.gitignore").exists()
	return wgit_exists, git_exists, gitignore_exists

	def _sanity_check(self) -> None:
	"""Helper to check if on-disk state matches what we expect."""
	if not self._weigit_repo_exists(self._wgit_parent):
	sys.stderr.write("fatal: no wgit repo exists!\n")
	sys.exit(1)

	def add(
	self,
	in_file_path: str,
	per_tensor: bool = True,
	gzip: bool = True,
	sparsify: bool = False,
	sparsify_policy: Any = None,
	) -> Optional[Dict[Any, Any]]:
	"""Add a file to the wgit repo.

	This could a new file or a modified file. Adding an unmodified, existing file
	is allowed but it is a noop.

	Args:
	in_file_path (str):
	Path to the file to be added.
	per_tensor (bool, optional):
	Add a file in a per-tensor fashion. This enables more deduplication
	due to tensors being identical. Deduplication cannot be disabled
	completely because we use a content addressable SHA1_Store class.
	Default: True
	gzip (bool, optional):
	Enable gzip based lossless compression on the object being added.
	Default: True
	sparsify (bool, optional):
	Enable sparsify for the tensors, which is going to modify the values
	for all or some tensors, i.e. lossy compression.
	Default: False
	sparsify_policy (Any):
	TODO (Min): need to add a callback function to control which tensors
	and how to sparsify.
	Default: None

	Returns:
	(Dict, optional)
	None if the content is added but not modified with lossy compression.
	Otherwise, returns a state_dict that contains the modified Tensors to
	be loaded back into the model, which means the tensors are dense, not
	SST and DST tensors.
	"""
	self._sanity_check()

	if sparsify and not per_tensor:
	raise ValueError("Only support sparsity when per_tensor is true")

	# Create the corresponding metadata file or load it if the file is
	# not a newly added file.
	file_path = Path(in_file_path)
	rel_file_path = self._rel_file_path(file_path)
	metadata_file = self._process_metadata_file(rel_file_path)

	# Add the file to the sha1_store.
	ret_state_dict = None
	file_path_or_state_dict: Union[Path, Dict] = file_path
	# TODO (Min): We don't add parent sha1 tracking to sha1 store due to
	# de-duplication & dependency tracking can create cycles.
	# We need to figure out a way to handle deletion.
	# TODO (Min): We don't detect changes and compute delta on a modified file
	# yet. Need to figure out a method for delta tracking.
	if per_tensor:

	def fn(element: Any, names: List[str]) -> Any:
	"""Callback on each leaf object for _recursive_apply_to_elements below."""
	if isinstance(element, Tensor):
	if sparsify:
	# TODO (Min): here we will optionally do SST/DST and add those
	# tensors with sparsity.
	# Remember to update ret_state_dict
	raise NotImplementedError()
	sha1 = self._sha1_store.add(element, compress=gzip, name=".".join(names))
	return _SHA1_Tensor(is_dense=True, dense_sha1=sha1)
	else:
	return element

	state_dict = torch.load(file_path)
	ret_state_dict = copy.deepcopy(state_dict) # This is only a temporary addition for testing.
	_recursive_apply_to_elements(state_dict, fn, [])
	file_path_or_state_dict = state_dict

	# Add this top-level object.
	sha1 = self._sha1_store.add(file_path_or_state_dict, compress=gzip)

	# write metadata to the metadata-file
	self._write_metadata(metadata_file, file_path, sha1)
	self._pygit.add() # add to the .wgit/.git repo

	return ret_state_dict

	def commit(self, message: str) -> None:
	"""Commits staged changes to the repo.

	Args:
	message (str):
	The commit message to be added.
	"""
	self._sanity_check()

	# TODO (Min): make commit message a json for better handling of metadata like step count,
	# LR, sparsity level, etc.
	self._pygit.commit(message)

	def size_info(self, path: Optional[str] = None) -> SizeInfo:
	"""Get size info for a file or the whole repo.

	For the whole repo, just call size_info from sha1_store.

	For a file, needs to open the metadata and find the sha1 and then
	for per_tensor state_dict, collect size_info on all objects.

	TODO (Min): not exactly clear it is easy to compute this with
	delta encoding, deduplication between objects, this
	is possible to compute precisely.

	Args:
	path (str, optional):
	File path for the query. If None, return whole repo's info.
	Default: None

	Returns:
	(SizeInfo):
	The dataclass that contains the size info.
	"""
	raise NotImplementedError()

	def status(self) -> Dict[str, RepoStatus]:
	"""Show the state of the weigit working tree.

	State can be
	1. dirty with changes/modifications not added to weigit repo.
	2. dirty with a file changes added but not committed.
	3. clean and tracking files after a change has been committed,
	or clean with with an empty repo.

	TODO (Min): this needs to return repo status and dirty files and untracked
	files too.
	Returns:
	(dict):
	A dict keyed with files and their status.
	"""
	self._sanity_check()

	pygit_status = self._pygit.status()
	status = self._get_metdata_files()
	if status:
	out_status = dict()
	for metadata_file, is_modified in status.items():
	# if metadata_file is among the keys of pygit_status dict, it has not been commited to git yet.
	if is_modified:
	out_status[str(metadata_file)] = RepoStatus.CHANGES_NOT_ADDED
	elif not is_modified and metadata_file in pygit_status.keys():
	out_status[str(metadata_file)] = RepoStatus.CHANGES_ADDED_NOT_COMMITED
	elif not is_modified and metadata_file not in pygit_status.keys():
	out_status[str(metadata_file)] = RepoStatus.CLEAN
	return out_status
	else: # if status dict is empty, nothing has been added so far.
	return {"": RepoStatus.CLEAN} # sub case of case-3, clean with an empty repo

	def log(self, file: str) -> None:
	"""Returns the WeiGit log of commit history.

	Args:
	file (str, optional):
	Show the log of the commit history of the repo. Optionally, show
	the log history of a specific file.
	"""
	self._sanity_check()

	# TODO (Min): this should return a list of sha1 for the history as well as
	# each commit's message, which could be a dict from json commit msg.
	if file:
	print(f"wgit log of the file: {file}")
	else:
	print("wgit log")

	def checkout(self, sha1: str) -> None:
	"""Checkout a previously commited version of the checkpoint.

	Args:
	sha1 (str):
	The sha1 hash of the file version to checkout.
	"""
	self._sanity_check()
	raise NotImplementedError()

	def checkout_by_steps(self) -> None:
	"""Not Implemented: Checkout by step count of the train process"""
	self._sanity_check()
	raise NotImplementedError()

	def _get_metdata_files(self) -> Dict[str, bool]:
	"""Walk the directories that contain the metadata files and check the
	status of those files, whether they have been modified or not.

	Dict[str, bool] is a path in string and whether the file is_modified.
	"""
	metadata_d = dict()
	for file in self._dot_wgit_dir_path.iterdir(): # iterate over the .wgit directory
	# exclude all the .wgit files and directory
	if file.name not in {"sha1_store", ".git", ".gitignore"}:
	# perform a directory walk on the metadata_file directories to find the metadata files
	for path in file.rglob("*"):
	if path.is_file():
	rel_path = str(path.relative_to(self._dot_wgit_dir_path)) # metadata path relative to .wgit dir
	metadata_d[rel_path] = self._is_file_modified(path)
	return metadata_d

	def _is_metadata_file(self, file: Path) -> bool:
	"""Checks whether a file is a valid metadata file by matching keys and
	checking if it has valid json data.
	"""
	try:
	with open(file) as f:
	metadata = json.load(f)
	is_metadata = set(metadata.keys()) == {SHA1_KEY, LAST_MODIFIED_TS_KEY, REL_PATH_KEY}
	except json.JSONDecodeError:
	return False # not a json file, so not valid metadata file
	return is_metadata

	def _is_file_modified(self, file: Path) -> bool:
	"""Checks whether a file has been modified since its last recorded modification
	time recorded in the metadata_file.
	"""
	with open(file) as f:
	data = json.load(f)
	# Get the last modified timestamp recorded by weigit and the current modified
	# timestamp. If not the same, then file has been modified since last weigit
	# updated metadata.
	last_mod_timestamp = data[LAST_MODIFIED_TS_KEY]
	curr_mod_timestamp = Path(data[REL_PATH_KEY]).stat().st_mtime
	return not curr_mod_timestamp == last_mod_timestamp

	def _process_metadata_file(self, metadata_fname: Path) -> Path:
	"""Create a metadata_file corresponding to the file to be tracked by weigit if
	the first version of the file is encountered. If a version already exists, open
	the file and get the sha1_hash of the last version as parent_sha1.
	"""
	metadata_file = self._dot_wgit_dir_path.joinpath(metadata_fname)
	metadata_file.parent.mkdir(parents=True, exist_ok=True) # create parent dirs for metadata file

	if not metadata_file.exists() or not metadata_file.stat().st_size:
	metadata_file.touch()
	else:
	with open(metadata_file, "r") as f:
	ref_data = json.load(f)
	return metadata_file

	def _write_metadata(self, metadata_file: Path, file_path: Path, sha1: str) -> None:
	"""Write metadata to the metadata file"""
	change_time = Path(file_path).stat().st_mtime
	metadata = {
	SHA1_KEY: sha1,
	LAST_MODIFIED_TS_KEY: change_time,
	REL_PATH_KEY: str(file_path),
	}
	with open(metadata_file, "w", encoding="utf-8") as f:
	json.dump(metadata, f, ensure_ascii=False, indent=4)

	def _rel_file_path(self, filepath: Path) -> Path:
	"""Find the relative part to the filepath from the current working
	directory and return the relative path.
	"""
	# get the absolute path
	filepath = filepath.resolve()
	# using zipped loop we get the path common to the filepath and cwd
	for i, (x, y) in enumerate(zip(filepath.parts, Path.cwd().parts)):
	pass
	# return the relative part (path not common to cwd)
	return Path(*filepath.parts[i:])