|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Hashing function for dataset keys using `hashlib.md5` |
|
|
|
Requirements for the hash function: |
|
|
|
- Provides a uniformly distributed hash from random space |
|
- Adequately fast speed |
|
- Working with multiple input types (in this case, `str`, `int` or `bytes`) |
|
- Should be platform independent (generates same hash on different OS and systems) |
|
|
|
The hashing function provides a unique 128-bit integer hash of the key provided. |
|
|
|
The split name is being used here as the hash salt to avoid having same hashes |
|
in different splits due to same keys |
|
""" |
|
|
|
from typing import Union |
|
|
|
from huggingface_hub.utils import insecure_hashlib |
|
|
|
|
|
def _as_bytes(hash_data: Union[str, int, bytes, bytearray]) -> bytes: |
|
""" |
|
Returns the input hash_data in its bytes form |
|
|
|
Args: |
|
hash_data: the hash salt/key to be converted to bytes |
|
""" |
|
if isinstance(hash_data, (bytes, bytearray)): |
|
|
|
return hash_data |
|
elif isinstance(hash_data, str): |
|
|
|
|
|
hash_data = hash_data.replace("\\", "/") |
|
elif isinstance(hash_data, int): |
|
hash_data = str(hash_data) |
|
else: |
|
|
|
raise InvalidKeyError(hash_data) |
|
|
|
return hash_data.encode("utf-8") |
|
|
|
|
|
class InvalidKeyError(Exception): |
|
"""Raises an error when given key is of invalid datatype.""" |
|
|
|
def __init__(self, hash_data): |
|
self.prefix = "\nFAILURE TO GENERATE DATASET: Invalid key type detected" |
|
self.err_msg = f"\nFound Key {hash_data} of type {type(hash_data)}" |
|
self.suffix = "\nKeys should be either str, int or bytes type" |
|
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}") |
|
|
|
|
|
class DuplicatedKeysError(Exception): |
|
"""Raise an error when duplicate key found.""" |
|
|
|
def __init__(self, key, duplicate_key_indices, fix_msg=""): |
|
self.key = key |
|
self.duplicate_key_indices = duplicate_key_indices |
|
self.fix_msg = fix_msg |
|
self.prefix = "Found multiple examples generated with the same key" |
|
if len(duplicate_key_indices) <= 20: |
|
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices)} have the key {key}" |
|
else: |
|
self.err_msg = f"\nThe examples at index {', '.join(duplicate_key_indices[:20])}... ({len(duplicate_key_indices) - 20} more) have the key {key}" |
|
self.suffix = "\n" + fix_msg if fix_msg else "" |
|
super().__init__(f"{self.prefix}{self.err_msg}{self.suffix}") |
|
|
|
|
|
class KeyHasher: |
|
"""KeyHasher class for providing hash using md5""" |
|
|
|
def __init__(self, hash_salt: str): |
|
self._split_md5 = insecure_hashlib.md5(_as_bytes(hash_salt)) |
|
|
|
def hash(self, key: Union[str, int, bytes]) -> int: |
|
"""Returns 128-bits unique hash of input key |
|
|
|
Args: |
|
key: the input key to be hashed (should be str, int or bytes) |
|
|
|
Returns: 128-bit int hash key""" |
|
md5 = self._split_md5.copy() |
|
byte_key = _as_bytes(key) |
|
md5.update(byte_key) |
|
|
|
return int(md5.hexdigest(), 16) |
|
|