jamtur01's picture
Upload folder using huggingface_hub
9c6594c verified
import asyncio
import sys
import zlib
from concurrent.futures import Executor
from typing import Any, Final, Optional, Protocol, TypedDict, cast
if sys.version_info >= (3, 12):
from collections.abc import Buffer
else:
from typing import Union
Buffer = Union[bytes, bytearray, "memoryview[int]", "memoryview[bytes]"]
try:
try:
import brotlicffi as brotli
except ImportError:
import brotli
HAS_BROTLI = True
except ImportError: # pragma: no cover
HAS_BROTLI = False
MAX_SYNC_CHUNK_SIZE = 1024
class ZLibCompressObjProtocol(Protocol):
def compress(self, data: Buffer) -> bytes: ...
def flush(self, mode: int = ..., /) -> bytes: ...
class ZLibDecompressObjProtocol(Protocol):
def decompress(self, data: Buffer, max_length: int = ...) -> bytes: ...
def flush(self, length: int = ..., /) -> bytes: ...
@property
def eof(self) -> bool: ...
class ZLibBackendProtocol(Protocol):
MAX_WBITS: int
Z_FULL_FLUSH: int
Z_SYNC_FLUSH: int
Z_BEST_SPEED: int
Z_FINISH: int
def compressobj(
self,
level: int = ...,
method: int = ...,
wbits: int = ...,
memLevel: int = ...,
strategy: int = ...,
zdict: Optional[Buffer] = ...,
) -> ZLibCompressObjProtocol: ...
def decompressobj(
self, wbits: int = ..., zdict: Buffer = ...
) -> ZLibDecompressObjProtocol: ...
def compress(
self, data: Buffer, /, level: int = ..., wbits: int = ...
) -> bytes: ...
def decompress(
self, data: Buffer, /, wbits: int = ..., bufsize: int = ...
) -> bytes: ...
class CompressObjArgs(TypedDict, total=False):
wbits: int
strategy: int
level: int
class ZLibBackendWrapper:
def __init__(self, _zlib_backend: ZLibBackendProtocol):
self._zlib_backend: ZLibBackendProtocol = _zlib_backend
@property
def name(self) -> str:
return getattr(self._zlib_backend, "__name__", "undefined")
@property
def MAX_WBITS(self) -> int:
return self._zlib_backend.MAX_WBITS
@property
def Z_FULL_FLUSH(self) -> int:
return self._zlib_backend.Z_FULL_FLUSH
@property
def Z_SYNC_FLUSH(self) -> int:
return self._zlib_backend.Z_SYNC_FLUSH
@property
def Z_BEST_SPEED(self) -> int:
return self._zlib_backend.Z_BEST_SPEED
@property
def Z_FINISH(self) -> int:
return self._zlib_backend.Z_FINISH
def compressobj(self, *args: Any, **kwargs: Any) -> ZLibCompressObjProtocol:
return self._zlib_backend.compressobj(*args, **kwargs)
def decompressobj(self, *args: Any, **kwargs: Any) -> ZLibDecompressObjProtocol:
return self._zlib_backend.decompressobj(*args, **kwargs)
def compress(self, data: Buffer, *args: Any, **kwargs: Any) -> bytes:
return self._zlib_backend.compress(data, *args, **kwargs)
def decompress(self, data: Buffer, *args: Any, **kwargs: Any) -> bytes:
return self._zlib_backend.decompress(data, *args, **kwargs)
# Everything not explicitly listed in the Protocol we just pass through
def __getattr__(self, attrname: str) -> Any:
return getattr(self._zlib_backend, attrname)
ZLibBackend: ZLibBackendWrapper = ZLibBackendWrapper(zlib)
def set_zlib_backend(new_zlib_backend: ZLibBackendProtocol) -> None:
ZLibBackend._zlib_backend = new_zlib_backend
def encoding_to_mode(
encoding: Optional[str] = None,
suppress_deflate_header: bool = False,
) -> int:
if encoding == "gzip":
return 16 + ZLibBackend.MAX_WBITS
return -ZLibBackend.MAX_WBITS if suppress_deflate_header else ZLibBackend.MAX_WBITS
class ZlibBaseHandler:
def __init__(
self,
mode: int,
executor: Optional[Executor] = None,
max_sync_chunk_size: Optional[int] = MAX_SYNC_CHUNK_SIZE,
):
self._mode = mode
self._executor = executor
self._max_sync_chunk_size = max_sync_chunk_size
class ZLibCompressor(ZlibBaseHandler):
def __init__(
self,
encoding: Optional[str] = None,
suppress_deflate_header: bool = False,
level: Optional[int] = None,
wbits: Optional[int] = None,
strategy: Optional[int] = None,
executor: Optional[Executor] = None,
max_sync_chunk_size: Optional[int] = MAX_SYNC_CHUNK_SIZE,
):
super().__init__(
mode=(
encoding_to_mode(encoding, suppress_deflate_header)
if wbits is None
else wbits
),
executor=executor,
max_sync_chunk_size=max_sync_chunk_size,
)
self._zlib_backend: Final = ZLibBackendWrapper(ZLibBackend._zlib_backend)
kwargs: CompressObjArgs = {}
kwargs["wbits"] = self._mode
if strategy is not None:
kwargs["strategy"] = strategy
if level is not None:
kwargs["level"] = level
self._compressor = self._zlib_backend.compressobj(**kwargs)
self._compress_lock = asyncio.Lock()
def compress_sync(self, data: bytes) -> bytes:
return self._compressor.compress(data)
async def compress(self, data: bytes) -> bytes:
"""Compress the data and returned the compressed bytes.
Note that flush() must be called after the last call to compress()
If the data size is large than the max_sync_chunk_size, the compression
will be done in the executor. Otherwise, the compression will be done
in the event loop.
"""
async with self._compress_lock:
# To ensure the stream is consistent in the event
# there are multiple writers, we need to lock
# the compressor so that only one writer can
# compress at a time.
if (
self._max_sync_chunk_size is not None
and len(data) > self._max_sync_chunk_size
):
return await asyncio.get_running_loop().run_in_executor(
self._executor, self._compressor.compress, data
)
return self.compress_sync(data)
def flush(self, mode: Optional[int] = None) -> bytes:
return self._compressor.flush(
mode if mode is not None else self._zlib_backend.Z_FINISH
)
class ZLibDecompressor(ZlibBaseHandler):
def __init__(
self,
encoding: Optional[str] = None,
suppress_deflate_header: bool = False,
executor: Optional[Executor] = None,
max_sync_chunk_size: Optional[int] = MAX_SYNC_CHUNK_SIZE,
):
super().__init__(
mode=encoding_to_mode(encoding, suppress_deflate_header),
executor=executor,
max_sync_chunk_size=max_sync_chunk_size,
)
self._zlib_backend: Final = ZLibBackendWrapper(ZLibBackend._zlib_backend)
self._decompressor = self._zlib_backend.decompressobj(wbits=self._mode)
def decompress_sync(self, data: bytes, max_length: int = 0) -> bytes:
return self._decompressor.decompress(data, max_length)
async def decompress(self, data: bytes, max_length: int = 0) -> bytes:
"""Decompress the data and return the decompressed bytes.
If the data size is large than the max_sync_chunk_size, the decompression
will be done in the executor. Otherwise, the decompression will be done
in the event loop.
"""
if (
self._max_sync_chunk_size is not None
and len(data) > self._max_sync_chunk_size
):
return await asyncio.get_running_loop().run_in_executor(
self._executor, self._decompressor.decompress, data, max_length
)
return self.decompress_sync(data, max_length)
def flush(self, length: int = 0) -> bytes:
return (
self._decompressor.flush(length)
if length > 0
else self._decompressor.flush()
)
@property
def eof(self) -> bool:
return self._decompressor.eof
class BrotliDecompressor:
# Supports both 'brotlipy' and 'Brotli' packages
# since they share an import name. The top branches
# are for 'brotlipy' and bottom branches for 'Brotli'
def __init__(self) -> None:
if not HAS_BROTLI:
raise RuntimeError(
"The brotli decompression is not available. "
"Please install `Brotli` module"
)
self._obj = brotli.Decompressor()
def decompress_sync(self, data: bytes) -> bytes:
if hasattr(self._obj, "decompress"):
return cast(bytes, self._obj.decompress(data))
return cast(bytes, self._obj.process(data))
def flush(self) -> bytes:
if hasattr(self._obj, "flush"):
return cast(bytes, self._obj.flush())
return b""