|
import importlib |
|
import importlib.metadata |
|
import logging |
|
import os |
|
import platform |
|
from pathlib import Path |
|
from typing import Optional |
|
|
|
from huggingface_hub import constants |
|
from packaging import version |
|
|
|
|
|
logger = logging.getLogger(__name__.split(".", 1)[0]) |
|
|
|
|
|
S3_DATASETS_BUCKET_PREFIX = "https://s3.amazonaws.com/datasets.huggingface.co/datasets/datasets" |
|
CLOUDFRONT_DATASETS_DISTRIB_PREFIX = "https://cdn-datasets.huggingface.co/datasets/datasets" |
|
REPO_DATASETS_URL = "https://raw.githubusercontent.com/huggingface/datasets/{revision}/datasets/{path}/{name}" |
|
|
|
|
|
HF_ENDPOINT = os.environ.get("HF_ENDPOINT", "https://huggingface.co") |
|
HUB_DATASETS_URL = HF_ENDPOINT + "/datasets/{repo_id}/resolve/{revision}/{path}" |
|
HUB_DATASETS_HFFS_URL = "hf://datasets/{repo_id}@{revision}/{path}" |
|
HUB_DEFAULT_VERSION = "main" |
|
|
|
PY_VERSION = version.parse(platform.python_version()) |
|
|
|
|
|
ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} |
|
ENV_VARS_FALSE_VALUES = {"0", "OFF", "NO", "FALSE"} |
|
ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"}) |
|
ENV_VARS_FALSE_AND_AUTO_VALUES = ENV_VARS_FALSE_VALUES.union({"AUTO"}) |
|
|
|
|
|
|
|
DILL_VERSION = version.parse(importlib.metadata.version("dill")) |
|
FSSPEC_VERSION = version.parse(importlib.metadata.version("fsspec")) |
|
PANDAS_VERSION = version.parse(importlib.metadata.version("pandas")) |
|
PYARROW_VERSION = version.parse(importlib.metadata.version("pyarrow")) |
|
HF_HUB_VERSION = version.parse(importlib.metadata.version("huggingface_hub")) |
|
|
|
USE_TF = os.environ.get("USE_TF", "AUTO").upper() |
|
USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper() |
|
USE_JAX = os.environ.get("USE_JAX", "AUTO").upper() |
|
|
|
TORCH_VERSION = "N/A" |
|
TORCH_AVAILABLE = False |
|
|
|
if USE_TORCH in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TF not in ENV_VARS_TRUE_VALUES: |
|
TORCH_AVAILABLE = importlib.util.find_spec("torch") is not None |
|
if TORCH_AVAILABLE: |
|
try: |
|
TORCH_VERSION = version.parse(importlib.metadata.version("torch")) |
|
logger.info(f"PyTorch version {TORCH_VERSION} available.") |
|
except importlib.metadata.PackageNotFoundError: |
|
pass |
|
else: |
|
logger.info("Disabling PyTorch because USE_TF is set") |
|
|
|
POLARS_VERSION = "N/A" |
|
POLARS_AVAILABLE = importlib.util.find_spec("polars") is not None |
|
|
|
if POLARS_AVAILABLE: |
|
try: |
|
POLARS_VERSION = version.parse(importlib.metadata.version("polars")) |
|
logger.info(f"Polars version {POLARS_VERSION} available.") |
|
except importlib.metadata.PackageNotFoundError: |
|
pass |
|
|
|
|
|
DUCKDB_VERSION = "N/A" |
|
DUCKDB_AVAILABLE = importlib.util.find_spec("duckdb") is not None |
|
|
|
if DUCKDB_AVAILABLE: |
|
try: |
|
DUCKDB_VERSION = version.parse(importlib.metadata.version("duckdb")) |
|
logger.info(f"Duckdb version {DUCKDB_VERSION} available.") |
|
except importlib.metadata.PackageNotFoundError: |
|
pass |
|
|
|
TF_VERSION = "N/A" |
|
TF_AVAILABLE = False |
|
|
|
if USE_TF in ENV_VARS_TRUE_AND_AUTO_VALUES and USE_TORCH not in ENV_VARS_TRUE_VALUES: |
|
TF_AVAILABLE = importlib.util.find_spec("tensorflow") is not None |
|
if TF_AVAILABLE: |
|
|
|
for package in [ |
|
"tensorflow", |
|
"tensorflow-cpu", |
|
"tensorflow-gpu", |
|
"tf-nightly", |
|
"tf-nightly-cpu", |
|
"tf-nightly-gpu", |
|
"intel-tensorflow", |
|
"tensorflow-rocm", |
|
"tensorflow-macos", |
|
]: |
|
try: |
|
TF_VERSION = version.parse(importlib.metadata.version(package)) |
|
except importlib.metadata.PackageNotFoundError: |
|
continue |
|
else: |
|
break |
|
else: |
|
TF_AVAILABLE = False |
|
if TF_AVAILABLE: |
|
if TF_VERSION.major < 2: |
|
logger.info(f"TensorFlow found but with version {TF_VERSION}. `datasets` requires version 2 minimum.") |
|
TF_AVAILABLE = False |
|
else: |
|
logger.info(f"TensorFlow version {TF_VERSION} available.") |
|
else: |
|
logger.info("Disabling Tensorflow because USE_TORCH is set") |
|
|
|
|
|
JAX_VERSION = "N/A" |
|
JAX_AVAILABLE = False |
|
|
|
if USE_JAX in ENV_VARS_TRUE_AND_AUTO_VALUES: |
|
JAX_AVAILABLE = importlib.util.find_spec("jax") is not None and importlib.util.find_spec("jaxlib") is not None |
|
if JAX_AVAILABLE: |
|
try: |
|
JAX_VERSION = version.parse(importlib.metadata.version("jax")) |
|
logger.info(f"JAX version {JAX_VERSION} available.") |
|
except importlib.metadata.PackageNotFoundError: |
|
pass |
|
else: |
|
logger.info("Disabling JAX because USE_JAX is set to False") |
|
|
|
|
|
|
|
SQLALCHEMY_AVAILABLE = importlib.util.find_spec("sqlalchemy") is not None |
|
|
|
|
|
PIL_AVAILABLE = importlib.util.find_spec("PIL") is not None |
|
IS_OPUS_SUPPORTED = importlib.util.find_spec("soundfile") is not None and version.parse( |
|
importlib.import_module("soundfile").__libsndfile_version__ |
|
) >= version.parse("1.0.31") |
|
IS_MP3_SUPPORTED = importlib.util.find_spec("soundfile") is not None and version.parse( |
|
importlib.import_module("soundfile").__libsndfile_version__ |
|
) >= version.parse("1.1.0") |
|
TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None |
|
PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None |
|
|
|
|
|
RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None |
|
ZSTANDARD_AVAILABLE = importlib.util.find_spec("zstandard") is not None |
|
LZ4_AVAILABLE = importlib.util.find_spec("lz4") is not None |
|
PY7ZR_AVAILABLE = importlib.util.find_spec("py7zr") is not None |
|
|
|
|
|
DEFAULT_XDG_CACHE_HOME = "~/.cache" |
|
XDG_CACHE_HOME = os.getenv("XDG_CACHE_HOME", DEFAULT_XDG_CACHE_HOME) |
|
DEFAULT_HF_CACHE_HOME = os.path.join(XDG_CACHE_HOME, "huggingface") |
|
HF_CACHE_HOME = os.path.expanduser(os.getenv("HF_HOME", DEFAULT_HF_CACHE_HOME)) |
|
|
|
DEFAULT_HF_DATASETS_CACHE = os.path.join(HF_CACHE_HOME, "datasets") |
|
HF_DATASETS_CACHE = Path(os.getenv("HF_DATASETS_CACHE", DEFAULT_HF_DATASETS_CACHE)) |
|
|
|
DEFAULT_HF_MODULES_CACHE = os.path.join(HF_CACHE_HOME, "modules") |
|
HF_MODULES_CACHE = Path(os.getenv("HF_MODULES_CACHE", DEFAULT_HF_MODULES_CACHE)) |
|
|
|
DOWNLOADED_DATASETS_DIR = "downloads" |
|
DEFAULT_DOWNLOADED_DATASETS_PATH = os.path.join(HF_DATASETS_CACHE, DOWNLOADED_DATASETS_DIR) |
|
DOWNLOADED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_DOWNLOADED_DATASETS_PATH", DEFAULT_DOWNLOADED_DATASETS_PATH)) |
|
|
|
EXTRACTED_DATASETS_DIR = "extracted" |
|
DEFAULT_EXTRACTED_DATASETS_PATH = os.path.join(DEFAULT_DOWNLOADED_DATASETS_PATH, EXTRACTED_DATASETS_DIR) |
|
EXTRACTED_DATASETS_PATH = Path(os.getenv("HF_DATASETS_EXTRACTED_DATASETS_PATH", DEFAULT_EXTRACTED_DATASETS_PATH)) |
|
|
|
|
|
HF_UPDATE_DOWNLOAD_COUNTS = ( |
|
os.environ.get("HF_UPDATE_DOWNLOAD_COUNTS", "AUTO").upper() in ENV_VARS_TRUE_AND_AUTO_VALUES |
|
) |
|
|
|
|
|
HF_DATASETS_MULTITHREADING_MAX_WORKERS = 16 |
|
|
|
|
|
__HF_DATASETS_TRUST_REMOTE_CODE = os.environ.get("HF_DATASETS_TRUST_REMOTE_CODE", "ask") |
|
HF_DATASETS_TRUST_REMOTE_CODE: Optional[bool] = ( |
|
True |
|
if __HF_DATASETS_TRUST_REMOTE_CODE.upper() in ENV_VARS_TRUE_VALUES |
|
else False |
|
if __HF_DATASETS_TRUST_REMOTE_CODE.upper() in ENV_VARS_FALSE_VALUES |
|
else None |
|
) |
|
TIME_OUT_REMOTE_CODE = 15 |
|
|
|
|
|
USE_PARQUET_EXPORT = True |
|
|
|
|
|
|
|
DEFAULT_MAX_BATCH_SIZE = 1000 |
|
|
|
|
|
ARROW_READER_BATCH_SIZE_IN_DATASET_ITER = 10 |
|
|
|
|
|
MAX_SHARD_SIZE = "500MB" |
|
|
|
|
|
PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETS = 100 |
|
PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETS = 100 |
|
PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS = 100 |
|
PARQUET_ROW_GROUP_SIZE_FOR_VIDEO_DATASETS = 10 |
|
|
|
|
|
_offline = os.environ.get("HF_DATASETS_OFFLINE") |
|
HF_HUB_OFFLINE = constants.HF_HUB_OFFLINE if _offline is None else _offline.upper() in ENV_VARS_TRUE_VALUES |
|
HF_DATASETS_OFFLINE = HF_HUB_OFFLINE |
|
|
|
|
|
|
|
|
|
|
|
|
|
__HF_DATASETS_DISABLE_PROGRESS_BARS = os.environ.get("HF_DATASETS_DISABLE_PROGRESS_BARS") |
|
HF_DATASETS_DISABLE_PROGRESS_BARS: Optional[bool] = ( |
|
__HF_DATASETS_DISABLE_PROGRESS_BARS.upper() in ENV_VARS_TRUE_VALUES |
|
if __HF_DATASETS_DISABLE_PROGRESS_BARS is not None |
|
else None |
|
) |
|
|
|
|
|
DEFAULT_IN_MEMORY_MAX_SIZE = 0 |
|
IN_MEMORY_MAX_SIZE = float(os.environ.get("HF_DATASETS_IN_MEMORY_MAX_SIZE", DEFAULT_IN_MEMORY_MAX_SIZE)) |
|
|
|
|
|
DATASET_ARROW_FILENAME = "dataset.arrow" |
|
DATASET_INDICES_FILENAME = "indices.arrow" |
|
DATASET_STATE_JSON_FILENAME = "state.json" |
|
DATASET_INFO_FILENAME = "dataset_info.json" |
|
DATASETDICT_INFOS_FILENAME = "dataset_infos.json" |
|
LICENSE_FILENAME = "LICENSE" |
|
DATASETDICT_JSON_FILENAME = "dataset_dict.json" |
|
METADATA_CONFIGS_FIELD = "configs" |
|
REPOCARD_FILENAME = "README.md" |
|
REPOYAML_FILENAME = ".huggingface.yaml" |
|
|
|
MODULE_NAME_FOR_DYNAMIC_MODULES = "datasets_modules" |
|
|
|
MAX_DATASET_CONFIG_ID_READABLE_LENGTH = 255 |
|
|
|
|
|
TEMP_CACHE_DIR_PREFIX = "hf_datasets-" |
|
|
|
|
|
STREAMING_READ_MAX_RETRIES = 20 |
|
STREAMING_READ_RETRY_INTERVAL = 5 |
|
|
|
|
|
DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200 |
|
GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10 |
|
ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200 |
|
|
|
|
|
MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL = 1000 |
|
|
|
|
|
PBAR_REFRESH_TIME_INTERVAL = 0.05 |
|
|
|
|
|
UPLOADS_MAX_NUMBER_PER_COMMIT = 50 |
|
|
|
|
|
MAX_TABLE_NBYTES_FOR_PICKLING = 4 << 30 |
|
|