File size: 1,593 Bytes
9c6594c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from argparse import ArgumentParser
from typing import Optional

from datasets.commands import BaseDatasetsCLICommand
from datasets.hub import convert_to_parquet


def _command_factory(args):
    return ConvertToParquetCommand(
        args.dataset_id,
        args.token,
        args.revision,
        args.trust_remote_code,
    )


class ConvertToParquetCommand(BaseDatasetsCLICommand):
    @staticmethod
    def register_subcommand(parser):
        parser: ArgumentParser = parser.add_parser("convert_to_parquet", help="Convert dataset to Parquet")
        parser.add_argument(
            "dataset_id", help="source dataset ID, e.g. USERNAME/DATASET_NAME or ORGANIZATION/DATASET_NAME"
        )
        parser.add_argument("--token", help="access token to the Hugging Face Hub (defaults to logged-in user's one)")
        parser.add_argument("--revision", help="source revision")
        parser.add_argument(
            "--trust_remote_code", action="store_true", help="whether to trust the code execution of the load script"
        )
        parser.set_defaults(func=_command_factory)

    def __init__(
        self,
        dataset_id: str,
        token: Optional[str],
        revision: Optional[str],
        trust_remote_code: bool,
    ):
        self._dataset_id = dataset_id
        self._token = token
        self._revision = revision
        self._trust_remote_code = trust_remote_code

    def run(self) -> None:
        _ = convert_to_parquet(
            self._dataset_id, revision=self._revision, token=self._token, trust_remote_code=self._trust_remote_code
        )