|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from libcpp.unordered_map cimport unordered_map |
|
from libcpp cimport bool as c_bool |
|
|
|
from pyarrow.includes.common cimport * |
|
from pyarrow.includes.libarrow cimport * |
|
from pyarrow.includes.libarrow_acero cimport * |
|
from pyarrow.includes.libarrow_fs cimport * |
|
|
|
|
|
cdef extern from "arrow/dataset/plan.h" namespace "arrow::dataset::internal" nogil: |
|
|
|
cdef void Initialize() |
|
|
|
|
|
ctypedef CStatus cb_writer_finish_internal(CFileWriter*) |
|
ctypedef void cb_writer_finish(dict, CFileWriter*) |
|
|
|
cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: |
|
|
|
cdef enum ExistingDataBehavior" arrow::dataset::ExistingDataBehavior": |
|
ExistingDataBehavior_DELETE_MATCHING" \ |
|
arrow::dataset::ExistingDataBehavior::kDeleteMatchingPartitions" |
|
ExistingDataBehavior_OVERWRITE_OR_IGNORE" \ |
|
arrow::dataset::ExistingDataBehavior::kOverwriteOrIgnore" |
|
ExistingDataBehavior_ERROR" \ |
|
arrow::dataset::ExistingDataBehavior::kError" |
|
|
|
cdef cppclass CScanOptions "arrow::dataset::ScanOptions": |
|
shared_ptr[CSchema] dataset_schema |
|
shared_ptr[CSchema] projected_schema |
|
c_bool use_threads |
|
c_bool cache_metadata |
|
CExpression filter |
|
|
|
cdef cppclass CScanNodeOptions "arrow::dataset::ScanNodeOptions"(CExecNodeOptions): |
|
CScanNodeOptions(shared_ptr[CDataset] dataset, shared_ptr[CScanOptions] scan_options, bint require_sequenced_output, bint implicit_ordering) |
|
|
|
shared_ptr[CScanOptions] scan_options |
|
|
|
cdef cppclass CFragmentScanOptions "arrow::dataset::FragmentScanOptions": |
|
c_string type_name() const |
|
|
|
ctypedef CIterator[shared_ptr[CScanTask]] CScanTaskIterator \ |
|
"arrow::dataset::ScanTaskIterator" |
|
|
|
cdef cppclass CScanTask" arrow::dataset::ScanTask": |
|
CResult[CRecordBatchIterator] Execute() |
|
|
|
cdef cppclass CFragment "arrow::dataset::Fragment": |
|
CResult[shared_ptr[CSchema]] ReadPhysicalSchema() |
|
CResult[CScanTaskIterator] Scan(shared_ptr[CScanOptions] options) |
|
c_bool splittable() const |
|
c_string type_name() const |
|
const CExpression& partition_expression() const |
|
|
|
ctypedef vector[shared_ptr[CFragment]] CFragmentVector \ |
|
"arrow::dataset::FragmentVector" |
|
|
|
ctypedef CIterator[shared_ptr[CFragment]] CFragmentIterator \ |
|
"arrow::dataset::FragmentIterator" |
|
|
|
cdef cppclass CInMemoryFragment "arrow::dataset::InMemoryFragment"( |
|
CFragment): |
|
CInMemoryFragment(vector[shared_ptr[CRecordBatch]] record_batches, |
|
CExpression partition_expression) |
|
|
|
cdef cppclass CTaggedRecordBatch "arrow::dataset::TaggedRecordBatch": |
|
shared_ptr[CRecordBatch] record_batch |
|
shared_ptr[CFragment] fragment |
|
|
|
ctypedef CIterator[CTaggedRecordBatch] CTaggedRecordBatchIterator \ |
|
"arrow::dataset::TaggedRecordBatchIterator" |
|
|
|
cdef cppclass CScanner "arrow::dataset::Scanner": |
|
CScanner(shared_ptr[CDataset], shared_ptr[CScanOptions]) |
|
CScanner(shared_ptr[CFragment], shared_ptr[CScanOptions]) |
|
CResult[CScanTaskIterator] Scan() |
|
CResult[CTaggedRecordBatchIterator] ScanBatches() |
|
CResult[shared_ptr[CTable]] ToTable() |
|
CResult[shared_ptr[CTable]] TakeRows(const CArray& indices) |
|
CResult[shared_ptr[CTable]] Head(int64_t num_rows) |
|
CResult[int64_t] CountRows() |
|
CResult[CFragmentIterator] GetFragments() |
|
CResult[shared_ptr[CRecordBatchReader]] ToRecordBatchReader() |
|
const shared_ptr[CScanOptions]& options() |
|
|
|
cdef cppclass CScannerBuilder "arrow::dataset::ScannerBuilder": |
|
CScannerBuilder(shared_ptr[CDataset], |
|
shared_ptr[CScanOptions] scan_options) |
|
CScannerBuilder(shared_ptr[CSchema], shared_ptr[CFragment], |
|
shared_ptr[CScanOptions] scan_options) |
|
|
|
@staticmethod |
|
shared_ptr[CScannerBuilder] FromRecordBatchReader( |
|
shared_ptr[CRecordBatchReader] reader) |
|
CStatus ProjectColumns "Project"(const vector[c_string]& columns) |
|
CStatus Project(vector[CExpression]& exprs, vector[c_string]& columns) |
|
CStatus Filter(CExpression filter) |
|
CStatus UseThreads(c_bool use_threads) |
|
CStatus CacheMetadata(c_bool cache_metadata) |
|
CStatus Pool(CMemoryPool* pool) |
|
CStatus BatchSize(int64_t batch_size) |
|
CStatus BatchReadahead(int32_t batch_readahead) |
|
CStatus FragmentReadahead(int32_t fragment_readahead) |
|
CStatus FragmentScanOptions( |
|
shared_ptr[CFragmentScanOptions] fragment_scan_options) |
|
CResult[shared_ptr[CScanOptions]] GetScanOptions() |
|
CResult[shared_ptr[CScanner]] Finish() |
|
shared_ptr[CSchema] schema() const |
|
|
|
ctypedef vector[shared_ptr[CDataset]] CDatasetVector \ |
|
"arrow::dataset::DatasetVector" |
|
|
|
cdef cppclass CDataset "arrow::dataset::Dataset": |
|
const shared_ptr[CSchema] & schema() |
|
CResult[CFragmentIterator] GetFragments() |
|
CResult[CFragmentIterator] GetFragments(CExpression predicate) |
|
const CExpression & partition_expression() |
|
c_string type_name() |
|
|
|
CResult[shared_ptr[CDataset]] ReplaceSchema(shared_ptr[CSchema]) |
|
|
|
CResult[shared_ptr[CScannerBuilder]] NewScan() |
|
|
|
cdef cppclass CInMemoryDataset "arrow::dataset::InMemoryDataset"( |
|
CDataset): |
|
CInMemoryDataset(shared_ptr[CRecordBatchReader]) |
|
CInMemoryDataset(shared_ptr[CTable]) |
|
|
|
cdef cppclass CUnionDataset "arrow::dataset::UnionDataset"( |
|
CDataset): |
|
@staticmethod |
|
CResult[shared_ptr[CUnionDataset]] Make(shared_ptr[CSchema] schema, |
|
CDatasetVector children) |
|
|
|
const CDatasetVector& children() const |
|
|
|
cdef cppclass CInspectOptions "arrow::dataset::InspectOptions": |
|
int fragments |
|
|
|
cdef cppclass CFinishOptions "arrow::dataset::FinishOptions": |
|
shared_ptr[CSchema] schema |
|
CInspectOptions inspect_options |
|
c_bool validate_fragments |
|
|
|
cdef cppclass CDatasetFactory "arrow::dataset::DatasetFactory": |
|
CResult[vector[shared_ptr[CSchema]]] InspectSchemas(CInspectOptions) |
|
CResult[shared_ptr[CSchema]] Inspect(CInspectOptions) |
|
CResult[shared_ptr[CDataset]] FinishWithSchema "Finish"( |
|
const shared_ptr[CSchema]& schema) |
|
CResult[shared_ptr[CDataset]] Finish() |
|
const CExpression& root_partition() |
|
CStatus SetRootPartition(CExpression partition) |
|
|
|
cdef cppclass CUnionDatasetFactory "arrow::dataset::UnionDatasetFactory": |
|
@staticmethod |
|
CResult[shared_ptr[CDatasetFactory]] Make( |
|
vector[shared_ptr[CDatasetFactory]] factories) |
|
|
|
cdef cppclass CFileSource "arrow::dataset::FileSource": |
|
const c_string& path() const |
|
const shared_ptr[CFileSystem]& filesystem() const |
|
const shared_ptr[CBuffer]& buffer() const |
|
const int64_t size() const |
|
|
|
|
|
|
|
|
|
CFileSource(...) |
|
|
|
cdef cppclass CFileWriteOptions \ |
|
"arrow::dataset::FileWriteOptions": |
|
const shared_ptr[CFileFormat]& format() const |
|
c_string type_name() const |
|
|
|
cdef cppclass CFileWriter \ |
|
"arrow::dataset::FileWriter": |
|
const shared_ptr[CFileFormat]& format() const |
|
const shared_ptr[CSchema]& schema() const |
|
const shared_ptr[CFileWriteOptions]& options() const |
|
const CFileLocator& destination() const |
|
CResult[int64_t] GetBytesWritten() |
|
|
|
cdef cppclass CFileFormat "arrow::dataset::FileFormat": |
|
shared_ptr[CFragmentScanOptions] default_fragment_scan_options |
|
c_string type_name() const |
|
CResult[shared_ptr[CSchema]] Inspect(const CFileSource&) const |
|
CResult[shared_ptr[CFileFragment]] MakeFragment( |
|
CFileSource source, |
|
CExpression partition_expression, |
|
shared_ptr[CSchema] physical_schema) |
|
shared_ptr[CFileWriteOptions] DefaultWriteOptions() |
|
|
|
cdef cppclass CFileFragment "arrow::dataset::FileFragment"( |
|
CFragment): |
|
const CFileSource& source() const |
|
const shared_ptr[CFileFormat]& format() const |
|
|
|
cdef cppclass CFileSystemDatasetWriteOptions \ |
|
"arrow::dataset::FileSystemDatasetWriteOptions": |
|
shared_ptr[CFileWriteOptions] file_write_options |
|
shared_ptr[CFileSystem] filesystem |
|
c_string base_dir |
|
shared_ptr[CPartitioning] partitioning |
|
int max_partitions |
|
c_string basename_template |
|
function[cb_writer_finish_internal] writer_pre_finish |
|
function[cb_writer_finish_internal] writer_post_finish |
|
ExistingDataBehavior existing_data_behavior |
|
c_bool create_dir |
|
uint32_t max_open_files |
|
uint64_t max_rows_per_file |
|
uint64_t min_rows_per_group |
|
uint64_t max_rows_per_group |
|
|
|
cdef cppclass CFileSystemDataset \ |
|
"arrow::dataset::FileSystemDataset"(CDataset): |
|
@staticmethod |
|
CResult[shared_ptr[CDataset]] Make( |
|
shared_ptr[CSchema] schema, |
|
CExpression source_partition, |
|
shared_ptr[CFileFormat] format, |
|
shared_ptr[CFileSystem] filesystem, |
|
vector[shared_ptr[CFileFragment]] fragments) |
|
|
|
@staticmethod |
|
CStatus Write( |
|
const CFileSystemDatasetWriteOptions& write_options, |
|
shared_ptr[CScanner] scanner) |
|
|
|
c_string type() |
|
vector[c_string] files() |
|
const shared_ptr[CFileFormat]& format() const |
|
const shared_ptr[CFileSystem]& filesystem() const |
|
const shared_ptr[CPartitioning]& partitioning() const |
|
|
|
cdef cppclass CIpcFileWriteOptions \ |
|
"arrow::dataset::IpcFileWriteOptions"(CFileWriteOptions): |
|
shared_ptr[CIpcWriteOptions] options |
|
|
|
cdef cppclass CIpcFileFormat "arrow::dataset::IpcFileFormat"( |
|
CFileFormat): |
|
pass |
|
|
|
cdef cppclass COrcFileFormat "arrow::dataset::OrcFileFormat"( |
|
CFileFormat): |
|
pass |
|
|
|
cdef cppclass CCsvFileWriteOptions \ |
|
"arrow::dataset::CsvFileWriteOptions"(CFileWriteOptions): |
|
shared_ptr[CCSVWriteOptions] write_options |
|
CMemoryPool* pool |
|
|
|
cdef cppclass CCsvFileFormat "arrow::dataset::CsvFileFormat"( |
|
CFileFormat): |
|
CCSVParseOptions parse_options |
|
|
|
cdef cppclass CCsvFragmentScanOptions \ |
|
"arrow::dataset::CsvFragmentScanOptions"(CFragmentScanOptions): |
|
CCSVConvertOptions convert_options |
|
CCSVReadOptions read_options |
|
function[StreamWrapFunc] stream_transform_func |
|
|
|
cdef cppclass CJsonFileFormat "arrow::dataset::JsonFileFormat"(CFileFormat): |
|
pass |
|
|
|
cdef cppclass CJsonFragmentScanOptions "arrow::dataset::JsonFragmentScanOptions"(CFragmentScanOptions): |
|
CJSONParseOptions parse_options |
|
CJSONReadOptions read_options |
|
|
|
cdef struct CPartitionPathFormat "arrow::dataset::PartitionPathFormat": |
|
c_string directory |
|
c_string filename |
|
|
|
cdef cppclass CPartitioning "arrow::dataset::Partitioning": |
|
c_string type_name() const |
|
CResult[CExpression] Parse(const c_string & path) const |
|
CResult[CPartitionPathFormat] Format(const CExpression & expr) const |
|
const shared_ptr[CSchema] & schema() |
|
c_bool Equals(const CPartitioning& other) const |
|
|
|
cdef cppclass CSegmentEncoding" arrow::dataset::SegmentEncoding": |
|
bint operator==(CSegmentEncoding) |
|
|
|
CSegmentEncoding CSegmentEncoding_None\ |
|
" arrow::dataset::SegmentEncoding::None" |
|
CSegmentEncoding CSegmentEncoding_Uri\ |
|
" arrow::dataset::SegmentEncoding::Uri" |
|
|
|
cdef cppclass CKeyValuePartitioningOptions \ |
|
"arrow::dataset::KeyValuePartitioningOptions": |
|
CSegmentEncoding segment_encoding |
|
|
|
cdef cppclass CHivePartitioningOptions \ |
|
"arrow::dataset::HivePartitioningOptions": |
|
CSegmentEncoding segment_encoding |
|
c_string null_fallback |
|
|
|
cdef cppclass CPartitioningFactoryOptions \ |
|
"arrow::dataset::PartitioningFactoryOptions": |
|
c_bool infer_dictionary |
|
shared_ptr[CSchema] schema |
|
CSegmentEncoding segment_encoding |
|
|
|
cdef cppclass CHivePartitioningFactoryOptions \ |
|
"arrow::dataset::HivePartitioningFactoryOptions": |
|
c_bool infer_dictionary |
|
c_string null_fallback |
|
shared_ptr[CSchema] schema |
|
CSegmentEncoding segment_encoding |
|
|
|
cdef cppclass CPartitioningFactory "arrow::dataset::PartitioningFactory": |
|
c_string type_name() const |
|
|
|
cdef cppclass CKeyValuePartitioning \ |
|
"arrow::dataset::KeyValuePartitioning"(CPartitioning): |
|
CKeyValuePartitioning(shared_ptr[CSchema] schema, |
|
vector[shared_ptr[CArray]] dictionaries, |
|
CKeyValuePartitioningOptions options) |
|
|
|
vector[shared_ptr[CArray]] dictionaries() const |
|
CSegmentEncoding segment_encoding() |
|
|
|
cdef cppclass CDirectoryPartitioning \ |
|
"arrow::dataset::DirectoryPartitioning"(CPartitioning): |
|
CDirectoryPartitioning(shared_ptr[CSchema] schema, |
|
vector[shared_ptr[CArray]] dictionaries) |
|
|
|
@staticmethod |
|
shared_ptr[CPartitioningFactory] MakeFactory( |
|
vector[c_string] field_names, CPartitioningFactoryOptions) |
|
|
|
vector[shared_ptr[CArray]] dictionaries() const |
|
|
|
cdef cppclass CHivePartitioning \ |
|
"arrow::dataset::HivePartitioning"(CPartitioning): |
|
CHivePartitioning(shared_ptr[CSchema] schema, |
|
vector[shared_ptr[CArray]] dictionaries, |
|
CHivePartitioningOptions options) |
|
|
|
@staticmethod |
|
shared_ptr[CPartitioningFactory] MakeFactory( |
|
CHivePartitioningFactoryOptions) |
|
|
|
vector[shared_ptr[CArray]] dictionaries() const |
|
c_string null_fallback() const |
|
|
|
cdef cppclass CFilenamePartitioning \ |
|
"arrow::dataset::FilenamePartitioning"(CPartitioning): |
|
CFilenamePartitioning(shared_ptr[CSchema] schema, |
|
vector[shared_ptr[CArray]] dictionaries) |
|
|
|
@staticmethod |
|
shared_ptr[CPartitioningFactory] MakeFactory( |
|
vector[c_string] field_names, CPartitioningFactoryOptions) |
|
|
|
vector[shared_ptr[CArray]] dictionaries() const |
|
|
|
cdef cppclass CPartitioningOrFactory \ |
|
"arrow::dataset::PartitioningOrFactory": |
|
CPartitioningOrFactory(shared_ptr[CPartitioning]) |
|
CPartitioningOrFactory(shared_ptr[CPartitioningFactory]) |
|
CPartitioningOrFactory & operator = (shared_ptr[CPartitioning]) |
|
CPartitioningOrFactory & operator = ( |
|
shared_ptr[CPartitioningFactory]) |
|
shared_ptr[CPartitioning] partitioning() const |
|
shared_ptr[CPartitioningFactory] factory() const |
|
|
|
cdef cppclass CFileSystemFactoryOptions \ |
|
"arrow::dataset::FileSystemFactoryOptions": |
|
CPartitioningOrFactory partitioning |
|
c_string partition_base_dir |
|
c_bool exclude_invalid_files |
|
vector[c_string] selector_ignore_prefixes |
|
|
|
cdef cppclass CFileSystemDatasetFactory \ |
|
"arrow::dataset::FileSystemDatasetFactory"( |
|
CDatasetFactory): |
|
@staticmethod |
|
CResult[shared_ptr[CDatasetFactory]] MakeFromPaths "Make"( |
|
shared_ptr[CFileSystem] filesystem, |
|
vector[c_string] paths, |
|
shared_ptr[CFileFormat] format, |
|
CFileSystemFactoryOptions options |
|
) |
|
|
|
@staticmethod |
|
CResult[shared_ptr[CDatasetFactory]] MakeFromSelector "Make"( |
|
shared_ptr[CFileSystem] filesystem, |
|
CFileSelector, |
|
shared_ptr[CFileFormat] format, |
|
CFileSystemFactoryOptions options |
|
) |
|
|
|
@staticmethod |
|
CResult[shared_ptr[CDatasetFactory]] MakeFromFileInfos "Make"( |
|
shared_ptr[CFileSystem] filesystem, |
|
vector[CFileInfo] files, |
|
shared_ptr[CFileFormat] format, |
|
CFileSystemFactoryOptions options |
|
) |
|
|