Spaces:

jamtur01
/

MMaDA

Runtime error

App Files Files Community

MMaDA / venv /lib /python3.11 /site-packages /pyarrow /orc.py

jamtur01

Upload folder using huggingface_hub

9c6594c verified 30 days ago

raw

history blame contribute delete

12.6 kB

	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.


	from numbers import Integral
	import warnings

	from pyarrow.lib import Table
	import pyarrow._orc as _orc
	from pyarrow.fs import _resolve_filesystem_and_path


	class ORCFile:
	"""
	Reader interface for a single ORC file

	Parameters
	----------
	source : str or pyarrow.NativeFile
	Readable source. For passing Python file objects or byte buffers,
	see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
	"""

	def __init__(self, source):
	self.reader = _orc.ORCReader()
	self.reader.open(source)

	@property
	def metadata(self):
	"""The file metadata, as an arrow KeyValueMetadata"""
	return self.reader.metadata()

	@property
	def schema(self):
	"""The file schema, as an arrow schema"""
	return self.reader.schema()

	@property
	def nrows(self):
	"""The number of rows in the file"""
	return self.reader.nrows()

	@property
	def nstripes(self):
	"""The number of stripes in the file"""
	return self.reader.nstripes()

	@property
	def file_version(self):
	"""Format version of the ORC file, must be 0.11 or 0.12"""
	return self.reader.file_version()

	@property
	def software_version(self):
	"""Software instance and version that wrote this file"""
	return self.reader.software_version()

	@property
	def compression(self):
	"""Compression codec of the file"""
	return self.reader.compression()

	@property
	def compression_size(self):
	"""Number of bytes to buffer for the compression codec in the file"""
	return self.reader.compression_size()

	@property
	def writer(self):
	"""Name of the writer that wrote this file.
	If the writer is unknown then its Writer ID
	(a number) is returned"""
	return self.reader.writer()

	@property
	def writer_version(self):
	"""Version of the writer"""
	return self.reader.writer_version()

	@property
	def row_index_stride(self):
	"""Number of rows per an entry in the row index or 0
	if there is no row index"""
	return self.reader.row_index_stride()

	@property
	def nstripe_statistics(self):
	"""Number of stripe statistics"""
	return self.reader.nstripe_statistics()

	@property
	def content_length(self):
	"""Length of the data stripes in the file in bytes"""
	return self.reader.content_length()

	@property
	def stripe_statistics_length(self):
	"""The number of compressed bytes in the file stripe statistics"""
	return self.reader.stripe_statistics_length()

	@property
	def file_footer_length(self):
	"""The number of compressed bytes in the file footer"""
	return self.reader.file_footer_length()

	@property
	def file_postscript_length(self):
	"""The number of bytes in the file postscript"""
	return self.reader.file_postscript_length()

	@property
	def file_length(self):
	"""The number of bytes in the file"""
	return self.reader.file_length()

	def _select_names(self, columns=None):
	if columns is None:
	return None

	schema = self.schema
	names = []
	for col in columns:
	if isinstance(col, Integral):
	col = int(col)
	if 0 <= col < len(schema):
	col = schema[col].name
	names.append(col)
	else:
	raise ValueError("Column indices must be in 0 <= ind < %d,"
	" got %d" % (len(schema), col))
	else:
	return columns

	return names

	def read_stripe(self, n, columns=None):
	"""Read a single stripe from the file.

	Parameters
	----------
	n : int
	The stripe index
	columns : list
	If not None, only these columns will be read from the stripe. A
	column name may be a prefix of a nested field, e.g. 'a' will select
	'a.b', 'a.c', and 'a.d.e'

	Returns
	-------
	pyarrow.RecordBatch
	Content of the stripe as a RecordBatch.
	"""
	columns = self._select_names(columns)
	return self.reader.read_stripe(n, columns=columns)

	def read(self, columns=None):
	"""Read the whole file.

	Parameters
	----------
	columns : list
	If not None, only these columns will be read from the file. A
	column name may be a prefix of a nested field, e.g. 'a' will select
	'a.b', 'a.c', and 'a.d.e'. Output always follows the
	ordering of the file and not the `columns` list.

	Returns
	-------
	pyarrow.Table
	Content of the file as a Table.
	"""
	columns = self._select_names(columns)
	return self.reader.read(columns=columns)


	_orc_writer_args_docs = """file_version : {"0.11", "0.12"}, default "0.12"
	Determine which ORC file version to use.
	`Hive 0.11 / ORC v0 <https://orc.apache.org/specification/ORCv0/>`_
	is the older version
	while `Hive 0.12 / ORC v1 <https://orc.apache.org/specification/ORCv1/>`_
	is the newer one.
	batch_size : int, default 1024
	Number of rows the ORC writer writes at a time.
	stripe_size : int, default 64 * 1024 * 1024
	Size of each ORC stripe in bytes.
	compression : string, default 'uncompressed'
	The compression codec.
	Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'}
	Note that LZ0 is currently not supported.
	compression_block_size : int, default 64 * 1024
	Size of each compression block in bytes.
	compression_strategy : string, default 'speed'
	The compression strategy i.e. speed vs size reduction.
	Valid values: {'SPEED', 'COMPRESSION'}
	row_index_stride : int, default 10000
	The row index stride i.e. the number of rows per
	an entry in the row index.
	padding_tolerance : double, default 0.0
	The padding tolerance.
	dictionary_key_size_threshold : double, default 0.0
	The dictionary key size threshold. 0 to disable dictionary encoding.
	1 to always enable dictionary encoding.
	bloom_filter_columns : None, set-like or list-like, default None
	Columns that use the bloom filter.
	bloom_filter_fpp : double, default 0.05
	Upper limit of the false-positive rate of the bloom filter.
	"""


	class ORCWriter:
	__doc__ = """
	Writer interface for a single ORC file

	Parameters
	----------
	where : str or pyarrow.io.NativeFile
	Writable target. For passing Python file objects or byte buffers,
	see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream
	or pyarrow.io.FixedSizeBufferWriter.
	{}
	""".format(_orc_writer_args_docs)

	is_open = False

	def __init__(self, where, *,
	file_version='0.12',
	batch_size=1024,
	stripe_size=64 * 1024 * 1024,
	compression='uncompressed',
	compression_block_size=65536,
	compression_strategy='speed',
	row_index_stride=10000,
	padding_tolerance=0.0,
	dictionary_key_size_threshold=0.0,
	bloom_filter_columns=None,
	bloom_filter_fpp=0.05,
	):
	self.writer = _orc.ORCWriter()
	self.writer.open(
	where,
	file_version=file_version,
	batch_size=batch_size,
	stripe_size=stripe_size,
	compression=compression,
	compression_block_size=compression_block_size,
	compression_strategy=compression_strategy,
	row_index_stride=row_index_stride,
	padding_tolerance=padding_tolerance,
	dictionary_key_size_threshold=dictionary_key_size_threshold,
	bloom_filter_columns=bloom_filter_columns,
	bloom_filter_fpp=bloom_filter_fpp
	)
	self.is_open = True

	def __del__(self):
	self.close()

	def __enter__(self):
	return self

	def __exit__(self, args, *kwargs):
	self.close()

	def write(self, table):
	"""
	Write the table into an ORC file. The schema of the table must
	be equal to the schema used when opening the ORC file.

	Parameters
	----------
	table : pyarrow.Table
	The table to be written into the ORC file
	"""
	assert self.is_open
	self.writer.write(table)

	def close(self):
	"""
	Close the ORC file
	"""
	if self.is_open:
	self.writer.close()
	self.is_open = False


	def read_table(source, columns=None, filesystem=None):
	filesystem, path = _resolve_filesystem_and_path(source, filesystem)
	if filesystem is not None:
	source = filesystem.open_input_file(path)

	if columns is not None and len(columns) == 0:
	result = ORCFile(source).read().select(columns)
	else:
	result = ORCFile(source).read(columns=columns)

	return result


	read_table.__doc__ = """
	Read a Table from an ORC file.

	Parameters
	----------
	source : str, pyarrow.NativeFile, or file-like object
	If a string passed, can be a single file name. For file-like objects,
	only read a single file. Use pyarrow.BufferReader to read a file
	contained in a bytes or buffer-like object.
	columns : list
	If not None, only these columns will be read from the file. A column
	name may be a prefix of a nested field, e.g. 'a' will select 'a.b',
	'a.c', and 'a.d.e'. Output always follows the ordering of the file and
	not the `columns` list. If empty, no columns will be read. Note
	that the table will still have the correct num_rows set despite having
	no columns.
	filesystem : FileSystem, default None
	If nothing passed, will be inferred based on path.
	Path will try to be found in the local on-disk filesystem otherwise
	it will be parsed as an URI to determine the filesystem.
	"""


	def write_table(table, where, *,
	file_version='0.12',
	batch_size=1024,
	stripe_size=64 * 1024 * 1024,
	compression='uncompressed',
	compression_block_size=65536,
	compression_strategy='speed',
	row_index_stride=10000,
	padding_tolerance=0.0,
	dictionary_key_size_threshold=0.0,
	bloom_filter_columns=None,
	bloom_filter_fpp=0.05):
	if isinstance(where, Table):
	warnings.warn(
	"The order of the arguments has changed. Pass as "
	"'write_table(table, where)' instead. The old order will raise "
	"an error in the future.", FutureWarning, stacklevel=2
	)
	table, where = where, table
	with ORCWriter(
	where,
	file_version=file_version,
	batch_size=batch_size,
	stripe_size=stripe_size,
	compression=compression,
	compression_block_size=compression_block_size,
	compression_strategy=compression_strategy,
	row_index_stride=row_index_stride,
	padding_tolerance=padding_tolerance,
	dictionary_key_size_threshold=dictionary_key_size_threshold,
	bloom_filter_columns=bloom_filter_columns,
	bloom_filter_fpp=bloom_filter_fpp
	) as writer:
	writer.write(table)


	write_table.__doc__ = """
	Write a table into an ORC file.

	Parameters
	----------
	table : pyarrow.lib.Table
	The table to be written into the ORC file
	where : str or pyarrow.io.NativeFile
	Writable target. For passing Python file objects or byte buffers,
	see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream
	or pyarrow.io.FixedSizeBufferWriter.
	{}
	""".format(_orc_writer_args_docs)