Spaces:

jamtur01
/

MMaDA

Runtime error

App Files Files Community

MMaDA / venv /lib /python3.11 /site-packages /pyarrow /tests /test_io.py

jamtur01

Upload folder using huggingface_hub

9c6594c verified about 1 month ago

raw

history blame contribute delete

63.8 kB

	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import bz2
	from contextlib import contextmanager
	from io import (BytesIO, StringIO, TextIOWrapper, BufferedIOBase, IOBase)
	import itertools
	import gc
	import gzip
	import math
	import os
	import pathlib
	import pytest
	import random
	import sys
	import tempfile
	import weakref

	try:
	import numpy as np
	except ImportError:
	np = None

	from pyarrow.util import guid
	from pyarrow import Codec
	import pyarrow as pa


	def check_large_seeks(file_factory, expected_error=None):
	if sys.platform in ('win32', 'darwin', 'emscripten'):
	pytest.skip("need sparse file support")
	try:
	filename = tempfile.mktemp(prefix='test_io')
	with open(filename, 'wb') as f:
	f.truncate(2 ** 32 + 10)
	f.seek(2 ** 32 + 5)
	f.write(b'mark\n')
	if expected_error:
	with expected_error:
	file_factory(filename)
	else:
	with file_factory(filename) as f:
	assert f.size() == 2 ** 32 + 10
	assert f.seek(2 32 + 5) == 2 32 + 5
	assert f.tell() == 2 ** 32 + 5
	assert f.read(5) == b'mark\n'
	assert f.tell() == 2 ** 32 + 10
	finally:
	os.unlink(filename)


	@contextmanager
	def assert_file_not_found():
	with pytest.raises(FileNotFoundError):
	yield


	# ----------------------------------------------------------------------
	# Python file-like objects


	def test_python_file_write():
	buf = BytesIO()

	f = pa.PythonFile(buf)

	assert f.tell() == 0

	s1 = b'enga\xc3\xb1ado'
	s2 = b'foobar'

	f.write(s1)
	assert f.tell() == len(s1)

	f.write(s2)

	expected = s1 + s2

	result = buf.getvalue()
	assert result == expected

	assert not f.closed
	f.close()
	assert f.closed

	with pytest.raises(TypeError, match="binary file expected"):
	pa.PythonFile(StringIO())


	def test_python_file_read():
	data = b'some sample data'

	buf = BytesIO(data)
	f = pa.PythonFile(buf, mode='r')

	assert f.size() == len(data)

	assert f.tell() == 0

	assert f.read(4) == b'some'
	assert f.tell() == 4

	f.seek(0)
	assert f.tell() == 0

	f.seek(5)
	assert f.tell() == 5

	v = f.read(50)
	assert v == b'sample data'
	assert len(v) == 11

	assert f.size() == len(data)

	assert not f.closed
	f.close()
	assert f.closed

	with pytest.raises(TypeError, match="binary file expected"):
	pa.PythonFile(StringIO(), mode='r')


	@pytest.mark.parametrize("nbytes", (-1, 0, 1, 5, 100))
	@pytest.mark.parametrize("file_offset", (-1, 0, 5, 100))
	def test_python_file_get_stream(nbytes, file_offset):

	data = b'data1data2data3data4data5'

	f = pa.PythonFile(BytesIO(data), mode='r')

	# negative nbytes or offsets don't make sense here, raise ValueError
	if nbytes < 0 or file_offset < 0:
	with pytest.raises(pa.ArrowInvalid,
	match="should be a positive value"):
	f.get_stream(file_offset=file_offset, nbytes=nbytes)
	f.close()
	return
	else:
	stream = f.get_stream(file_offset=file_offset, nbytes=nbytes)

	# Subsequent calls to 'read' should match behavior if same
	# data passed to BytesIO where get_stream should handle if
	# nbytes/file_offset results in no bytes b/c out of bounds.
	start = min(file_offset, len(data))
	end = min(file_offset + nbytes, len(data))
	buf = BytesIO(data[start:end])

	# read some chunks
	assert stream.read(nbytes=4) == buf.read(4)
	assert stream.read(nbytes=6) == buf.read(6)

	# Read to end of each stream
	assert stream.read() == buf.read()

	# Try reading past the stream
	n = len(data) * 2
	assert stream.read(n) == buf.read(n)

	# NativeFile[CInputStream] is not seekable
	with pytest.raises(OSError, match="seekable"):
	stream.seek(0)

	stream.close()
	assert stream.closed


	def test_python_file_read_at():
	data = b'some sample data'

	buf = BytesIO(data)
	f = pa.PythonFile(buf, mode='r')

	# test simple read at
	v = f.read_at(nbytes=5, offset=3)
	assert v == b'e sam'
	assert len(v) == 5

	# test reading entire file when nbytes > len(file)
	w = f.read_at(nbytes=50, offset=0)
	assert w == data
	assert len(w) == 16


	def test_python_file_readall():
	data = b'some sample data'

	buf = BytesIO(data)
	with pa.PythonFile(buf, mode='r') as f:
	assert f.readall() == data


	def test_python_file_readinto():
	length = 10
	data = b'some sample data longer than 10'
	dst_buf = bytearray(length)
	src_buf = BytesIO(data)

	with pa.PythonFile(src_buf, mode='r') as f:
	assert f.readinto(dst_buf) == 10

	assert dst_buf[:length] == data[:length]
	assert len(dst_buf) == length


	def test_python_file_read_buffer():
	length = 10
	data = b'0123456798'
	dst_buf = bytearray(data)

	class DuckReader:
	def close(self):
	pass

	@property
	def closed(self):
	return False

	def read_buffer(self, nbytes):
	assert nbytes == length
	return memoryview(dst_buf)[:nbytes]

	duck_reader = DuckReader()
	with pa.PythonFile(duck_reader, mode='r') as f:
	buf = f.read_buffer(length)
	assert len(buf) == length
	assert memoryview(buf).tobytes() == dst_buf[:length]
	# buf should point to the same memory, so modifying it
	memoryview(buf)[0] = ord(b'x')
	# should modify the original
	assert dst_buf[0] == ord(b'x')


	def test_python_file_correct_abc():
	with pa.PythonFile(BytesIO(b''), mode='r') as f:
	assert isinstance(f, BufferedIOBase)
	assert isinstance(f, IOBase)


	def test_python_file_iterable():
	data = b'''line1
	line2
	line3
	'''

	buf = BytesIO(data)
	buf2 = BytesIO(data)

	with pa.PythonFile(buf, mode='r') as f:
	for read, expected in zip(f, buf2):
	assert read == expected


	def test_python_file_large_seeks():
	def factory(filename):
	return pa.PythonFile(open(filename, 'rb'))

	check_large_seeks(factory)


	def test_bytes_reader():
	# Like a BytesIO, but zero-copy underneath for C++ consumers
	data = b'some sample data'
	f = pa.BufferReader(data)
	assert f.tell() == 0

	assert f.size() == len(data)

	assert f.read(4) == b'some'
	assert f.tell() == 4

	f.seek(0)
	assert f.tell() == 0

	f.seek(0, 2)
	assert f.tell() == len(data)

	f.seek(5)
	assert f.tell() == 5

	assert f.read(50) == b'sample data'

	assert not f.closed
	f.close()
	assert f.closed


	def test_bytes_reader_non_bytes():
	with pytest.raises(TypeError):
	pa.BufferReader('some sample data')


	def test_bytes_reader_retains_parent_reference():
	import gc

	# ARROW-421
	def get_buffer():
	data = b'some sample data' * 1000
	reader = pa.BufferReader(data)
	reader.seek(5)
	return reader.read_buffer(6)

	buf = get_buffer()
	gc.collect()
	assert buf.to_pybytes() == b'sample'
	assert buf.parent is not None


	def test_python_file_implicit_mode(tmpdir):
	path = os.path.join(str(tmpdir), 'foo.txt')
	with open(path, 'wb') as f:
	pf = pa.PythonFile(f)
	assert pf.writable()
	assert not pf.readable()
	assert not pf.seekable() # PyOutputStream isn't seekable
	f.write(b'foobar\n')

	with open(path, 'rb') as f:
	pf = pa.PythonFile(f)
	assert pf.readable()
	assert not pf.writable()
	assert pf.seekable()
	assert pf.read() == b'foobar\n'

	bio = BytesIO()
	pf = pa.PythonFile(bio)
	assert pf.writable()
	assert not pf.readable()
	assert not pf.seekable()
	pf.write(b'foobar\n')
	assert bio.getvalue() == b'foobar\n'


	def test_python_file_writelines(tmpdir):
	lines = [b'line1\n', b'line2\n' b'line3']
	path = os.path.join(str(tmpdir), 'foo.txt')
	with open(path, 'wb') as f:
	try:
	f = pa.PythonFile(f, mode='w')
	assert f.writable()
	f.writelines(lines)
	finally:
	f.close()

	with open(path, 'rb') as f:
	try:
	f = pa.PythonFile(f, mode='r')
	assert f.readable()
	assert f.read() == b''.join(lines)
	finally:
	f.close()


	def test_python_file_closing():
	bio = BytesIO()
	pf = pa.PythonFile(bio)
	wr = weakref.ref(pf)
	del pf
	assert wr() is None # object was destroyed
	assert not bio.closed
	pf = pa.PythonFile(bio)
	pf.close()
	assert bio.closed


	# ----------------------------------------------------------------------
	# Buffers


	def check_buffer_pickling(buf, pickler):
	# Check that buffer survives a pickle roundtrip
	for protocol in range(0, pickler.HIGHEST_PROTOCOL + 1):
	result = pickler.loads(pickler.dumps(buf, protocol=protocol))
	assert len(result) == len(buf)
	assert memoryview(result) == memoryview(buf)
	assert result.to_pybytes() == buf.to_pybytes()
	assert result.is_mutable == buf.is_mutable


	def test_buffer_bytes(pickle_module):
	val = b'some data'

	buf = pa.py_buffer(val)
	assert isinstance(buf, pa.Buffer)
	assert not buf.is_mutable
	assert buf.is_cpu

	result = buf.to_pybytes()
	assert result == val

	check_buffer_pickling(buf, pickle_module)


	def test_buffer_null_data(pickle_module):
	null_buff = pa.foreign_buffer(address=0, size=0)
	assert null_buff.to_pybytes() == b""
	assert null_buff.address == 0
	# ARROW-16048: we shouldn't expose a NULL address through the Python
	# buffer protocol.
	m = memoryview(null_buff)
	assert m.tobytes() == b""
	assert pa.py_buffer(m).address != 0

	check_buffer_pickling(null_buff, pickle_module)


	def test_buffer_memoryview(pickle_module):
	val = b'some data'

	buf = pa.py_buffer(val)
	assert isinstance(buf, pa.Buffer)
	assert not buf.is_mutable
	assert buf.is_cpu

	result = memoryview(buf)
	assert result == val

	check_buffer_pickling(buf, pickle_module)


	def test_buffer_bytearray(pickle_module):
	val = bytearray(b'some data')

	buf = pa.py_buffer(val)
	assert isinstance(buf, pa.Buffer)
	assert buf.is_mutable
	assert buf.is_cpu

	result = bytearray(buf)
	assert result == val

	check_buffer_pickling(buf, pickle_module)


	def test_buffer_invalid():
	with pytest.raises(TypeError,
	match="(bytes-like object\|buffer interface)"):
	pa.py_buffer(None)


	def test_buffer_weakref():
	buf = pa.py_buffer(b'some data')
	wr = weakref.ref(buf)
	assert wr() is not None
	del buf
	assert wr() is None


	@pytest.mark.parametrize('val, expected_hex_buffer',
	[(b'check', b'636865636B'),
	(b'\a0', b'0730'),
	(b'', b'')])
	def test_buffer_hex(val, expected_hex_buffer):
	buf = pa.py_buffer(val)
	assert buf.hex() == expected_hex_buffer


	@pytest.mark.numpy
	def test_buffer_to_numpy():
	# Make sure creating a numpy array from an arrow buffer works
	byte_array = bytearray(20)
	byte_array[0] = 42
	buf = pa.py_buffer(byte_array)
	array = np.frombuffer(buf, dtype="uint8")
	assert array[0] == byte_array[0]
	byte_array[0] += 1
	assert array[0] == byte_array[0]
	assert array.base == buf


	@pytest.mark.numpy
	def test_buffer_from_numpy():
	# C-contiguous
	arr = np.arange(12, dtype=np.int8).reshape((3, 4))
	buf = pa.py_buffer(arr)
	assert buf.is_cpu
	assert buf.is_mutable
	assert buf.to_pybytes() == arr.tobytes()
	# F-contiguous; note strides information is lost
	buf = pa.py_buffer(arr.T)
	assert buf.is_cpu
	assert buf.is_mutable
	assert buf.to_pybytes() == arr.tobytes()
	# Non-contiguous
	with pytest.raises(ValueError, match="not contiguous"):
	buf = pa.py_buffer(arr.T[::2])


	@pytest.mark.numpy
	def test_buffer_address():
	b1 = b'some data!'
	b2 = bytearray(b1)
	b3 = bytearray(b1)

	buf1 = pa.py_buffer(b1)
	buf2 = pa.py_buffer(b1)
	buf3 = pa.py_buffer(b2)
	buf4 = pa.py_buffer(b3)

	assert buf1.address > 0
	assert buf1.address == buf2.address
	assert buf3.address != buf2.address
	assert buf4.address != buf3.address

	arr = np.arange(5)
	buf = pa.py_buffer(arr)
	assert buf.address == arr.ctypes.data


	@pytest.mark.numpy
	def test_buffer_equals():
	# Buffer.equals() returns true iff the buffers have the same contents
	def eq(a, b):
	assert a.equals(b)
	assert a == b
	assert not (a != b)

	def ne(a, b):
	assert not a.equals(b)
	assert not (a == b)
	assert a != b

	b1 = b'some data!'
	b2 = bytearray(b1)
	b3 = bytearray(b1)
	b3[0] = 42
	buf1 = pa.py_buffer(b1)
	buf2 = pa.py_buffer(b2)
	buf3 = pa.py_buffer(b2)
	buf4 = pa.py_buffer(b3)
	buf5 = pa.py_buffer(np.frombuffer(b2, dtype=np.int16))
	eq(buf1, buf1)
	eq(buf1, buf2)
	eq(buf2, buf3)
	ne(buf2, buf4)
	# Data type is indifferent
	eq(buf2, buf5)


	def test_buffer_eq_bytes():
	buf = pa.py_buffer(b'some data')
	assert buf == b'some data'
	assert buf == bytearray(b'some data')
	assert buf != b'some dat1'

	with pytest.raises(TypeError):
	buf == 'some data'


	def test_buffer_getitem():
	data = bytearray(b'some data!')
	buf = pa.py_buffer(data)

	n = len(data)
	for ix in range(-n, n - 1):
	assert buf[ix] == data[ix]

	with pytest.raises(IndexError):
	buf[n]

	with pytest.raises(IndexError):
	buf[-n - 1]


	def test_buffer_slicing():
	data = b'some data!'
	buf = pa.py_buffer(data)

	sliced = buf.slice(2)
	expected = pa.py_buffer(b'me data!')
	assert sliced.equals(expected)

	sliced2 = buf.slice(2, 4)
	expected2 = pa.py_buffer(b'me d')
	assert sliced2.equals(expected2)

	# 0 offset
	assert buf.slice(0).equals(buf)

	# Slice past end of buffer
	assert len(buf.slice(len(buf))) == 0

	with pytest.raises(IndexError):
	buf.slice(-1)

	with pytest.raises(IndexError):
	buf.slice(len(buf) + 1)
	assert buf[11:].to_pybytes() == b""

	# Slice stop exceeds buffer length
	with pytest.raises(IndexError):
	buf.slice(1, len(buf))
	assert buf[1:11].to_pybytes() == buf.to_pybytes()[1:]

	# Negative length
	with pytest.raises(IndexError):
	buf.slice(1, -1)

	# Test slice notation
	assert buf[2:].equals(buf.slice(2))
	assert buf[2:5].equals(buf.slice(2, 3))
	assert buf[-5:].equals(buf.slice(len(buf) - 5))
	assert buf[-5:-2].equals(buf.slice(len(buf) - 5, 3))

	with pytest.raises(IndexError):
	buf[::-1]
	with pytest.raises(IndexError):
	buf[::2]

	n = len(buf)
	for start in range(-n * 2, n * 2):
	for stop in range(-n * 2, n * 2):
	assert buf[start:stop].to_pybytes() == buf.to_pybytes()[start:stop]


	def test_buffer_hashing():
	# Buffers are unhashable
	with pytest.raises(TypeError, match="unhashable"):
	hash(pa.py_buffer(b'123'))


	@pytest.mark.numpy
	def test_buffer_protocol_respects_immutability():
	# ARROW-3228; NumPy's frombuffer ctor determines whether a buffer-like
	# object is mutable by first attempting to get a mutable buffer using
	# PyObject_FromBuffer. If that fails, it assumes that the object is
	# immutable
	a = b'12345'
	arrow_ref = pa.py_buffer(a)
	numpy_ref = np.frombuffer(arrow_ref, dtype=np.uint8)
	assert not numpy_ref.flags.writeable


	@pytest.mark.numpy
	def test_foreign_buffer():
	obj = np.array([1, 2], dtype=np.int32)
	addr = obj.__array_interface__["data"][0]
	size = obj.nbytes
	buf = pa.foreign_buffer(addr, size, obj)
	wr = weakref.ref(obj)
	del obj
	assert np.frombuffer(buf, dtype=np.int32).tolist() == [1, 2]
	assert wr() is not None
	del buf
	assert wr() is None


	def test_allocate_buffer():
	buf = pa.allocate_buffer(100)
	assert buf.size == 100
	assert buf.is_mutable
	assert buf.parent is None

	bit = b'abcde'
	writer = pa.FixedSizeBufferWriter(buf)
	writer.write(bit)

	assert buf.to_pybytes()[:5] == bit


	def test_allocate_buffer_resizable():
	buf = pa.allocate_buffer(100, resizable=True)
	assert isinstance(buf, pa.ResizableBuffer)

	buf.resize(200)
	assert buf.size == 200


	@pytest.mark.numpy
	def test_non_cpu_buffer(pickle_module):
	cuda = pytest.importorskip("pyarrow.cuda")
	ctx = cuda.Context(0)

	data = np.array([b'testing'])
	cuda_buf = ctx.buffer_from_data(data)
	arr = pa.FixedSizeBinaryArray.from_buffers(pa.binary(7), 1, [None, cuda_buf])
	buf_on_gpu = arr.buffers()[1]

	assert buf_on_gpu.size == cuda_buf.size
	assert buf_on_gpu.address == cuda_buf.address
	assert buf_on_gpu.is_cpu == cuda_buf.is_cpu
	assert buf_on_gpu.is_mutable

	repr1 = "<pyarrow.Buffer address="
	repr2 = "size=7 is_cpu=False is_mutable=True>"
	assert repr1 in repr(buf_on_gpu)
	assert repr2 in repr(buf_on_gpu)

	buf_on_gpu_sliced = buf_on_gpu.slice(2)
	cuda_sliced = cuda.CudaBuffer.from_buffer(buf_on_gpu_sliced)
	assert cuda_sliced.to_pybytes() == b'sting'

	buf_on_gpu_sliced = buf_on_gpu[2:4]
	cuda_sliced = cuda.CudaBuffer.from_buffer(buf_on_gpu_sliced)
	assert cuda_sliced.to_pybytes() == b'st'

	# Sliced buffers with same address
	assert buf_on_gpu_sliced.equals(cuda_buf[2:4])

	# Buffers on different devices
	msg_device = "Device on which the data resides differs between buffers"
	with pytest.raises(ValueError, match=msg_device):
	buf_on_gpu.equals(pa.py_buffer(data))

	msg = "Implemented only for data on CPU device"
	# Buffers with different addresses
	arr_short = np.array([b'sting'])
	cuda_buf_short = ctx.buffer_from_data(arr_short)
	with pytest.raises(NotImplementedError, match=msg):
	buf_on_gpu_sliced.equals(cuda_buf_short)
	arr_short = pa.FixedSizeBinaryArray.from_buffers(
	pa.binary(5), 1, [None, cuda_buf_short]
	)
	buf_on_gpu_short = arr_short.buffers()[1]
	with pytest.raises(NotImplementedError, match=msg):
	buf_on_gpu_sliced.equals(buf_on_gpu_short)

	with pytest.raises(NotImplementedError, match=msg):
	buf_on_gpu.hex()

	with pytest.raises(NotImplementedError, match=msg):
	cuda_buf.hex()

	with pytest.raises(NotImplementedError, match=msg):
	buf_on_gpu[1]

	with pytest.raises(NotImplementedError, match=msg):
	buf_on_gpu.to_pybytes()

	with pytest.raises(NotImplementedError, match=msg):
	pickle_module.dumps(buf_on_gpu, protocol=4)

	with pytest.raises(NotImplementedError, match=msg):
	pickle_module.dumps(cuda_buf, protocol=4)

	with pytest.raises(NotImplementedError, match=msg):
	memoryview(buf_on_gpu)


	def test_cache_options():
	opts1 = pa.CacheOptions()
	opts2 = pa.CacheOptions(hole_size_limit=1024)
	opts3 = pa.CacheOptions(hole_size_limit=4096, range_size_limit=8192)
	opts4 = pa.CacheOptions(hole_size_limit=4096,
	range_size_limit=8192, prefetch_limit=5)
	opts5 = pa.CacheOptions(hole_size_limit=4096,
	range_size_limit=8192, lazy=False)
	opts6 = pa.CacheOptions.from_network_metrics(time_to_first_byte_millis=100,
	transfer_bandwidth_mib_per_sec=200,
	ideal_bandwidth_utilization_frac=0.9,
	max_ideal_request_size_mib=64)

	assert opts1.hole_size_limit == 8192
	assert opts1.range_size_limit == 32 * 1024 * 1024
	assert opts1.lazy is True
	assert opts1.prefetch_limit == 0

	assert opts2.hole_size_limit == 1024
	assert opts2.range_size_limit == 32 * 1024 * 1024
	assert opts2.lazy is True
	assert opts2.prefetch_limit == 0

	assert opts3.hole_size_limit == 4096
	assert opts3.range_size_limit == 8192
	assert opts3.lazy is True
	assert opts3.prefetch_limit == 0

	assert opts4.hole_size_limit == 4096
	assert opts4.range_size_limit == 8192
	assert opts4.lazy is True
	assert opts4.prefetch_limit == 5

	assert opts5.hole_size_limit == 4096
	assert opts5.range_size_limit == 8192
	assert opts5.lazy is False
	assert opts5.prefetch_limit == 0

	assert opts6.lazy is False

	assert opts1 == opts1
	assert opts1 != opts2
	assert opts2 != opts3
	assert opts3 != opts4
	assert opts4 != opts5
	assert opts6 != opts1


	def test_cache_options_pickling(pickle_module):
	options = [
	pa.CacheOptions(),
	pa.CacheOptions(hole_size_limit=4096, range_size_limit=8192,
	lazy=True, prefetch_limit=5),
	]

	for option in options:
	assert pickle_module.loads(pickle_module.dumps(option)) == option


	@pytest.mark.numpy
	@pytest.mark.parametrize("compression", [
	pytest.param(
	"bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
	),
	"brotli",
	"gzip",
	"lz4",
	"zstd",
	"snappy"
	])
	def test_compress_decompress(compression):
	if not Codec.is_available(compression):
	pytest.skip("{} support is not built".format(compression))

	INPUT_SIZE = 10000
	test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
	.astype(np.uint8)
	.tobytes())
	test_buf = pa.py_buffer(test_data)

	compressed_buf = pa.compress(test_buf, codec=compression)
	compressed_bytes = pa.compress(test_data, codec=compression,
	asbytes=True)

	assert isinstance(compressed_bytes, bytes)

	decompressed_buf = pa.decompress(compressed_buf, INPUT_SIZE,
	codec=compression)
	decompressed_bytes = pa.decompress(compressed_bytes, INPUT_SIZE,
	codec=compression, asbytes=True)

	assert isinstance(decompressed_bytes, bytes)

	assert decompressed_buf.equals(test_buf)
	assert decompressed_bytes == test_data

	with pytest.raises(ValueError):
	pa.decompress(compressed_bytes, codec=compression)


	@pytest.mark.numpy
	@pytest.mark.parametrize("compression", [
	pytest.param(
	"bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
	),
	"brotli",
	"gzip",
	"lz4",
	"zstd",
	"snappy"
	])
	def test_compression_level(compression):
	if not Codec.is_available(compression):
	pytest.skip("{} support is not built".format(compression))

	codec = Codec(compression)
	if codec.name == "snappy":
	assert codec.compression_level is None
	else:
	assert isinstance(codec.compression_level, int)

	# These codecs do not support a compression level
	no_level = ['snappy']
	if compression in no_level:
	assert not Codec.supports_compression_level(compression)
	with pytest.raises(ValueError):
	Codec(compression, 0)
	with pytest.raises(ValueError):
	Codec.minimum_compression_level(compression)
	with pytest.raises(ValueError):
	Codec.maximum_compression_level(compression)
	with pytest.raises(ValueError):
	Codec.default_compression_level(compression)
	return

	INPUT_SIZE = 10000
	test_data = (np.random.randint(0, 255, size=INPUT_SIZE)
	.astype(np.uint8)
	.tobytes())
	test_buf = pa.py_buffer(test_data)

	min_level = Codec.minimum_compression_level(compression)
	max_level = Codec.maximum_compression_level(compression)
	default_level = Codec.default_compression_level(compression)

	assert min_level < max_level
	assert default_level >= min_level
	assert default_level <= max_level

	for compression_level in range(min_level, max_level+1):
	codec = Codec(compression, compression_level)
	compressed_buf = codec.compress(test_buf)
	compressed_bytes = codec.compress(test_data, asbytes=True)
	assert isinstance(compressed_bytes, bytes)
	decompressed_buf = codec.decompress(compressed_buf, INPUT_SIZE)
	decompressed_bytes = codec.decompress(compressed_bytes, INPUT_SIZE,
	asbytes=True)

	assert isinstance(decompressed_bytes, bytes)

	assert decompressed_buf.equals(test_buf)
	assert decompressed_bytes == test_data

	with pytest.raises(ValueError):
	codec.decompress(compressed_bytes)

	# The ability to set a seed this way is not present on older versions of
	# numpy (currently in our python 3.6 CI build). Some inputs might just
	# happen to compress the same between the two levels so using seeded
	# random numbers is necessary to help get more reliable results
	#
	# The goal of this part is to ensure the compression_level is being
	# passed down to the C++ layer, not to verify the compression algs
	# themselves
	if not hasattr(np.random, 'default_rng'):
	pytest.skip('Requires newer version of numpy')
	rng = np.random.default_rng(seed=42)
	values = rng.integers(0, 100, 1000)
	arr = pa.array(values)
	hard_to_compress_buffer = arr.buffers()[1]

	weak_codec = Codec(compression, min_level)
	weakly_compressed_buf = weak_codec.compress(hard_to_compress_buffer)

	strong_codec = Codec(compression, max_level)
	strongly_compressed_buf = strong_codec.compress(hard_to_compress_buffer)

	assert len(weakly_compressed_buf) > len(strongly_compressed_buf)


	def test_buffer_memoryview_is_immutable():
	val = b'some data'

	buf = pa.py_buffer(val)
	assert not buf.is_mutable
	assert isinstance(buf, pa.Buffer)

	result = memoryview(buf)
	assert result.readonly

	with pytest.raises(TypeError) as exc:
	result[0] = b'h'
	assert 'cannot modify read-only' in str(exc.value)

	b = bytes(buf)
	with pytest.raises(TypeError) as exc:
	b[0] = b'h'
	assert 'cannot modify read-only' in str(exc.value)


	def test_uninitialized_buffer():
	# ARROW-2039: calling Buffer() directly creates an uninitialized object
	# ARROW-2638: prevent calling extension class constructors directly
	with pytest.raises(TypeError):
	pa.Buffer()


	def test_memory_output_stream():
	# 10 bytes
	val = b'dataabcdef'
	f = pa.BufferOutputStream()

	K = 1000
	for i in range(K):
	f.write(val)

	buf = f.getvalue()
	assert len(buf) == len(val) * K
	assert buf.to_pybytes() == val * K


	def test_inmemory_write_after_closed():
	f = pa.BufferOutputStream()
	f.write(b'ok')
	assert not f.closed
	f.getvalue()
	assert f.closed

	with pytest.raises(ValueError):
	f.write(b'not ok')


	def test_buffer_protocol_ref_counting():
	def make_buffer(bytes_obj):
	return bytearray(pa.py_buffer(bytes_obj))

	buf = make_buffer(b'foo')
	gc.collect()
	assert buf == b'foo'

	# ARROW-1053
	val = b'foo'
	refcount_before = sys.getrefcount(val)
	for i in range(10):
	make_buffer(val)
	gc.collect()
	assert refcount_before == sys.getrefcount(val)


	@pytest.mark.numpy
	def test_nativefile_write_memoryview():
	f = pa.BufferOutputStream()
	data = b'ok'

	arr = np.frombuffer(data, dtype='S1')

	f.write(arr)
	f.write(bytearray(data))
	f.write(pa.py_buffer(data))
	with pytest.raises(TypeError):
	f.write(data.decode('utf8'))

	buf = f.getvalue()

	assert buf.to_pybytes() == data * 3


	# ----------------------------------------------------------------------
	# Mock output stream


	def test_mock_output_stream():
	# Make sure that the MockOutputStream and the BufferOutputStream record the
	# same size

	# 10 bytes
	val = b'dataabcdef'

	f1 = pa.MockOutputStream()
	f2 = pa.BufferOutputStream()

	K = 1000
	for i in range(K):
	f1.write(val)
	f2.write(val)

	assert f1.size() == len(f2.getvalue())

	# Do the same test with a table
	record_batch = pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], ['a'])

	f1 = pa.MockOutputStream()
	f2 = pa.BufferOutputStream()

	stream_writer1 = pa.RecordBatchStreamWriter(f1, record_batch.schema)
	stream_writer2 = pa.RecordBatchStreamWriter(f2, record_batch.schema)

	stream_writer1.write_batch(record_batch)
	stream_writer2.write_batch(record_batch)
	stream_writer1.close()
	stream_writer2.close()

	assert f1.size() == len(f2.getvalue())


	# ----------------------------------------------------------------------
	# OS files and memory maps


	@pytest.fixture
	def sample_disk_data(request, tmpdir):
	SIZE = 4096
	arr = [random.randint(0, 255) for _ in range(SIZE)]
	data = bytes(arr[:SIZE])

	path = os.path.join(str(tmpdir), guid())

	with open(path, 'wb') as f:
	f.write(data)

	def teardown():
	_try_delete(path)

	request.addfinalizer(teardown)
	return path, data


	def _check_native_file_reader(FACTORY, sample_data,
	allow_read_out_of_bounds=True):
	path, data = sample_data

	f = FACTORY(path, mode='r')

	assert f.read(10) == data[:10]
	assert f.read(0) == b''
	assert f.tell() == 10

	assert f.read() == data[10:]

	assert f.size() == len(data)

	f.seek(0)
	assert f.tell() == 0

	# Seeking past end of file not supported in memory maps
	if allow_read_out_of_bounds:
	f.seek(len(data) + 1)
	assert f.tell() == len(data) + 1
	assert f.read(5) == b''

	# Test whence argument of seek, ARROW-1287
	assert f.seek(3) == 3
	assert f.seek(3, os.SEEK_CUR) == 6
	assert f.tell() == 6

	ex_length = len(data) - 2
	assert f.seek(-2, os.SEEK_END) == ex_length
	assert f.tell() == ex_length


	def test_memory_map_reader(sample_disk_data):
	_check_native_file_reader(pa.memory_map, sample_disk_data,
	allow_read_out_of_bounds=False)


	def test_memory_map_retain_buffer_reference(sample_disk_data):
	path, data = sample_disk_data

	cases = []
	with pa.memory_map(path, 'rb') as f:
	cases.append((f.read_buffer(100), data[:100]))
	cases.append((f.read_buffer(100), data[100:200]))
	cases.append((f.read_buffer(100), data[200:300]))

	# Call gc.collect() for good measure
	gc.collect()

	for buf, expected in cases:
	assert buf.to_pybytes() == expected


	def test_os_file_reader(sample_disk_data):
	_check_native_file_reader(pa.OSFile, sample_disk_data)


	def test_os_file_large_seeks():
	check_large_seeks(pa.OSFile)


	def _try_delete(path):
	try:
	os.remove(path)
	except os.error:
	pass


	def test_memory_map_writer(tmpdir):
	if sys.platform == "emscripten":
	pytest.xfail("Multiple memory maps to same file don't work on emscripten")
	SIZE = 4096
	arr = [random.randint(0, 255) for _ in range(SIZE)]
	data = bytes(arr[:SIZE])

	path = os.path.join(str(tmpdir), guid())
	with open(path, 'wb') as f:
	f.write(data)

	f = pa.memory_map(path, mode='r+b')

	f.seek(10)
	f.write(b'peekaboo')
	assert f.tell() == 18

	f.seek(10)
	assert f.read(8) == b'peekaboo'

	f2 = pa.memory_map(path, mode='r+b')

	f2.seek(10)
	f2.write(b'booapeak')
	f2.seek(10)

	f.seek(10)
	assert f.read(8) == b'booapeak'

	# Does not truncate file
	f3 = pa.memory_map(path, mode='w')
	f3.write(b'foo')

	with pa.memory_map(path) as f4:
	assert f4.size() == SIZE

	with pytest.raises(IOError):
	f3.read(5)

	f.seek(0)
	assert f.read(3) == b'foo'


	def test_memory_map_resize(tmpdir):
	SIZE = 4096
	arr = [random.randint(0, 255) for _ in range(SIZE)]
	data1 = bytes(arr[:(SIZE // 2)])
	data2 = bytes(arr[(SIZE // 2):])

	path = os.path.join(str(tmpdir), guid())

	mmap = pa.create_memory_map(path, SIZE / 2)
	mmap.write(data1)

	mmap.resize(SIZE)
	mmap.write(data2)

	mmap.close()

	with open(path, 'rb') as f:
	assert f.read() == bytes(arr[:SIZE])


	def test_memory_zero_length(tmpdir):
	path = os.path.join(str(tmpdir), guid())
	f = open(path, 'wb')
	f.close()
	with pa.memory_map(path, mode='r+b') as memory_map:
	assert memory_map.size() == 0


	def test_memory_map_large_seeks():
	if sys.maxsize >= 2**32:
	expected_error = None
	else:
	expected_error = pytest.raises(
	pa.ArrowCapacityError,
	match="Requested memory map length 4294967306 "
	"does not fit in a C size_t")
	check_large_seeks(pa.memory_map, expected_error=expected_error)


	def test_memory_map_close_remove(tmpdir):
	# ARROW-6740: should be able to delete closed memory-mapped file (Windows)
	path = os.path.join(str(tmpdir), guid())
	mmap = pa.create_memory_map(path, 4096)
	mmap.close()
	assert mmap.closed
	os.remove(path) # Shouldn't fail


	def test_memory_map_deref_remove(tmpdir):
	path = os.path.join(str(tmpdir), guid())
	pa.create_memory_map(path, 4096)
	os.remove(path) # Shouldn't fail


	def test_os_file_writer(tmpdir):
	SIZE = 4096
	arr = [random.randint(0, 255) for _ in range(SIZE)]
	data = bytes(arr[:SIZE])

	path = os.path.join(str(tmpdir), guid())
	with open(path, 'wb') as f:
	f.write(data)

	# Truncates file
	f2 = pa.OSFile(path, mode='w')
	f2.write(b'foo')

	with pa.OSFile(path) as f3:
	assert f3.size() == 3

	with pytest.raises(IOError):
	f2.read(5)
	f2.close()

	# Append
	with pa.OSFile(path, mode='ab') as f4:
	f4.write(b'bar')
	with pa.OSFile(path) as f5:
	assert f5.size() == 6 # foo + bar


	def test_native_file_write_reject_unicode():
	# ARROW-3227
	nf = pa.BufferOutputStream()
	with pytest.raises(TypeError):
	nf.write('foo')


	def test_native_file_modes(tmpdir):
	path = os.path.join(str(tmpdir), guid())
	with open(path, 'wb') as f:
	f.write(b'foooo')

	with pa.OSFile(path, mode='r') as f:
	assert f.mode == 'rb'
	assert f.readable()
	assert not f.writable()
	assert f.seekable()

	with pa.OSFile(path, mode='rb') as f:
	assert f.mode == 'rb'
	assert f.readable()
	assert not f.writable()
	assert f.seekable()

	with pa.OSFile(path, mode='w') as f:
	assert f.mode == 'wb'
	assert not f.readable()
	assert f.writable()
	assert not f.seekable()

	with pa.OSFile(path, mode='wb') as f:
	assert f.mode == 'wb'
	assert not f.readable()
	assert f.writable()
	assert not f.seekable()

	with pa.OSFile(path, mode='ab') as f:
	assert f.mode == 'ab'
	assert not f.readable()
	assert f.writable()
	assert not f.seekable()

	with pa.OSFile(path, mode='a') as f:
	assert f.mode == 'ab'
	assert not f.readable()
	assert f.writable()
	assert not f.seekable()

	with open(path, 'wb') as f:
	f.write(b'foooo')

	with pa.memory_map(path, 'r') as f:
	assert f.mode == 'rb'
	assert f.readable()
	assert not f.writable()
	assert f.seekable()

	with pa.memory_map(path, 'r+') as f:
	assert f.mode == 'rb+'
	assert f.readable()
	assert f.writable()
	assert f.seekable()

	with pa.memory_map(path, 'r+b') as f:
	assert f.mode == 'rb+'
	assert f.readable()
	assert f.writable()
	assert f.seekable()


	@pytest.mark.xfail(
	sys.platform == "emscripten", reason="umask doesn't work on Emscripten"
	)
	def test_native_file_permissions(tmpdir):
	# ARROW-10124: permissions of created files should follow umask
	cur_umask = os.umask(0o002)
	os.umask(cur_umask)

	path = os.path.join(str(tmpdir), guid())
	with pa.OSFile(path, mode='w'):
	pass
	assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask

	path = os.path.join(str(tmpdir), guid())
	with pa.memory_map(path, 'w'):
	pass
	assert os.stat(path).st_mode & 0o777 == 0o666 & ~cur_umask


	def test_native_file_raises_ValueError_after_close(tmpdir):
	path = os.path.join(str(tmpdir), guid())
	with open(path, 'wb') as f:
	f.write(b'foooo')

	with pa.OSFile(path, mode='rb') as os_file:
	assert not os_file.closed
	assert os_file.closed

	with pa.memory_map(path, mode='rb') as mmap_file:
	assert not mmap_file.closed
	assert mmap_file.closed

	files = [os_file,
	mmap_file]

	methods = [('tell', ()),
	('seek', (0,)),
	('size', ()),
	('flush', ()),
	('readable', ()),
	('writable', ()),
	('seekable', ())]

	for f in files:
	for method, args in methods:
	with pytest.raises(ValueError):
	getattr(f, method)(*args)


	def test_native_file_TextIOWrapper(tmpdir):
	data = ('foooo\n'
	'barrr\n'
	'bazzz\n')

	path = os.path.join(str(tmpdir), guid())
	with open(path, 'wb') as f:
	f.write(data.encode('utf-8'))

	with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
	assert fil.readable()
	res = fil.read()
	assert res == data
	assert fil.closed

	with TextIOWrapper(pa.OSFile(path, mode='rb')) as fil:
	# Iteration works
	lines = list(fil)
	assert ''.join(lines) == data

	# Writing
	path2 = os.path.join(str(tmpdir), guid())
	with TextIOWrapper(pa.OSFile(path2, mode='wb')) as fil:
	assert fil.writable()
	fil.write(data)

	with TextIOWrapper(pa.OSFile(path2, mode='rb')) as fil:
	res = fil.read()
	assert res == data


	def test_native_file_TextIOWrapper_perf(tmpdir):
	# ARROW-16272: TextIOWrapper.readline() shouldn't exhaust a large
	# Arrow input stream.
	data = b'foo\nquux\n'
	path = str(tmpdir / 'largefile.txt')
	with open(path, 'wb') as f:
	f.write(data * 100_000)

	binary_file = pa.OSFile(path, mode='rb')
	with TextIOWrapper(binary_file) as f:
	assert binary_file.tell() == 0
	nbytes = 20_000
	lines = f.readlines(nbytes)
	assert len(lines) == math.ceil(2 * nbytes / len(data))
	assert nbytes <= binary_file.tell() <= nbytes * 2


	def test_native_file_read1(tmpdir):
	# ARROW-16272: read1() should not exhaust the input stream if there
	# is a large amount of data remaining.
	data = b'123\n' * 1_000_000
	path = str(tmpdir / 'largefile.txt')
	with open(path, 'wb') as f:
	f.write(data)

	chunks = []
	with pa.OSFile(path, mode='rb') as f:
	while True:
	b = f.read1()
	assert len(b) < len(data)
	chunks.append(b)
	b = f.read1(30_000)
	assert len(b) <= 30_000
	chunks.append(b)
	if not b:
	break

	assert b"".join(chunks) == data


	@pytest.mark.pandas
	def test_native_file_pandas_text_reader(tmpdir):
	# ARROW-16272: Pandas' read_csv() should not exhaust an Arrow
	# input stream when a small nrows is passed.
	import pandas as pd
	import pandas.testing as tm
	data = b'a,b\n' * 10_000_000
	path = str(tmpdir / 'largefile.txt')
	with open(path, 'wb') as f:
	f.write(data)

	with pa.OSFile(path, mode='rb') as f:
	df = pd.read_csv(f, nrows=10)
	expected = pd.DataFrame({'a': ['a'] * 10, 'b': ['b'] * 10})
	tm.assert_frame_equal(df, expected)
	# Some readahead occurred, but not up to the end of file
	assert f.tell() <= 256 * 1024


	def test_native_file_open_error():
	with assert_file_not_found():
	pa.OSFile('non_existent_file', 'rb')
	with assert_file_not_found():
	pa.memory_map('non_existent_file', 'rb')


	# ----------------------------------------------------------------------
	# Buffered streams

	def test_buffered_input_stream():
	raw = pa.BufferReader(b"123456789")
	f = pa.BufferedInputStream(raw, buffer_size=4)
	assert f.read(2) == b"12"
	assert raw.tell() == 4
	f.close()
	assert f.closed
	assert raw.closed


	def test_buffered_input_stream_detach_seekable():
	# detach() to a seekable file (io::RandomAccessFile in C++)
	f = pa.BufferedInputStream(pa.BufferReader(b"123456789"), buffer_size=4)
	assert f.read(2) == b"12"
	raw = f.detach()
	assert f.closed
	assert not raw.closed
	assert raw.seekable()
	assert raw.read(4) == b"5678"
	raw.seek(2)
	assert raw.read(4) == b"3456"


	def test_buffered_input_stream_detach_non_seekable():
	# detach() to a non-seekable file (io::InputStream in C++)
	f = pa.BufferedInputStream(
	pa.BufferedInputStream(pa.BufferReader(b"123456789"), buffer_size=4),
	buffer_size=4)
	assert f.read(2) == b"12"
	raw = f.detach()
	assert f.closed
	assert not raw.closed
	assert not raw.seekable()
	assert raw.read(4) == b"5678"
	with pytest.raises(EnvironmentError):
	raw.seek(2)


	@pytest.mark.numpy
	def test_buffered_output_stream():
	np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer
	buf = pa.py_buffer(np_buf)

	raw = pa.FixedSizeBufferWriter(buf)
	f = pa.BufferedOutputStream(raw, buffer_size=4)
	f.write(b"12")
	assert np_buf[:4].tobytes() == b'\0\0\0\0'
	f.flush()
	assert np_buf[:4].tobytes() == b'12\0\0'
	f.write(b"3456789")
	f.close()
	assert f.closed
	assert raw.closed
	assert np_buf[:10].tobytes() == b'123456789\0'


	@pytest.mark.numpy
	def test_buffered_output_stream_detach():
	np_buf = np.zeros(100, dtype=np.int8) # zero-initialized buffer
	buf = pa.py_buffer(np_buf)

	f = pa.BufferedOutputStream(pa.FixedSizeBufferWriter(buf), buffer_size=4)
	f.write(b"12")
	assert np_buf[:4].tobytes() == b'\0\0\0\0'
	raw = f.detach()
	assert f.closed
	assert not raw.closed
	assert np_buf[:4].tobytes() == b'12\0\0'


	# ----------------------------------------------------------------------
	# Compressed input and output streams

	def check_compressed_input(data, fn, compression):
	raw = pa.OSFile(fn, mode="rb")
	with pa.CompressedInputStream(raw, compression) as compressed:
	assert not compressed.closed
	assert compressed.readable()
	assert not compressed.writable()
	assert not compressed.seekable()
	got = compressed.read()
	assert got == data
	assert compressed.closed
	assert raw.closed

	# Same with read_buffer()
	raw = pa.OSFile(fn, mode="rb")
	with pa.CompressedInputStream(raw, compression) as compressed:
	buf = compressed.read_buffer()
	assert isinstance(buf, pa.Buffer)
	assert buf.to_pybytes() == data


	@pytest.mark.gzip
	def test_compressed_input_gzip(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	fn = str(tmpdir / "compressed_input_test.gz")
	with gzip.open(fn, "wb") as f:
	f.write(data)
	check_compressed_input(data, fn, "gzip")


	def test_compressed_input_bz2(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	fn = str(tmpdir / "compressed_input_test.bz2")
	with bz2.BZ2File(fn, "w") as f:
	f.write(data)
	try:
	check_compressed_input(data, fn, "bz2")
	except NotImplementedError as e:
	pytest.skip(str(e))


	@pytest.mark.gzip
	def test_compressed_input_openfile(tmpdir):
	if not Codec.is_available("gzip"):
	pytest.skip("gzip support is not built")

	data = b"some test data\n" * 10 + b"eof\n"
	fn = str(tmpdir / "test_compressed_input_openfile.gz")
	with gzip.open(fn, "wb") as f:
	f.write(data)

	with pa.CompressedInputStream(fn, "gzip") as compressed:
	buf = compressed.read_buffer()
	assert buf.to_pybytes() == data
	assert compressed.closed

	with pa.CompressedInputStream(pathlib.Path(fn), "gzip") as compressed:
	buf = compressed.read_buffer()
	assert buf.to_pybytes() == data
	assert compressed.closed

	f = open(fn, "rb")
	with pa.CompressedInputStream(f, "gzip") as compressed:
	buf = compressed.read_buffer()
	assert buf.to_pybytes() == data
	assert f.closed


	def check_compressed_concatenated(data, fn, compression):
	raw = pa.OSFile(fn, mode="rb")
	with pa.CompressedInputStream(raw, compression) as compressed:
	got = compressed.read()
	assert got == data


	@pytest.mark.gzip
	def test_compressed_concatenated_gzip(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	fn = str(tmpdir / "compressed_input_test2.gz")
	with gzip.open(fn, "wb") as f:
	f.write(data[:50])
	with gzip.open(fn, "ab") as f:
	f.write(data[50:])
	check_compressed_concatenated(data, fn, "gzip")


	@pytest.mark.gzip
	def test_compressed_input_invalid():
	data = b"foo" * 10
	raw = pa.BufferReader(data)
	with pytest.raises(ValueError):
	pa.CompressedInputStream(raw, "unknown_compression")
	with pytest.raises(TypeError):
	pa.CompressedInputStream(raw, None)

	with pa.CompressedInputStream(raw, "gzip") as compressed:
	with pytest.raises(IOError, match="zlib inflate failed"):
	compressed.read()


	def make_compressed_output(data, fn, compression):
	raw = pa.BufferOutputStream()
	with pa.CompressedOutputStream(raw, compression) as compressed:
	assert not compressed.closed
	assert not compressed.readable()
	assert compressed.writable()
	assert not compressed.seekable()
	compressed.write(data)
	assert compressed.closed
	assert raw.closed
	with open(fn, "wb") as f:
	f.write(raw.getvalue())


	@pytest.mark.gzip
	def test_compressed_output_gzip(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	fn = str(tmpdir / "compressed_output_test.gz")
	make_compressed_output(data, fn, "gzip")
	with gzip.open(fn, "rb") as f:
	got = f.read()
	assert got == data


	def test_compressed_output_bz2(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	fn = str(tmpdir / "compressed_output_test.bz2")
	try:
	make_compressed_output(data, fn, "bz2")
	except NotImplementedError as e:
	pytest.skip(str(e))
	with bz2.BZ2File(fn, "r") as f:
	got = f.read()
	assert got == data


	def test_output_stream_constructor(tmpdir):
	if not Codec.is_available("gzip"):
	pytest.skip("gzip support is not built")
	with pa.CompressedOutputStream(tmpdir / "ctor.gz", "gzip") as stream:
	stream.write(b"test")
	with (tmpdir / "ctor2.gz").open("wb") as f:
	with pa.CompressedOutputStream(f, "gzip") as stream:
	stream.write(b"test")


	@pytest.mark.parametrize(("path", "expected_compression"), [
	("file.bz2", "bz2"),
	("file.lz4", "lz4"),
	(pathlib.Path("file.gz"), "gzip"),
	(pathlib.Path("path/to/file.zst"), "zstd"),
	])
	def test_compression_detection(path, expected_compression):
	if not Codec.is_available(expected_compression):
	with pytest.raises(pa.lib.ArrowNotImplementedError):
	Codec.detect(path)
	else:
	codec = Codec.detect(path)
	assert isinstance(codec, Codec)
	assert codec.name == expected_compression


	def test_unknown_compression_raises():
	with pytest.raises(ValueError):
	Codec.is_available('unknown')
	with pytest.raises(TypeError):
	Codec(None)
	with pytest.raises(ValueError):
	Codec('unknown')


	@pytest.mark.parametrize("compression", [
	"bz2",
	"brotli",
	"gzip",
	"lz4",
	"zstd",
	pytest.param(
	"snappy",
	marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError)
	)
	])
	def test_compressed_roundtrip(compression):
	if not Codec.is_available(compression):
	pytest.skip("{} support is not built".format(compression))

	data = b"some test data\n" * 10 + b"eof\n"
	raw = pa.BufferOutputStream()
	with pa.CompressedOutputStream(raw, compression) as compressed:
	compressed.write(data)

	cdata = raw.getvalue()
	assert len(cdata) < len(data)
	raw = pa.BufferReader(cdata)
	with pa.CompressedInputStream(raw, compression) as compressed:
	got = compressed.read()
	assert got == data


	@pytest.mark.parametrize(
	"compression",
	["bz2", "brotli", "gzip", "lz4", "zstd"]
	)
	def test_compressed_recordbatch_stream(compression):
	if not Codec.is_available(compression):
	pytest.skip("{} support is not built".format(compression))

	# ARROW-4836: roundtrip a RecordBatch through a compressed stream
	table = pa.Table.from_arrays([pa.array([1, 2, 3, 4, 5])], ['a'])
	raw = pa.BufferOutputStream()
	stream = pa.CompressedOutputStream(raw, compression)
	writer = pa.RecordBatchStreamWriter(stream, table.schema)
	writer.write_table(table, max_chunksize=3)
	writer.close()
	stream.close() # Flush data
	buf = raw.getvalue()
	stream = pa.CompressedInputStream(pa.BufferReader(buf), compression)
	got_table = pa.RecordBatchStreamReader(stream).read_all()
	assert got_table == table


	# ----------------------------------------------------------------------
	# Transform input streams

	unicode_transcoding_example = (
	"Dès Noël où un zéphyr haï me vêt de glaçons würmiens "
	"je dîne d’exquis rôtis de bœuf au kir à l’aÿ d’âge mûr & cætera !"
	)


	def check_transcoding(data, src_encoding, dest_encoding, chunk_sizes):
	chunk_sizes = iter(chunk_sizes)
	stream = pa.transcoding_input_stream(
	pa.BufferReader(data.encode(src_encoding)),
	src_encoding, dest_encoding)
	out = []
	while True:
	buf = stream.read(next(chunk_sizes))
	out.append(buf)
	if not buf:
	break
	out = b''.join(out)
	assert out.decode(dest_encoding) == data


	@pytest.mark.parametrize('src_encoding, dest_encoding',
	[('utf-8', 'utf-16'),
	('utf-16', 'utf-8'),
	('utf-8', 'utf-32-le'),
	('utf-8', 'utf-32-be'),
	])
	def test_transcoding_input_stream(src_encoding, dest_encoding):
	# All at once
	check_transcoding(unicode_transcoding_example,
	src_encoding, dest_encoding, [1000, 0])
	# Incremental
	check_transcoding(unicode_transcoding_example,
	src_encoding, dest_encoding,
	itertools.cycle([1, 2, 3, 5]))


	@pytest.mark.parametrize('src_encoding, dest_encoding',
	[('utf-8', 'utf-8'),
	('utf-8', 'UTF8')])
	def test_transcoding_no_ops(src_encoding, dest_encoding):
	# No indirection is wasted when a trivial transcoding is requested
	stream = pa.BufferReader(b"abc123")
	assert pa.transcoding_input_stream(
	stream, src_encoding, dest_encoding) is stream


	@pytest.mark.parametrize('src_encoding, dest_encoding',
	[('utf-8', 'ascii'),
	('utf-8', 'latin-1'),
	])
	def test_transcoding_encoding_error(src_encoding, dest_encoding):
	# Character \u0100 cannot be represented in the destination encoding
	stream = pa.transcoding_input_stream(
	pa.BufferReader("\u0100".encode(src_encoding)),
	src_encoding,
	dest_encoding)
	with pytest.raises(UnicodeEncodeError):
	stream.read(1)


	@pytest.mark.parametrize('src_encoding, dest_encoding',
	[('utf-8', 'utf-16'),
	('utf-16', 'utf-8'),
	])
	def test_transcoding_decoding_error(src_encoding, dest_encoding):
	# The given bytestring is not valid in the source encoding
	stream = pa.transcoding_input_stream(
	pa.BufferReader(b"\xff\xff\xff\xff"),
	src_encoding,
	dest_encoding)
	with pytest.raises(UnicodeError):
	stream.read(1)


	# ----------------------------------------------------------------------
	# High-level API

	@pytest.mark.gzip
	def test_input_stream_buffer():
	data = b"some test data\n" * 10 + b"eof\n"
	for arg in [pa.py_buffer(data), memoryview(data)]:
	stream = pa.input_stream(arg)
	assert stream.read() == data

	gz_data = gzip.compress(data)
	stream = pa.input_stream(memoryview(gz_data))
	assert stream.read() == gz_data
	stream = pa.input_stream(memoryview(gz_data), compression='gzip')
	assert stream.read() == data


	def test_input_stream_duck_typing():
	# Accept objects having the right file-like methods...
	class DuckReader:

	def close(self):
	pass

	@property
	def closed(self):
	return False

	def read(self, nbytes=None):
	return b'hello'

	stream = pa.input_stream(DuckReader())
	assert stream.read(5) == b'hello'


	def test_input_stream_file_path(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	file_path = tmpdir / 'input_stream'
	with open(str(file_path), 'wb') as f:
	f.write(data)

	stream = pa.input_stream(file_path)
	assert stream.read() == data
	stream = pa.input_stream(str(file_path))
	assert stream.read() == data
	stream = pa.input_stream(pathlib.Path(str(file_path)))
	assert stream.read() == data


	@pytest.mark.gzip
	def test_input_stream_file_path_compressed(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	gz_data = gzip.compress(data)
	file_path = tmpdir / 'input_stream.gz'
	with open(str(file_path), 'wb') as f:
	f.write(gz_data)

	stream = pa.input_stream(file_path)
	assert stream.read() == data
	stream = pa.input_stream(str(file_path))
	assert stream.read() == data
	stream = pa.input_stream(pathlib.Path(str(file_path)))
	assert stream.read() == data

	stream = pa.input_stream(file_path, compression='gzip')
	assert stream.read() == data
	stream = pa.input_stream(file_path, compression=None)
	assert stream.read() == gz_data


	def test_input_stream_file_path_buffered(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	file_path = tmpdir / 'input_stream.buffered'
	with open(str(file_path), 'wb') as f:
	f.write(data)

	stream = pa.input_stream(file_path, buffer_size=32)
	assert isinstance(stream, pa.BufferedInputStream)
	assert stream.read() == data
	stream = pa.input_stream(str(file_path), buffer_size=64)
	assert isinstance(stream, pa.BufferedInputStream)
	assert stream.read() == data
	stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
	assert isinstance(stream, pa.BufferedInputStream)
	assert stream.read() == data

	unbuffered_stream = pa.input_stream(file_path, buffer_size=0)
	assert isinstance(unbuffered_stream, pa.OSFile)

	msg = 'Buffer size must be larger than zero'
	with pytest.raises(ValueError, match=msg):
	pa.input_stream(file_path, buffer_size=-1)
	with pytest.raises(TypeError):
	pa.input_stream(file_path, buffer_size='million')


	@pytest.mark.gzip
	def test_input_stream_file_path_compressed_and_buffered(tmpdir):
	data = b"some test data\n" * 100 + b"eof\n"
	gz_data = gzip.compress(data)
	file_path = tmpdir / 'input_stream_compressed_and_buffered.gz'
	with open(str(file_path), 'wb') as f:
	f.write(gz_data)

	stream = pa.input_stream(file_path, buffer_size=32, compression='gzip')
	assert stream.read() == data
	stream = pa.input_stream(str(file_path), buffer_size=64)
	assert stream.read() == data
	stream = pa.input_stream(pathlib.Path(str(file_path)), buffer_size=1024)
	assert stream.read() == data


	@pytest.mark.gzip
	def test_input_stream_python_file(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	bio = BytesIO(data)

	stream = pa.input_stream(bio)
	assert stream.read() == data

	gz_data = gzip.compress(data)
	bio = BytesIO(gz_data)
	stream = pa.input_stream(bio)
	assert stream.read() == gz_data
	bio.seek(0)
	stream = pa.input_stream(bio, compression='gzip')
	assert stream.read() == data

	file_path = tmpdir / 'input_stream'
	with open(str(file_path), 'wb') as f:
	f.write(data)
	with open(str(file_path), 'rb') as f:
	stream = pa.input_stream(f)
	assert stream.read() == data


	@pytest.mark.gzip
	def test_input_stream_native_file():
	data = b"some test data\n" * 10 + b"eof\n"
	gz_data = gzip.compress(data)
	reader = pa.BufferReader(gz_data)
	stream = pa.input_stream(reader)
	assert stream is reader
	reader = pa.BufferReader(gz_data)
	stream = pa.input_stream(reader, compression='gzip')
	assert stream.read() == data


	def test_input_stream_errors(tmpdir):
	buf = memoryview(b"")
	with pytest.raises(ValueError):
	pa.input_stream(buf, compression="foo")

	for arg in [bytearray(), StringIO()]:
	with pytest.raises(TypeError):
	pa.input_stream(arg)

	with assert_file_not_found():
	pa.input_stream("non_existent_file")

	with open(str(tmpdir / 'new_file'), 'wb') as f:
	with pytest.raises(TypeError, match="readable file expected"):
	pa.input_stream(f)


	def test_output_stream_buffer():
	data = b"some test data\n" * 10 + b"eof\n"
	buf = bytearray(len(data))
	stream = pa.output_stream(pa.py_buffer(buf))
	stream.write(data)
	assert buf == data

	buf = bytearray(len(data))
	stream = pa.output_stream(memoryview(buf))
	stream.write(data)
	assert buf == data


	def test_output_stream_duck_typing():
	# Accept objects having the right file-like methods...
	class DuckWriter:
	def __init__(self):
	self.buf = pa.BufferOutputStream()

	def close(self):
	pass

	@property
	def closed(self):
	return False

	def write(self, data):
	self.buf.write(data)

	duck_writer = DuckWriter()
	stream = pa.output_stream(duck_writer)
	assert stream.write(b'hello')
	assert duck_writer.buf.getvalue().to_pybytes() == b'hello'


	def test_output_stream_file_path(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	file_path = tmpdir / 'output_stream'

	def check_data(file_path, data):
	with pa.output_stream(file_path) as stream:
	stream.write(data)
	with open(str(file_path), 'rb') as f:
	assert f.read() == data

	check_data(file_path, data)
	check_data(str(file_path), data)
	check_data(pathlib.Path(str(file_path)), data)


	@pytest.mark.gzip
	def test_output_stream_file_path_compressed(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	file_path = tmpdir / 'output_stream.gz'

	def check_data(file_path, data, **kwargs):
	with pa.output_stream(file_path, **kwargs) as stream:
	stream.write(data)
	with open(str(file_path), 'rb') as f:
	return f.read()

	assert gzip.decompress(check_data(file_path, data)) == data
	assert gzip.decompress(check_data(str(file_path), data)) == data
	assert gzip.decompress(
	check_data(pathlib.Path(str(file_path)), data)) == data

	assert gzip.decompress(
	check_data(file_path, data, compression='gzip')) == data
	assert check_data(file_path, data, compression=None) == data

	with pytest.raises(ValueError, match='Invalid value for compression'):
	assert check_data(file_path, data, compression='rabbit') == data


	def test_output_stream_file_path_buffered(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"
	file_path = tmpdir / 'output_stream.buffered'

	def check_data(file_path, data, **kwargs):
	with pa.output_stream(file_path, **kwargs) as stream:
	if kwargs.get('buffer_size', 0) > 0:
	assert isinstance(stream, pa.BufferedOutputStream)
	stream.write(data)
	with open(str(file_path), 'rb') as f:
	return f.read()

	unbuffered_stream = pa.output_stream(file_path, buffer_size=0)
	assert isinstance(unbuffered_stream, pa.OSFile)

	msg = 'Buffer size must be larger than zero'
	with pytest.raises(ValueError, match=msg):
	assert check_data(file_path, data, buffer_size=-128) == data

	assert check_data(file_path, data, buffer_size=32) == data
	assert check_data(file_path, data, buffer_size=1024) == data
	assert check_data(str(file_path), data, buffer_size=32) == data

	result = check_data(pathlib.Path(str(file_path)), data, buffer_size=32)
	assert result == data


	@pytest.mark.gzip
	def test_output_stream_file_path_compressed_and_buffered(tmpdir):
	data = b"some test data\n" * 100 + b"eof\n"
	file_path = tmpdir / 'output_stream_compressed_and_buffered.gz'

	def check_data(file_path, data, **kwargs):
	with pa.output_stream(file_path, **kwargs) as stream:
	stream.write(data)
	with open(str(file_path), 'rb') as f:
	return f.read()

	result = check_data(file_path, data, buffer_size=32)
	assert gzip.decompress(result) == data

	result = check_data(file_path, data, buffer_size=1024)
	assert gzip.decompress(result) == data

	result = check_data(file_path, data, buffer_size=1024, compression='gzip')
	assert gzip.decompress(result) == data


	def test_output_stream_destructor(tmpdir):
	# The wrapper returned by pa.output_stream() should respect Python
	# file semantics, i.e. destroying it should close the underlying
	# file cleanly.
	data = b"some test data\n"
	file_path = tmpdir / 'output_stream.buffered'

	def check_data(file_path, data, **kwargs):
	stream = pa.output_stream(file_path, **kwargs)
	stream.write(data)
	del stream
	gc.collect()
	with open(str(file_path), 'rb') as f:
	return f.read()

	assert check_data(file_path, data, buffer_size=0) == data
	assert check_data(file_path, data, buffer_size=1024) == data


	@pytest.mark.gzip
	def test_output_stream_python_file(tmpdir):
	data = b"some test data\n" * 10 + b"eof\n"

	def check_data(data, **kwargs):
	# XXX cannot use BytesIO because stream.close() is necessary
	# to finish writing compressed data, but it will also close the
	# underlying BytesIO
	fn = str(tmpdir / 'output_stream_file')
	with open(fn, 'wb') as f:
	with pa.output_stream(f, **kwargs) as stream:
	stream.write(data)
	with open(fn, 'rb') as f:
	return f.read()

	assert check_data(data) == data
	assert gzip.decompress(check_data(data, compression='gzip')) == data


	def test_output_stream_errors(tmpdir):
	buf = memoryview(bytearray())
	with pytest.raises(ValueError):
	pa.output_stream(buf, compression="foo")

	for arg in [bytearray(), StringIO()]:
	with pytest.raises(TypeError):
	pa.output_stream(arg)

	fn = str(tmpdir / 'new_file')
	with open(fn, 'wb') as f:
	pass
	with open(fn, 'rb') as f:
	with pytest.raises(TypeError, match="writable file expected"):
	pa.output_stream(f)