File size: 6,888 Bytes
9c6594c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <optional>
#include <vector>
#include "arrow/io/caching.h"
#include "arrow/ipc/type_fwd.h"
#include "arrow/status.h"
#include "arrow/type_fwd.h"
#include "arrow/util/compression.h"
#include "arrow/util/visibility.h"
namespace arrow {
class MemoryPool;
namespace ipc {
// ARROW-109: We set this number arbitrarily to help catch user mistakes. For
// deeply nested schemas, it is expected the user will indicate explicitly the
// maximum allowed recursion depth
constexpr int kMaxNestingDepth = 64;
/// \brief Options for writing Arrow IPC messages
struct ARROW_EXPORT IpcWriteOptions {
/// \brief If true, allow field lengths that don't fit in a signed 32-bit int.
///
/// Some implementations may not be able to parse streams created with this option.
bool allow_64bit = false;
/// \brief The maximum permitted schema nesting depth.
int max_recursion_depth = kMaxNestingDepth;
/// \brief Write padding after memory buffers up to this multiple of bytes.
int32_t alignment = 8;
/// \brief Write the pre-0.15.0 IPC message format
///
/// This legacy format consists of a 4-byte prefix instead of 8-byte.
bool write_legacy_ipc_format = false;
/// \brief The memory pool to use for allocations made during IPC writing
///
/// While Arrow IPC is predominantly zero-copy, it may have to allocate
/// memory in some cases (for example if compression is enabled).
MemoryPool* memory_pool = default_memory_pool();
/// \brief Compression codec to use for record batch body buffers
///
/// May only be UNCOMPRESSED, LZ4_FRAME and ZSTD.
std::shared_ptr<util::Codec> codec;
/// \brief Minimum space savings percentage required for compression to be applied
///
/// Space savings is calculated as (1.0 - compressed_size / uncompressed_size).
///
/// For example, if min_space_savings = 0.1, a 100-byte body buffer won't undergo
/// compression if its expected compressed size exceeds 90 bytes. If this option is
/// unset, compression will be used indiscriminately. If no codec was supplied, this
/// option is ignored.
///
/// Values outside of the range [0,1] are handled as errors.
///
/// Note that enabling this option may result in unreadable data for Arrow C++ versions
/// prior to 12.0.0.
std::optional<double> min_space_savings;
/// \brief Use global CPU thread pool to parallelize any computational tasks
/// like compression
bool use_threads = true;
/// \brief Whether to emit dictionary deltas
///
/// If false, a changed dictionary for a given field will emit a full
/// dictionary replacement.
/// If true, a changed dictionary will be compared against the previous
/// version. If possible, a dictionary delta will be emitted, otherwise
/// a full dictionary replacement.
///
/// Default is false to maximize stream compatibility.
///
/// Also, note that if a changed dictionary is a nested dictionary,
/// then a delta is never emitted, for compatibility with the read path.
bool emit_dictionary_deltas = false;
/// \brief Whether to unify dictionaries for the IPC file format
///
/// The IPC file format doesn't support dictionary replacements.
/// Therefore, chunks of a column with a dictionary type must have the same
/// dictionary in each record batch (or an extended dictionary + delta).
///
/// If this option is true, RecordBatchWriter::WriteTable will attempt
/// to unify dictionaries across each table column. If this option is
/// false, incompatible dictionaries across a table column will simply
/// raise an error.
///
/// Note that enabling this option has a runtime cost. Also, not all types
/// currently support dictionary unification.
///
/// This option is ignored for IPC streams, which support dictionary replacement
/// and deltas.
bool unify_dictionaries = false;
/// \brief Format version to use for IPC messages and their metadata.
///
/// Presently using V5 version (readable by 1.0.0 and later).
/// V4 is also available (readable by 0.8.0 and later).
MetadataVersion metadata_version = MetadataVersion::V5;
static IpcWriteOptions Defaults();
};
/// \brief Options for reading Arrow IPC messages
struct ARROW_EXPORT IpcReadOptions {
/// \brief The maximum permitted schema nesting depth.
int max_recursion_depth = kMaxNestingDepth;
/// \brief The memory pool to use for allocations made during IPC reading
///
/// While Arrow IPC is predominantly zero-copy, it may have to allocate
/// memory in some cases (for example if compression is enabled).
MemoryPool* memory_pool = default_memory_pool();
/// \brief Top-level schema fields to include when deserializing RecordBatch.
///
/// If empty (the default), return all deserialized fields.
/// If non-empty, the values are the indices of fields in the top-level schema.
std::vector<int> included_fields;
/// \brief Use global CPU thread pool to parallelize any computational tasks
/// like decompression
bool use_threads = true;
/// \brief Whether to convert incoming data to platform-native endianness
///
/// If the endianness of the received schema is not equal to platform-native
/// endianness, then all buffers with endian-sensitive data will be byte-swapped.
/// This includes the value buffers of numeric types, temporal types, decimal
/// types, as well as the offset buffers of variable-sized binary and list-like
/// types.
///
/// Endianness conversion is achieved by the RecordBatchFileReader,
/// RecordBatchStreamReader and StreamDecoder classes.
bool ensure_native_endian = true;
/// \brief Options to control caching behavior when pre-buffering is requested
///
/// The lazy property will always be reset to true to deliver the expected behavior
io::CacheOptions pre_buffer_cache_options = io::CacheOptions::LazyDefaults();
static IpcReadOptions Defaults();
};
namespace internal {
Status CheckCompressionSupported(Compression::type codec);
} // namespace internal
} // namespace ipc
} // namespace arrow
|