File size: 9,432 Bytes
9c6594c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include "arrow/util/endian.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
namespace parquet::internal {
struct PARQUET_EXPORT LevelInfo {
LevelInfo()
: null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {}
LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level,
int32_t repeated_ancestor_definition_level)
: null_slot_usage(null_slots),
def_level(static_cast<int16_t>(definition_level)),
rep_level(static_cast<int16_t>(repetition_level)),
repeated_ancestor_def_level(
static_cast<int16_t>(repeated_ancestor_definition_level)) {}
bool operator==(const LevelInfo& b) const {
return null_slot_usage == b.null_slot_usage && def_level == b.def_level &&
rep_level == b.rep_level &&
repeated_ancestor_def_level == b.repeated_ancestor_def_level;
}
bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; }
// How many slots an undefined but present (i.e. null) element in
// parquet consumes when decoding to Arrow.
// "Slot" is used in the same context as the Arrow specification
// (i.e. a value holder).
// This is only ever >1 for descendents of FixedSizeList.
int32_t null_slot_usage = 1;
// The definition level at which the value for the field
// is considered not null (definition levels greater than
// or equal to this value indicate a not-null
// value for the field). For list fields definition levels
// greater than or equal to this field indicate a present,
// possibly null, child value.
int16_t def_level = 0;
// The repetition level corresponding to this element
// or the closest repeated ancestor. Any repetition
// level less than this indicates either a new list OR
// an empty list (which is determined in conjunction
// with definition levels).
int16_t rep_level = 0;
// The definition level indicating the level at which the closest
// repeated ancestor is not empty. This is used to discriminate
// between a value less than |def_level| being null or excluded entirely.
// For instance if we have an arrow schema like:
// list(struct(f0: int)). Then then there are the following
// definition levels:
// 0 = null list
// 1 = present but empty list.
// 2 = a null value in the list
// 3 = a non null struct but null integer.
// 4 = a present integer.
// When reconstructing, the struct and integer arrays'
// repeated_ancestor_def_level would be 2. Any
// def_level < 2 indicates that there isn't a corresponding
// child value in the list.
// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]]
// has the def levels [0, 1, 2, 3, 4]. The actual
// struct array is only of length 3: [not-set, set, set] and
// the int array is also of length 3: [N/A, null, 1].
//
int16_t repeated_ancestor_def_level = 0;
/// Increments levels according to the cardinality of node.
void Increment(const schema::Node& node) {
if (node.is_repeated()) {
IncrementRepeated();
return;
}
if (node.is_optional()) {
IncrementOptional();
return;
}
}
/// Increments level for a optional node.
void IncrementOptional() { def_level++; }
/// Increments levels for the repeated node. Returns
/// the previous ancestor_list_def_level.
int16_t IncrementRepeated() {
int16_t last_repeated_ancestor = repeated_ancestor_def_level;
// Repeated fields add both a repetition and definition level. This is used
// to distinguish between an empty list and a list with an item in it.
++rep_level;
++def_level;
// For levels >= repeated_ancestor_def_level it indicates the list was
// non-null and had at least one element. This is important
// for later decoding because we need to add a slot for these
// values. for levels < current_def_level no slots are added
// to arrays.
repeated_ancestor_def_level = def_level;
return last_repeated_ancestor;
}
// Calculates and returns LevelInfo for a column descriptor.
static LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) {
LevelInfo level_info;
level_info.def_level = descr->max_definition_level();
level_info.rep_level = descr->max_repetition_level();
int16_t min_spaced_def_level = descr->max_definition_level();
const ::parquet::schema::Node* node = descr->schema_node().get();
while (node && !node->is_repeated()) {
if (node->is_optional()) {
min_spaced_def_level--;
}
node = node->parent();
}
level_info.repeated_ancestor_def_level = min_spaced_def_level;
return level_info;
}
friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) {
// This print method is to silence valgrind issues. What's printed
// is not important because all asserts happen directly on
// members.
os << "{def=" << levels.def_level << ", rep=" << levels.rep_level
<< ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level;
if (levels.null_slot_usage > 1) {
os << ", null_slot_usage=" << levels.null_slot_usage;
}
os << "}";
return os;
}
};
// Input/Output structure for reconstructed validity bitmaps.
struct PARQUET_EXPORT ValidityBitmapInputOutput {
// Input only.
// The maximum number of values_read expected (actual
// values read must be less than or equal to this value).
// If this number is exceeded methods will throw a
// ParquetException. Exceeding this limit indicates
// either a corrupt or incorrectly written file.
int64_t values_read_upper_bound = 0;
// Output only. The number of values added to the encountered
// (this is logically the count of the number of elements
// for an Arrow array).
int64_t values_read = 0;
// Input/Output. The number of nulls encountered.
int64_t null_count = 0;
// Output only. The validity bitmap to populate. Maybe be null only
// for DefRepLevelsToListInfo (if all that is needed is list offsets).
uint8_t* valid_bits = NULLPTR;
// Input only, offset into valid_bits to start at.
int64_t valid_bits_offset = 0;
};
// Converts def_levels to validity bitmaps for non-list arrays and structs that have
// at least one member that is not a list and has no list descendents.
// For lists use DefRepLevelsToList and structs where all descendants contain
// a list use DefRepLevelsToBitmap.
void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels,
LevelInfo level_info,
ValidityBitmapInputOutput* output);
// Reconstructs a validity bitmap and list offsets for a list arrays based on
// def/rep levels. The first element of offsets will not be modified if rep_levels
// starts with a new list. The first element of offsets will be used when calculating
// the next offset. See documentation onf DefLevelsToBitmap for when to use this
// method vs the other ones in this file for reconstruction.
//
// Offsets must be sized to 1 + values_read_upper_bound.
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
const int16_t* rep_levels, int64_t num_def_levels,
LevelInfo level_info,
ValidityBitmapInputOutput* output,
int32_t* offsets);
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels,
const int16_t* rep_levels, int64_t num_def_levels,
LevelInfo level_info,
ValidityBitmapInputOutput* output,
int64_t* offsets);
// Reconstructs a validity bitmap for a struct every member is a list or has
// a list descendant. See documentation on DefLevelsToBitmap for when more
// details on this method compared to the other ones defined above.
void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels,
const int16_t* rep_levels,
int64_t num_def_levels, LevelInfo level_info,
ValidityBitmapInputOutput* output);
// This is exposed to ensure we can properly test a software simulated pext function
// (i.e. it isn't hidden by runtime dispatch).
uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection);
} // namespace parquet::internal
|