// Licensed to the Apache Software Foundation (ASF) under one | |
// or more contributor license agreements. See the NOTICE file | |
// distributed with this work for additional information | |
// regarding copyright ownership. The ASF licenses this file | |
// to you under the Apache License, Version 2.0 (the | |
// "License"); you may not use this file except in compliance | |
// with the License. You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, | |
// software distributed under the License is distributed on an | |
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
// KIND, either express or implied. See the License for the | |
// specific language governing permissions and limitations | |
// under the License. | |
namespace parquet::internal { | |
struct PARQUET_EXPORT LevelInfo { | |
LevelInfo() | |
: null_slot_usage(1), def_level(0), rep_level(0), repeated_ancestor_def_level(0) {} | |
LevelInfo(int32_t null_slots, int32_t definition_level, int32_t repetition_level, | |
int32_t repeated_ancestor_definition_level) | |
: null_slot_usage(null_slots), | |
def_level(static_cast<int16_t>(definition_level)), | |
rep_level(static_cast<int16_t>(repetition_level)), | |
repeated_ancestor_def_level( | |
static_cast<int16_t>(repeated_ancestor_definition_level)) {} | |
bool operator==(const LevelInfo& b) const { | |
return null_slot_usage == b.null_slot_usage && def_level == b.def_level && | |
rep_level == b.rep_level && | |
repeated_ancestor_def_level == b.repeated_ancestor_def_level; | |
} | |
bool HasNullableValues() const { return repeated_ancestor_def_level < def_level; } | |
// How many slots an undefined but present (i.e. null) element in | |
// parquet consumes when decoding to Arrow. | |
// "Slot" is used in the same context as the Arrow specification | |
// (i.e. a value holder). | |
// This is only ever >1 for descendents of FixedSizeList. | |
int32_t null_slot_usage = 1; | |
// The definition level at which the value for the field | |
// is considered not null (definition levels greater than | |
// or equal to this value indicate a not-null | |
// value for the field). For list fields definition levels | |
// greater than or equal to this field indicate a present, | |
// possibly null, child value. | |
int16_t def_level = 0; | |
// The repetition level corresponding to this element | |
// or the closest repeated ancestor. Any repetition | |
// level less than this indicates either a new list OR | |
// an empty list (which is determined in conjunction | |
// with definition levels). | |
int16_t rep_level = 0; | |
// The definition level indicating the level at which the closest | |
// repeated ancestor is not empty. This is used to discriminate | |
// between a value less than |def_level| being null or excluded entirely. | |
// For instance if we have an arrow schema like: | |
// list(struct(f0: int)). Then then there are the following | |
// definition levels: | |
// 0 = null list | |
// 1 = present but empty list. | |
// 2 = a null value in the list | |
// 3 = a non null struct but null integer. | |
// 4 = a present integer. | |
// When reconstructing, the struct and integer arrays' | |
// repeated_ancestor_def_level would be 2. Any | |
// def_level < 2 indicates that there isn't a corresponding | |
// child value in the list. | |
// i.e. [null, [], [null], [{f0: null}], [{f0: 1}]] | |
// has the def levels [0, 1, 2, 3, 4]. The actual | |
// struct array is only of length 3: [not-set, set, set] and | |
// the int array is also of length 3: [N/A, null, 1]. | |
// | |
int16_t repeated_ancestor_def_level = 0; | |
/// Increments levels according to the cardinality of node. | |
void Increment(const schema::Node& node) { | |
if (node.is_repeated()) { | |
IncrementRepeated(); | |
return; | |
} | |
if (node.is_optional()) { | |
IncrementOptional(); | |
return; | |
} | |
} | |
/// Increments level for a optional node. | |
void IncrementOptional() { def_level++; } | |
/// Increments levels for the repeated node. Returns | |
/// the previous ancestor_list_def_level. | |
int16_t IncrementRepeated() { | |
int16_t last_repeated_ancestor = repeated_ancestor_def_level; | |
// Repeated fields add both a repetition and definition level. This is used | |
// to distinguish between an empty list and a list with an item in it. | |
++rep_level; | |
++def_level; | |
// For levels >= repeated_ancestor_def_level it indicates the list was | |
// non-null and had at least one element. This is important | |
// for later decoding because we need to add a slot for these | |
// values. for levels < current_def_level no slots are added | |
// to arrays. | |
repeated_ancestor_def_level = def_level; | |
return last_repeated_ancestor; | |
} | |
// Calculates and returns LevelInfo for a column descriptor. | |
static LevelInfo ComputeLevelInfo(const ColumnDescriptor* descr) { | |
LevelInfo level_info; | |
level_info.def_level = descr->max_definition_level(); | |
level_info.rep_level = descr->max_repetition_level(); | |
int16_t min_spaced_def_level = descr->max_definition_level(); | |
const ::parquet::schema::Node* node = descr->schema_node().get(); | |
while (node && !node->is_repeated()) { | |
if (node->is_optional()) { | |
min_spaced_def_level--; | |
} | |
node = node->parent(); | |
} | |
level_info.repeated_ancestor_def_level = min_spaced_def_level; | |
return level_info; | |
} | |
friend std::ostream& operator<<(std::ostream& os, const LevelInfo& levels) { | |
// This print method is to silence valgrind issues. What's printed | |
// is not important because all asserts happen directly on | |
// members. | |
os << "{def=" << levels.def_level << ", rep=" << levels.rep_level | |
<< ", repeated_ancestor_def=" << levels.repeated_ancestor_def_level; | |
if (levels.null_slot_usage > 1) { | |
os << ", null_slot_usage=" << levels.null_slot_usage; | |
} | |
os << "}"; | |
return os; | |
} | |
}; | |
// Input/Output structure for reconstructed validity bitmaps. | |
struct PARQUET_EXPORT ValidityBitmapInputOutput { | |
// Input only. | |
// The maximum number of values_read expected (actual | |
// values read must be less than or equal to this value). | |
// If this number is exceeded methods will throw a | |
// ParquetException. Exceeding this limit indicates | |
// either a corrupt or incorrectly written file. | |
int64_t values_read_upper_bound = 0; | |
// Output only. The number of values added to the encountered | |
// (this is logically the count of the number of elements | |
// for an Arrow array). | |
int64_t values_read = 0; | |
// Input/Output. The number of nulls encountered. | |
int64_t null_count = 0; | |
// Output only. The validity bitmap to populate. Maybe be null only | |
// for DefRepLevelsToListInfo (if all that is needed is list offsets). | |
uint8_t* valid_bits = NULLPTR; | |
// Input only, offset into valid_bits to start at. | |
int64_t valid_bits_offset = 0; | |
}; | |
// Converts def_levels to validity bitmaps for non-list arrays and structs that have | |
// at least one member that is not a list and has no list descendents. | |
// For lists use DefRepLevelsToList and structs where all descendants contain | |
// a list use DefRepLevelsToBitmap. | |
void PARQUET_EXPORT DefLevelsToBitmap(const int16_t* def_levels, int64_t num_def_levels, | |
LevelInfo level_info, | |
ValidityBitmapInputOutput* output); | |
// Reconstructs a validity bitmap and list offsets for a list arrays based on | |
// def/rep levels. The first element of offsets will not be modified if rep_levels | |
// starts with a new list. The first element of offsets will be used when calculating | |
// the next offset. See documentation onf DefLevelsToBitmap for when to use this | |
// method vs the other ones in this file for reconstruction. | |
// | |
// Offsets must be sized to 1 + values_read_upper_bound. | |
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels, | |
const int16_t* rep_levels, int64_t num_def_levels, | |
LevelInfo level_info, | |
ValidityBitmapInputOutput* output, | |
int32_t* offsets); | |
void PARQUET_EXPORT DefRepLevelsToList(const int16_t* def_levels, | |
const int16_t* rep_levels, int64_t num_def_levels, | |
LevelInfo level_info, | |
ValidityBitmapInputOutput* output, | |
int64_t* offsets); | |
// Reconstructs a validity bitmap for a struct every member is a list or has | |
// a list descendant. See documentation on DefLevelsToBitmap for when more | |
// details on this method compared to the other ones defined above. | |
void PARQUET_EXPORT DefRepLevelsToBitmap(const int16_t* def_levels, | |
const int16_t* rep_levels, | |
int64_t num_def_levels, LevelInfo level_info, | |
ValidityBitmapInputOutput* output); | |
// This is exposed to ensure we can properly test a software simulated pext function | |
// (i.e. it isn't hidden by runtime dispatch). | |
uint64_t PARQUET_EXPORT TestOnlyExtractBitsSoftware(uint64_t bitmap, uint64_t selection); | |
} // namespace parquet::internal | |