File size: 6,204 Bytes
9c6594c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cassert>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "arrow/result.h"
#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
#include "parquet/level_conversion.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
namespace parquet {
class ArrowReaderProperties;
class ArrowWriterProperties;
class WriterProperties;
namespace arrow {
/// \defgroup arrow-to-parquet-schema-conversion Functions to convert an Arrow
/// schema into a Parquet schema.
///
/// @{
PARQUET_EXPORT
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties,
schema::NodePtr* out);
PARQUET_EXPORT
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
const WriterProperties& properties,
const ArrowWriterProperties& arrow_properties,
std::shared_ptr<SchemaDescriptor>* out);
PARQUET_EXPORT
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
const WriterProperties& properties,
std::shared_ptr<SchemaDescriptor>* out);
/// @}
/// \defgroup parquet-to-arrow-schema-conversion Functions to convert a Parquet
/// schema into an Arrow schema.
///
/// @{
PARQUET_EXPORT
::arrow::Status FromParquetSchema(
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties,
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata,
std::shared_ptr<::arrow::Schema>* out);
PARQUET_EXPORT
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
const ArrowReaderProperties& properties,
std::shared_ptr<::arrow::Schema>* out);
PARQUET_EXPORT
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema,
std::shared_ptr<::arrow::Schema>* out);
/// @}
/// \brief Bridge between an arrow::Field and parquet column indices.
struct PARQUET_EXPORT SchemaField {
std::shared_ptr<::arrow::Field> field;
std::vector<SchemaField> children;
// Only set for leaf nodes
int column_index = -1;
parquet::internal::LevelInfo level_info;
bool is_leaf() const { return column_index != -1; }
};
/// \brief Bridge between a parquet Schema and an arrow Schema.
///
/// Expose parquet columns as a tree structure. Useful traverse and link
/// between arrow's Schema and parquet's Schema.
struct PARQUET_EXPORT SchemaManifest {
static ::arrow::Status Make(
const SchemaDescriptor* schema,
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata,
const ArrowReaderProperties& properties, SchemaManifest* manifest);
const SchemaDescriptor* descr;
std::shared_ptr<::arrow::Schema> origin_schema;
std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata;
std::vector<SchemaField> schema_fields;
std::unordered_map<int, const SchemaField*> column_index_to_field;
std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent;
::arrow::Status GetColumnField(int column_index, const SchemaField** out) const {
auto it = column_index_to_field.find(column_index);
if (it == column_index_to_field.end()) {
return ::arrow::Status::KeyError("Column index ", column_index,
" not found in schema manifest, may be malformed");
}
*out = it->second;
return ::arrow::Status::OK();
}
const SchemaField* GetParent(const SchemaField* field) const {
// Returns nullptr also if not found
auto it = child_to_parent.find(field);
if (it == child_to_parent.end()) {
return NULLPTR;
}
return it->second;
}
/// Coalesce a list of field indices (relative to the equivalent arrow::Schema) which
/// correspond to the column root (first node below the parquet schema's root group) of
/// each leaf referenced in column_indices.
///
/// For example, for leaves `a.b.c`, `a.b.d.e`, and `i.j.k` (column_indices=[0,1,3])
/// the roots are `a` and `i` (return=[0,2]).
///
/// root
/// -- a <------
/// -- -- b | |
/// -- -- -- c |
/// -- -- -- d |
/// -- -- -- -- e
/// -- f
/// -- -- g
/// -- -- -- h
/// -- i <---
/// -- -- j |
/// -- -- -- k
::arrow::Result<std::vector<int>> GetFieldIndices(
const std::vector<int>& column_indices) const {
const schema::GroupNode* group = descr->group_node();
std::unordered_set<int> already_added;
std::vector<int> out;
for (int column_idx : column_indices) {
if (column_idx < 0 || column_idx >= descr->num_columns()) {
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
}
auto field_node = descr->GetColumnRoot(column_idx);
auto field_idx = group->FieldIndex(*field_node);
if (field_idx == -1) {
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid");
}
if (already_added.insert(field_idx).second) {
out.push_back(field_idx);
}
}
return out;
}
};
} // namespace arrow
} // namespace parquet
|