|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#include <cassert> |
|
#include <memory> |
|
#include <unordered_map> |
|
#include <unordered_set> |
|
#include <vector> |
|
|
|
#include "arrow/result.h" |
|
#include "arrow/status.h" |
|
#include "arrow/type.h" |
|
#include "arrow/type_fwd.h" |
|
|
|
#include "parquet/level_conversion.h" |
|
#include "parquet/platform.h" |
|
#include "parquet/schema.h" |
|
|
|
namespace parquet { |
|
|
|
class ArrowReaderProperties; |
|
class ArrowWriterProperties; |
|
class WriterProperties; |
|
|
|
namespace arrow { |
|
|
|
|
|
|
|
|
|
|
|
|
|
PARQUET_EXPORT |
|
::arrow::Status FieldToNode(const std::shared_ptr<::arrow::Field>& field, |
|
const WriterProperties& properties, |
|
const ArrowWriterProperties& arrow_properties, |
|
schema::NodePtr* out); |
|
|
|
PARQUET_EXPORT |
|
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema, |
|
const WriterProperties& properties, |
|
const ArrowWriterProperties& arrow_properties, |
|
std::shared_ptr<SchemaDescriptor>* out); |
|
|
|
PARQUET_EXPORT |
|
::arrow::Status ToParquetSchema(const ::arrow::Schema* arrow_schema, |
|
const WriterProperties& properties, |
|
std::shared_ptr<SchemaDescriptor>* out); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PARQUET_EXPORT |
|
::arrow::Status FromParquetSchema( |
|
const SchemaDescriptor* parquet_schema, const ArrowReaderProperties& properties, |
|
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata, |
|
std::shared_ptr<::arrow::Schema>* out); |
|
|
|
PARQUET_EXPORT |
|
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema, |
|
const ArrowReaderProperties& properties, |
|
std::shared_ptr<::arrow::Schema>* out); |
|
|
|
PARQUET_EXPORT |
|
::arrow::Status FromParquetSchema(const SchemaDescriptor* parquet_schema, |
|
std::shared_ptr<::arrow::Schema>* out); |
|
|
|
|
|
|
|
|
|
struct PARQUET_EXPORT SchemaField { |
|
std::shared_ptr<::arrow::Field> field; |
|
std::vector<SchemaField> children; |
|
|
|
|
|
int column_index = -1; |
|
|
|
parquet::internal::LevelInfo level_info; |
|
|
|
bool is_leaf() const { return column_index != -1; } |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
struct PARQUET_EXPORT SchemaManifest { |
|
static ::arrow::Status Make( |
|
const SchemaDescriptor* schema, |
|
const std::shared_ptr<const ::arrow::KeyValueMetadata>& metadata, |
|
const ArrowReaderProperties& properties, SchemaManifest* manifest); |
|
|
|
const SchemaDescriptor* descr; |
|
std::shared_ptr<::arrow::Schema> origin_schema; |
|
std::shared_ptr<const ::arrow::KeyValueMetadata> schema_metadata; |
|
std::vector<SchemaField> schema_fields; |
|
|
|
std::unordered_map<int, const SchemaField*> column_index_to_field; |
|
std::unordered_map<const SchemaField*, const SchemaField*> child_to_parent; |
|
|
|
::arrow::Status GetColumnField(int column_index, const SchemaField** out) const { |
|
auto it = column_index_to_field.find(column_index); |
|
if (it == column_index_to_field.end()) { |
|
return ::arrow::Status::KeyError("Column index ", column_index, |
|
" not found in schema manifest, may be malformed"); |
|
} |
|
*out = it->second; |
|
return ::arrow::Status::OK(); |
|
} |
|
|
|
const SchemaField* GetParent(const SchemaField* field) const { |
|
|
|
auto it = child_to_parent.find(field); |
|
if (it == child_to_parent.end()) { |
|
return NULLPTR; |
|
} |
|
return it->second; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
::arrow::Result<std::vector<int>> GetFieldIndices( |
|
const std::vector<int>& column_indices) const { |
|
const schema::GroupNode* group = descr->group_node(); |
|
std::unordered_set<int> already_added; |
|
|
|
std::vector<int> out; |
|
for (int column_idx : column_indices) { |
|
if (column_idx < 0 || column_idx >= descr->num_columns()) { |
|
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid"); |
|
} |
|
|
|
auto field_node = descr->GetColumnRoot(column_idx); |
|
auto field_idx = group->FieldIndex(*field_node); |
|
if (field_idx == -1) { |
|
return ::arrow::Status::IndexError("Column index ", column_idx, " is not valid"); |
|
} |
|
|
|
if (already_added.insert(field_idx).second) { |
|
out.push_back(field_idx); |
|
} |
|
} |
|
return out; |
|
} |
|
}; |
|
|
|
} |
|
} |
|
|