|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#include <cstdint> |
|
#include <memory> |
|
#include <ostream> |
|
#include <string> |
|
#include <unordered_map> |
|
#include <utility> |
|
#include <vector> |
|
|
|
#include "parquet/platform.h" |
|
#include "parquet/types.h" |
|
#include "parquet/windows_fixup.h" |
|
|
|
namespace parquet { |
|
|
|
class SchemaDescriptor; |
|
|
|
namespace schema { |
|
|
|
class Node; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct ListEncoding { |
|
enum type { ONE_LEVEL, TWO_LEVEL, THREE_LEVEL }; |
|
}; |
|
|
|
class PARQUET_EXPORT ColumnPath { |
|
public: |
|
ColumnPath() : path_() {} |
|
explicit ColumnPath(const std::vector<std::string>& path) : path_(path) {} |
|
explicit ColumnPath(std::vector<std::string>&& path) : path_(std::move(path)) {} |
|
|
|
static std::shared_ptr<ColumnPath> FromDotString(const std::string& dotstring); |
|
static std::shared_ptr<ColumnPath> FromNode(const Node& node); |
|
|
|
std::shared_ptr<ColumnPath> extend(const std::string& node_name) const; |
|
std::string ToDotString() const; |
|
const std::vector<std::string>& ToDotVector() const; |
|
|
|
protected: |
|
std::vector<std::string> path_; |
|
}; |
|
|
|
|
|
|
|
class PARQUET_EXPORT Node { |
|
public: |
|
enum type { PRIMITIVE, GROUP }; |
|
|
|
virtual ~Node() {} |
|
|
|
bool is_primitive() const { return type_ == Node::PRIMITIVE; } |
|
|
|
bool is_group() const { return type_ == Node::GROUP; } |
|
|
|
bool is_optional() const { return repetition_ == Repetition::OPTIONAL; } |
|
|
|
bool is_repeated() const { return repetition_ == Repetition::REPEATED; } |
|
|
|
bool is_required() const { return repetition_ == Repetition::REQUIRED; } |
|
|
|
virtual bool Equals(const Node* other) const = 0; |
|
|
|
const std::string& name() const { return name_; } |
|
|
|
Node::type node_type() const { return type_; } |
|
|
|
Repetition::type repetition() const { return repetition_; } |
|
|
|
ConvertedType::type converted_type() const { return converted_type_; } |
|
|
|
const std::shared_ptr<const LogicalType>& logical_type() const { return logical_type_; } |
|
|
|
|
|
|
|
|
|
int field_id() const { return field_id_; } |
|
|
|
const Node* parent() const { return parent_; } |
|
|
|
const std::shared_ptr<ColumnPath> path() const; |
|
|
|
virtual void ToParquet(void* element) const = 0; |
|
|
|
|
|
class Visitor { |
|
public: |
|
virtual ~Visitor() {} |
|
|
|
virtual void Visit(Node* node) = 0; |
|
}; |
|
class ConstVisitor { |
|
public: |
|
virtual ~ConstVisitor() {} |
|
|
|
virtual void Visit(const Node* node) = 0; |
|
}; |
|
|
|
virtual void Visit(Visitor* visitor) = 0; |
|
virtual void VisitConst(ConstVisitor* visitor) const = 0; |
|
|
|
protected: |
|
friend class GroupNode; |
|
|
|
Node(Node::type type, const std::string& name, Repetition::type repetition, |
|
ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1) |
|
: type_(type), |
|
name_(name), |
|
repetition_(repetition), |
|
converted_type_(converted_type), |
|
field_id_(field_id), |
|
parent_(NULLPTR) {} |
|
|
|
Node(Node::type type, const std::string& name, Repetition::type repetition, |
|
std::shared_ptr<const LogicalType> logical_type, int field_id = -1) |
|
: type_(type), |
|
name_(name), |
|
repetition_(repetition), |
|
logical_type_(std::move(logical_type)), |
|
field_id_(field_id), |
|
parent_(NULLPTR) {} |
|
|
|
Node::type type_; |
|
std::string name_; |
|
Repetition::type repetition_; |
|
ConvertedType::type converted_type_{ConvertedType::NONE}; |
|
std::shared_ptr<const LogicalType> logical_type_; |
|
int field_id_; |
|
|
|
const Node* parent_; |
|
|
|
bool EqualsInternal(const Node* other) const; |
|
void SetParent(const Node* p_parent); |
|
|
|
private: |
|
PARQUET_DISALLOW_COPY_AND_ASSIGN(Node); |
|
}; |
|
|
|
|
|
using NodePtr = std::shared_ptr<Node>; |
|
using NodeVector = std::vector<NodePtr>; |
|
|
|
|
|
|
|
|
|
|
|
class PARQUET_EXPORT PrimitiveNode : public Node { |
|
public: |
|
static std::unique_ptr<Node> FromParquet(const void* opaque_element); |
|
|
|
|
|
static inline NodePtr Make(const std::string& name, Repetition::type repetition, |
|
Type::type type, |
|
ConvertedType::type converted_type = ConvertedType::NONE, |
|
int length = -1, int precision = -1, int scale = -1, |
|
int field_id = -1) { |
|
return NodePtr(new PrimitiveNode(name, repetition, type, converted_type, length, |
|
precision, scale, field_id)); |
|
} |
|
|
|
|
|
|
|
static inline NodePtr Make(const std::string& name, Repetition::type repetition, |
|
std::shared_ptr<const LogicalType> logical_type, |
|
Type::type primitive_type, int primitive_length = -1, |
|
int field_id = -1) { |
|
return NodePtr(new PrimitiveNode(name, repetition, std::move(logical_type), |
|
primitive_type, primitive_length, field_id)); |
|
} |
|
|
|
bool Equals(const Node* other) const override; |
|
|
|
Type::type physical_type() const { return physical_type_; } |
|
|
|
ColumnOrder column_order() const { return column_order_; } |
|
|
|
void SetColumnOrder(ColumnOrder column_order) { column_order_ = column_order; } |
|
|
|
int32_t type_length() const { return type_length_; } |
|
|
|
const DecimalMetadata& decimal_metadata() const { return decimal_metadata_; } |
|
|
|
void ToParquet(void* element) const override; |
|
void Visit(Visitor* visitor) override; |
|
void VisitConst(ConstVisitor* visitor) const override; |
|
|
|
private: |
|
PrimitiveNode(const std::string& name, Repetition::type repetition, Type::type type, |
|
ConvertedType::type converted_type = ConvertedType::NONE, int length = -1, |
|
int precision = -1, int scale = -1, int field_id = -1); |
|
|
|
PrimitiveNode(const std::string& name, Repetition::type repetition, |
|
std::shared_ptr<const LogicalType> logical_type, |
|
Type::type primitive_type, int primitive_length = -1, int field_id = -1); |
|
|
|
Type::type physical_type_; |
|
int32_t type_length_; |
|
DecimalMetadata decimal_metadata_; |
|
ColumnOrder column_order_; |
|
|
|
|
|
void SetTypeLength(int32_t length) { type_length_ = length; } |
|
|
|
bool EqualsInternal(const PrimitiveNode* other) const; |
|
|
|
FRIEND_TEST(TestPrimitiveNode, Attrs); |
|
FRIEND_TEST(TestPrimitiveNode, Equals); |
|
FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping); |
|
FRIEND_TEST(TestPrimitiveNode, FromParquet); |
|
}; |
|
|
|
class PARQUET_EXPORT GroupNode : public Node { |
|
public: |
|
static std::unique_ptr<Node> FromParquet(const void* opaque_element, |
|
NodeVector fields = {}); |
|
|
|
|
|
static inline NodePtr Make(const std::string& name, Repetition::type repetition, |
|
const NodeVector& fields, |
|
ConvertedType::type converted_type = ConvertedType::NONE, |
|
int field_id = -1) { |
|
return NodePtr(new GroupNode(name, repetition, fields, converted_type, field_id)); |
|
} |
|
|
|
|
|
|
|
static inline NodePtr Make(const std::string& name, Repetition::type repetition, |
|
const NodeVector& fields, |
|
std::shared_ptr<const LogicalType> logical_type, |
|
int field_id = -1) { |
|
return NodePtr( |
|
new GroupNode(name, repetition, fields, std::move(logical_type), field_id)); |
|
} |
|
|
|
bool Equals(const Node* other) const override; |
|
|
|
const NodePtr& field(int i) const { return fields_[i]; } |
|
|
|
|
|
|
|
int FieldIndex(const std::string& name) const; |
|
|
|
int FieldIndex(const Node& node) const; |
|
|
|
int field_count() const { return static_cast<int>(fields_.size()); } |
|
|
|
void ToParquet(void* element) const override; |
|
void Visit(Visitor* visitor) override; |
|
void VisitConst(ConstVisitor* visitor) const override; |
|
|
|
|
|
|
|
bool HasRepeatedFields() const; |
|
|
|
private: |
|
GroupNode(const std::string& name, Repetition::type repetition, |
|
const NodeVector& fields, |
|
ConvertedType::type converted_type = ConvertedType::NONE, int field_id = -1); |
|
|
|
GroupNode(const std::string& name, Repetition::type repetition, |
|
const NodeVector& fields, std::shared_ptr<const LogicalType> logical_type, |
|
int field_id = -1); |
|
|
|
NodeVector fields_; |
|
bool EqualsInternal(const GroupNode* other) const; |
|
|
|
|
|
std::unordered_multimap<std::string, int> field_name_to_idx_; |
|
|
|
FRIEND_TEST(TestGroupNode, Attrs); |
|
FRIEND_TEST(TestGroupNode, Equals); |
|
FRIEND_TEST(TestGroupNode, FieldIndex); |
|
FRIEND_TEST(TestGroupNode, FieldIndexDuplicateName); |
|
}; |
|
|
|
|
|
|
|
|
|
#define PRIMITIVE_FACTORY(FuncName, TYPE) \ |
|
static inline NodePtr FuncName(const std::string& name, \ |
|
Repetition::type repetition = Repetition::OPTIONAL, \ |
|
int field_id = -1) { \ |
|
return PrimitiveNode::Make(name, repetition, Type::TYPE, ConvertedType::NONE, \ |
|
-1, -1, -1, field_id); \ |
|
} |
|
|
|
PRIMITIVE_FACTORY(Boolean, BOOLEAN) |
|
PRIMITIVE_FACTORY(Int32, INT32) |
|
PRIMITIVE_FACTORY(Int64, INT64) |
|
PRIMITIVE_FACTORY(Int96, INT96) |
|
PRIMITIVE_FACTORY(Float, FLOAT) |
|
PRIMITIVE_FACTORY(Double, DOUBLE) |
|
PRIMITIVE_FACTORY(ByteArray, BYTE_ARRAY) |
|
|
|
void PARQUET_EXPORT PrintSchema(const schema::Node* schema, std::ostream& stream, |
|
int indent_width = 2); |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
class PARQUET_EXPORT ColumnDescriptor { |
|
public: |
|
ColumnDescriptor(schema::NodePtr node, int16_t max_definition_level, |
|
int16_t max_repetition_level, |
|
const SchemaDescriptor* schema_descr = NULLPTR); |
|
|
|
bool Equals(const ColumnDescriptor& other) const; |
|
|
|
int16_t max_definition_level() const { return max_definition_level_; } |
|
|
|
int16_t max_repetition_level() const { return max_repetition_level_; } |
|
|
|
Type::type physical_type() const { return primitive_node_->physical_type(); } |
|
|
|
ConvertedType::type converted_type() const { return primitive_node_->converted_type(); } |
|
|
|
const std::shared_ptr<const LogicalType>& logical_type() const { |
|
return primitive_node_->logical_type(); |
|
} |
|
|
|
ColumnOrder column_order() const { return primitive_node_->column_order(); } |
|
|
|
SortOrder::type sort_order() const { |
|
const auto& la = logical_type(); |
|
auto pt = physical_type(); |
|
return la ? GetSortOrder(la, pt) : GetSortOrder(converted_type(), pt); |
|
} |
|
|
|
const std::string& name() const { return primitive_node_->name(); } |
|
|
|
const std::shared_ptr<schema::ColumnPath> path() const; |
|
|
|
const schema::NodePtr& schema_node() const { return node_; } |
|
|
|
std::string ToString() const; |
|
|
|
int type_length() const; |
|
|
|
int type_precision() const; |
|
|
|
int type_scale() const; |
|
|
|
private: |
|
schema::NodePtr node_; |
|
const schema::PrimitiveNode* primitive_node_; |
|
|
|
int16_t max_definition_level_; |
|
int16_t max_repetition_level_; |
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PARQUET_EXPORT SchemaDescriptor { |
|
public: |
|
SchemaDescriptor() = default; |
|
~SchemaDescriptor() = default; |
|
|
|
|
|
void Init(std::unique_ptr<schema::Node> schema); |
|
void Init(schema::NodePtr schema); |
|
|
|
const ColumnDescriptor* Column(int i) const; |
|
|
|
|
|
|
|
|
|
int ColumnIndex(const std::string& node_path) const; |
|
|
|
int ColumnIndex(const schema::Node& node) const; |
|
|
|
bool Equals(const SchemaDescriptor& other, std::ostream* diff_output = NULLPTR) const; |
|
|
|
|
|
int num_columns() const { return static_cast<int>(leaves_.size()); } |
|
|
|
const schema::NodePtr& schema_root() const { return schema_; } |
|
|
|
const schema::GroupNode* group_node() const { return group_node_; } |
|
|
|
|
|
const schema::Node* GetColumnRoot(int i) const; |
|
|
|
const std::string& name() const { return group_node_->name(); } |
|
|
|
std::string ToString() const; |
|
|
|
void updateColumnOrders(const std::vector<ColumnOrder>& column_orders); |
|
|
|
|
|
|
|
int GetColumnIndex(const schema::PrimitiveNode& node) const; |
|
|
|
|
|
|
|
bool HasRepeatedFields() const; |
|
|
|
private: |
|
friend class ColumnDescriptor; |
|
|
|
|
|
schema::NodePtr schema_; |
|
|
|
|
|
const schema::GroupNode* group_node_; |
|
|
|
void BuildTree(const schema::NodePtr& node, int16_t max_def_level, |
|
int16_t max_rep_level, const schema::NodePtr& base); |
|
|
|
|
|
std::vector<ColumnDescriptor> leaves_; |
|
|
|
std::unordered_map<const schema::PrimitiveNode*, int> node_to_leaf_index_; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::unordered_map<int, schema::NodePtr> leaf_to_base_; |
|
|
|
|
|
std::unordered_multimap<std::string, int> leaf_to_idx_; |
|
}; |
|
|
|
} |
|
|