|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#include <array> |
|
#include <chrono> |
|
#include <cstdint> |
|
#include <memory> |
|
#include <optional> |
|
#include <string> |
|
#include <string_view> |
|
#include <vector> |
|
|
|
#include "parquet/column_writer.h" |
|
#include "parquet/file_writer.h" |
|
|
|
namespace parquet { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PARQUET_EXPORT StreamWriter { |
|
public: |
|
template <typename T> |
|
using optional = ::std::optional<T>; |
|
|
|
|
|
|
|
|
|
StreamWriter() = default; |
|
|
|
explicit StreamWriter(std::unique_ptr<ParquetFileWriter> writer); |
|
|
|
~StreamWriter() = default; |
|
|
|
static void SetDefaultMaxRowGroupSize(int64_t max_size); |
|
|
|
void SetMaxRowGroupSize(int64_t max_size); |
|
|
|
int current_column() const { return column_index_; } |
|
|
|
int64_t current_row() const { return current_row_; } |
|
|
|
int num_columns() const; |
|
|
|
|
|
StreamWriter(StreamWriter&&) = default; |
|
StreamWriter& operator=(StreamWriter&&) = default; |
|
|
|
|
|
StreamWriter(const StreamWriter&) = delete; |
|
StreamWriter& operator=(const StreamWriter&) = delete; |
|
|
|
|
|
|
|
StreamWriter& operator<<(bool v); |
|
|
|
StreamWriter& operator<<(int8_t v); |
|
|
|
StreamWriter& operator<<(uint8_t v); |
|
|
|
StreamWriter& operator<<(int16_t v); |
|
|
|
StreamWriter& operator<<(uint16_t v); |
|
|
|
StreamWriter& operator<<(int32_t v); |
|
|
|
StreamWriter& operator<<(uint32_t v); |
|
|
|
StreamWriter& operator<<(int64_t v); |
|
|
|
StreamWriter& operator<<(uint64_t v); |
|
|
|
StreamWriter& operator<<(const std::chrono::milliseconds& v); |
|
|
|
StreamWriter& operator<<(const std::chrono::microseconds& v); |
|
|
|
StreamWriter& operator<<(float v); |
|
|
|
StreamWriter& operator<<(double v); |
|
|
|
StreamWriter& operator<<(char v); |
|
|
|
|
|
|
|
|
|
struct PARQUET_EXPORT FixedStringView { |
|
FixedStringView() = default; |
|
|
|
explicit FixedStringView(const char* data_ptr); |
|
|
|
FixedStringView(const char* data_ptr, std::size_t data_len); |
|
|
|
const char* data{NULLPTR}; |
|
std::size_t size{0}; |
|
}; |
|
|
|
|
|
template <int N> |
|
StreamWriter& operator<<(const char (&v)[N]) { |
|
return WriteFixedLength(v, N); |
|
} |
|
template <std::size_t N> |
|
StreamWriter& operator<<(const std::array<char, N>& v) { |
|
return WriteFixedLength(v.data(), N); |
|
} |
|
StreamWriter& operator<<(FixedStringView v); |
|
|
|
|
|
StreamWriter& operator<<(const char* v); |
|
StreamWriter& operator<<(const std::string& v); |
|
StreamWriter& operator<<(::std::string_view v); |
|
|
|
|
|
template <typename T> |
|
StreamWriter& operator<<(const optional<T>& v) { |
|
if (v) { |
|
return operator<<(*v); |
|
} |
|
SkipOptionalColumn(); |
|
return *this; |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int64_t SkipColumns(int num_columns_to_skip); |
|
|
|
|
|
|
|
|
|
void EndRow(); |
|
|
|
|
|
void EndRowGroup(); |
|
|
|
protected: |
|
template <typename WriterType, typename T> |
|
StreamWriter& Write(const T v) { |
|
auto writer = static_cast<WriterType*>(row_group_writer_->column(column_index_++)); |
|
|
|
writer->WriteBatch(kBatchSizeOne, &kDefLevelOne, &kRepLevelZero, &v); |
|
|
|
if (max_row_group_size_ > 0) { |
|
row_group_size_ += writer->estimated_buffered_value_bytes(); |
|
} |
|
return *this; |
|
} |
|
|
|
StreamWriter& WriteVariableLength(const char* data_ptr, std::size_t data_len); |
|
|
|
StreamWriter& WriteFixedLength(const char* data_ptr, std::size_t data_len); |
|
|
|
void CheckColumn(Type::type physical_type, ConvertedType::type converted_type, |
|
int length = -1); |
|
|
|
|
|
|
|
|
|
void SkipOptionalColumn(); |
|
|
|
void WriteNullValue(ColumnWriter* writer); |
|
|
|
private: |
|
using node_ptr_type = std::shared_ptr<schema::PrimitiveNode>; |
|
|
|
struct null_deleter { |
|
void operator()(void*) {} |
|
}; |
|
|
|
int32_t column_index_{0}; |
|
int64_t current_row_{0}; |
|
int64_t row_group_size_{0}; |
|
int64_t max_row_group_size_{default_row_group_size_}; |
|
|
|
std::unique_ptr<ParquetFileWriter> file_writer_; |
|
std::unique_ptr<RowGroupWriter, null_deleter> row_group_writer_; |
|
std::vector<node_ptr_type> nodes_; |
|
|
|
static constexpr int16_t kDefLevelZero = 0; |
|
static constexpr int16_t kDefLevelOne = 1; |
|
static constexpr int16_t kRepLevelZero = 0; |
|
static constexpr int64_t kBatchSizeOne = 1; |
|
|
|
static int64_t default_row_group_size_; |
|
}; |
|
|
|
struct PARQUET_EXPORT EndRowType {}; |
|
constexpr EndRowType EndRow = {}; |
|
|
|
struct PARQUET_EXPORT EndRowGroupType {}; |
|
constexpr EndRowGroupType EndRowGroup = {}; |
|
|
|
PARQUET_EXPORT |
|
StreamWriter& operator<<(StreamWriter&, EndRowType); |
|
|
|
PARQUET_EXPORT |
|
StreamWriter& operator<<(StreamWriter&, EndRowGroupType); |
|
|
|
} |
|
|