File size: 9,343 Bytes
9c6594c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <memory>
#include <utility>
#include "parquet/metadata.h"
#include "parquet/platform.h"
#include "parquet/properties.h"
#include "parquet/schema.h"
namespace parquet {
class ColumnWriter;
// FIXME: copied from reader-internal.cc
static constexpr uint8_t kParquetMagic[4] = {'P', 'A', 'R', '1'};
static constexpr uint8_t kParquetEMagic[4] = {'P', 'A', 'R', 'E'};
class PARQUET_EXPORT RowGroupWriter {
public:
// Forward declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
struct Contents {
virtual ~Contents() = default;
virtual int num_columns() const = 0;
virtual int64_t num_rows() const = 0;
// to be used only with ParquetFileWriter::AppendRowGroup
virtual ColumnWriter* NextColumn() = 0;
// to be used only with ParquetFileWriter::AppendBufferedRowGroup
virtual ColumnWriter* column(int i) = 0;
virtual int current_column() const = 0;
virtual void Close() = 0;
/// \brief total uncompressed bytes written by the page writer
virtual int64_t total_bytes_written() const = 0;
/// \brief total bytes still compressed but not written by the page writer
virtual int64_t total_compressed_bytes() const = 0;
/// \brief total compressed bytes written by the page writer
virtual int64_t total_compressed_bytes_written() const = 0;
virtual bool buffered() const = 0;
};
explicit RowGroupWriter(std::unique_ptr<Contents> contents);
/// Construct a ColumnWriter for the indicated row group-relative column.
///
/// To be used only with ParquetFileWriter::AppendRowGroup
/// Ownership is solely within the RowGroupWriter. The ColumnWriter is only
/// valid until the next call to NextColumn or Close. As the contents are
/// directly written to the sink, once a new column is started, the contents
/// of the previous one cannot be modified anymore.
ColumnWriter* NextColumn();
/// Index of currently written column. Equal to -1 if NextColumn()
/// has not been called yet.
int current_column();
void Close();
int num_columns() const;
/// Construct a ColumnWriter for the indicated row group column.
///
/// To be used only with ParquetFileWriter::AppendBufferedRowGroup
/// Ownership is solely within the RowGroupWriter. The ColumnWriter is
/// valid until Close. The contents are buffered in memory and written to sink
/// on Close
ColumnWriter* column(int i);
/**
* Number of rows that shall be written as part of this RowGroup.
*/
int64_t num_rows() const;
/// \brief total uncompressed bytes written by the page writer
int64_t total_bytes_written() const;
/// \brief total bytes still compressed but not written by the page writer.
/// It will always return 0 from the SerializedPageWriter.
int64_t total_compressed_bytes() const;
/// \brief total compressed bytes written by the page writer
int64_t total_compressed_bytes_written() const;
/// Returns whether the current RowGroupWriter is in the buffered mode and is created
/// by calling ParquetFileWriter::AppendBufferedRowGroup.
bool buffered() const;
private:
// Holds a pointer to an instance of Contents implementation
std::unique_ptr<Contents> contents_;
};
PARQUET_EXPORT
void WriteFileMetaData(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink);
PARQUET_EXPORT
void WriteMetaDataFile(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink);
PARQUET_EXPORT
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
ArrowOutputStream* sink,
const std::shared_ptr<Encryptor>& encryptor,
bool encrypt_footer);
PARQUET_EXPORT
void WriteEncryptedFileMetadata(const FileMetaData& file_metadata,
::arrow::io::OutputStream* sink,
const std::shared_ptr<Encryptor>& encryptor = NULLPTR,
bool encrypt_footer = false);
PARQUET_EXPORT
void WriteFileCryptoMetaData(const FileCryptoMetaData& crypto_metadata,
::arrow::io::OutputStream* sink);
class PARQUET_EXPORT ParquetFileWriter {
public:
// Forward declare a virtual class 'Contents' to aid dependency injection and more
// easily create test fixtures
// An implementation of the Contents class is defined in the .cc file
struct Contents {
Contents(std::shared_ptr<::parquet::schema::GroupNode> schema,
std::shared_ptr<const KeyValueMetadata> key_value_metadata)
: schema_(), key_value_metadata_(std::move(key_value_metadata)) {
schema_.Init(std::move(schema));
}
virtual ~Contents() {}
// Perform any cleanup associated with the file contents
virtual void Close() = 0;
virtual RowGroupWriter* AppendRowGroup() = 0;
virtual RowGroupWriter* AppendBufferedRowGroup() = 0;
virtual int64_t num_rows() const = 0;
virtual int num_columns() const = 0;
virtual int num_row_groups() const = 0;
virtual const std::shared_ptr<WriterProperties>& properties() const = 0;
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const {
return key_value_metadata_;
}
virtual void AddKeyValueMetadata(
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata) = 0;
// Return const-pointer to make it clear that this object is not to be copied
const SchemaDescriptor* schema() const { return &schema_; }
SchemaDescriptor schema_;
/// This should be the only place this is stored. Everything else is a const reference
std::shared_ptr<const KeyValueMetadata> key_value_metadata_;
const std::shared_ptr<FileMetaData>& metadata() const { return file_metadata_; }
std::shared_ptr<FileMetaData> file_metadata_;
};
ParquetFileWriter();
~ParquetFileWriter();
static std::unique_ptr<ParquetFileWriter> Open(
std::shared_ptr<::arrow::io::OutputStream> sink,
std::shared_ptr<schema::GroupNode> schema,
std::shared_ptr<WriterProperties> properties = default_writer_properties(),
std::shared_ptr<const KeyValueMetadata> key_value_metadata = NULLPTR);
void Open(std::unique_ptr<Contents> contents);
void Close();
/// Construct a RowGroupWriter with an arbitrary number of rows.
///
/// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
/// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
RowGroupWriter* AppendRowGroup();
/// Construct a RowGroupWriter that buffers all the values until the RowGroup is ready.
/// Use this if you want to write a RowGroup based on a certain size
///
/// Ownership is solely within the ParquetFileWriter. The RowGroupWriter is only valid
/// until the next call to AppendRowGroup or AppendBufferedRowGroup or Close.
RowGroupWriter* AppendBufferedRowGroup();
/// \brief Add key-value metadata to the file.
/// \param[in] key_value_metadata the metadata to add.
/// \note This will overwrite any existing metadata with the same key(s).
/// \throw ParquetException if Close() has been called.
void AddKeyValueMetadata(
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata);
/// Number of columns.
///
/// This number is fixed during the lifetime of the writer as it is determined via
/// the schema.
int num_columns() const;
/// Number of rows in the yet started RowGroups.
///
/// Changes on the addition of a new RowGroup.
int64_t num_rows() const;
/// Number of started RowGroups.
int num_row_groups() const;
/// Configuration passed to the writer, e.g. the used Parquet format version.
const std::shared_ptr<WriterProperties>& properties() const;
/// Returns the file schema descriptor
const SchemaDescriptor* schema() const;
/// Returns a column descriptor in schema
const ColumnDescriptor* descr(int i) const;
/// Returns the file custom metadata
const std::shared_ptr<const KeyValueMetadata>& key_value_metadata() const;
/// Returns the file metadata, only available after calling Close().
const std::shared_ptr<FileMetaData> metadata() const;
private:
// Holds a pointer to an instance of Contents implementation
std::unique_ptr<Contents> contents_;
std::shared_ptr<FileMetaData> file_metadata_;
};
} // namespace parquet
|