// Licensed to the Apache Software Foundation (ASF) under one | |
// or more contributor license agreements. See the NOTICE file | |
// distributed with this work for additional information | |
// regarding copyright ownership. The ASF licenses this file | |
// to you under the Apache License, Version 2.0 (the | |
// "License"); you may not use this file except in compliance | |
// with the License. You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, | |
// software distributed under the License is distributed on an | |
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
// KIND, either express or implied. See the License for the | |
// specific language governing permissions and limitations | |
// under the License. | |
namespace arrow { | |
namespace fs { | |
namespace internal { | |
// Opaque wrapper for GCS's library credentials to avoid exposing in Arrow headers. | |
struct GcsCredentialsHolder; | |
} // namespace internal | |
class GcsFileSystem; | |
/// \brief Container for GCS Credentials and information necessary to recreate them. | |
class ARROW_EXPORT GcsCredentials { | |
public: | |
bool Equals(const GcsCredentials& other) const; | |
bool anonymous() const { return anonymous_; } | |
const std::string& access_token() const { return access_token_; } | |
TimePoint expiration() const { return expiration_; } | |
const std::string& target_service_account() const { return target_service_account_; } | |
const std::string& json_credentials() const { return json_credentials_; } | |
const std::shared_ptr<internal::GcsCredentialsHolder>& holder() const { | |
return holder_; | |
} | |
private: | |
GcsCredentials() = default; | |
bool anonymous_ = false; | |
std::string access_token_; | |
TimePoint expiration_; | |
std::string target_service_account_; | |
std::string json_credentials_; | |
std::shared_ptr<internal::GcsCredentialsHolder> holder_; | |
friend class GcsFileSystem; | |
friend struct GcsOptions; | |
}; | |
/// Options for the GcsFileSystem implementation. | |
struct ARROW_EXPORT GcsOptions { | |
/// \brief Equivalent to GcsOptions::Defaults(). | |
GcsOptions(); | |
GcsCredentials credentials; | |
std::string endpoint_override; | |
std::string scheme; | |
/// \brief Location to use for creating buckets. | |
std::string default_bucket_location; | |
/// \brief If set used to control total time allowed for retrying underlying | |
/// errors. | |
/// | |
/// The default policy is to retry for up to 15 minutes. | |
std::optional<double> retry_limit_seconds; | |
/// \brief Default metadata for OpenOutputStream. | |
/// | |
/// This will be ignored if non-empty metadata is passed to OpenOutputStream. | |
std::shared_ptr<const KeyValueMetadata> default_metadata; | |
/// \brief The project to use for creating buckets. | |
/// | |
/// If not set, the library uses the GOOGLE_CLOUD_PROJECT environment | |
/// variable. Most I/O operations do not need a project id, only applications | |
/// that create new buckets need a project id. | |
std::optional<std::string> project_id; | |
bool Equals(const GcsOptions& other) const; | |
/// \brief Initialize with Google Default Credentials | |
/// | |
/// Create options configured to use [Application Default Credentials][aip/4110]. The | |
/// details of this mechanism are too involved to describe here, but suffice is to say | |
/// that applications can override any defaults using an environment variable | |
/// (`GOOGLE_APPLICATION_CREDENTIALS`), and that the defaults work with most Google | |
/// Cloud Platform deployment environments (GCE, GKE, Cloud Run, etc.), and that have | |
/// the same behavior as the `gcloud` CLI tool on your workstation. | |
/// | |
/// \see https://cloud.google.com/docs/authentication | |
/// | |
/// [aip/4110]: https://google.aip.dev/auth/4110 | |
static GcsOptions Defaults(); | |
/// \brief Initialize with anonymous credentials | |
static GcsOptions Anonymous(); | |
/// \brief Initialize with access token | |
/// | |
/// These credentials are useful when using an out-of-band mechanism to fetch access | |
/// tokens. Note that access tokens are time limited, you will need to manually refresh | |
/// the tokens created by the out-of-band mechanism. | |
static GcsOptions FromAccessToken(const std::string& access_token, | |
TimePoint expiration); | |
/// \brief Initialize with service account impersonation | |
/// | |
/// Service account impersonation allows one principal (a user or service account) to | |
/// impersonate a service account. It requires that the calling principal has the | |
/// necessary permissions *on* the service account. | |
static GcsOptions FromImpersonatedServiceAccount( | |
const GcsCredentials& base_credentials, const std::string& target_service_account); | |
/// Creates service account credentials from a JSON object in string form. | |
/// | |
/// The @p json_object is expected to be in the format described by [aip/4112]. Such an | |
/// object contains the identity of a service account, as well as a private key that can | |
/// be used to sign tokens, showing the caller was holding the private key. | |
/// | |
/// In GCP one can create several "keys" for each service account, and these keys are | |
/// downloaded as a JSON "key file". The contents of such a file are in the format | |
/// required by this function. Remember that key files and their contents should be | |
/// treated as any other secret with security implications, think of them as passwords | |
/// (because they are!), don't store them or output them where unauthorized persons may | |
/// read them. | |
/// | |
/// Most applications should probably use default credentials, maybe pointing them to a | |
/// file with these contents. Using this function may be useful when the json object is | |
/// obtained from a Cloud Secret Manager or a similar service. | |
/// | |
/// [aip/4112]: https://google.aip.dev/auth/4112 | |
static GcsOptions FromServiceAccountCredentials(const std::string& json_object); | |
/// Initialize from URIs such as "gs://bucket/object". | |
static Result<GcsOptions> FromUri(const arrow::util::Uri& uri, std::string* out_path); | |
static Result<GcsOptions> FromUri(const std::string& uri, std::string* out_path); | |
}; | |
/// \brief GCS-backed FileSystem implementation. | |
/// | |
/// GCS (Google Cloud Storage - https://cloud.google.com/storage) is a scalable object | |
/// storage system for any amount of data. The main abstractions in GCS are buckets and | |
/// objects. A bucket is a namespace for objects, buckets can store any number of objects, | |
/// tens of millions and even billions is not uncommon. Each object contains a single | |
/// blob of data, up to 5TiB in size. Buckets are typically configured to keep a single | |
/// version of each object, but versioning can be enabled. Versioning is important because | |
/// objects are immutable, once created one cannot append data to the object or modify the | |
/// object data in any way. | |
/// | |
/// GCS buckets are in a global namespace, if a Google Cloud customer creates a bucket | |
/// named `foo` no other customer can create a bucket with the same name. Note that a | |
/// principal (a user or service account) may only list the buckets they are entitled to, | |
/// and then only within a project. It is not possible to list "all" the buckets. | |
/// | |
/// Within each bucket objects are in flat namespace. GCS does not have folders or | |
/// directories. However, following some conventions it is possible to emulate | |
/// directories. To this end, this class: | |
/// | |
/// - All buckets are treated as directories at the "root" | |
/// - Creating a root directory results in a new bucket being created, this may be slower | |
/// than most GCS operations. | |
/// - The class creates marker objects for a directory, using a metadata attribute to | |
/// annotate the file. | |
/// - GCS can list all the objects with a given prefix, this is used to emulate listing | |
/// of directories. | |
/// - In object lists GCS can summarize all the objects with a common prefix as a single | |
/// entry, this is used to emulate non-recursive lists. Note that GCS list time is | |
/// proportional to the number of objects in the prefix. Listing recursively takes | |
/// almost the same time as non-recursive lists. | |
/// | |
class ARROW_EXPORT GcsFileSystem : public FileSystem { | |
public: | |
~GcsFileSystem() override = default; | |
std::string type_name() const override; | |
const GcsOptions& options() const; | |
bool Equals(const FileSystem& other) const override; | |
Result<std::string> PathFromUri(const std::string& uri_string) const override; | |
Result<FileInfo> GetFileInfo(const std::string& path) override; | |
Result<FileInfoVector> GetFileInfo(const FileSelector& select) override; | |
Status CreateDir(const std::string& path, bool recursive) override; | |
Status DeleteDir(const std::string& path) override; | |
Status DeleteDirContents(const std::string& path, bool missing_dir_ok = false) override; | |
/// This is not implemented in GcsFileSystem, as it would be too dangerous. | |
Status DeleteRootDirContents() override; | |
Status DeleteFile(const std::string& path) override; | |
Status Move(const std::string& src, const std::string& dest) override; | |
Status CopyFile(const std::string& src, const std::string& dest) override; | |
Result<std::shared_ptr<io::InputStream>> OpenInputStream( | |
const std::string& path) override; | |
Result<std::shared_ptr<io::InputStream>> OpenInputStream(const FileInfo& info) override; | |
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( | |
const std::string& path) override; | |
Result<std::shared_ptr<io::RandomAccessFile>> OpenInputFile( | |
const FileInfo& info) override; | |
Result<std::shared_ptr<io::OutputStream>> OpenOutputStream( | |
const std::string& path, | |
const std::shared_ptr<const KeyValueMetadata>& metadata) override; | |
Result<std::shared_ptr<io::OutputStream>> OpenAppendStream( | |
const std::string& path, | |
const std::shared_ptr<const KeyValueMetadata>& metadata) override; | |
/// Create a GcsFileSystem instance from the given options. | |
static Result<std::shared_ptr<GcsFileSystem>> Make( | |
const GcsOptions& options, const io::IOContext& = io::default_io_context()); | |
private: | |
explicit GcsFileSystem(const GcsOptions& options, const io::IOContext& io_context); | |
class Impl; | |
std::shared_ptr<Impl> impl_; | |
}; | |
} // namespace fs | |
} // namespace arrow | |