// Licensed to the Apache Software Foundation (ASF) under one | |
// or more contributor license agreements. See the NOTICE file | |
// distributed with this work for additional information | |
// regarding copyright ownership. The ASF licenses this file | |
// to you under the Apache License, Version 2.0 (the | |
// "License"); you may not use this file except in compliance | |
// with the License. You may obtain a copy of the License at | |
// | |
// http://www.apache.org/licenses/LICENSE-2.0 | |
// | |
// Unless required by applicable law or agreed to in writing, | |
// software distributed under the License is distributed on an | |
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
// KIND, either express or implied. See the License for the | |
// specific language governing permissions and limitations | |
// under the License. | |
/// \file abi.h Arrow C Data Interface | |
/// | |
/// The Arrow C Data interface defines a very small, stable set | |
/// of C definitions which can be easily copied into any project's | |
/// source code and vendored to be used for columnar data interchange | |
/// in the Arrow format. For non-C/C++ languages and runtimes, | |
/// it should be almost as easy to translate the C definitions into | |
/// the corresponding C FFI declarations. | |
/// | |
/// Applications and libraries can therefore work with Arrow memory | |
/// without necessarily using the Arrow libraries or reinventing | |
/// the wheel. Developers can choose between tight integration | |
/// with the Arrow software project or minimal integration with | |
/// the Arrow format only. | |
// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html | |
extern "C" { | |
struct ArrowSchema { | |
// Array type description | |
const char* format; | |
const char* name; | |
const char* metadata; | |
int64_t flags; | |
int64_t n_children; | |
struct ArrowSchema** children; | |
struct ArrowSchema* dictionary; | |
// Release callback | |
void (*release)(struct ArrowSchema*); | |
// Opaque producer-specific data | |
void* private_data; | |
}; | |
struct ArrowArray { | |
// Array data description | |
int64_t length; | |
int64_t null_count; | |
int64_t offset; | |
int64_t n_buffers; | |
int64_t n_children; | |
const void** buffers; | |
struct ArrowArray** children; | |
struct ArrowArray* dictionary; | |
// Release callback | |
void (*release)(struct ArrowArray*); | |
// Opaque producer-specific data | |
void* private_data; | |
}; | |
// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html | |
// DeviceType for the allocated memory | |
typedef int32_t ArrowDeviceType; | |
// CPU device, same as using ArrowArray directly | |
// CUDA GPU Device | |
// Pinned CUDA CPU memory by cudaMallocHost | |
// OpenCL Device | |
// Vulkan buffer for next-gen graphics | |
// Metal for Apple GPU | |
// Verilog simulator buffer | |
// ROCm GPUs for AMD GPUs | |
// Pinned ROCm CPU memory allocated by hipMallocHost | |
// Reserved for extension | |
// CUDA managed/unified memory allocated by cudaMallocManaged | |
// unified shared memory allocated on a oneAPI non-partitioned device. | |
// GPU support for next-gen WebGPU standard | |
// Qualcomm Hexagon DSP | |
struct ArrowDeviceArray { | |
// the Allocated Array | |
// | |
// the buffers in the array (along with the buffers of any | |
// children) are what is allocated on the device. | |
struct ArrowArray array; | |
// The device id to identify a specific device | |
int64_t device_id; | |
// The type of device which can access this memory. | |
ArrowDeviceType device_type; | |
// An event-like object to synchronize on if needed. | |
void* sync_event; | |
// Reserved bytes for future expansion. | |
int64_t reserved[3]; | |
}; | |
struct ArrowArrayStream { | |
// Callback to get the stream type | |
// (will be the same for all arrays in the stream). | |
// | |
// Return value: 0 if successful, an `errno`-compatible error code otherwise. | |
// | |
// If successful, the ArrowSchema must be released independently from the stream. | |
int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out); | |
// Callback to get the next array | |
// (if no error and the array is released, the stream has ended) | |
// | |
// Return value: 0 if successful, an `errno`-compatible error code otherwise. | |
// | |
// If successful, the ArrowArray must be released independently from the stream. | |
int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out); | |
// Callback to get optional detailed error information. | |
// This must only be called if the last stream operation failed | |
// with a non-0 return code. | |
// | |
// Return value: pointer to a null-terminated character array describing | |
// the last error, or NULL if no description is available. | |
// | |
// The returned pointer is only valid until the next operation on this stream | |
// (including release). | |
const char* (*get_last_error)(struct ArrowArrayStream*); | |
// Release callback: release the stream's own resources. | |
// Note that arrays returned by `get_next` must be individually released. | |
void (*release)(struct ArrowArrayStream*); | |
// Opaque producer-specific data | |
void* private_data; | |
}; | |
// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays. | |
// | |
// This stream is intended to provide a stream of data on a single | |
// device, if a producer wants data to be produced on multiple devices | |
// then multiple streams should be provided. One per device. | |
struct ArrowDeviceArrayStream { | |
// The device that this stream produces data on. | |
ArrowDeviceType device_type; | |
// Callback to get the stream schema | |
// (will be the same for all arrays in the stream). | |
// | |
// Return value 0 if successful, an `errno`-compatible error code otherwise. | |
// | |
// If successful, the ArrowSchema must be released independently from the stream. | |
// The schema should be accessible via CPU memory. | |
int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out); | |
// Callback to get the next array | |
// (if no error and the array is released, the stream has ended) | |
// | |
// Return value: 0 if successful, an `errno`-compatible error code otherwise. | |
// | |
// If successful, the ArrowDeviceArray must be released independently from the stream. | |
int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out); | |
// Callback to get optional detailed error information. | |
// This must only be called if the last stream operation failed | |
// with a non-0 return code. | |
// | |
// Return value: pointer to a null-terminated character array describing | |
// the last error, or NULL if no description is available. | |
// | |
// The returned pointer is only valid until the next operation on this stream | |
// (including release). | |
const char* (*get_last_error)(struct ArrowDeviceArrayStream* self); | |
// Release callback: release the stream's own resources. | |
// Note that arrays returned by `get_next` must be individually released. | |
void (*release)(struct ArrowDeviceArrayStream* self); | |
// Opaque producer-specific data | |
void* private_data; | |
}; | |
// EXPERIMENTAL: ArrowAsyncTask represents available data from a producer that was passed | |
// to an invocation of `on_next_task` on the ArrowAsyncDeviceStreamHandler. | |
// | |
// The reason for this Task approach instead of the Async interface returning | |
// the Array directly is to allow for more complex thread handling and reducing | |
// context switching and data transfers between CPU cores (e.g. from one L1/L2 | |
// cache to another) if desired. | |
// | |
// For example, the `on_next_task` callback can be called when data is ready, while | |
// the producer puts potential "decoding" logic in the `ArrowAsyncTask` object. This | |
// allows for the producer to manage the I/O on one thread which calls `on_next_task` | |
// and the consumer can determine when the decoding (producer logic in the `extract_data` | |
// callback of the task) occurs and on which thread, to avoid a CPU core transfer | |
// (data staying in the L2 cache). | |
struct ArrowAsyncTask { | |
// This callback should populate the ArrowDeviceArray associated with this task. | |
// The order of ArrowAsyncTasks provided by the producer enables a consumer to | |
// ensure the order of data to process. | |
// | |
// This function is expected to be synchronous, but should not perform any blocking | |
// I/O. Ideally it should be as cheap as possible so as to not tie up the consumer | |
// thread unnecessarily. | |
// | |
// Returns: 0 if successful, errno-compatible error otherwise. | |
// | |
// If a non-0 value is returned then it should be followed by a call to `on_error` | |
// on the appropriate ArrowAsyncDeviceStreamHandler. This is because it's highly | |
// likely that whatever is calling this function may be entirely disconnected from | |
// the current control flow. Indicating an error here with a non-zero return allows | |
// the current flow to be aware of the error occurring, while still allowing any | |
// logging or error handling to still be centralized in the `on_error` callback of | |
// the original Async handler. | |
// | |
// Rather than a release callback, any required cleanup should be performed as part | |
// of the invocation of `extract_data`. Ownership of the Array is passed to the consumer | |
// calling this, and so it must be released separately. | |
// | |
// It is only valid to call this method exactly once. | |
int (*extract_data)(struct ArrowAsyncTask* self, struct ArrowDeviceArray* out); | |
// opaque task-specific data | |
void* private_data; | |
}; | |
// EXPERIMENTAL: ArrowAsyncProducer represents a 1-to-1 relationship between an async | |
// producer and consumer. This object allows the consumer to perform backpressure and flow | |
// control on the asynchronous stream processing. This object must be owned by the | |
// producer who creates it, and thus is responsible for cleaning it up. | |
struct ArrowAsyncProducer { | |
// The device type that this stream produces data on. | |
ArrowDeviceType device_type; | |
// A consumer must call this function to start receiving on_next_task calls. | |
// | |
// It *must* be valid to call this synchronously from within `on_next_task` or | |
// `on_schema`, but this function *must not* immediately call `on_next_task` so as | |
// to avoid recursion and reentrant callbacks. | |
// | |
// After cancel has been called, additional calls to this function must be NOPs, | |
// but allowed. While not cancelled, calling this function must register the | |
// given number of additional arrays/batches to be produced with the producer. | |
// The producer should only call `on_next_task` at most the registered number | |
// of arrays before propagating backpressure. | |
// | |
// Any error encountered by calling request must be propagated by calling the `on_error` | |
// callback of the ArrowAsyncDeviceStreamHandler. | |
// | |
// While not cancelled, any subsequent calls to `on_next_task`, `on_error` or | |
// `release` should be scheduled by the producer to be called later. | |
// | |
// It is invalid for a consumer to call this with a value of n <= 0, producers should | |
// error if given such a value. | |
void (*request)(struct ArrowAsyncProducer* self, int64_t n); | |
// This cancel callback signals a producer that it must eventually stop making calls | |
// to on_next_task. It must be idempotent and thread-safe. After calling cancel once, | |
// subsequent calls must be NOPs. This must not call any consumer-side handlers other | |
// than `on_error`. | |
// | |
// It is not required that calling cancel affect the producer immediately, only that it | |
// must eventually stop calling on_next_task and subsequently call release on the | |
// async handler. As such, a consumer must be prepared to receive one or more calls to | |
// `on_next_task` even after calling cancel if there are still requested arrays pending. | |
// | |
// Successful cancellation should *not* result in the producer calling `on_error`, it | |
// should finish out any remaining tasks and eventually call `release`. | |
// | |
// Any error encountered during handling a call to cancel must be reported via the | |
// on_error callback on the async stream handler. | |
void (*cancel)(struct ArrowAsyncProducer* self); | |
// Any additional metadata tied to a specific stream of data. This must either be NULL | |
// or a valid pointer to metadata which is encoded in the same way schema metadata | |
// would be. Non-null metadata must be valid for the lifetime of this object. As an | |
// example a producer could use this to provide the total number of rows and/or batches | |
// in the stream if known. | |
const char* additional_metadata; | |
// producer-specific opaque data. | |
void* private_data; | |
}; | |
// EXPERIMENTAL: Similar to ArrowDeviceArrayStream, except designed for an asynchronous | |
// style of interaction. While ArrowDeviceArrayStream provides producer | |
// defined callbacks, this is intended to be created by the consumer instead. | |
// The consumer passes this handler to the producer, which in turn uses the | |
// callbacks to inform the consumer of events in the stream. | |
struct ArrowAsyncDeviceStreamHandler { | |
// Handler for receiving a schema. The passed in stream_schema must be | |
// released or moved by the handler (producer is giving ownership of the schema to | |
// the handler, but not ownership of the top level object itself). | |
// | |
// With the exception of an error occurring (on_error), this must be the first | |
// callback function which is called by a producer and must only be called exactly | |
// once. As such, the producer should provide a valid ArrowAsyncProducer instance | |
// so the consumer can control the flow. See the documentation on ArrowAsyncProducer | |
// for how it works. The ArrowAsyncProducer is owned by the producer who calls this | |
// function and thus the producer is responsible for cleaning it up when calling | |
// the release callback of this handler. | |
// | |
// If there is any additional metadata tied to this stream, it will be provided as | |
// a non-null value for the `additional_metadata` field of the ArrowAsyncProducer | |
// which will be valid at least until the release callback is called. | |
// | |
// Return value: 0 if successful, `errno`-compatible error otherwise | |
// | |
// A producer that receives a non-zero return here should stop producing and eventually | |
// call release instead. | |
int (*on_schema)(struct ArrowAsyncDeviceStreamHandler* self, | |
struct ArrowSchema* stream_schema); | |
// Handler for receiving data. This is called when data is available providing an | |
// ArrowAsyncTask struct to signify it. The producer indicates the end of the stream | |
// by passing NULL as the value for the task rather than a valid pointer to a task. | |
// The task object is only valid for the lifetime of this function call, if a consumer | |
// wants to utilize it after this function returns, it must copy or move the contents | |
// of it to a new ArrowAsyncTask object. | |
// | |
// The `request` callback of a provided ArrowAsyncProducer must be called in order | |
// to start receiving calls to this handler. | |
// | |
// The metadata argument can be null or can be used by a producer | |
// to pass arbitrary extra information to the consumer (such as total number | |
// of rows, context info, or otherwise). The data should be passed using the same | |
// encoding as the metadata within the ArrowSchema struct itself (defined in | |
// the spec at | |
// https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata) | |
// | |
// If metadata is non-null then it only needs to exist for the lifetime of this call, | |
// a consumer who wants it to live after that must copy it to ensure lifetime. | |
// | |
// A producer *must not* call this concurrently from multiple different threads. | |
// | |
// A consumer must be prepared to receive one or more calls to this callback even | |
// after calling cancel on the corresponding ArrowAsyncProducer, as cancel does not | |
// guarantee it happens immediately. | |
// | |
// Return value: 0 if successful, `errno`-compatible error otherwise. | |
// | |
// If the consumer returns a non-zero return from this method, that indicates to the | |
// producer that it should stop propagating data as an error occurred. After receiving | |
// such a return, the only interaction with this object is for the producer to call | |
// the `release` callback. | |
int (*on_next_task)(struct ArrowAsyncDeviceStreamHandler* self, | |
struct ArrowAsyncTask* task, const char* metadata); | |
// Handler for encountering an error. The producer should call release after | |
// this returns to clean up any resources. The `code` passed in can be any error | |
// code that a producer wants, but should be errno-compatible for consistency. | |
// | |
// If the message or metadata are non-null, they will only last as long as this | |
// function call. The consumer would need to perform a copy of the data if it is | |
// necessary for them to live past the lifetime of this call. | |
// | |
// Error metadata should be encoded as with metadata in ArrowSchema, defined in | |
// the spec at | |
// https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata | |
// | |
// It is valid for this to be called by a producer with or without a preceding call | |
// to ArrowAsyncProducer.request. | |
// | |
// This callback must not call any methods of an ArrowAsyncProducer object. | |
void (*on_error)(struct ArrowAsyncDeviceStreamHandler* self, int code, | |
const char* message, const char* metadata); | |
// Release callback to release any resources for the handler. Should always be | |
// called by a producer when it is done utilizing a handler. No callbacks should | |
// be called after this is called. | |
// | |
// It is valid for the release callback to be called by a producer with or without | |
// a preceding call to ArrowAsyncProducer.request. | |
// | |
// The release callback must not call any methods of an ArrowAsyncProducer object. | |
void (*release)(struct ArrowAsyncDeviceStreamHandler* self); | |
// MUST be populated by the producer BEFORE calling any callbacks other than release. | |
// This provides the connection between a handler and its producer, and must exist until | |
// the release callback is called. | |
struct ArrowAsyncProducer* producer; | |
// Opaque handler-specific data | |
void* private_data; | |
}; | |
} | |