File size: 20,318 Bytes
9c6594c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

/// \file abi.h Arrow C Data Interface
///
/// The Arrow C Data interface defines a very small, stable set
/// of C definitions which can be easily copied into any project's
/// source code and vendored to be used for columnar data interchange
/// in the Arrow format. For non-C/C++ languages and runtimes,
/// it should be almost as easy to translate the C definitions into
/// the corresponding C FFI declarations.
///
/// Applications and libraries can therefore work with Arrow memory
/// without necessarily using the Arrow libraries or reinventing
/// the wheel. Developers can choose between tight integration
/// with the Arrow software project or minimal integration with
/// the Arrow format only.

#pragma once

#include <stdint.h>

// Spec and documentation: https://arrow.apache.org/docs/format/CDataInterface.html

#ifdef __cplusplus
extern "C" {
#endif

#ifndef ARROW_C_DATA_INTERFACE
#  define ARROW_C_DATA_INTERFACE

#  define ARROW_FLAG_DICTIONARY_ORDERED 1
#  define ARROW_FLAG_NULLABLE 2
#  define ARROW_FLAG_MAP_KEYS_SORTED 4

struct ArrowSchema {
  // Array type description
  const char* format;
  const char* name;
  const char* metadata;
  int64_t flags;
  int64_t n_children;
  struct ArrowSchema** children;
  struct ArrowSchema* dictionary;

  // Release callback
  void (*release)(struct ArrowSchema*);
  // Opaque producer-specific data
  void* private_data;
};

struct ArrowArray {
  // Array data description
  int64_t length;
  int64_t null_count;
  int64_t offset;
  int64_t n_buffers;
  int64_t n_children;
  const void** buffers;
  struct ArrowArray** children;
  struct ArrowArray* dictionary;

  // Release callback
  void (*release)(struct ArrowArray*);
  // Opaque producer-specific data
  void* private_data;
};

#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_EXACT "ARROW:average_byte_width:exact"
#  define ARROW_STATISTICS_KEY_AVERAGE_BYTE_WIDTH_APPROXIMATE \
    "ARROW:average_byte_width:approximate"
#  define ARROW_STATISTICS_KEY_DISTINCT_COUNT_EXACT "ARROW:distinct_count:exact"
#  define ARROW_STATISTICS_KEY_DISTINCT_COUNT_APPROXIMATE \
    "ARROW:distinct_count:approximate"
#  define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_EXACT "ARROW:max_byte_width:exact"
#  define ARROW_STATISTICS_KEY_MAX_BYTE_WIDTH_APPROXIMATE \
    "ARROW:max_byte_width:approximate"
#  define ARROW_STATISTICS_KEY_MAX_VALUE_EXACT "ARROW:max_value:exact"
#  define ARROW_STATISTICS_KEY_MAX_VALUE_APPROXIMATE "ARROW:max_value:approximate"
#  define ARROW_STATISTICS_KEY_MIN_VALUE_EXACT "ARROW:min_value:exact"
#  define ARROW_STATISTICS_KEY_MIN_VALUE_APPROXIMATE "ARROW:min_value:approximate"
#  define ARROW_STATISTICS_KEY_NULL_COUNT_EXACT "ARROW:null_count:exact"
#  define ARROW_STATISTICS_KEY_NULL_COUNT_APPROXIMATE "ARROW:null_count:approximate"
#  define ARROW_STATISTICS_KEY_ROW_COUNT_EXACT "ARROW:row_count:exact"
#  define ARROW_STATISTICS_KEY_ROW_COUNT_APPROXIMATE "ARROW:row_count:approximate"

#endif  // ARROW_C_DATA_INTERFACE

#ifndef ARROW_C_DEVICE_DATA_INTERFACE
#  define ARROW_C_DEVICE_DATA_INTERFACE

// Spec and Documentation: https://arrow.apache.org/docs/format/CDeviceDataInterface.html

// DeviceType for the allocated memory
typedef int32_t ArrowDeviceType;

// CPU device, same as using ArrowArray directly
#  define ARROW_DEVICE_CPU 1
// CUDA GPU Device
#  define ARROW_DEVICE_CUDA 2
// Pinned CUDA CPU memory by cudaMallocHost
#  define ARROW_DEVICE_CUDA_HOST 3
// OpenCL Device
#  define ARROW_DEVICE_OPENCL 4
// Vulkan buffer for next-gen graphics
#  define ARROW_DEVICE_VULKAN 7
// Metal for Apple GPU
#  define ARROW_DEVICE_METAL 8
// Verilog simulator buffer
#  define ARROW_DEVICE_VPI 9
// ROCm GPUs for AMD GPUs
#  define ARROW_DEVICE_ROCM 10
// Pinned ROCm CPU memory allocated by hipMallocHost
#  define ARROW_DEVICE_ROCM_HOST 11
// Reserved for extension
#  define ARROW_DEVICE_EXT_DEV 12
// CUDA managed/unified memory allocated by cudaMallocManaged
#  define ARROW_DEVICE_CUDA_MANAGED 13
// unified shared memory allocated on a oneAPI non-partitioned device.
#  define ARROW_DEVICE_ONEAPI 14
// GPU support for next-gen WebGPU standard
#  define ARROW_DEVICE_WEBGPU 15
// Qualcomm Hexagon DSP
#  define ARROW_DEVICE_HEXAGON 16

struct ArrowDeviceArray {
  // the Allocated Array
  //
  // the buffers in the array (along with the buffers of any
  // children) are what is allocated on the device.
  struct ArrowArray array;
  // The device id to identify a specific device
  int64_t device_id;
  // The type of device which can access this memory.
  ArrowDeviceType device_type;
  // An event-like object to synchronize on if needed.
  void* sync_event;
  // Reserved bytes for future expansion.
  int64_t reserved[3];
};

#endif  // ARROW_C_DEVICE_DATA_INTERFACE

#ifndef ARROW_C_STREAM_INTERFACE
#  define ARROW_C_STREAM_INTERFACE

struct ArrowArrayStream {
  // Callback to get the stream type
  // (will be the same for all arrays in the stream).
  //
  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
  //
  // If successful, the ArrowSchema must be released independently from the stream.
  int (*get_schema)(struct ArrowArrayStream*, struct ArrowSchema* out);

  // Callback to get the next array
  // (if no error and the array is released, the stream has ended)
  //
  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
  //
  // If successful, the ArrowArray must be released independently from the stream.
  int (*get_next)(struct ArrowArrayStream*, struct ArrowArray* out);

  // Callback to get optional detailed error information.
  // This must only be called if the last stream operation failed
  // with a non-0 return code.
  //
  // Return value: pointer to a null-terminated character array describing
  // the last error, or NULL if no description is available.
  //
  // The returned pointer is only valid until the next operation on this stream
  // (including release).
  const char* (*get_last_error)(struct ArrowArrayStream*);

  // Release callback: release the stream's own resources.
  // Note that arrays returned by `get_next` must be individually released.
  void (*release)(struct ArrowArrayStream*);

  // Opaque producer-specific data
  void* private_data;
};

#endif  // ARROW_C_STREAM_INTERFACE

#ifndef ARROW_C_DEVICE_STREAM_INTERFACE
#  define ARROW_C_DEVICE_STREAM_INTERFACE

// Equivalent to ArrowArrayStream, but for ArrowDeviceArrays.
//
// This stream is intended to provide a stream of data on a single
// device, if a producer wants data to be produced on multiple devices
// then multiple streams should be provided. One per device.
struct ArrowDeviceArrayStream {
  // The device that this stream produces data on.
  ArrowDeviceType device_type;

  // Callback to get the stream schema
  // (will be the same for all arrays in the stream).
  //
  // Return value 0 if successful, an `errno`-compatible error code otherwise.
  //
  // If successful, the ArrowSchema must be released independently from the stream.
  // The schema should be accessible via CPU memory.
  int (*get_schema)(struct ArrowDeviceArrayStream* self, struct ArrowSchema* out);

  // Callback to get the next array
  // (if no error and the array is released, the stream has ended)
  //
  // Return value: 0 if successful, an `errno`-compatible error code otherwise.
  //
  // If successful, the ArrowDeviceArray must be released independently from the stream.
  int (*get_next)(struct ArrowDeviceArrayStream* self, struct ArrowDeviceArray* out);

  // Callback to get optional detailed error information.
  // This must only be called if the last stream operation failed
  // with a non-0 return code.
  //
  // Return value: pointer to a null-terminated character array describing
  // the last error, or NULL if no description is available.
  //
  // The returned pointer is only valid until the next operation on this stream
  // (including release).
  const char* (*get_last_error)(struct ArrowDeviceArrayStream* self);

  // Release callback: release the stream's own resources.
  // Note that arrays returned by `get_next` must be individually released.
  void (*release)(struct ArrowDeviceArrayStream* self);

  // Opaque producer-specific data
  void* private_data;
};

#endif  // ARROW_C_DEVICE_STREAM_INTERFACE

#ifndef ARROW_C_ASYNC_STREAM_INTERFACE
#  define ARROW_C_ASYNC_STREAM_INTERFACE

// EXPERIMENTAL: ArrowAsyncTask represents available data from a producer that was passed
// to an invocation of `on_next_task` on the ArrowAsyncDeviceStreamHandler.
//
// The reason for this Task approach instead of the Async interface returning
// the Array directly is to allow for more complex thread handling and reducing
// context switching and data transfers between CPU cores (e.g. from one L1/L2
// cache to another) if desired.
//
// For example, the `on_next_task` callback can be called when data is ready, while
// the producer puts potential "decoding" logic in the `ArrowAsyncTask` object. This
// allows for the producer to manage the I/O on one thread which calls `on_next_task`
// and the consumer can determine when the decoding (producer logic in the `extract_data`
// callback of the task) occurs and on which thread, to avoid a CPU core transfer
// (data staying in the L2 cache).
struct ArrowAsyncTask {
  // This callback should populate the ArrowDeviceArray associated with this task.
  // The order of ArrowAsyncTasks provided by the producer enables a consumer to
  // ensure the order of data to process.
  //
  // This function is expected to be synchronous, but should not perform any blocking
  // I/O. Ideally it should be as cheap as possible so as to not tie up the consumer
  // thread unnecessarily.
  //
  // Returns: 0 if successful, errno-compatible error otherwise.
  //
  // If a non-0 value is returned then it should be followed by a call to `on_error`
  // on the appropriate ArrowAsyncDeviceStreamHandler. This is because it's highly
  // likely that whatever is calling this function may be entirely disconnected from
  // the current control flow. Indicating an error here with a non-zero return allows
  // the current flow to be aware of the error occurring, while still allowing any
  // logging or error handling to still be centralized in the `on_error` callback of
  // the original Async handler.
  //
  // Rather than a release callback, any required cleanup should be performed as part
  // of the invocation of `extract_data`. Ownership of the Array is passed to the consumer
  // calling this, and so it must be released separately.
  //
  // It is only valid to call this method exactly once.
  int (*extract_data)(struct ArrowAsyncTask* self, struct ArrowDeviceArray* out);

  // opaque task-specific data
  void* private_data;
};

// EXPERIMENTAL: ArrowAsyncProducer represents a 1-to-1 relationship between an async
// producer and consumer. This object allows the consumer to perform backpressure and flow
// control on the asynchronous stream processing. This object must be owned by the
// producer who creates it, and thus is responsible for cleaning it up.
struct ArrowAsyncProducer {
  // The device type that this stream produces data on.
  ArrowDeviceType device_type;

  // A consumer must call this function to start receiving on_next_task calls.
  //
  // It *must* be valid to call this synchronously from within `on_next_task` or
  // `on_schema`, but this function *must not* immediately call `on_next_task` so as
  // to avoid recursion and reentrant callbacks.
  //
  // After cancel has been called, additional calls to this function must be NOPs,
  // but allowed. While not cancelled, calling this function must register the
  // given number of additional arrays/batches to be produced with the producer.
  // The producer should only call `on_next_task` at most the registered number
  // of arrays before propagating backpressure.
  //
  // Any error encountered by calling request must be propagated by calling the `on_error`
  // callback of the ArrowAsyncDeviceStreamHandler.
  //
  // While not cancelled, any subsequent calls to `on_next_task`, `on_error` or
  // `release` should be scheduled by the producer to be called later.
  //
  // It is invalid for a consumer to call this with a value of n <= 0, producers should
  // error if given such a value.
  void (*request)(struct ArrowAsyncProducer* self, int64_t n);

  // This cancel callback signals a producer that it must eventually stop making calls
  // to on_next_task. It must be idempotent and thread-safe. After calling cancel once,
  // subsequent calls must be NOPs. This must not call any consumer-side handlers other
  // than `on_error`.
  //
  // It is not required that calling cancel affect the producer immediately, only that it
  // must eventually stop calling on_next_task and subsequently call release on the
  // async handler. As such, a consumer must be prepared to receive one or more calls to
  // `on_next_task` even after calling cancel if there are still requested arrays pending.
  //
  // Successful cancellation should *not* result in the producer calling `on_error`, it
  // should finish out any remaining tasks and eventually call `release`.
  //
  // Any error encountered during handling a call to cancel must be reported via the
  // on_error callback on the async stream handler.
  void (*cancel)(struct ArrowAsyncProducer* self);

  // Any additional metadata tied to a specific stream of data. This must either be NULL
  // or a valid pointer to metadata which is encoded in the same way schema metadata
  // would be. Non-null metadata must be valid for the lifetime of this object. As an
  // example a producer could use this to provide the total number of rows and/or batches
  // in the stream if known.
  const char* additional_metadata;

  // producer-specific opaque data.
  void* private_data;
};

// EXPERIMENTAL: Similar to ArrowDeviceArrayStream, except designed for an asynchronous
// style of interaction. While ArrowDeviceArrayStream provides producer
// defined callbacks, this is intended to be created by the consumer instead.
// The consumer passes this handler to the producer, which in turn uses the
// callbacks to inform the consumer of events in the stream.
struct ArrowAsyncDeviceStreamHandler {
  // Handler for receiving a schema. The passed in stream_schema must be
  // released or moved by the handler (producer is giving ownership of the schema to
  // the handler, but not ownership of the top level object itself).
  //
  // With the exception of an error occurring (on_error), this must be the first
  // callback function which is called by a producer and must only be called exactly
  // once. As such, the producer should provide a valid ArrowAsyncProducer instance
  // so the consumer can control the flow. See the documentation on ArrowAsyncProducer
  // for how it works. The ArrowAsyncProducer is owned by the producer who calls this
  // function and thus the producer is responsible for cleaning it up when calling
  // the release callback of this handler.
  //
  // If there is any additional metadata tied to this stream, it will be provided as
  // a non-null value for the `additional_metadata` field of the ArrowAsyncProducer
  // which will be valid at least until the release callback is called.
  //
  // Return value: 0 if successful, `errno`-compatible error otherwise
  //
  // A producer that receives a non-zero return here should stop producing and eventually
  // call release instead.
  int (*on_schema)(struct ArrowAsyncDeviceStreamHandler* self,
                   struct ArrowSchema* stream_schema);

  // Handler for receiving data. This is called when data is available providing an
  // ArrowAsyncTask struct to signify it. The producer indicates the end of the stream
  // by passing NULL as the value for the task rather than a valid pointer to a task.
  // The task object is only valid for the lifetime of this function call, if a consumer
  // wants to utilize it after this function returns, it must copy or move the contents
  // of it to a new ArrowAsyncTask object.
  //
  // The `request` callback of a provided ArrowAsyncProducer must be called in order
  // to start receiving calls to this handler.
  //
  // The metadata argument can be null or can be used by a producer
  // to pass arbitrary extra information to the consumer (such as total number
  // of rows, context info, or otherwise). The data should be passed using the same
  // encoding as the metadata within the ArrowSchema struct itself (defined in
  // the spec at
  // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata)
  //
  // If metadata is non-null then it only needs to exist for the lifetime of this call,
  // a consumer who wants it to live after that must copy it to ensure lifetime.
  //
  // A producer *must not* call this concurrently from multiple different threads.
  //
  // A consumer must be prepared to receive one or more calls to this callback even
  // after calling cancel on the corresponding ArrowAsyncProducer, as cancel does not
  // guarantee it happens immediately.
  //
  // Return value: 0 if successful, `errno`-compatible error otherwise.
  //
  // If the consumer returns a non-zero return from this method, that indicates to the
  // producer that it should stop propagating data as an error occurred. After receiving
  // such a return, the only interaction with this object is for the producer to call
  // the `release` callback.
  int (*on_next_task)(struct ArrowAsyncDeviceStreamHandler* self,
                      struct ArrowAsyncTask* task, const char* metadata);

  // Handler for encountering an error. The producer should call release after
  // this returns to clean up any resources. The `code` passed in can be any error
  // code that a producer wants, but should be errno-compatible for consistency.
  //
  // If the message or metadata are non-null, they will only last as long as this
  // function call. The consumer would need to perform a copy of the data if it is
  // necessary for them to live past the lifetime of this call.
  //
  // Error metadata should be encoded as with metadata in ArrowSchema, defined in
  // the spec at
  // https://arrow.apache.org/docs/format/CDataInterface.html#c.ArrowSchema.metadata
  //
  // It is valid for this to be called by a producer with or without a preceding call
  // to ArrowAsyncProducer.request.
  //
  // This callback must not call any methods of an ArrowAsyncProducer object.
  void (*on_error)(struct ArrowAsyncDeviceStreamHandler* self, int code,
                   const char* message, const char* metadata);

  // Release callback to release any resources for the handler. Should always be
  // called by a producer when it is done utilizing a handler. No callbacks should
  // be called after this is called.
  //
  // It is valid for the release callback to be called by a producer with or without
  // a preceding call to ArrowAsyncProducer.request.
  //
  // The release callback must not call any methods of an ArrowAsyncProducer object.
  void (*release)(struct ArrowAsyncDeviceStreamHandler* self);

  // MUST be populated by the producer BEFORE calling any callbacks other than release.
  // This provides the connection between a handler and its producer, and must exist until
  // the release callback is called.
  struct ArrowAsyncProducer* producer;

  // Opaque handler-specific data
  void* private_data;
};

#endif  // ARROW_C_ASYNC_STREAM_INTERFACE

#ifdef __cplusplus
}
#endif