File size: 33,083 Bytes
9c6594c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
#pragma once

#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>

#include <pthreadpool.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
 * @brief Status code for any NNPACK function call.
 */
enum nnp_status {
	/** The call succeeded, and all output arguments now contain valid data. */
	nnp_status_success = 0,
	/** NNPACK function was called with batch_size == 0. */
	nnp_status_invalid_batch_size = 2,
	/** NNPACK function was called with channels == 0. */
	nnp_status_invalid_channels = 3,
	/** NNPACK function was called with input_channels == 0. */
	nnp_status_invalid_input_channels = 4,
	/** NNPACK function was called with output_channels == 0. */
	nnp_status_invalid_output_channels = 5,
	/** NNPACK function was called with input_size.height == 0 or input_size.width == 0 */
	nnp_status_invalid_input_size = 10,
	/** NNPACK function was called with input_stride.height == 0 or input_stride.width == 0 */
	nnp_status_invalid_input_stride = 11,
	/** NNPACK function was called with input_padding not less than respective kernel (or pooling) size, i.e.:
	 *
	 *  - input_padding.left   >= kernel_size.width  (>= pooling_size.width)
	 *  - input_padding.right  >= kernel_size.width  (>= pooling_size.width)
	 *  - input_padding.top    >= kernel_size.height (>= pooling_size.height)
	 *  - input_padding.bottom >= kernel_size.height (>= pooling_size.height)
	 */
	nnp_status_invalid_input_padding = 12,
	/** NNPACK function was called with kernel_size.height == 0 or kernel_size.width == 0 */
	nnp_status_invalid_kernel_size = 13,
	/** NNPACK function was called with pooling_size.height == 0 or pooling_size.width == 0 */
	nnp_status_invalid_pooling_size = 14,
	/** NNPACK function was called with pooling_stride.height == 0 or pooling_stride.width == 0 */
	nnp_status_invalid_pooling_stride = 15,
	/** NNPACK function was called with convolution algorithm not in nnp_convolution_algorithm enumeration */
	nnp_status_invalid_algorithm = 16,
	/** NNPACK function was called with convolution transform strategy not in nnp_convolution_transform_strategy enum */
	nnp_status_invalid_transform_strategy = 17,
	/** NNPACK function was called with output_subsampling.height == 0 or output_subsampling.width == 0 */
	nnp_status_invalid_output_subsampling = 13,
	/** NNPACK function was called with activation not in nnp_activation enum */
	nnp_status_invalid_activation = 14,
	/** NNPACK function was called with invalid activation parameters */
	nnp_status_invalid_activation_parameters = 15,

	/** NNPACK does not support the particular input size for the function */
	nnp_status_unsupported_input_size = 20,
	/** NNPACK does not support the particular input stride for the function */
	nnp_status_unsupported_input_stride = 21,
	/** NNPACK does not support the particular input padding for the function */
	nnp_status_unsupported_input_padding = 22,
	/** NNPACK does not support the particular kernel size for the function */
	nnp_status_unsupported_kernel_size = 23,
	/** NNPACK does not support the particular pooling size for the function */
	nnp_status_unsupported_pooling_size = 24,
	/** NNPACK does not support the particular pooling stride for the function */
	nnp_status_unsupported_pooling_stride = 25,
	/** NNPACK does not support the particular convolution algorithm for the function */
	nnp_status_unsupported_algorithm = 26,
	/** NNPACK does not support the particular convolution transform strategy for the algorithm */
	nnp_status_unsupported_transform_strategy = 27,
	/** NNPACK does not support the particular activation function for the function */
	nnp_status_unsupported_activation = 28,
	/** NNPACK does not support the particular activation function parameters for the function */
	nnp_status_unsupported_activation_parameters = 29, 

	/** NNPACK function was called before the library was initialized */
	nnp_status_uninitialized = 50,
	/** NNPACK does not implement this function for the host CPU */
	nnp_status_unsupported_hardware = 51,
	/** NNPACK failed to allocate memory for temporary buffers */
	nnp_status_out_of_memory = 52,
	/** Scratch space buffer is too small */
	nnp_status_insufficient_buffer = 53,
	/** Scratch space buffer is not properly aligned */
	nnp_status_misaligned_buffer = 54
};

/**
 * @brief Activation applied applied after a convolutional or fully-connected layer.
 */
enum nnp_activation {
	/** Identity activation f(x) := x, i.e. no transformation */
	nnp_activation_identity = 0,
	/** ReLU activation f(x) := max(0, x) */
	nnp_activation_relu = 1,
};

/**
 * @brief Algorithm for computing convolutional layers.
 */
enum nnp_convolution_algorithm {
	/** Let NNPACK choose the algorithm depending on layer parameters */
	nnp_convolution_algorithm_auto = 0,
	/** Tiled convolution based on 2D Fourier transform with 8x8 blocks. Supports kernels up to 8x8. */
	nnp_convolution_algorithm_ft8x8 = 1,
	/** Tiled convolution based on 2D Fourier transform with 16x16 blocks. Supports kernels up to 16x16. */
	nnp_convolution_algorithm_ft16x16 = 2,
	/** Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks. Supports only 3x3 kernels. */
	nnp_convolution_algorithm_wt8x8 = 3,
	/** Direct convolution via implicit GEMM. */
	nnp_convolution_algorithm_implicit_gemm = 4,
	/** Direct convolution implementation. */
	nnp_convolution_algorithm_direct = 5,
	/**
	 * Tiled convolution based on 2D Winograd transform F(3x3, 6x6) with 8x8 blocks in FP16.
	 * Supports only 3x3 kernels. Implemented only for new ARM processors (with NEON-HP),
	 * on non-supported processors falls back to nnp_convolution_algorithm_wt8x8.
	 */
	nnp_convolution_algorithm_wt8x8_fp16 = 6,
};

enum nnp_convolution_transform_strategy {
	nnp_convolution_transform_strategy_compute = 1,
	nnp_convolution_transform_strategy_precompute = 2,
	nnp_convolution_transform_strategy_reuse = 3
};

/* For backward compatibility */
#define nnp_convolution_transform_strategy_block_based nnp_convolution_transform_strategy_compute
#define nnp_convolution_transform_strategy_tuple_based nnp_convolution_transform_strategy_compute

/**
 * @brief Size of images, kernels, and pooling filters in NNPACK.
 */
struct nnp_size {
	/** Width (horizontal size) of an image, kernel, or pooling filter. */
	size_t width;
	/** Height (vertical size) of an image, kernel, or pooling filter. */
	size_t height;
};

/**
 * @brief Padding of images in NNPACK.
 */
struct nnp_padding {
	/** Padding above the image data */
	size_t top;
	/** Padding on the right of image data */
	size_t right;
	/** Padding below the image data */
	size_t bottom;
	/** Padding on the left of image data */
	size_t left;
};

/**
 * @brief Profiling information about time spent in different phases of a function call.
 */
struct nnp_profile {
	/** Time spent inside the function call, in seconds. */
	double total;
	/** Time spend on transformation of the input or input gradient tensor, in seconds. */
	double input_transform;
	/** Time spend on transformation of the kernel or kernel gradient tensor, in seconds. */
	double kernel_transform;
	/** Time spend on transformation of the output or output gradient tensor, in seconds. */
	double output_transform;
	/** Time spend on multiplication-accumulation of transformed coefficients, in seconds. */
	double block_multiplication;
};

enum nnp_status nnp_initialize(void);

enum nnp_status nnp_deinitialize(void);

/**
 * @brief Computes output of a 2D convolutional layer from input and kernel tensors.
 * @details This function targets training of convolutional neural networks and performs forward propagation.
 *          It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
 *          For minibatch size 1, use nnp_convolution_inference for optimal performance.
 * @param algorithm The type of algorithm to use for convolution. Possible values are:
 *
 *    - nnp_convolution_algorithm_auto    -- let the function choose the algorithm.
 *    - nnp_convolution_algorithm_ft8x8   -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
 *                                           Supports kernels up to 8x8.
 *    - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
 *                                           Supports kernels up to 16x16.
 *    - nnp_convolution_algorithm_wt8x8   -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
 *                                           Supports only 3x3 kernels.
 *
 * @param batch_size The number of images on the input and output of the convolutional layer.
 * @param input_channels The number of channels (AKA features, dimensions) in the input images.
 * @param output_channels The number of channels (AKA features, dimensions) in the output images.
 * @param input_size Size of input images, excluding implicit zero-padding.
 * @param input_padding Implicit zero-padding of input images.
 * @param kernel_size Kernel size.
 * @param[in]  input  A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
 * @param[in]  kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
 * @param[in]  bias   A 1D array bias[output_channels].
 * @param[out] output A 4D tensor output[batch_size][output_channels][output_size.height][output_size.width] where
 *                        output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
 *                                             (kernel_size.height - 1)
 *                        output_size.width  = (input_padding.left + input_size.width + input_padding.right) -
 *                                             (kernel_size.width - 1)
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 * @param[out] profile An optional pointer to profiling structure.
 *                     If provided, the structure would record time spent in different phases of the computation.
 */

enum nnp_status nnp_convolution_output(
	enum nnp_convolution_algorithm algorithm,
	size_t batch_size,
	size_t input_channels,
	size_t output_channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size kernel_size,
	const float* input,
	const float* kernel,
	const float* bias,
	float* output,
	void* workspace_buffer,
	size_t* workspace_size,
	enum nnp_activation activation,
	const void* activation_parameters,
	pthreadpool_t threadpool,
	struct nnp_profile* profile);

/**
 * @brief Computes gradient of input of a 2D convolutional layer from gradient of output and kernel tensors.
 * @details This function targets training of convolutional neural networks and performs backward propagation.
 *          It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
 * @param algorithm The type of algorithm to use for convolution. Possible values are:
 *
 *    - nnp_convolution_algorithm_auto    -- let the function choose the algorithm.
 *    - nnp_convolution_algorithm_ft8x8   -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
 *                                           Supports kernels up to 8x8.
 *    - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
 *                                           Supports kernels up to 16x16.
 *    - nnp_convolution_algorithm_wt8x8   -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
 *                                           Supports only 3x3 kernels.
 *
 * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
 * @param input_channels The number of channels (AKA features, dimensions) in the input images (and gradients).
 * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
 * @param input_size Size of input images and their gradients, excluding implicit zero-padding.
 * @param input_padding Implicit zero-padding of input images.
 * @param kernel_size Kernel size.
 * @param[in]  grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
 *                         where
 *                           output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
 *                                                (kernel_size.height - 1)
 *                           output_size.width  = (input_padding.left + input_size.width + input_padding.right) -
 *                                                (kernel_size.width - 1)
 * @param[in]  kernel      A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
 * @param[out] grad_input  A 4D tensor grad_input[batch_size][input_channels][input_size.height][input_size.width].
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 * @param[out] profile An optional pointer to profiling structure.
 *                     If provided, the structure would record time spent in different phases of the computation.
 */
enum nnp_status nnp_convolution_input_gradient(
	enum nnp_convolution_algorithm algorithm,
	size_t batch_size,
	size_t input_channels,
	size_t output_channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size kernel_size,
	const float* grad_output,
	const float* kernel,
	float* grad_input,
	void* workspace_buffer,
	size_t* workspace_size,
	enum nnp_activation activation,
	const void* activation_parameters,
	pthreadpool_t threadpool,
	struct nnp_profile* profile);

/**
 * @brief Computes gradient of kernel of a 2D convolutional layer from gradient of output and input tensors.
 * @details This function targets training of convolutional neural networks and performs backward propagation.
 *          It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
 * @param algorithm The type of algorithm to use for convolution. Possible values are:
 *
 *    - nnp_convolution_algorithm_auto    -- let the function choose the algorithm.
 *    - nnp_convolution_algorithm_ft8x8   -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
 *                                           Supports kernels up to 8x8.
 *    - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
 *                                           Supports kernels up to 16x16.
 *
 * @param batch_size The number of images (and their gradients) on the input and output of the convolutional layer.
 * @param input_channels The number of channels (AKA features, dimensions) in the input images.
 * @param output_channels The number of channels (AKA features, dimensions) in the output images (and gradients).
 * @param input_size Size of input images and their gradients, excluding implicit zero-padding.
 * @param input_padding Implicit zero-padding of input images.
 * @param kernel_size Kernel size.
 * @param[in]  input       A 4D tensor input[batch_size][input_channels][input_size.height][input_size.width].
 * @param[in]  grad_output A 4D tensor grad_output[batch_size][output_channels][output_size.height][output_size.width]
 *                         where
 *                           output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
 *                                                (kernel_size.height - 1)
 *                           output_size.width  = (input_padding.left + input_size.width + input_padding.right) -
 *                                                (kernel_size.width - 1)
 * @param[out] grad_kernel A 4D tensor
 *                         grad_kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 * @param[out] profile An optional pointer to profiling structure.
 *                     If provided, the structure would record time spent in different phases of the computation.
 */
enum nnp_status nnp_convolution_kernel_gradient(
	enum nnp_convolution_algorithm algorithm,
	size_t batch_size,
	size_t input_channels,
	size_t output_channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size kernel_size,
	const float* input,
	const float* grad_output,
	float* grad_kernel,
	void* workspace_buffer,
	size_t* workspace_size,
	enum nnp_activation activation,
	const void* activation_parameters,
	pthreadpool_t threadpool,
	struct nnp_profile* profile);

/**
 * @brief Computes output of a 2D convolutional layer for a single input image and a kernel tensor.
 * @details This function targets prediction with convolutional neural networks and performs forward propagation.
 * @param algorithm The type of algorithm to use for convolution. Possible values are:
 *
 *    - nnp_convolution_algorithm_auto    -- let the function choose the algorithm.
 *    - nnp_convolution_algorithm_ft8x8   -- tiled convolution based on 2D Fourier transform with 8x8 blocks.
 *                                           Supports kernels up to 8x8.
 *    - nnp_convolution_algorithm_ft16x16 -- tiled convolution based on 2D Fourier transform with 16x16 blocks.
 *                                           Supports kernels up to 16x16.
 *    - nnp_convolution_algorithm_wt8x8   -- tiled convolution based on 2D Winograd transform F(3x3, 6x6).
 *                                           Supports only 3x3 kernels.
 *
 * @param transform_strategy A strategy that guides computation of kernel transforms coefficients.
 *                           Possible values are:
 *
 *    - nnp_convolution_transform_strategy_block_based -- do multiplication-accumulations on blocks of transformed
 *                                                        coefficients.
 *    - nnp_convolution_transform_strategy_tuple_based -- do multiplication-accumulations on tuples of transformed
 *                                                        coefficients.
 *
 * @param input_channels The number of channels (AKA features, dimensions) in the input image.
 * @param output_channels The number of channels (AKA features, dimensions) in the output image.
 * @param input_size Size of input image, excluding implicit zero-padding.
 * @param input_padding Implicit zero-padding of input image.
 * @param kernel_size Kernel size.
 * @param output_subsampling Subsample region for output, also known as convolution stride.
 * @param[in]  input  A 3D tensor input[input_channels][input_size.height][input_size.width].
 * @param[in]  kernel A 4D tensor kernel[output_channels][input_channels][kernel_size.height][kernel_size.width].
 * @param[in]  bias   A 1D array bias[output_channels].
 * @param[out] output A 3D tensor output[output_channels][output_size.height][output_size.width] where
 *                        output_size.height = (input_padding.top + input_size.height + input_padding.bottom) -
 *                                             (kernel_size.height - 1)
 *                        output_size.width  = (input_padding.left + input_size.width + input_padding.right) -
 *                                             (kernel_size.width - 1)
 * @param[in] workspace_buffer Buffer for scratch memory used during computation. Buffer must be aligned on 64 bytes.
 *                             If workspace_buffer is NULL and workspace_size is non-NULL, NNPACK would store the size
 *                             of required workspace memory at the workspace_size location, and exit without
 *                             computations.
 *                             If workspace_buffer is NULL and workspace_size is NULL, NNPACK would allocate memory
 *                             before and deallocate after this computation, potentially at significant runtime cost.
 * @param[in,out] workspace_size Pointer to the size of workspace buffer.
 *                               If workspace_buffer is NULL, NNPACK will write the size of required scratch memory to
 *                               the location specified by this pointer.
 *                               If workspace_buffer is non-NULL, NNPACK expects workspace_size to specify the size of
 *                               the buffer, in bytes.
 *                               If workspace_size is NULL, workspace_buffer must be NULL as well. In this case NNPACK
 *                               would allocate memory before and deallocate after this computation, potentially at
 *                               significant runtime cost.
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 * @param[out] profile An optional pointer to profiling structure.
 *                     If provided, the structure would record time spent in different phases of the computation.
 */
enum nnp_status nnp_convolution_inference(
	enum nnp_convolution_algorithm algorithm,
	enum nnp_convolution_transform_strategy transform_strategy,
	size_t input_channels,
	size_t output_channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size kernel_size,
	struct nnp_size output_subsampling,
	const float* input,
	const float* kernel,
	const float* bias,
	float* output,
	void* workspace_buffer,
	size_t* workspace_size,
	enum nnp_activation activation,
	const void* activation_parameters,
	pthreadpool_t threadpool,
	struct nnp_profile* profile);

/**
 * @brief Computes output of a fully connected layer from input and kernel matrices.
 * @details This function targets training of convolutional neural networks and performs forward propagation.
 *          It is optimized for moderate minibatch sizes (64-128) and can be inefficient on a small minibatch.
 *          For minibatch size 1, use nnp_fully_connected_inference for optimal performance.
 * @param batch_size The number of vectors on the input and output of the fully connected layer.
 * @param input_channels The number of channels (AKA features, dimensions) in the input matrix.
 * @param output_channels The number of channels (AKA features, dimensions) in the output matrix.
 * @param[in]  input  A 2D matrix input[batch_size][input_channels].
 * @param[in]  kernel A 2D matrix kernel[output_channels][input_channels].
 * @param[out] output A 2D matrix output[batch_size][output_channels].
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 */
enum nnp_status nnp_fully_connected_output(
	size_t batch_size,
	size_t input_channels,
	size_t output_channels,
	const float input[],
	const float kernel[],
	float output[],
	pthreadpool_t threadpool,
	struct nnp_profile* profile);

/**
 * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
 * @details This function targets prediction with convolutional neural networks and performs forward propagation.
 * @param input_channels The number of channels (AKA features, dimensions) in the input vector.
 * @param output_channels The number of channels (AKA features, dimensions) in the output vector.
 * @param[in]  input  A 1D array input[input_channels] of FP32 elements.
 * @param[in]  kernel A 2D matrix kernel[output_channels][input_channels] of FP32 elements.
 * @param[out] output A 1D array output[output_channels] of FP32 elements.
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 */
enum nnp_status nnp_fully_connected_inference(
	size_t input_channels,
	size_t output_channels,
	const float* input,
	const float* kernel,
	float* output,
	pthreadpool_t threadpool);

/**
 * @brief Computes output of a fully connected layer for a single input vector and a kernel matrix.
 * @details This function targets prediction with convolutional neural networks and performs forward propagation.
 * @param input_channels The number of channels (AKA features, dimensions) in the input vector.
 * @param output_channels The number of channels (AKA features, dimensions) in the output vector.
 * @param[in]  input  A 1D array input[input_channels] of FP32 elements.
 * @param[in]  kernel A 2D matrix kernel[output_channels][input_channels] of FP16 (ARM alternative format) elements.
 * @param[out] output A 1D array output[output_channels] of FP32 elements.
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 */
enum nnp_status nnp_fully_connected_inference_f16f32(
	size_t input_channels,
	size_t output_channels,
	const float* input,
	const void* kernel,
	float* output,
	pthreadpool_t threadpool);

/**
 * @brief Computes output of a max-pooling layer for an input tensor.
 * @details This function targets both prediction and training of convolutional neural networks and performs forward
 *          propagation. Is is optimized for both large and small minibatch sizes.
 * @param batch_size The number of images on the input and output of the max-pooling layer.
 * @param channels   The number of channels (AKA features, dimensions) in both input and output images.
 * @param input_size Size of input images, excluding implicit zero-padding.
 * @param input_padding Implicit padding of input images. The padding pixels are ignored by the pooling filter, but
 *                      affect the output size.
 * @param pooling_size   Size of the pooling filter. Only 2x2 filter are currently supported.
 * @param pooling_stride Stride of the pooling filter. Only 2x2 strides are currently supported.
 * @param[in]  input  A 4D tensor input[batch_size][channels][input_size.height][input_size.width].
 * @param[out] output A 4D tensor output[batch_size][channels][output_size.height][output_size.width] where
 *                    output_size.height = ceil(
 *                      (input_padding.top + input_size.height + input_padding.bottom - pooling_size.height) /
 *                        pooling_stride.height) + 1
 *                    output_size.width = ceil(
 *                      (input_padding.left + input_size.width + input_padding.right - pooling_size.width) /
 *                        pooling_stride.width) + 1
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 */
enum nnp_status nnp_max_pooling_output(
	size_t batch_size,
	size_t channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size pooling_size,
	struct nnp_size pooling_stride,
	const float input[],
	float output[],
	pthreadpool_t threadpool);

/**
 * @brief Computes output of a softmax layer for an input matrix.
 * @details This function targets both prediction and training of convolutional neural networks and performs forward
 *          propagation. Is is optimized for both large and small minibatch sizes.
 * @param batch_size The number of vectors on the input and output of the softmax layer.
 * @param channels   The number of channels (AKA features, dimensions) in both input and output vectors.
 * @param[in]  input  A 2D matrix input[batch_size][channels].
 * @param[out] output A 2D matrix output[batch_size][channels].
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 */
enum nnp_status nnp_softmax_output(
    size_t batch_size,
    size_t channels,
    const float input[],
    float output[],
    pthreadpool_t threadpool);

/**
 * @brief Computes output of a rectified linear unit (ReLU) layer for an input matrix.
 * @details This function targets both prediction and training of convolutional neural networks and performs forward
 *          propagation. Is is optimized for both large and small minibatch sizes.
 * @param batch_size The number of vectors on the input and output of the ReLU layer.
 * @param channels   The number of channels (AKA features, dimensions) in both input and output matrices.
 * @param[in]  input  A 2D matrix input[batch_size][channels].
 * @param[out] output A 2D matrix output[batch_size][channels].
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 */
enum nnp_status nnp_relu_output(
	size_t batch_size,
	size_t channels,
	const float input[],
	float output[],
	float negative_slope,
	pthreadpool_t threadpool);

/**
 * @brief Computes gradient of input of a rectified linear unit (ReLU) layer from gradient of output and input matrices.
 * @details This function targets training of convolutional neural networks and performs backward propagation.
 *          Is is optimized for both large and small minibatch sizes.
 * @param batch_size The number of vectors on the input and output of the ReLU layer.
 * @param channels   The number of channels (AKA features, dimensions) in both input and output matrices.
 * @param[in]  input  A 2D matrix input[batch_size][channels].
 * @param[out] output A 2D matrix output[batch_size][channels].
 * @param threadpool A thread pool for parallelization of the computation.
 *                   If threadpool is NULL, the computation would run on the caller thread without parallelization.
 */
enum nnp_status nnp_relu_input_gradient(
	size_t batch_size,
	size_t channels,
	const float grad_output[],
	const float input[],
	float grad_input[],
	float negative_slope,
	pthreadpool_t threadpool);

#ifdef __cplusplus
} /* extern "C" */
#endif

#ifdef __cplusplus
// Backward compatible implementations for nnp_convolution_*, if we are in C++
// mode.
inline enum nnp_status nnp_convolution_output(
	enum nnp_convolution_algorithm algorithm,
	size_t batch_size,
	size_t input_channels,
	size_t output_channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size kernel_size,
	const float input[],
	const float kernel[],
	const float bias[],
	float output[],
	pthreadpool_t threadpool,
	struct nnp_profile* profile)
{
	return nnp_convolution_output(
		algorithm,
		batch_size, input_channels, output_channels,
		input_size, input_padding, kernel_size,
		input, kernel, bias, output,
		NULL, NULL,
		nnp_activation_identity, NULL, threadpool, profile);
}

inline enum nnp_status nnp_convolution_input_gradient(
	enum nnp_convolution_algorithm algorithm,
	size_t batch_size,
	size_t input_channels,
	size_t output_channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size kernel_size,
	const float grad_output[],
	const float kernel[],
	float grad_input[],
	pthreadpool_t threadpool,
	struct nnp_profile* profile)
{
	return nnp_convolution_input_gradient(
		algorithm,
		batch_size, input_channels, output_channels,
		input_size, input_padding, kernel_size,
		grad_output, kernel, grad_input,
		NULL, NULL,
		nnp_activation_identity, NULL, threadpool, profile);
}

inline enum nnp_status nnp_convolution_kernel_gradient(
	enum nnp_convolution_algorithm algorithm,
	size_t batch_size,
	size_t input_channels,
	size_t output_channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size kernel_size,
	const float input[],
	const float grad_output[],
	float grad_kernel[],
	pthreadpool_t threadpool,
	struct nnp_profile* profile)
{
	return nnp_convolution_kernel_gradient(
		algorithm,
		batch_size, input_channels, output_channels,
		input_size, input_padding, kernel_size,
		input, grad_output, grad_kernel,
		NULL, NULL,
		nnp_activation_identity, NULL, threadpool, profile);
}

inline enum nnp_status nnp_convolution_inference(
	enum nnp_convolution_algorithm algorithm,
	enum nnp_convolution_transform_strategy transform_strategy,
	size_t input_channels,
	size_t output_channels,
	struct nnp_size input_size,
	struct nnp_padding input_padding,
	struct nnp_size kernel_size,
	struct nnp_size output_subsampling,
	const float input[],
	const float kernel[],
	const float bias[],
	float output[],
	pthreadpool_t threadpool,
	struct nnp_profile* profile) {
	return nnp_convolution_inference(
		algorithm, transform_strategy,
		input_channels, output_channels,
		input_size, input_padding, kernel_size, output_subsampling,
		input, kernel, bias, output, NULL, NULL,
		nnp_activation_identity, NULL,
		threadpool, profile);
}

#endif // __cplusplus