|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#pragma once |
|
|
|
#include <atomic> |
|
#include <cstdint> |
|
#include <optional> |
|
#include <thread> |
|
#include <unordered_map> |
|
#include <vector> |
|
|
|
#include "arrow/compute/expression.h" |
|
#include "arrow/compute/type_fwd.h" |
|
#include "arrow/result.h" |
|
#include "arrow/util/cpu_info.h" |
|
#include "arrow/util/simd.h" |
|
|
|
#if defined(__clang__) || defined(__GNUC__) |
|
# define BYTESWAP(x) __builtin_bswap64(x) |
|
# define ROTL(x, n) (((x) << (n)) | ((x) >> ((-n) & 31))) |
|
# define ROTL64(x, n) (((x) << (n)) | ((x) >> ((-n) & 63))) |
|
#elif defined(_MSC_VER) |
|
# include <intrin.h> |
|
# define BYTESWAP(x) _byteswap_uint64(x) |
|
# define ROTL(x, n) _rotl((x), (n)) |
|
# define ROTL64(x, n) _rotl64((x), (n)) |
|
#endif |
|
|
|
namespace arrow { |
|
namespace util { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
using int64_for_gather_t = const long long int; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class MiniBatch { |
|
public: |
|
static constexpr int kLogMiniBatchLength = 10; |
|
static constexpr int kMiniBatchLength = 1 << kLogMiniBatchLength; |
|
}; |
|
|
|
namespace bit_util { |
|
|
|
ARROW_EXPORT void bits_to_indexes(int bit_to_search, int64_t hardware_flags, |
|
const int num_bits, const uint8_t* bits, |
|
int* num_indexes, uint16_t* indexes, |
|
int bit_offset = 0); |
|
|
|
ARROW_EXPORT void bits_filter_indexes(int bit_to_search, int64_t hardware_flags, |
|
const int num_bits, const uint8_t* bits, |
|
const uint16_t* input_indexes, int* num_indexes, |
|
uint16_t* indexes, int bit_offset = 0); |
|
|
|
|
|
ARROW_EXPORT void bits_split_indexes(int64_t hardware_flags, const int num_bits, |
|
const uint8_t* bits, int* num_indexes_bit0, |
|
uint16_t* indexes_bit0, uint16_t* indexes_bit1, |
|
int bit_offset = 0); |
|
|
|
|
|
ARROW_EXPORT void bits_to_bytes(int64_t hardware_flags, const int num_bits, |
|
const uint8_t* bits, uint8_t* bytes, int bit_offset = 0); |
|
|
|
|
|
ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits, |
|
const uint8_t* bytes, uint8_t* bits, int bit_offset = 0); |
|
|
|
ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, |
|
uint32_t num_bytes); |
|
|
|
#if defined(ARROW_HAVE_RUNTIME_AVX2) && defined(ARROW_HAVE_RUNTIME_BMI2) |
|
|
|
|
|
namespace avx2 { |
|
ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, |
|
const uint8_t* bits, |
|
const uint16_t* input_indexes, |
|
int* num_indexes, uint16_t* indexes); |
|
ARROW_EXPORT void bits_to_indexes_avx2(int bit_to_search, const int num_bits, |
|
const uint8_t* bits, int* num_indexes, |
|
uint16_t* indexes, uint16_t base_index = 0); |
|
ARROW_EXPORT void bits_to_bytes_avx2(const int num_bits, const uint8_t* bits, |
|
uint8_t* bytes); |
|
ARROW_EXPORT void bytes_to_bits_avx2(const int num_bits, const uint8_t* bytes, |
|
uint8_t* bits); |
|
ARROW_EXPORT bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes); |
|
} |
|
|
|
#endif |
|
|
|
} |
|
} |
|
|
|
namespace compute { |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename PreVisit, typename PostVisitCall> |
|
Result<Expression> ModifyExpression(Expression expr, const PreVisit& pre, |
|
const PostVisitCall& post_call) { |
|
ARROW_ASSIGN_OR_RAISE(expr, Result<Expression>(pre(std::move(expr)))); |
|
|
|
auto call = expr.call(); |
|
if (!call) return expr; |
|
|
|
bool at_least_one_modified = false; |
|
std::vector<Expression> modified_arguments; |
|
|
|
for (size_t i = 0; i < call->arguments.size(); ++i) { |
|
ARROW_ASSIGN_OR_RAISE(auto modified_argument, |
|
ModifyExpression(call->arguments[i], pre, post_call)); |
|
|
|
if (Identical(modified_argument, call->arguments[i])) { |
|
continue; |
|
} |
|
|
|
if (!at_least_one_modified) { |
|
modified_arguments = call->arguments; |
|
at_least_one_modified = true; |
|
} |
|
|
|
modified_arguments[i] = std::move(modified_argument); |
|
} |
|
|
|
if (at_least_one_modified) { |
|
|
|
auto modified_call = *call; |
|
modified_call.arguments = std::move(modified_arguments); |
|
return post_call(Expression(std::move(modified_call)), &expr); |
|
} |
|
|
|
return post_call(std::move(expr), NULLPTR); |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TailSkipForSIMD { |
|
public: |
|
static int64_t FixBitAccess(int num_bytes_accessed_together, int64_t num_rows, |
|
int bit_offset) { |
|
int64_t num_bytes = bit_util::BytesForBits(num_rows + bit_offset); |
|
int64_t num_bytes_safe = |
|
std::max(static_cast<int64_t>(0LL), num_bytes - num_bytes_accessed_together + 1); |
|
int64_t num_rows_safe = |
|
std::max(static_cast<int64_t>(0LL), 8 * num_bytes_safe - bit_offset); |
|
return std::min(num_rows_safe, num_rows); |
|
} |
|
static int64_t FixBinaryAccess(int num_bytes_accessed_together, int64_t num_rows, |
|
int64_t length) { |
|
int64_t num_rows_to_skip = bit_util::CeilDiv(length, num_bytes_accessed_together); |
|
int64_t num_rows_safe = |
|
std::max(static_cast<int64_t>(0LL), num_rows - num_rows_to_skip); |
|
return num_rows_safe; |
|
} |
|
static int64_t FixVarBinaryAccess(int num_bytes_accessed_together, int64_t num_rows, |
|
const uint32_t* offsets) { |
|
|
|
|
|
|
|
int64_t num_rows_safe = num_rows; |
|
while (num_rows_safe > 0 && |
|
offsets[num_rows_safe] + num_bytes_accessed_together > offsets[num_rows]) { |
|
--num_rows_safe; |
|
} |
|
return num_rows_safe; |
|
} |
|
static int FixSelection(int64_t num_rows_safe, int num_selected, |
|
const uint16_t* selection) { |
|
int num_selected_safe = num_selected; |
|
while (num_selected_safe > 0 && selection[num_selected_safe - 1] >= num_rows_safe) { |
|
--num_selected_safe; |
|
} |
|
return num_selected_safe; |
|
} |
|
}; |
|
|
|
} |
|
} |
|
|