|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import struct as _struct |
|
|
|
|
|
cdef class Tensor(_Weakrefable): |
|
""" |
|
A n-dimensional array a.k.a Tensor. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
<pyarrow.Tensor> |
|
type: int32 |
|
shape: (2, 3) |
|
strides: (12, 4) |
|
""" |
|
|
|
def __init__(self): |
|
raise TypeError("Do not call Tensor's constructor directly, use one " |
|
"of the `pyarrow.Tensor.from_*` functions instead.") |
|
|
|
cdef void init(self, const shared_ptr[CTensor]& sp_tensor): |
|
self.sp_tensor = sp_tensor |
|
self.tp = sp_tensor.get() |
|
self.type = pyarrow_wrap_data_type(self.tp.type()) |
|
self._ssize_t_shape = self._make_shape_or_strides_buffer(self.shape) |
|
self._ssize_t_strides = self._make_shape_or_strides_buffer(self.strides) |
|
|
|
def _make_shape_or_strides_buffer(self, values): |
|
""" |
|
Make a bytes object holding an array of `values` cast to `Py_ssize_t`. |
|
""" |
|
return _struct.pack(f"{len(values)}n", *values) |
|
|
|
def __repr__(self): |
|
return """<pyarrow.Tensor> |
|
type: {0.type} |
|
shape: {0.shape} |
|
strides: {0.strides}""".format(self) |
|
|
|
@staticmethod |
|
def from_numpy(obj, dim_names=None): |
|
""" |
|
Create a Tensor from a numpy array. |
|
|
|
Parameters |
|
---------- |
|
obj : numpy.ndarray |
|
The source numpy array |
|
dim_names : list, optional |
|
Names of each dimension of the Tensor. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
<pyarrow.Tensor> |
|
type: int32 |
|
shape: (2, 3) |
|
strides: (12, 4) |
|
""" |
|
cdef: |
|
vector[c_string] c_dim_names |
|
shared_ptr[CTensor] ctensor |
|
|
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
check_status(NdarrayToTensor(c_default_memory_pool(), obj, |
|
c_dim_names, &ctensor)) |
|
return pyarrow_wrap_tensor(ctensor) |
|
|
|
def to_numpy(self): |
|
""" |
|
Convert arrow::Tensor to numpy.ndarray with zero copy |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.to_numpy() |
|
array([[ 2, 2, 4], |
|
[ 4, 5, 100]], dtype=int32) |
|
""" |
|
if np is None: |
|
raise ImportError( |
|
"Cannot return a numpy.ndarray if NumPy is not present") |
|
cdef PyObject* out |
|
|
|
check_status(TensorToNdarray(self.sp_tensor, self, &out)) |
|
return PyObject_to_object(out) |
|
|
|
def equals(self, Tensor other): |
|
""" |
|
Return true if the tensors contains exactly equal data. |
|
|
|
Parameters |
|
---------- |
|
other : Tensor |
|
The other tensor to compare for equality. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) |
|
>>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a","b"]) |
|
>>> tensor.equals(tensor) |
|
True |
|
>>> tensor.equals(tensor2) |
|
False |
|
""" |
|
return self.tp.Equals(deref(other.tp)) |
|
|
|
def __eq__(self, other): |
|
if isinstance(other, Tensor): |
|
return self.equals(other) |
|
else: |
|
return NotImplemented |
|
|
|
def dim_name(self, i): |
|
""" |
|
Returns the name of the i-th tensor dimension. |
|
|
|
Parameters |
|
---------- |
|
i : int |
|
The physical index of the tensor dimension. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.dim_name(0) |
|
'dim1' |
|
>>> tensor.dim_name(1) |
|
'dim2' |
|
""" |
|
return frombytes(self.tp.dim_name(i)) |
|
|
|
@property |
|
def dim_names(self): |
|
""" |
|
Names of this tensor dimensions. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.dim_names |
|
['dim1', 'dim2'] |
|
""" |
|
return [frombytes(x) for x in tuple(self.tp.dim_names())] |
|
|
|
@property |
|
def is_mutable(self): |
|
""" |
|
Is this tensor mutable or immutable. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.is_mutable |
|
True |
|
""" |
|
return self.tp.is_mutable() |
|
|
|
@property |
|
def is_contiguous(self): |
|
""" |
|
Is this tensor contiguous in memory. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.is_contiguous |
|
True |
|
""" |
|
return self.tp.is_contiguous() |
|
|
|
@property |
|
def ndim(self): |
|
""" |
|
The dimension (n) of this tensor. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.ndim |
|
2 |
|
""" |
|
return self.tp.ndim() |
|
|
|
@property |
|
def size(self): |
|
""" |
|
The size of this tensor. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.size |
|
6 |
|
""" |
|
return self.tp.size() |
|
|
|
@property |
|
def shape(self): |
|
""" |
|
The shape of this tensor. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.shape |
|
(2, 3) |
|
""" |
|
|
|
return tuple(self.tp.shape()) |
|
|
|
@property |
|
def strides(self): |
|
""" |
|
Strides of this tensor. |
|
|
|
Examples |
|
-------- |
|
>>> import pyarrow as pa |
|
>>> import numpy as np |
|
>>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) |
|
>>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1","dim2"]) |
|
>>> tensor.strides |
|
(12, 4) |
|
""" |
|
return tuple(self.tp.strides()) |
|
|
|
def __getbuffer__(self, cp.Py_buffer* buffer, int flags): |
|
buffer.buf = <char *> self.tp.data().get().data() |
|
pep3118_format = self.type.pep3118_format |
|
if pep3118_format is None: |
|
raise NotImplementedError("type %s not supported for buffer " |
|
"protocol" % (self.type,)) |
|
buffer.format = pep3118_format |
|
buffer.itemsize = self.type.bit_width // 8 |
|
buffer.internal = NULL |
|
buffer.len = self.tp.size() * buffer.itemsize |
|
buffer.ndim = self.tp.ndim() |
|
buffer.obj = self |
|
if self.tp.is_mutable(): |
|
buffer.readonly = 0 |
|
else: |
|
buffer.readonly = 1 |
|
buffer.shape = <Py_ssize_t *> cp.PyBytes_AsString(self._ssize_t_shape) |
|
buffer.strides = <Py_ssize_t *> cp.PyBytes_AsString(self._ssize_t_strides) |
|
buffer.suboffsets = NULL |
|
|
|
|
|
ctypedef CSparseCOOIndex* _CSparseCOOIndexPtr |
|
|
|
|
|
cdef class SparseCOOTensor(_Weakrefable): |
|
""" |
|
A sparse COO tensor. |
|
""" |
|
|
|
def __init__(self): |
|
raise TypeError("Do not call SparseCOOTensor's constructor directly, " |
|
"use one of the `pyarrow.SparseCOOTensor.from_*` " |
|
"functions instead.") |
|
|
|
cdef void init(self, const shared_ptr[CSparseCOOTensor]& sp_sparse_tensor): |
|
self.sp_sparse_tensor = sp_sparse_tensor |
|
self.stp = sp_sparse_tensor.get() |
|
self.type = pyarrow_wrap_data_type(self.stp.type()) |
|
|
|
def __repr__(self): |
|
return """<pyarrow.SparseCOOTensor> |
|
type: {0.type} |
|
shape: {0.shape}""".format(self) |
|
|
|
@classmethod |
|
def from_dense_numpy(cls, obj, dim_names=None): |
|
""" |
|
Convert numpy.ndarray to arrow::SparseCOOTensor |
|
|
|
Parameters |
|
---------- |
|
obj : numpy.ndarray |
|
Data used to populate the rows. |
|
dim_names : list[str], optional |
|
Names of the dimensions. |
|
|
|
Returns |
|
------- |
|
pyarrow.SparseCOOTensor |
|
""" |
|
return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) |
|
|
|
@staticmethod |
|
def from_numpy(data, coords, shape, dim_names=None): |
|
""" |
|
Create arrow::SparseCOOTensor from numpy.ndarrays |
|
|
|
Parameters |
|
---------- |
|
data : numpy.ndarray |
|
Data used to populate the rows. |
|
coords : numpy.ndarray |
|
Coordinates of the data. |
|
shape : tuple |
|
Shape of the tensor. |
|
dim_names : list, optional |
|
Names of the dimensions. |
|
""" |
|
cdef shared_ptr[CSparseCOOTensor] csparse_tensor |
|
cdef vector[int64_t] c_shape |
|
cdef vector[c_string] c_dim_names |
|
|
|
for x in shape: |
|
c_shape.push_back(x) |
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
|
|
coords = np.require(coords, dtype='i8', requirements='C') |
|
if coords.ndim != 2: |
|
raise ValueError("Expected 2-dimensional array for " |
|
"SparseCOOTensor indices") |
|
|
|
check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), |
|
data, coords, c_shape, |
|
c_dim_names, &csparse_tensor)) |
|
return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) |
|
|
|
@staticmethod |
|
def from_scipy(obj, dim_names=None): |
|
""" |
|
Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor |
|
|
|
Parameters |
|
---------- |
|
obj : scipy.sparse.csr_matrix |
|
The scipy matrix that should be converted. |
|
dim_names : list, optional |
|
Names of the dimensions. |
|
""" |
|
import scipy.sparse |
|
if not isinstance(obj, scipy.sparse.coo_matrix): |
|
raise TypeError( |
|
"Expected scipy.sparse.coo_matrix, got {}".format(type(obj))) |
|
|
|
cdef shared_ptr[CSparseCOOTensor] csparse_tensor |
|
cdef vector[int64_t] c_shape |
|
cdef vector[c_string] c_dim_names |
|
|
|
for x in obj.shape: |
|
c_shape.push_back(x) |
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
row = obj.row |
|
col = obj.col |
|
|
|
|
|
|
|
|
|
|
|
if obj.has_canonical_format: |
|
order = np.lexsort((col, row)) |
|
row = row[order] |
|
col = col[order] |
|
coords = np.vstack([row, col]).T |
|
coords = np.require(coords, dtype='i8', requirements='C') |
|
|
|
check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), |
|
obj.data, coords, c_shape, |
|
c_dim_names, &csparse_tensor)) |
|
return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) |
|
|
|
@staticmethod |
|
def from_pydata_sparse(obj, dim_names=None): |
|
""" |
|
Convert pydata/sparse.COO to arrow::SparseCOOTensor. |
|
|
|
Parameters |
|
---------- |
|
obj : pydata.sparse.COO |
|
The sparse multidimensional array that should be converted. |
|
dim_names : list, optional |
|
Names of the dimensions. |
|
""" |
|
import sparse |
|
if not isinstance(obj, sparse.COO): |
|
raise TypeError( |
|
"Expected sparse.COO, got {}".format(type(obj))) |
|
|
|
cdef shared_ptr[CSparseCOOTensor] csparse_tensor |
|
cdef vector[int64_t] c_shape |
|
cdef vector[c_string] c_dim_names |
|
|
|
for x in obj.shape: |
|
c_shape.push_back(x) |
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
coords = np.require(obj.coords.T, dtype='i8', requirements='C') |
|
|
|
check_status(NdarraysToSparseCOOTensor(c_default_memory_pool(), |
|
obj.data, coords, c_shape, |
|
c_dim_names, &csparse_tensor)) |
|
return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) |
|
|
|
@staticmethod |
|
def from_tensor(obj): |
|
""" |
|
Convert arrow::Tensor to arrow::SparseCOOTensor. |
|
|
|
Parameters |
|
---------- |
|
obj : Tensor |
|
The tensor that should be converted. |
|
""" |
|
cdef shared_ptr[CSparseCOOTensor] csparse_tensor |
|
cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) |
|
|
|
with nogil: |
|
check_status(TensorToSparseCOOTensor(ctensor, &csparse_tensor)) |
|
|
|
return pyarrow_wrap_sparse_coo_tensor(csparse_tensor) |
|
|
|
def to_numpy(self): |
|
""" |
|
Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. |
|
""" |
|
if np is None: |
|
raise ImportError( |
|
"Cannot return a numpy.ndarray if NumPy is not present") |
|
cdef PyObject* out_data |
|
cdef PyObject* out_coords |
|
|
|
check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self, |
|
&out_data, &out_coords)) |
|
return PyObject_to_object(out_data), PyObject_to_object(out_coords) |
|
|
|
def to_scipy(self): |
|
""" |
|
Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. |
|
""" |
|
from scipy.sparse import coo_matrix |
|
cdef PyObject* out_data |
|
cdef PyObject* out_coords |
|
|
|
check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self, |
|
&out_data, &out_coords)) |
|
data = PyObject_to_object(out_data) |
|
coords = PyObject_to_object(out_coords) |
|
row, col = coords[:, 0], coords[:, 1] |
|
result = coo_matrix((data[:, 0], (row, col)), shape=self.shape) |
|
|
|
|
|
|
|
|
|
|
|
if self.has_canonical_format: |
|
result.sum_duplicates() |
|
return result |
|
|
|
def to_pydata_sparse(self): |
|
""" |
|
Convert arrow::SparseCOOTensor to pydata/sparse.COO. |
|
""" |
|
from sparse import COO |
|
cdef PyObject* out_data |
|
cdef PyObject* out_coords |
|
|
|
check_status(SparseCOOTensorToNdarray(self.sp_sparse_tensor, self, |
|
&out_data, &out_coords)) |
|
data = PyObject_to_object(out_data) |
|
coords = PyObject_to_object(out_coords) |
|
result = COO(data=data[:, 0], coords=coords.T, shape=self.shape) |
|
return result |
|
|
|
def to_tensor(self): |
|
""" |
|
Convert arrow::SparseCOOTensor to arrow::Tensor. |
|
""" |
|
|
|
cdef shared_ptr[CTensor] ctensor |
|
with nogil: |
|
ctensor = GetResultValue(self.stp.ToTensor()) |
|
|
|
return pyarrow_wrap_tensor(ctensor) |
|
|
|
def equals(self, SparseCOOTensor other): |
|
""" |
|
Return true if sparse tensors contains exactly equal data. |
|
|
|
Parameters |
|
---------- |
|
other : SparseCOOTensor |
|
The other tensor to compare for equality. |
|
""" |
|
return self.stp.Equals(deref(other.stp)) |
|
|
|
def __eq__(self, other): |
|
if isinstance(other, SparseCOOTensor): |
|
return self.equals(other) |
|
else: |
|
return NotImplemented |
|
|
|
@property |
|
def is_mutable(self): |
|
return self.stp.is_mutable() |
|
|
|
@property |
|
def ndim(self): |
|
return self.stp.ndim() |
|
|
|
@property |
|
def shape(self): |
|
|
|
return tuple(self.stp.shape()) |
|
|
|
@property |
|
def size(self): |
|
return self.stp.size() |
|
|
|
def dim_name(self, i): |
|
""" |
|
Returns the name of the i-th tensor dimension. |
|
|
|
Parameters |
|
---------- |
|
i : int |
|
The physical index of the tensor dimension. |
|
|
|
Returns |
|
------- |
|
str |
|
""" |
|
return frombytes(self.stp.dim_name(i)) |
|
|
|
@property |
|
def dim_names(self): |
|
names_tuple = tuple(self.stp.dim_names()) |
|
return tuple(frombytes(x) for x in names_tuple) |
|
|
|
@property |
|
def non_zero_length(self): |
|
return self.stp.non_zero_length() |
|
|
|
@property |
|
def has_canonical_format(self): |
|
cdef: |
|
_CSparseCOOIndexPtr csi |
|
|
|
csi = <_CSparseCOOIndexPtr>(self.stp.sparse_index().get()) |
|
if csi != nullptr: |
|
return csi.is_canonical() |
|
return True |
|
|
|
cdef class SparseCSRMatrix(_Weakrefable): |
|
""" |
|
A sparse CSR matrix. |
|
""" |
|
|
|
def __init__(self): |
|
raise TypeError("Do not call SparseCSRMatrix's constructor directly, " |
|
"use one of the `pyarrow.SparseCSRMatrix.from_*` " |
|
"functions instead.") |
|
|
|
cdef void init(self, const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor): |
|
self.sp_sparse_tensor = sp_sparse_tensor |
|
self.stp = sp_sparse_tensor.get() |
|
self.type = pyarrow_wrap_data_type(self.stp.type()) |
|
|
|
def __repr__(self): |
|
return """<pyarrow.SparseCSRMatrix> |
|
type: {0.type} |
|
shape: {0.shape}""".format(self) |
|
|
|
@classmethod |
|
def from_dense_numpy(cls, obj, dim_names=None): |
|
""" |
|
Convert numpy.ndarray to arrow::SparseCSRMatrix |
|
|
|
Parameters |
|
---------- |
|
obj : numpy.ndarray |
|
The dense numpy array that should be converted. |
|
dim_names : list, optional |
|
The names of the dimensions. |
|
|
|
Returns |
|
------- |
|
pyarrow.SparseCSRMatrix |
|
""" |
|
return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) |
|
|
|
@staticmethod |
|
def from_numpy(data, indptr, indices, shape, dim_names=None): |
|
""" |
|
Create arrow::SparseCSRMatrix from numpy.ndarrays. |
|
|
|
Parameters |
|
---------- |
|
data : numpy.ndarray |
|
Data used to populate the sparse matrix. |
|
indptr : numpy.ndarray |
|
Range of the rows, |
|
The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. |
|
indices : numpy.ndarray |
|
Column indices of the corresponding non-zero values. |
|
shape : tuple |
|
Shape of the matrix. |
|
dim_names : list, optional |
|
Names of the dimensions. |
|
""" |
|
cdef shared_ptr[CSparseCSRMatrix] csparse_tensor |
|
cdef vector[int64_t] c_shape |
|
cdef vector[c_string] c_dim_names |
|
|
|
for x in shape: |
|
c_shape.push_back(x) |
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
|
|
indptr = np.require(indptr, dtype='i8') |
|
indices = np.require(indices, dtype='i8') |
|
if indptr.ndim != 1: |
|
raise ValueError("Expected 1-dimensional array for " |
|
"SparseCSRMatrix indptr") |
|
if indices.ndim != 1: |
|
raise ValueError("Expected 1-dimensional array for " |
|
"SparseCSRMatrix indices") |
|
|
|
check_status(NdarraysToSparseCSRMatrix(c_default_memory_pool(), |
|
data, indptr, indices, c_shape, |
|
c_dim_names, &csparse_tensor)) |
|
return pyarrow_wrap_sparse_csr_matrix(csparse_tensor) |
|
|
|
@staticmethod |
|
def from_scipy(obj, dim_names=None): |
|
""" |
|
Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. |
|
|
|
Parameters |
|
---------- |
|
obj : scipy.sparse.csr_matrix |
|
The scipy matrix that should be converted. |
|
dim_names : list, optional |
|
Names of the dimensions. |
|
""" |
|
import scipy.sparse |
|
if not isinstance(obj, scipy.sparse.csr_matrix): |
|
raise TypeError( |
|
"Expected scipy.sparse.csr_matrix, got {}".format(type(obj))) |
|
|
|
cdef shared_ptr[CSparseCSRMatrix] csparse_tensor |
|
cdef vector[int64_t] c_shape |
|
cdef vector[c_string] c_dim_names |
|
|
|
for x in obj.shape: |
|
c_shape.push_back(x) |
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
|
|
indptr = np.require(obj.indptr, dtype='i8') |
|
indices = np.require(obj.indices, dtype='i8') |
|
|
|
check_status(NdarraysToSparseCSRMatrix(c_default_memory_pool(), |
|
obj.data, indptr, indices, |
|
c_shape, c_dim_names, |
|
&csparse_tensor)) |
|
return pyarrow_wrap_sparse_csr_matrix(csparse_tensor) |
|
|
|
@staticmethod |
|
def from_tensor(obj): |
|
""" |
|
Convert arrow::Tensor to arrow::SparseCSRMatrix. |
|
|
|
Parameters |
|
---------- |
|
obj : Tensor |
|
The dense tensor that should be converted. |
|
""" |
|
cdef shared_ptr[CSparseCSRMatrix] csparse_tensor |
|
cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) |
|
|
|
with nogil: |
|
check_status(TensorToSparseCSRMatrix(ctensor, &csparse_tensor)) |
|
|
|
return pyarrow_wrap_sparse_csr_matrix(csparse_tensor) |
|
|
|
def to_numpy(self): |
|
""" |
|
Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. |
|
""" |
|
if np is None: |
|
raise ImportError( |
|
"Cannot return a numpy.ndarray if NumPy is not present") |
|
cdef PyObject* out_data |
|
cdef PyObject* out_indptr |
|
cdef PyObject* out_indices |
|
|
|
check_status(SparseCSRMatrixToNdarray(self.sp_sparse_tensor, self, |
|
&out_data, &out_indptr, |
|
&out_indices)) |
|
return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), |
|
PyObject_to_object(out_indices)) |
|
|
|
def to_scipy(self): |
|
""" |
|
Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. |
|
""" |
|
from scipy.sparse import csr_matrix |
|
cdef PyObject* out_data |
|
cdef PyObject* out_indptr |
|
cdef PyObject* out_indices |
|
|
|
check_status(SparseCSRMatrixToNdarray(self.sp_sparse_tensor, self, |
|
&out_data, &out_indptr, |
|
&out_indices)) |
|
|
|
data = PyObject_to_object(out_data) |
|
indptr = PyObject_to_object(out_indptr) |
|
indices = PyObject_to_object(out_indices) |
|
result = csr_matrix((data[:, 0], indices, indptr), shape=self.shape) |
|
return result |
|
|
|
def to_tensor(self): |
|
""" |
|
Convert arrow::SparseCSRMatrix to arrow::Tensor. |
|
""" |
|
cdef shared_ptr[CTensor] ctensor |
|
with nogil: |
|
ctensor = GetResultValue(self.stp.ToTensor()) |
|
|
|
return pyarrow_wrap_tensor(ctensor) |
|
|
|
def equals(self, SparseCSRMatrix other): |
|
""" |
|
Return true if sparse tensors contains exactly equal data. |
|
|
|
Parameters |
|
---------- |
|
other : SparseCSRMatrix |
|
The other tensor to compare for equality. |
|
""" |
|
return self.stp.Equals(deref(other.stp)) |
|
|
|
def __eq__(self, other): |
|
if isinstance(other, SparseCSRMatrix): |
|
return self.equals(other) |
|
else: |
|
return NotImplemented |
|
|
|
@property |
|
def is_mutable(self): |
|
return self.stp.is_mutable() |
|
|
|
@property |
|
def ndim(self): |
|
return self.stp.ndim() |
|
|
|
@property |
|
def shape(self): |
|
|
|
return tuple(self.stp.shape()) |
|
|
|
@property |
|
def size(self): |
|
return self.stp.size() |
|
|
|
def dim_name(self, i): |
|
""" |
|
Returns the name of the i-th tensor dimension. |
|
|
|
Parameters |
|
---------- |
|
i : int |
|
The physical index of the tensor dimension. |
|
|
|
Returns |
|
------- |
|
str |
|
""" |
|
return frombytes(self.stp.dim_name(i)) |
|
|
|
@property |
|
def dim_names(self): |
|
names_tuple = tuple(self.stp.dim_names()) |
|
return tuple(frombytes(x) for x in names_tuple) |
|
|
|
@property |
|
def non_zero_length(self): |
|
return self.stp.non_zero_length() |
|
|
|
cdef class SparseCSCMatrix(_Weakrefable): |
|
""" |
|
A sparse CSC matrix. |
|
""" |
|
|
|
def __init__(self): |
|
raise TypeError("Do not call SparseCSCMatrix's constructor directly, " |
|
"use one of the `pyarrow.SparseCSCMatrix.from_*` " |
|
"functions instead.") |
|
|
|
cdef void init(self, const shared_ptr[CSparseCSCMatrix]& sp_sparse_tensor): |
|
self.sp_sparse_tensor = sp_sparse_tensor |
|
self.stp = sp_sparse_tensor.get() |
|
self.type = pyarrow_wrap_data_type(self.stp.type()) |
|
|
|
def __repr__(self): |
|
return """<pyarrow.SparseCSCMatrix> |
|
type: {0.type} |
|
shape: {0.shape}""".format(self) |
|
|
|
@classmethod |
|
def from_dense_numpy(cls, obj, dim_names=None): |
|
""" |
|
Convert numpy.ndarray to arrow::SparseCSCMatrix |
|
|
|
Parameters |
|
---------- |
|
obj : numpy.ndarray |
|
Data used to populate the rows. |
|
dim_names : list[str], optional |
|
Names of the dimensions. |
|
|
|
Returns |
|
------- |
|
pyarrow.SparseCSCMatrix |
|
""" |
|
return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) |
|
|
|
@staticmethod |
|
def from_numpy(data, indptr, indices, shape, dim_names=None): |
|
""" |
|
Create arrow::SparseCSCMatrix from numpy.ndarrays |
|
|
|
Parameters |
|
---------- |
|
data : numpy.ndarray |
|
Data used to populate the sparse matrix. |
|
indptr : numpy.ndarray |
|
Range of the rows, |
|
The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. |
|
indices : numpy.ndarray |
|
Column indices of the corresponding non-zero values. |
|
shape : tuple |
|
Shape of the matrix. |
|
dim_names : list, optional |
|
Names of the dimensions. |
|
""" |
|
cdef shared_ptr[CSparseCSCMatrix] csparse_tensor |
|
cdef vector[int64_t] c_shape |
|
cdef vector[c_string] c_dim_names |
|
|
|
for x in shape: |
|
c_shape.push_back(x) |
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
|
|
indptr = np.require(indptr, dtype='i8') |
|
indices = np.require(indices, dtype='i8') |
|
if indptr.ndim != 1: |
|
raise ValueError("Expected 1-dimensional array for " |
|
"SparseCSCMatrix indptr") |
|
if indices.ndim != 1: |
|
raise ValueError("Expected 1-dimensional array for " |
|
"SparseCSCMatrix indices") |
|
|
|
check_status(NdarraysToSparseCSCMatrix(c_default_memory_pool(), |
|
data, indptr, indices, c_shape, |
|
c_dim_names, &csparse_tensor)) |
|
return pyarrow_wrap_sparse_csc_matrix(csparse_tensor) |
|
|
|
@staticmethod |
|
def from_scipy(obj, dim_names=None): |
|
""" |
|
Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix |
|
|
|
Parameters |
|
---------- |
|
obj : scipy.sparse.csc_matrix |
|
The scipy matrix that should be converted. |
|
dim_names : list, optional |
|
Names of the dimensions. |
|
""" |
|
import scipy.sparse |
|
if not isinstance(obj, scipy.sparse.csc_matrix): |
|
raise TypeError( |
|
"Expected scipy.sparse.csc_matrix, got {}".format(type(obj))) |
|
|
|
cdef shared_ptr[CSparseCSCMatrix] csparse_tensor |
|
cdef vector[int64_t] c_shape |
|
cdef vector[c_string] c_dim_names |
|
|
|
for x in obj.shape: |
|
c_shape.push_back(x) |
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
|
|
indptr = np.require(obj.indptr, dtype='i8') |
|
indices = np.require(obj.indices, dtype='i8') |
|
|
|
check_status(NdarraysToSparseCSCMatrix(c_default_memory_pool(), |
|
obj.data, indptr, indices, |
|
c_shape, c_dim_names, |
|
&csparse_tensor)) |
|
return pyarrow_wrap_sparse_csc_matrix(csparse_tensor) |
|
|
|
@staticmethod |
|
def from_tensor(obj): |
|
""" |
|
Convert arrow::Tensor to arrow::SparseCSCMatrix |
|
|
|
Parameters |
|
---------- |
|
obj : Tensor |
|
The dense tensor that should be converted. |
|
""" |
|
cdef shared_ptr[CSparseCSCMatrix] csparse_tensor |
|
cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) |
|
|
|
with nogil: |
|
check_status(TensorToSparseCSCMatrix(ctensor, &csparse_tensor)) |
|
|
|
return pyarrow_wrap_sparse_csc_matrix(csparse_tensor) |
|
|
|
def to_numpy(self): |
|
""" |
|
Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy |
|
""" |
|
if np is None: |
|
raise ImportError( |
|
"Cannot return a numpy.ndarray if NumPy is not present") |
|
cdef PyObject* out_data |
|
cdef PyObject* out_indptr |
|
cdef PyObject* out_indices |
|
|
|
check_status(SparseCSCMatrixToNdarray(self.sp_sparse_tensor, self, |
|
&out_data, &out_indptr, |
|
&out_indices)) |
|
return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), |
|
PyObject_to_object(out_indices)) |
|
|
|
def to_scipy(self): |
|
""" |
|
Convert arrow::SparseCSCMatrix to scipy.sparse.csc_matrix |
|
""" |
|
from scipy.sparse import csc_matrix |
|
cdef PyObject* out_data |
|
cdef PyObject* out_indptr |
|
cdef PyObject* out_indices |
|
|
|
check_status(SparseCSCMatrixToNdarray(self.sp_sparse_tensor, self, |
|
&out_data, &out_indptr, |
|
&out_indices)) |
|
|
|
data = PyObject_to_object(out_data) |
|
indptr = PyObject_to_object(out_indptr) |
|
indices = PyObject_to_object(out_indices) |
|
result = csc_matrix((data[:, 0], indices, indptr), shape=self.shape) |
|
return result |
|
|
|
def to_tensor(self): |
|
""" |
|
Convert arrow::SparseCSCMatrix to arrow::Tensor |
|
""" |
|
|
|
cdef shared_ptr[CTensor] ctensor |
|
with nogil: |
|
ctensor = GetResultValue(self.stp.ToTensor()) |
|
|
|
return pyarrow_wrap_tensor(ctensor) |
|
|
|
def equals(self, SparseCSCMatrix other): |
|
""" |
|
Return true if sparse tensors contains exactly equal data |
|
|
|
Parameters |
|
---------- |
|
other : SparseCSCMatrix |
|
The other tensor to compare for equality. |
|
""" |
|
return self.stp.Equals(deref(other.stp)) |
|
|
|
def __eq__(self, other): |
|
if isinstance(other, SparseCSCMatrix): |
|
return self.equals(other) |
|
else: |
|
return NotImplemented |
|
|
|
@property |
|
def is_mutable(self): |
|
return self.stp.is_mutable() |
|
|
|
@property |
|
def ndim(self): |
|
return self.stp.ndim() |
|
|
|
@property |
|
def shape(self): |
|
|
|
return tuple(self.stp.shape()) |
|
|
|
@property |
|
def size(self): |
|
return self.stp.size() |
|
|
|
def dim_name(self, i): |
|
""" |
|
Returns the name of the i-th tensor dimension. |
|
|
|
Parameters |
|
---------- |
|
i : int |
|
The physical index of the tensor dimension. |
|
|
|
Returns |
|
------- |
|
str |
|
""" |
|
return frombytes(self.stp.dim_name(i)) |
|
|
|
@property |
|
def dim_names(self): |
|
names_tuple = tuple(self.stp.dim_names()) |
|
return tuple(frombytes(x) for x in names_tuple) |
|
|
|
@property |
|
def non_zero_length(self): |
|
return self.stp.non_zero_length() |
|
|
|
|
|
cdef class SparseCSFTensor(_Weakrefable): |
|
""" |
|
A sparse CSF tensor. |
|
|
|
CSF is a generalization of compressed sparse row (CSR) index. |
|
|
|
CSF index recursively compresses each dimension of a tensor into a set |
|
of prefix trees. Each path from a root to leaf forms one tensor |
|
non-zero index. CSF is implemented with two arrays of buffers and one |
|
arrays of integers. |
|
""" |
|
|
|
def __init__(self): |
|
raise TypeError("Do not call SparseCSFTensor's constructor directly, " |
|
"use one of the `pyarrow.SparseCSFTensor.from_*` " |
|
"functions instead.") |
|
|
|
cdef void init(self, const shared_ptr[CSparseCSFTensor]& sp_sparse_tensor): |
|
self.sp_sparse_tensor = sp_sparse_tensor |
|
self.stp = sp_sparse_tensor.get() |
|
self.type = pyarrow_wrap_data_type(self.stp.type()) |
|
|
|
def __repr__(self): |
|
return """<pyarrow.SparseCSFTensor> |
|
type: {0.type} |
|
shape: {0.shape}""".format(self) |
|
|
|
@classmethod |
|
def from_dense_numpy(cls, obj, dim_names=None): |
|
""" |
|
Convert numpy.ndarray to arrow::SparseCSFTensor |
|
|
|
Parameters |
|
---------- |
|
obj : numpy.ndarray |
|
Data used to populate the rows. |
|
dim_names : list[str], optional |
|
Names of the dimensions. |
|
|
|
Returns |
|
------- |
|
pyarrow.SparseCSFTensor |
|
""" |
|
return cls.from_tensor(Tensor.from_numpy(obj, dim_names=dim_names)) |
|
|
|
@staticmethod |
|
def from_numpy(data, indptr, indices, shape, axis_order=None, |
|
dim_names=None): |
|
""" |
|
Create arrow::SparseCSFTensor from numpy.ndarrays |
|
|
|
Parameters |
|
---------- |
|
data : numpy.ndarray |
|
Data used to populate the sparse tensor. |
|
indptr : numpy.ndarray |
|
The sparsity structure. |
|
Each two consecutive dimensions in a tensor correspond to |
|
a buffer in indices. |
|
A pair of consecutive values at `indptr[dim][i]` |
|
`indptr[dim][i + 1]` signify a range of nodes in |
|
`indices[dim + 1]` who are children of `indices[dim][i]` node. |
|
indices : numpy.ndarray |
|
Stores values of nodes. |
|
Each tensor dimension corresponds to a buffer in indptr. |
|
shape : tuple |
|
Shape of the matrix. |
|
axis_order : list, optional |
|
the sequence in which dimensions were traversed to |
|
produce the prefix tree. |
|
dim_names : list, optional |
|
Names of the dimensions. |
|
""" |
|
cdef shared_ptr[CSparseCSFTensor] csparse_tensor |
|
cdef vector[int64_t] c_axis_order |
|
cdef vector[int64_t] c_shape |
|
cdef vector[c_string] c_dim_names |
|
|
|
for x in shape: |
|
c_shape.push_back(x) |
|
if not axis_order: |
|
axis_order = np.argsort(shape) |
|
for x in axis_order: |
|
c_axis_order.push_back(x) |
|
if dim_names is not None: |
|
for x in dim_names: |
|
c_dim_names.push_back(tobytes(x)) |
|
|
|
|
|
if not (isinstance(indptr, (list, tuple)) and |
|
isinstance(indices, (list, tuple))): |
|
raise TypeError("Expected list or tuple, got {}, {}" |
|
.format(type(indptr), type(indices))) |
|
if len(indptr) != len(shape) - 1: |
|
raise ValueError("Expected list of {ndim} np.arrays for " |
|
"SparseCSFTensor.indptr".format(ndim=len(shape))) |
|
if len(indices) != len(shape): |
|
raise ValueError("Expected list of {ndim} np.arrays for " |
|
"SparseCSFTensor.indices".format(ndim=len(shape))) |
|
if any([x.ndim != 1 for x in indptr]): |
|
raise ValueError("Expected a list of 1-dimensional arrays for " |
|
"SparseCSFTensor.indptr") |
|
if any([x.ndim != 1 for x in indices]): |
|
raise ValueError("Expected a list of 1-dimensional arrays for " |
|
"SparseCSFTensor.indices") |
|
indptr = [np.require(arr, dtype='i8') for arr in indptr] |
|
indices = [np.require(arr, dtype='i8') for arr in indices] |
|
|
|
check_status(NdarraysToSparseCSFTensor(c_default_memory_pool(), data, |
|
indptr, indices, c_shape, |
|
c_axis_order, c_dim_names, |
|
&csparse_tensor)) |
|
return pyarrow_wrap_sparse_csf_tensor(csparse_tensor) |
|
|
|
@staticmethod |
|
def from_tensor(obj): |
|
""" |
|
Convert arrow::Tensor to arrow::SparseCSFTensor |
|
|
|
Parameters |
|
---------- |
|
obj : Tensor |
|
The dense tensor that should be converted. |
|
""" |
|
cdef shared_ptr[CSparseCSFTensor] csparse_tensor |
|
cdef shared_ptr[CTensor] ctensor = pyarrow_unwrap_tensor(obj) |
|
|
|
with nogil: |
|
check_status(TensorToSparseCSFTensor(ctensor, &csparse_tensor)) |
|
|
|
return pyarrow_wrap_sparse_csf_tensor(csparse_tensor) |
|
|
|
def to_numpy(self): |
|
""" |
|
Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy |
|
""" |
|
if np is None: |
|
raise ImportError( |
|
"Cannot return a numpy.ndarray if NumPy is not present") |
|
cdef PyObject* out_data |
|
cdef PyObject* out_indptr |
|
cdef PyObject* out_indices |
|
|
|
check_status(SparseCSFTensorToNdarray(self.sp_sparse_tensor, self, |
|
&out_data, &out_indptr, |
|
&out_indices)) |
|
return (PyObject_to_object(out_data), PyObject_to_object(out_indptr), |
|
PyObject_to_object(out_indices)) |
|
|
|
def to_tensor(self): |
|
""" |
|
Convert arrow::SparseCSFTensor to arrow::Tensor |
|
""" |
|
|
|
cdef shared_ptr[CTensor] ctensor |
|
with nogil: |
|
ctensor = GetResultValue(self.stp.ToTensor()) |
|
|
|
return pyarrow_wrap_tensor(ctensor) |
|
|
|
def equals(self, SparseCSFTensor other): |
|
""" |
|
Return true if sparse tensors contains exactly equal data |
|
|
|
Parameters |
|
---------- |
|
other : SparseCSFTensor |
|
The other tensor to compare for equality. |
|
""" |
|
return self.stp.Equals(deref(other.stp)) |
|
|
|
def __eq__(self, other): |
|
if isinstance(other, SparseCSFTensor): |
|
return self.equals(other) |
|
else: |
|
return NotImplemented |
|
|
|
@property |
|
def is_mutable(self): |
|
return self.stp.is_mutable() |
|
|
|
@property |
|
def ndim(self): |
|
return self.stp.ndim() |
|
|
|
@property |
|
def shape(self): |
|
|
|
return tuple(self.stp.shape()) |
|
|
|
@property |
|
def size(self): |
|
return self.stp.size() |
|
|
|
def dim_name(self, i): |
|
""" |
|
Returns the name of the i-th tensor dimension. |
|
|
|
Parameters |
|
---------- |
|
i : int |
|
The physical index of the tensor dimension. |
|
|
|
Returns |
|
------- |
|
str |
|
""" |
|
return frombytes(self.stp.dim_name(i)) |
|
|
|
@property |
|
def dim_names(self): |
|
names_tuple = tuple(self.stp.dim_names()) |
|
return tuple(frombytes(x) for x in names_tuple) |
|
|
|
@property |
|
def non_zero_length(self): |
|
return self.stp.non_zero_length() |
|
|