|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import List, Optional, Union |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
|
|
from .feature_extraction_utils import BatchFeature |
|
from .tokenization_utils_base import BatchEncoding |
|
from .utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]: |
|
""" |
|
Deal with dynamic shape in tensorflow cleanly. |
|
|
|
Args: |
|
tensor (`tf.Tensor` or `np.ndarray`): The tensor we want the shape of. |
|
|
|
Returns: |
|
`List[int]`: The shape of the tensor as a list. |
|
""" |
|
if isinstance(tensor, np.ndarray): |
|
return list(tensor.shape) |
|
|
|
dynamic = tf.shape(tensor) |
|
|
|
if tensor.shape == tf.TensorShape(None): |
|
return dynamic |
|
|
|
static = tensor.shape.as_list() |
|
|
|
return [dynamic[i] if s is None else s for i, s in enumerate(static)] |
|
|
|
|
|
def stable_softmax(logits: tf.Tensor, axis: Optional[int] = None, name: Optional[str] = None) -> tf.Tensor: |
|
""" |
|
Stable wrapper that returns the same output as `tf.nn.softmax`, but that works reliably with XLA on CPU. It is |
|
meant as a workaround for the [following issue](https://github.com/tensorflow/tensorflow/issues/55682), and will be |
|
removed after it gets fixed. The arguments and outputs are the same as `tf.nn.softmax`, and relies on the fact that |
|
`softmax(x) = softmax(x + c)` (see https://ogunlao.github.io/2020/04/26/you_dont_really_know_softmax.html). |
|
|
|
Args: |
|
logits (`tf.Tensor`): |
|
Must be one of the following types: half, float32, float64. |
|
axis (`int`, *optional*): |
|
The dimension softmax would be performed on. The default is -1 which indicates the last dimension. |
|
name (`str`, *optional*): |
|
A name for the operation. |
|
|
|
Returns: |
|
`tf.Tensor`: |
|
A Tensor. Has the same type and shape as logits. |
|
""" |
|
|
|
|
|
return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name) |
|
|
|
|
|
def functional_layernorm(inputs, weight, bias, epsilon=1e-5, axis=-1): |
|
|
|
|
|
|
|
|
|
if weight.shape.rank != 1 or bias.shape.rank != 1 or not isinstance(axis, int): |
|
raise NotImplementedError("Only 1D weight and bias tensors are supported for now, with only a single axis.") |
|
|
|
|
|
mean, variance = tf.nn.moments(inputs, axes=[axis], keepdims=True) |
|
|
|
if axis != -1: |
|
|
|
|
|
shape = [1] * inputs.shape.rank |
|
shape[axis] = shape_list(inputs)[axis] |
|
weight = tf.reshape(weight, shape) |
|
bias = tf.reshape(bias, shape) |
|
|
|
|
|
|
|
outputs = tf.nn.batch_normalization( |
|
inputs, |
|
mean, |
|
variance, |
|
offset=bias, |
|
scale=weight, |
|
variance_epsilon=epsilon, |
|
) |
|
return outputs |
|
|
|
|
|
def scaled_dot_product_attention( |
|
query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale: float = None |
|
): |
|
"""TF equivalent for torch's nn.functional.scaled_dot_product_attention""" |
|
if dropout_p != 0.0: |
|
raise ValueError( |
|
"Dropout is not supported in this implementation - file an issue " |
|
"with Transformers and ping @Rocketknight1 if you need it for a port!" |
|
) |
|
if is_causal and attn_mask is not None: |
|
raise ValueError("You cannot specify an attn_mask and is_causal at the same time!") |
|
if is_causal: |
|
attn_mask = tf.ones((tf.shape(query)[-2], tf.shape(key)[-2]), dtype=tf.int32) |
|
attn_mask = tf.experimental.numpy.tril(attn_mask, k=0) |
|
if attn_mask is not None and (attn_mask.dtype.is_integer or attn_mask.dtype.is_bool): |
|
|
|
attn_mask = tf.where(attn_mask > 0, tf.cast(0.0, query.dtype), tf.cast(-1000.0, query.dtype)) |
|
logits = tf.einsum("...qd, ...kd -> ...qk", query, key) |
|
if scale is None: |
|
scale = tf.cast(tf.shape(key)[-1], logits.dtype) ** -0.5 |
|
logits *= scale |
|
if attn_mask is not None: |
|
logits += attn_mask |
|
probs = tf.nn.softmax(logits) |
|
return probs @ value |
|
|
|
|
|
def flatten(input, start_dim=0, end_dim=-1): |
|
|
|
|
|
|
|
if end_dim < 0: |
|
end_dim += input.shape.rank |
|
if start_dim < 0: |
|
start_dim += input.shape.rank |
|
|
|
if start_dim == end_dim: |
|
return input |
|
|
|
in_shape = tf.shape(input) |
|
flattened_dim = tf.math.reduce_prod(in_shape[start_dim : end_dim + 1]) |
|
out_shape = tf.concat([in_shape[:start_dim], [flattened_dim], in_shape[end_dim + 1 :]], axis=0) |
|
return tf.reshape(input, out_shape) |
|
|
|
|
|
def invert_attention_mask(encoder_attention_mask: tf.Tensor) -> tf.Tensor: |
|
""" |
|
Invert an attention mask (e.g., switches 0. and 1.). |
|
|
|
Args: |
|
encoder_attention_mask (`torch.Tensor`): An attention mask. |
|
|
|
Returns: |
|
`tf.Tensor`: The inverted attention mask. |
|
""" |
|
if not isinstance(encoder_attention_mask, tf.Tensor): |
|
encoder_attention_mask = tf.convert_to_tensor(encoder_attention_mask) |
|
if encoder_attention_mask.shape.rank == 3: |
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] |
|
if encoder_attention_mask.shape.rank == 2: |
|
encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] |
|
|
|
|
|
|
|
|
|
|
|
encoder_extended_attention_mask = ( |
|
tf.cast(1, encoder_attention_mask.dtype) - encoder_extended_attention_mask |
|
) * encoder_extended_attention_mask.dtype.min |
|
|
|
return encoder_extended_attention_mask |
|
|
|
|
|
def check_embeddings_within_bounds(tensor: tf.Tensor, embed_dim: int, tensor_name: str = "input_ids") -> None: |
|
""" |
|
`tf.gather`, on which TF embedding layers are based, won't check positive out of bound indices on GPU, returning |
|
zeros instead. This function adds a check against that dangerous silent behavior. |
|
|
|
Args: |
|
tensor (`tf.Tensor`): The tensor of indices to check. |
|
embed_dim (`int`): The embedding dimension. |
|
tensor_name (`str`, *optional*): The name of the tensor to use in the error message. |
|
""" |
|
tf.debugging.assert_less( |
|
tensor, |
|
tf.cast(embed_dim, dtype=tensor.dtype), |
|
message=( |
|
f"The maximum value of {tensor_name} ({tf.math.reduce_max(tensor)}) must be smaller than the embedding " |
|
f"layer's input dimension ({embed_dim}). The likely cause is some problem at tokenization time." |
|
), |
|
) |
|
|
|
|
|
def save_attributes_to_hdf5_group(group, name, data): |
|
"""Saves attributes (data) of the specified name into the HDF5 group. |
|
|
|
This method deals with an inherent problem of HDF5 file which is not able to store data larger than |
|
HDF5_OBJECT_HEADER_LIMIT bytes. |
|
|
|
Args: |
|
group: A pointer to a HDF5 group. |
|
name: A name of the attributes to save. |
|
data: Attributes data to store. |
|
|
|
Raises: |
|
RuntimeError: If any single attribute is too large to be saved. |
|
|
|
Copied from Keras to Transformers to avoid versioning issues. |
|
""" |
|
HDF5_OBJECT_HEADER_LIMIT = 64512 |
|
|
|
|
|
|
|
bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT] |
|
|
|
|
|
if bad_attributes: |
|
raise RuntimeError( |
|
"The following attributes cannot be saved to HDF5 file because " |
|
f"they are larger than {HDF5_OBJECT_HEADER_LIMIT} " |
|
f"bytes: {bad_attributes}" |
|
) |
|
|
|
data_npy = np.asarray(data) |
|
|
|
num_chunks = 1 |
|
chunked_data = np.array_split(data_npy, num_chunks) |
|
|
|
|
|
while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data): |
|
num_chunks += 1 |
|
chunked_data = np.array_split(data_npy, num_chunks) |
|
|
|
if num_chunks > 1: |
|
for chunk_id, chunk_data in enumerate(chunked_data): |
|
group.attrs["%s%d" % (name, chunk_id)] = chunk_data |
|
else: |
|
group.attrs[name] = data |
|
|
|
|
|
def load_attributes_from_hdf5_group(group, name): |
|
"""Loads attributes of the specified name from the HDF5 group. |
|
|
|
This method deals with an inherent problem of HDF5 file which is not able to store data larger than |
|
HDF5_OBJECT_HEADER_LIMIT bytes. |
|
|
|
Args: |
|
group: A pointer to a HDF5 group. |
|
name: A name of the attributes to load. |
|
|
|
Returns: |
|
data: Attributes data. |
|
|
|
Copied from Keras to Transformers to avoid versioning issues. |
|
""" |
|
if name in group.attrs: |
|
data = [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs[name]] |
|
else: |
|
data = [] |
|
chunk_id = 0 |
|
while "%s%d" % (name, chunk_id) in group.attrs: |
|
data.extend( |
|
[n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs["%s%d" % (name, chunk_id)]] |
|
) |
|
chunk_id += 1 |
|
return data |
|
|
|
|
|
def expand_1d(data): |
|
"""Expands 1-dimensional `Tensor`s into 2-dimensional `Tensor`s. |
|
Copied from Keras to here to avoid versioning issues.""" |
|
|
|
def _expand_single_1d_tensor(t): |
|
if isinstance(t, tf.Tensor) and t.shape.rank == 1: |
|
return tf.expand_dims(t, axis=-1) |
|
return t |
|
|
|
return tf.nest.map_structure(_expand_single_1d_tensor, data) |
|
|
|
|
|
def convert_batch_encoding(*args, **kwargs): |
|
|
|
if args and isinstance(args[0], (BatchEncoding, BatchFeature)): |
|
args = list(args) |
|
args[0] = dict(args[0]) |
|
elif "x" in kwargs and isinstance(kwargs["x"], (BatchEncoding, BatchFeature)): |
|
kwargs["x"] = dict(kwargs["x"]) |
|
return args, kwargs |
|
|