File size: 4,448 Bytes
9c6594c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any, ClassVar, Optional, Union
import pyarrow as pa
if TYPE_CHECKING:
from .features import FeatureType
@dataclass
class Translation:
"""`Feature` for translations with fixed languages per example.
Here for compatibility with tfds.
Args:
languages (`dict`):
A dictionary for each example mapping string language codes to string translations.
Example:
```python
>>> # At construction time:
>>> datasets.features.Translation(languages=['en', 'fr', 'de'])
>>> # During data generation:
>>> yield {
... 'en': 'the cat',
... 'fr': 'le chat',
... 'de': 'die katze'
... }
```
"""
languages: list[str]
id: Optional[str] = None
# Automatically constructed
dtype: ClassVar[str] = "dict"
pa_type: ClassVar[Any] = None
_type: str = field(default="Translation", init=False, repr=False)
def __call__(self):
return pa.struct({lang: pa.string() for lang in sorted(self.languages)})
def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
"""Flatten the Translation feature into a dictionary."""
from .features import Value
return {k: Value("string") for k in sorted(self.languages)}
@dataclass
class TranslationVariableLanguages:
"""`Feature` for translations with variable languages per example.
Here for compatibility with tfds.
Args:
languages (`dict`):
A dictionary for each example mapping string language codes to one or more string translations.
The languages present may vary from example to example.
Returns:
- `language` or `translation` (variable-length 1D `tf.Tensor` of `tf.string`):
Language codes sorted in ascending order or plain text translations, sorted to align with language codes.
Example:
```python
>>> # At construction time:
>>> datasets.features.TranslationVariableLanguages(languages=['en', 'fr', 'de'])
>>> # During data generation:
>>> yield {
... 'en': 'the cat',
... 'fr': ['le chat', 'la chatte,']
... 'de': 'die katze'
... }
>>> # Tensor returned :
>>> {
... 'language': ['en', 'de', 'fr', 'fr'],
... 'translation': ['the cat', 'die katze', 'la chatte', 'le chat'],
... }
```
"""
languages: Optional[list] = None
num_languages: Optional[int] = None
id: Optional[str] = None
# Automatically constructed
dtype: ClassVar[str] = "dict"
pa_type: ClassVar[Any] = None
_type: str = field(default="TranslationVariableLanguages", init=False, repr=False)
def __post_init__(self):
self.languages = sorted(set(self.languages)) if self.languages else None
self.num_languages = len(self.languages) if self.languages else None
def __call__(self):
return pa.struct({"language": pa.list_(pa.string()), "translation": pa.list_(pa.string())})
def encode_example(self, translation_dict):
lang_set = set(self.languages)
if set(translation_dict) == {"language", "translation"}:
return translation_dict
elif self.languages and set(translation_dict) - lang_set:
raise ValueError(
f"Some languages in example ({', '.join(sorted(set(translation_dict) - lang_set))}) are not in valid set ({', '.join(lang_set)})."
)
# Convert dictionary into tuples, splitting out cases where there are
# multiple translations for a single language.
translation_tuples = []
for lang, text in translation_dict.items():
if isinstance(text, str):
translation_tuples.append((lang, text))
else:
translation_tuples.extend([(lang, el) for el in text])
# Ensure translations are in ascending order by language code.
languages, translations = zip(*sorted(translation_tuples))
return {"language": languages, "translation": translations}
def flatten(self) -> Union["FeatureType", dict[str, "FeatureType"]]:
"""Flatten the TranslationVariableLanguages feature into a dictionary."""
from .features import Sequence, Value
return {
"language": Sequence(Value("string")),
"translation": Sequence(Value("string")),
}
|