sdadas commited on
Commit
d4e78fd
·
verified ·
1 Parent(s): a56dc66

Upload 9 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<mask>": 128000,
3
+ "<user_token_10>": 128010,
4
+ "<user_token_11>": 128011,
5
+ "<user_token_12>": 128012,
6
+ "<user_token_13>": 128013,
7
+ "<user_token_14>": 128014,
8
+ "<user_token_15>": 128015,
9
+ "<user_token_16>": 128016,
10
+ "<user_token_17>": 128017,
11
+ "<user_token_18>": 128018,
12
+ "<user_token_19>": 128019,
13
+ "<user_token_1>": 128001,
14
+ "<user_token_20>": 128020,
15
+ "<user_token_21>": 128021,
16
+ "<user_token_22>": 128022,
17
+ "<user_token_23>": 128023,
18
+ "<user_token_24>": 128024,
19
+ "<user_token_25>": 128025,
20
+ "<user_token_26>": 128026,
21
+ "<user_token_27>": 128027,
22
+ "<user_token_28>": 128028,
23
+ "<user_token_29>": 128029,
24
+ "<user_token_2>": 128002,
25
+ "<user_token_30>": 128030,
26
+ "<user_token_31>": 128031,
27
+ "<user_token_32>": 128032,
28
+ "<user_token_33>": 128033,
29
+ "<user_token_34>": 128034,
30
+ "<user_token_35>": 128035,
31
+ "<user_token_36>": 128036,
32
+ "<user_token_37>": 128037,
33
+ "<user_token_38>": 128038,
34
+ "<user_token_39>": 128039,
35
+ "<user_token_3>": 128003,
36
+ "<user_token_40>": 128040,
37
+ "<user_token_41>": 128041,
38
+ "<user_token_42>": 128042,
39
+ "<user_token_43>": 128043,
40
+ "<user_token_44>": 128044,
41
+ "<user_token_45>": 128045,
42
+ "<user_token_46>": 128046,
43
+ "<user_token_47>": 128047,
44
+ "<user_token_48>": 128048,
45
+ "<user_token_49>": 128049,
46
+ "<user_token_4>": 128004,
47
+ "<user_token_50>": 128050,
48
+ "<user_token_51>": 128051,
49
+ "<user_token_52>": 128052,
50
+ "<user_token_53>": 128053,
51
+ "<user_token_54>": 128054,
52
+ "<user_token_55>": 128055,
53
+ "<user_token_56>": 128056,
54
+ "<user_token_57>": 128057,
55
+ "<user_token_58>": 128058,
56
+ "<user_token_59>": 128059,
57
+ "<user_token_5>": 128005,
58
+ "<user_token_60>": 128060,
59
+ "<user_token_61>": 128061,
60
+ "<user_token_62>": 128062,
61
+ "<user_token_63>": 128063,
62
+ "<user_token_6>": 128006,
63
+ "<user_token_7>": 128007,
64
+ "<user_token_8>": 128008,
65
+ "<user_token_9>": 128009
66
+ }
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_roberta.RobertaConfig",
8
+ "AutoModel": "modeling_roberta.RobertaModel",
9
+ "AutoModelForCausalLM": "modeling_roberta.RobertaForCausalLM",
10
+ "AutoModelForMaskedLM": "modeling_roberta.RobertaForMaskedLM",
11
+ "AutoModelForMultipleChoice": "modeling_roberta.RobertaForMultipleChoice",
12
+ "AutoModelForQuestionAnswering": "modeling_roberta.RobertaForQuestionAnswering",
13
+ "AutoModelForSequenceClassification": "modeling_roberta.RobertaForSequenceClassification",
14
+ "AutoModelForTokenClassification": "modeling_roberta.RobertaForTokenClassification"
15
+ },
16
+ "bos_token_id": 0,
17
+ "classifier_dropout": null,
18
+ "eos_token_id": 2,
19
+ "hidden_act": "gelu",
20
+ "hidden_dropout_prob": 0.1,
21
+ "hidden_size": 1024,
22
+ "id2label": {
23
+ "0": "LABEL_0"
24
+ },
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 4096,
27
+ "label2id": {
28
+ "LABEL_0": 0
29
+ },
30
+ "layer_norm_eps": 1e-05,
31
+ "max_position_embeddings": 8194,
32
+ "model_type": "roberta",
33
+ "num_attention_heads": 16,
34
+ "num_hidden_layers": 24,
35
+ "pad_token_id": 1,
36
+ "position_embedding_type": "absolute",
37
+ "torch_dtype": "float32",
38
+ "transformers_version": "4.48.3",
39
+ "type_vocab_size": 1,
40
+ "use_cache": true,
41
+ "vocab_size": 128064
42
+ }
configuration_roberta.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """ RoBERTa configuration"""
17
+ from collections import OrderedDict
18
+ from typing import Mapping
19
+
20
+ from transformers import PretrainedConfig
21
+ from transformers.onnx import OnnxConfig
22
+ from transformers.utils import logging
23
+
24
+
25
+ logger = logging.get_logger(__name__)
26
+
27
+
28
+ class RobertaConfig(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`RobertaModel`] or a [`TFRobertaModel`]. It is
31
+ used to instantiate a RoBERTa model according to the specified arguments, defining the model architecture.
32
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
33
+ [FacebookAI/roberta-base](https://huggingface.co/FacebookAI/roberta-base) architecture.
34
+
35
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
+ documentation from [`PretrainedConfig`] for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 50265):
41
+ Vocabulary size of the RoBERTa model. Defines the number of different tokens that can be represented by the
42
+ `inputs_ids` passed when calling [`RobertaModel`] or [`TFRobertaModel`].
43
+ hidden_size (`int`, *optional*, defaults to 768):
44
+ Dimensionality of the encoder layers and the pooler layer.
45
+ num_hidden_layers (`int`, *optional*, defaults to 12):
46
+ Number of hidden layers in the Transformer encoder.
47
+ num_attention_heads (`int`, *optional*, defaults to 12):
48
+ Number of attention heads for each attention layer in the Transformer encoder.
49
+ intermediate_size (`int`, *optional*, defaults to 3072):
50
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
51
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
52
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
53
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
54
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
55
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
56
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
57
+ The dropout ratio for the attention probabilities.
58
+ max_position_embeddings (`int`, *optional*, defaults to 512):
59
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
60
+ just in case (e.g., 512 or 1024 or 2048).
61
+ type_vocab_size (`int`, *optional*, defaults to 2):
62
+ The vocabulary size of the `token_type_ids` passed when calling [`RobertaModel`] or [`TFRobertaModel`].
63
+ initializer_range (`float`, *optional*, defaults to 0.02):
64
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
65
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
66
+ The epsilon used by the layer normalization layers.
67
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
68
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
69
+ positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
70
+ [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
71
+ For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
72
+ with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
73
+ is_decoder (`bool`, *optional*, defaults to `False`):
74
+ Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
75
+ use_cache (`bool`, *optional*, defaults to `True`):
76
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
77
+ relevant if `config.is_decoder=True`.
78
+ classifier_dropout (`float`, *optional*):
79
+ The dropout ratio for the classification head.
80
+
81
+ Examples:
82
+
83
+ ```python
84
+ >>> from transformers import RobertaConfig, RobertaModel
85
+
86
+ >>> # Initializing a RoBERTa configuration
87
+ >>> configuration = RobertaConfig()
88
+
89
+ >>> # Initializing a model (with random weights) from the configuration
90
+ >>> model = RobertaModel(configuration)
91
+
92
+ >>> # Accessing the model configuration
93
+ >>> configuration = model.config
94
+ ```"""
95
+
96
+ model_type = "roberta"
97
+
98
+ def __init__(
99
+ self,
100
+ vocab_size=50265,
101
+ hidden_size=768,
102
+ num_hidden_layers=12,
103
+ num_attention_heads=12,
104
+ intermediate_size=3072,
105
+ hidden_act="gelu",
106
+ hidden_dropout_prob=0.1,
107
+ attention_probs_dropout_prob=0.1,
108
+ max_position_embeddings=512,
109
+ type_vocab_size=2,
110
+ initializer_range=0.02,
111
+ layer_norm_eps=1e-12,
112
+ pad_token_id=1,
113
+ bos_token_id=0,
114
+ eos_token_id=2,
115
+ position_embedding_type="absolute",
116
+ use_cache=True,
117
+ classifier_dropout=None,
118
+ **kwargs,
119
+ ):
120
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
121
+
122
+ self.vocab_size = vocab_size
123
+ self.hidden_size = hidden_size
124
+ self.num_hidden_layers = num_hidden_layers
125
+ self.num_attention_heads = num_attention_heads
126
+ self.hidden_act = hidden_act
127
+ self.intermediate_size = intermediate_size
128
+ self.hidden_dropout_prob = hidden_dropout_prob
129
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
130
+ self.max_position_embeddings = max_position_embeddings
131
+ self.type_vocab_size = type_vocab_size
132
+ self.initializer_range = initializer_range
133
+ self.layer_norm_eps = layer_norm_eps
134
+ self.position_embedding_type = position_embedding_type
135
+ self.use_cache = use_cache
136
+ self.classifier_dropout = classifier_dropout
137
+
138
+
139
+ class RobertaOnnxConfig(OnnxConfig):
140
+ @property
141
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
142
+ if self.task == "multiple-choice":
143
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
144
+ else:
145
+ dynamic_axis = {0: "batch", 1: "sequence"}
146
+ return OrderedDict(
147
+ [
148
+ ("input_ids", dynamic_axis),
149
+ ("attention_mask", dynamic_axis),
150
+ ]
151
+ )
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3043a8d96704ebec7dc9921ba4bc97dee6ba24b9ced9c8ca9813b79e8ec2b535
3
+ size 1771613516
modeling_roberta.py ADDED
@@ -0,0 +1,1973 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch RoBERTa model."""
17
+
18
+ import math
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.nn.functional as F
23
+ import torch.utils.checkpoint
24
+ from torch import nn
25
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
26
+
27
+ from transformers.activations import ACT2FN, gelu
28
+ from transformers.modeling_outputs import (
29
+ BaseModelOutputWithPastAndCrossAttentions,
30
+ BaseModelOutputWithPoolingAndCrossAttentions,
31
+ CausalLMOutputWithCrossAttentions,
32
+ MaskedLMOutput,
33
+ MultipleChoiceModelOutput,
34
+ QuestionAnsweringModelOutput,
35
+ SequenceClassifierOutput,
36
+ TokenClassifierOutput,
37
+ )
38
+ from transformers.modeling_utils import PreTrainedModel
39
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
40
+ from transformers.utils import (
41
+ add_code_sample_docstrings,
42
+ add_start_docstrings,
43
+ add_start_docstrings_to_model_forward,
44
+ is_flash_attn_2_available,
45
+ is_flash_attn_greater_or_equal_2_10,
46
+ logging,
47
+ replace_return_docstrings,
48
+ )
49
+ from .configuration_roberta import RobertaConfig
50
+
51
+
52
+ if is_flash_attn_2_available():
53
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
54
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
55
+
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+ _CHECKPOINT_FOR_DOC = "FacebookAI/roberta-base"
60
+ _CONFIG_FOR_DOC = "RobertaConfig"
61
+
62
+
63
+ # Copied from https://github.com/MeetKai/functionary/blob/main/functionary/train/packing/monkey_patch_packing.py
64
+ def _get_max_seqlen_in_batch(attention_mask):
65
+ max_num = torch.max(attention_mask)
66
+ # attention_mask: B x N
67
+ counts = []
68
+ for i in range(1, max_num + 1):
69
+ counts.append(
70
+ torch.sum(attention_mask == i, axis=-1)
71
+ ) # shape: B, count length of data point maksed with i
72
+ result = torch.stack(counts, axis=1)
73
+ result = result.flatten()
74
+ return result[result.nonzero()].squeeze(-1).to(dtype=torch.int32)
75
+
76
+
77
+ @torch.compiler.disable(recursive=False)
78
+ def get_unpad_data(attention_mask):
79
+ seqlens_in_batch = _get_max_seqlen_in_batch(
80
+ attention_mask
81
+ ) # attention_mask.sum(dim=-1, dtype=torch.int32)
82
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
83
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
84
+ cu_seqlens = F.pad(
85
+ torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)
86
+ )
87
+ return (
88
+ indices,
89
+ cu_seqlens,
90
+ max_seqlen_in_batch,
91
+ )
92
+
93
+
94
+ class RobertaEmbeddings(nn.Module):
95
+ """
96
+ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
97
+ """
98
+
99
+ # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
100
+ def __init__(self, config):
101
+ super().__init__()
102
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
103
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
104
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
105
+
106
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
107
+ # any TensorFlow checkpoint file
108
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
109
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
110
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
111
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
112
+ self.register_buffer(
113
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
114
+ )
115
+ self.register_buffer(
116
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
117
+ )
118
+
119
+ # End copy
120
+ self.padding_idx = config.pad_token_id
121
+ self.position_embeddings = nn.Embedding(
122
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
123
+ )
124
+
125
+ def forward(
126
+ self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
127
+ ):
128
+ if position_ids is None:
129
+ if input_ids is not None:
130
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
131
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
132
+ else:
133
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
134
+
135
+ if input_ids is not None:
136
+ input_shape = input_ids.size()
137
+ else:
138
+ input_shape = inputs_embeds.size()[:-1]
139
+
140
+ seq_length = input_shape[1]
141
+
142
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
143
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
144
+ # issue #5664
145
+ if token_type_ids is None:
146
+ if hasattr(self, "token_type_ids"):
147
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
148
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
149
+ token_type_ids = buffered_token_type_ids_expanded
150
+ else:
151
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
152
+
153
+ if inputs_embeds is None:
154
+ inputs_embeds = self.word_embeddings(input_ids)
155
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
156
+
157
+ embeddings = inputs_embeds + token_type_embeddings
158
+ if self.position_embedding_type == "absolute":
159
+ position_embeddings = self.position_embeddings(position_ids)
160
+ embeddings += position_embeddings
161
+ embeddings = self.LayerNorm(embeddings)
162
+ embeddings = self.dropout(embeddings)
163
+ return embeddings
164
+
165
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
166
+ """
167
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
168
+
169
+ Args:
170
+ inputs_embeds: torch.Tensor
171
+
172
+ Returns: torch.Tensor
173
+ """
174
+ input_shape = inputs_embeds.size()[:-1]
175
+ sequence_length = input_shape[1]
176
+
177
+ position_ids = torch.arange(
178
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
179
+ )
180
+ return position_ids.unsqueeze(0).expand(input_shape)
181
+
182
+
183
+ # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Roberta
184
+ class RobertaSelfAttention(nn.Module):
185
+ def __init__(self, config, position_embedding_type=None):
186
+ super().__init__()
187
+ self.config = config
188
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
189
+ raise ValueError(
190
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
191
+ f"heads ({config.num_attention_heads})"
192
+ )
193
+
194
+ self.num_attention_heads = config.num_attention_heads
195
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
196
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
197
+
198
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
199
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
200
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
201
+
202
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
203
+ self.position_embedding_type = position_embedding_type or getattr(
204
+ config, "position_embedding_type", "absolute"
205
+ )
206
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
207
+ self.max_position_embeddings = config.max_position_embeddings
208
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
209
+
210
+ self.is_decoder = config.is_decoder
211
+
212
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
213
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
214
+ x = x.view(new_x_shape)
215
+ return x.permute(0, 2, 1, 3)
216
+
217
+ def forward(
218
+ self,
219
+ hidden_states: torch.Tensor,
220
+ attention_mask: Optional[torch.FloatTensor] = None,
221
+ head_mask: Optional[torch.FloatTensor] = None,
222
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
223
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
224
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
225
+ output_attentions: Optional[bool] = False,
226
+ original_attention_mask: Optional[torch.Tensor] = None,
227
+ ) -> Tuple[torch.Tensor]:
228
+ mixed_query_layer = self.query(hidden_states)
229
+
230
+ # If this is instantiated as a cross-attention module, the keys
231
+ # and values come from an encoder; the attention mask needs to be
232
+ # such that the encoder's padding tokens are not attended to.
233
+ is_cross_attention = encoder_hidden_states is not None
234
+
235
+ if is_cross_attention and past_key_value is not None:
236
+ # reuse k,v, cross_attentions
237
+ key_layer = past_key_value[0]
238
+ value_layer = past_key_value[1]
239
+ attention_mask = encoder_attention_mask
240
+ elif is_cross_attention:
241
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
242
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
243
+ attention_mask = encoder_attention_mask
244
+ elif past_key_value is not None:
245
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
246
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
247
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
248
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
249
+ else:
250
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
251
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
252
+
253
+ query_layer = self.transpose_for_scores(mixed_query_layer)
254
+
255
+ use_cache = past_key_value is not None
256
+ if self.is_decoder:
257
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
258
+ # Further calls to cross_attention layer can then reuse all cross-attention
259
+ # key/value_states (first "if" case)
260
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
261
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
262
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
263
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
264
+ past_key_value = (key_layer, value_layer)
265
+
266
+ # Take the dot product between "query" and "key" to get the raw attention scores.
267
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
268
+
269
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
270
+ query_length, key_length = query_layer.shape[2], key_layer.shape[2]
271
+ if use_cache:
272
+ position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
273
+ -1, 1
274
+ )
275
+ else:
276
+ position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
277
+ position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
278
+ distance = position_ids_l - position_ids_r
279
+
280
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
281
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
282
+
283
+ if self.position_embedding_type == "relative_key":
284
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
285
+ attention_scores = attention_scores + relative_position_scores
286
+ elif self.position_embedding_type == "relative_key_query":
287
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
288
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
289
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
290
+
291
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
292
+ if attention_mask is not None:
293
+ # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
294
+ attention_scores = attention_scores + attention_mask
295
+
296
+ # Normalize the attention scores to probabilities.
297
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
298
+
299
+ # This is actually dropping out entire tokens to attend to, which might
300
+ # seem a bit unusual, but is taken from the original Transformer paper.
301
+ attention_probs = self.dropout(attention_probs)
302
+
303
+ # Mask heads if we want to
304
+ if head_mask is not None:
305
+ attention_probs = attention_probs * head_mask
306
+
307
+ context_layer = torch.matmul(attention_probs, value_layer)
308
+
309
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
310
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
311
+ context_layer = context_layer.view(new_context_layer_shape)
312
+
313
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
314
+
315
+ if self.is_decoder:
316
+ outputs = outputs + (past_key_value,)
317
+ return outputs
318
+
319
+
320
+ class RobertaFlashAttention2(RobertaSelfAttention):
321
+ def __init__(self, *args, **kwargs):
322
+ super().__init__(*args, **kwargs)
323
+
324
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
325
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
326
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
327
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
328
+
329
+ self.is_causal = False
330
+
331
+ if self.position_embedding_type != "absolute":
332
+ raise ValueError("RobertaFlashAttention2 only supports absolute position embeddings")
333
+
334
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
335
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
336
+ x = x.view(new_x_shape)
337
+ return x
338
+
339
+ def forward(
340
+ self,
341
+ hidden_states: torch.Tensor,
342
+ attention_mask: Optional[torch.FloatTensor] = None,
343
+ head_mask: Optional[torch.FloatTensor] = None,
344
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
345
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
346
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
347
+ output_attentions: Optional[bool] = False,
348
+ original_attention_mask: Optional[torch.Tensor] = None,
349
+ ) -> Tuple[torch.Tensor, ...]:
350
+ """
351
+ Parameters:
352
+ query: torch.tensor(bs, seq_length, dim)
353
+ key: torch.tensor(bs, seq_length, dim)
354
+ value: torch.tensor(bs, seq_length, dim)
355
+ mask: torch.tensor(bs, seq_length)
356
+
357
+ Returns:
358
+ weights: torch.tensor(bs, n_heads, seq_length, seq_length) Attention weights context: torch.tensor(bs,
359
+ seq_length, dim) Contextualized layer. Optional: only if `output_attentions=True`
360
+ """
361
+ if output_attentions:
362
+ raise ValueError("RobertaFlashAttention2 attention does not support output_attentions")
363
+ if head_mask is not None:
364
+ raise ValueError("RobertaFlashAttention2 attention does not support head_mask")
365
+
366
+ mixed_query_layer = self.query(hidden_states)
367
+
368
+ # If this is instantiated as a cross-attention module, the keys
369
+ # and values come from an encoder; the attention mask needs to be
370
+ # such that the encoder's padding tokens are not attended to.
371
+ is_cross_attention = encoder_hidden_states is not None
372
+
373
+ if is_cross_attention and past_key_value is not None:
374
+ # reuse k,v, cross_attentions
375
+ key_states = past_key_value[0]
376
+ value_states = past_key_value[1]
377
+ attention_mask = encoder_attention_mask
378
+ elif is_cross_attention:
379
+ key_states = self.transpose_for_scores(self.key(encoder_hidden_states))
380
+ value_states = self.transpose_for_scores(self.value(encoder_hidden_states))
381
+ attention_mask = encoder_attention_mask
382
+ elif past_key_value is not None:
383
+ key_states = self.transpose_for_scores(self.key(hidden_states))
384
+ value_states = self.transpose_for_scores(self.value(hidden_states))
385
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
386
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
387
+ else:
388
+ key_states = self.transpose_for_scores(self.key(hidden_states))
389
+ value_states = self.transpose_for_scores(self.value(hidden_states))
390
+
391
+ # attention_mask is of the "extended attention mask" at this stage, i.e. it's 0 for positions that need attention
392
+ # and the lowest possible value for positions that should be masked. So, an "all attention" mask sums to 0.
393
+ # In that case, we can safely set it to None to avoid unnecessary computation for variable length attention.
394
+ if original_attention_mask is not None:
395
+ attention_mask = original_attention_mask
396
+ elif attention_mask.sum().item() == 0:
397
+ attention_mask = None
398
+ else:
399
+ # Otherwise, we want to undo the "extended attention mask" format, as flash attention doesn't work with it.
400
+ attention_mask = torch.where(attention_mask[:, 0, 0, :] == 0, 1.0, 0.0)
401
+
402
+ query_states = self.transpose_for_scores(mixed_query_layer)
403
+ # At this stage, the key, value and query states all have the shape of
404
+ # batch_size x seq_len x head_dim x hidden_dim
405
+
406
+ if self.is_decoder:
407
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
408
+ # Further calls to cross_attention layer can then reuse all cross-attention
409
+ # key/value_states (first "if" case)
410
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
411
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
412
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
413
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
414
+ past_key_value = (key_states, value_states)
415
+
416
+ seq_len = query_states.shape[1]
417
+
418
+ attn_dropout = self.config.attention_probs_dropout_prob if self.training else 0.0
419
+
420
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
421
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
422
+ # cast them back in the correct dtype just to be sure everything works as expected.
423
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
424
+ # in fp32.
425
+
426
+ if query_states.dtype == torch.float32:
427
+ if torch.is_autocast_enabled():
428
+ target_dtype = torch.get_autocast_gpu_dtype()
429
+ # Handle the case where the model is quantized
430
+ elif hasattr(self.config, "_pre_quantization_dtype"):
431
+ target_dtype = self.config._pre_quantization_dtype
432
+ else:
433
+ target_dtype = self.query.weight.dtype
434
+
435
+ logger.warning_once(
436
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
437
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
438
+ f" {target_dtype}."
439
+ )
440
+
441
+ query_states = query_states.to(target_dtype)
442
+ key_states = key_states.to(target_dtype)
443
+ value_states = value_states.to(target_dtype)
444
+
445
+ attn_weights = self._flash_attention_forward(
446
+ query_states, key_states, value_states, attention_mask, seq_len, dropout=attn_dropout
447
+ )
448
+
449
+ new_shape = attn_weights.size()[:-2] + (self.all_head_size,)
450
+ attn_output = attn_weights.view(new_shape)
451
+
452
+ outputs = (attn_output,)
453
+
454
+ if self.is_decoder:
455
+ outputs = outputs + (past_key_value,)
456
+ return outputs
457
+
458
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
459
+ def _flash_attention_forward(
460
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
461
+ ):
462
+ """
463
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
464
+ first unpad the input, then computes the attention scores and pad the final attention scores.
465
+
466
+ Args:
467
+ query_states (`torch.Tensor`):
468
+ Input query states to be passed to Flash Attention API
469
+ key_states (`torch.Tensor`):
470
+ Input key states to be passed to Flash Attention API
471
+ value_states (`torch.Tensor`):
472
+ Input value states to be passed to Flash Attention API
473
+ attention_mask (`torch.Tensor`):
474
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
475
+ position of padding tokens and 1 for the position of non-padding tokens.
476
+ dropout (`float`):
477
+ Attention dropout
478
+ softmax_scale (`float`, *optional*):
479
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
480
+ """
481
+ if not self._flash_attn_uses_top_left_mask:
482
+ causal = self.is_causal
483
+ else:
484
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
485
+ causal = self.is_causal and query_length != 1
486
+
487
+ # Contains at least one padding token in the sequence
488
+ if attention_mask is not None:
489
+ batch_size = query_states.shape[0]
490
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
491
+ query_states, key_states, value_states, attention_mask, query_length
492
+ )
493
+
494
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
495
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
496
+
497
+ attn_output_unpad = flash_attn_varlen_func(
498
+ query_states,
499
+ key_states,
500
+ value_states,
501
+ cu_seqlens_q=cu_seqlens_q,
502
+ cu_seqlens_k=cu_seqlens_k,
503
+ max_seqlen_q=max_seqlen_in_batch_q,
504
+ max_seqlen_k=max_seqlen_in_batch_k,
505
+ dropout_p=dropout,
506
+ softmax_scale=softmax_scale,
507
+ causal=causal,
508
+ )
509
+
510
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
511
+ else:
512
+ attn_output = flash_attn_func(
513
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
514
+ )
515
+
516
+ return attn_output
517
+
518
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input with num_heads->num_attention_heads
519
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
520
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = get_unpad_data(attention_mask)
521
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
522
+
523
+ key_layer = index_first_axis(
524
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
525
+ )
526
+ value_layer = index_first_axis(
527
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
528
+ )
529
+ if query_length == kv_seq_len:
530
+ query_layer = index_first_axis(
531
+ query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads, head_dim), indices_k
532
+ )
533
+ cu_seqlens_q = cu_seqlens_k
534
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
535
+ indices_q = indices_k
536
+ elif query_length == 1:
537
+ max_seqlen_in_batch_q = 1
538
+ cu_seqlens_q = torch.arange(
539
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
540
+ ) # There is a memcpy here, that is very bad.
541
+ indices_q = cu_seqlens_q[:-1]
542
+ query_layer = query_layer.squeeze(1)
543
+ else:
544
+ # The -q_len: slice assumes left padding.
545
+ attention_mask = attention_mask[:, -query_length:]
546
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
547
+
548
+ return (
549
+ query_layer,
550
+ key_layer,
551
+ value_layer,
552
+ indices_q,
553
+ (cu_seqlens_q, cu_seqlens_k),
554
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
555
+ )
556
+
557
+
558
+ class RobertaSdpaAttention(RobertaSelfAttention):
559
+ """
560
+ Roberta attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
561
+ `RobertaSelfAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
562
+ SDPA API.
563
+ """
564
+
565
+ def __init__(self, config, position_embedding_type=None):
566
+ super().__init__(config, position_embedding_type)
567
+
568
+ self.is_causal = False
569
+
570
+ if self.position_embedding_type != "absolute":
571
+ raise ValueError("RobertaSdpaAttention only supports absolute position embeddings")
572
+
573
+ # Adapted from LlamaAttention.forward and RobertaFlashAttention2.forward
574
+ def forward(
575
+ self,
576
+ hidden_states: torch.Tensor,
577
+ attention_mask: Optional[torch.FloatTensor] = None,
578
+ head_mask: Optional[torch.FloatTensor] = None,
579
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
580
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
581
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
582
+ output_attentions: Optional[bool] = False,
583
+ original_attention_mask: Optional[torch.Tensor] = None,
584
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
585
+ if output_attentions:
586
+ # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
587
+ logger.warning_once(
588
+ "RobertaModel is using RobertaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
589
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
590
+ )
591
+ return super().forward(
592
+ hidden_states=hidden_states,
593
+ attention_mask=attention_mask,
594
+ head_mask=head_mask,
595
+ encoder_hidden_states=encoder_hidden_states,
596
+ encoder_attention_mask=encoder_attention_mask,
597
+ past_key_value=past_key_value,
598
+ output_attentions=output_attentions,
599
+ )
600
+
601
+ mixed_query_layer = self.query(hidden_states)
602
+
603
+ # If this is instantiated as a cross-attention module, the keys
604
+ # and values come from an encoder; the attention mask needs to be
605
+ # such that the encoder's padding tokens are not attended to.
606
+ is_cross_attention = encoder_hidden_states is not None
607
+
608
+ if is_cross_attention and past_key_value is not None:
609
+ # reuse k,v, cross_attentions
610
+ key_states = past_key_value[0]
611
+ value_states = past_key_value[1]
612
+ attention_mask = encoder_attention_mask
613
+ elif is_cross_attention:
614
+ key_states = self.transpose_for_scores(self.key(encoder_hidden_states))
615
+ value_states = self.transpose_for_scores(self.value(encoder_hidden_states))
616
+ attention_mask = encoder_attention_mask
617
+ elif past_key_value is not None:
618
+ key_states = self.transpose_for_scores(self.key(hidden_states))
619
+ value_states = self.transpose_for_scores(self.value(hidden_states))
620
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
621
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
622
+ else:
623
+ key_states = self.transpose_for_scores(self.key(hidden_states))
624
+ value_states = self.transpose_for_scores(self.value(hidden_states))
625
+
626
+ query_states = self.transpose_for_scores(mixed_query_layer)
627
+ # At this stage, the key, value and query states all have the shape of
628
+ # batch_size x head_dim x seq_len x hidden_dim
629
+
630
+ if self.is_decoder:
631
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
632
+ # Further calls to cross_attention layer can then reuse all cross-attention
633
+ # key/value_states (first "if" case)
634
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
635
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
636
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
637
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
638
+ past_key_value = (key_states, value_states)
639
+
640
+ batch_size, _, seq_len, _ = query_states.size()
641
+
642
+ attn_dropout = self.config.attention_probs_dropout_prob if self.training else 0.0
643
+
644
+ # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
645
+ # Reference: https://github.com/pytorch/pytorch/issues/112577.
646
+ if query_states.device.type == "cuda" and attention_mask is not None:
647
+ query_states = query_states.contiguous()
648
+ key_states = key_states.contiguous()
649
+ value_states = value_states.contiguous()
650
+
651
+ # In case we are not compiling, we may set `causal_mask` to None, which is required to dispatch to SDPA's Flash Attention 2 backend, rather
652
+ # relying on the `is_causal` argument.
653
+ attn_output = torch.nn.functional.scaled_dot_product_attention(
654
+ query_states,
655
+ key_states,
656
+ value_states,
657
+ attn_mask=attention_mask,
658
+ dropout_p=attn_dropout,
659
+ is_causal=self.is_causal and attention_mask is None and seq_len > 1,
660
+ )
661
+
662
+ if attn_output.size() != (batch_size, self.num_attention_heads, seq_len, self.attention_head_size):
663
+ raise ValueError(
664
+ f"`attn_output` should be of size {(batch_size, self.num_attention_heads, seq_len, self.attention_head_size)}, but is"
665
+ f" {attn_output.size()}"
666
+ )
667
+
668
+ attn_output = attn_output.transpose(1, 2)
669
+ attn_output = attn_output.reshape(batch_size, seq_len, self.all_head_size)
670
+
671
+ outputs = (attn_output,)
672
+
673
+ if self.is_decoder:
674
+ outputs = outputs + (past_key_value,)
675
+ return outputs
676
+
677
+
678
+ ROBERTA_ATTENTION_CLASSES = {
679
+ "eager": RobertaSelfAttention,
680
+ "sdpa": RobertaSdpaAttention,
681
+ "flash_attention_2": RobertaFlashAttention2,
682
+ }
683
+
684
+
685
+ # Copied from transformers.models.bert.modeling_bert.BertSelfOutput
686
+ class RobertaSelfOutput(nn.Module):
687
+ def __init__(self, config):
688
+ super().__init__()
689
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
690
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
691
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
692
+
693
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
694
+ hidden_states = self.dense(hidden_states)
695
+ hidden_states = self.dropout(hidden_states)
696
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
697
+ return hidden_states
698
+
699
+
700
+ class RobertaAttention(nn.Module):
701
+ def __init__(self, config, position_embedding_type=None):
702
+ super().__init__()
703
+ self.self = ROBERTA_ATTENTION_CLASSES[config._attn_implementation](
704
+ config,
705
+ position_embedding_type=position_embedding_type,
706
+ )
707
+ self.output = RobertaSelfOutput(config)
708
+ self.pruned_heads = set()
709
+
710
+ # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
711
+ def prune_heads(self, heads):
712
+ if len(heads) == 0:
713
+ return
714
+ heads, index = find_pruneable_heads_and_indices(
715
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
716
+ )
717
+
718
+ # Prune linear layers
719
+ self.self.query = prune_linear_layer(self.self.query, index)
720
+ self.self.key = prune_linear_layer(self.self.key, index)
721
+ self.self.value = prune_linear_layer(self.self.value, index)
722
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
723
+
724
+ # Update hyper params and store pruned heads
725
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
726
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
727
+ self.pruned_heads = self.pruned_heads.union(heads)
728
+
729
+ # Copied from transformers.models.bert.modeling_bert.BertAttention.forward
730
+ def forward(
731
+ self,
732
+ hidden_states: torch.Tensor,
733
+ attention_mask: Optional[torch.FloatTensor] = None,
734
+ head_mask: Optional[torch.FloatTensor] = None,
735
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
736
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
737
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
738
+ output_attentions: Optional[bool] = False,
739
+ original_attention_mask: Optional[torch.Tensor] = None,
740
+ ) -> Tuple[torch.Tensor]:
741
+ self_outputs = self.self(
742
+ hidden_states,
743
+ attention_mask,
744
+ head_mask,
745
+ encoder_hidden_states,
746
+ encoder_attention_mask,
747
+ past_key_value,
748
+ output_attentions,
749
+ original_attention_mask
750
+ )
751
+ attention_output = self.output(self_outputs[0], hidden_states)
752
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
753
+ return outputs
754
+
755
+
756
+ # Copied from transformers.models.bert.modeling_bert.BertIntermediate
757
+ class RobertaIntermediate(nn.Module):
758
+ def __init__(self, config):
759
+ super().__init__()
760
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
761
+ if isinstance(config.hidden_act, str):
762
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
763
+ else:
764
+ self.intermediate_act_fn = config.hidden_act
765
+
766
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
767
+ hidden_states = self.dense(hidden_states)
768
+ hidden_states = self.intermediate_act_fn(hidden_states)
769
+ return hidden_states
770
+
771
+
772
+ # Copied from transformers.models.bert.modeling_bert.BertOutput
773
+ class RobertaOutput(nn.Module):
774
+ def __init__(self, config):
775
+ super().__init__()
776
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
777
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
778
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
779
+
780
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
781
+ hidden_states = self.dense(hidden_states)
782
+ hidden_states = self.dropout(hidden_states)
783
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
784
+ return hidden_states
785
+
786
+
787
+ # Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Roberta
788
+ class RobertaLayer(nn.Module):
789
+ def __init__(self, config):
790
+ super().__init__()
791
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
792
+ self.seq_len_dim = 1
793
+ self.attention = RobertaAttention(config)
794
+ self.is_decoder = config.is_decoder
795
+ self.add_cross_attention = config.add_cross_attention
796
+ if self.add_cross_attention:
797
+ if not self.is_decoder:
798
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
799
+ self.crossattention = RobertaAttention(config, position_embedding_type="absolute")
800
+ self.intermediate = RobertaIntermediate(config)
801
+ self.output = RobertaOutput(config)
802
+
803
+ def forward(
804
+ self,
805
+ hidden_states: torch.Tensor,
806
+ attention_mask: Optional[torch.FloatTensor] = None,
807
+ head_mask: Optional[torch.FloatTensor] = None,
808
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
809
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
810
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
811
+ output_attentions: Optional[bool] = False,
812
+ original_attention_mask: Optional[torch.Tensor] = None
813
+ ) -> Tuple[torch.Tensor]:
814
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
815
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
816
+ self_attention_outputs = self.attention(
817
+ hidden_states,
818
+ attention_mask,
819
+ head_mask,
820
+ output_attentions=output_attentions,
821
+ past_key_value=self_attn_past_key_value,
822
+ original_attention_mask=original_attention_mask
823
+ )
824
+ attention_output = self_attention_outputs[0]
825
+
826
+ # if decoder, the last output is tuple of self-attn cache
827
+ if self.is_decoder:
828
+ outputs = self_attention_outputs[1:-1]
829
+ present_key_value = self_attention_outputs[-1]
830
+ else:
831
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
832
+
833
+ cross_attn_present_key_value = None
834
+ if self.is_decoder and encoder_hidden_states is not None:
835
+ if not hasattr(self, "crossattention"):
836
+ raise ValueError(
837
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
838
+ " by setting `config.add_cross_attention=True`"
839
+ )
840
+
841
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
842
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
843
+ cross_attention_outputs = self.crossattention(
844
+ attention_output,
845
+ attention_mask,
846
+ head_mask,
847
+ encoder_hidden_states,
848
+ encoder_attention_mask,
849
+ cross_attn_past_key_value,
850
+ output_attentions,
851
+ )
852
+ attention_output = cross_attention_outputs[0]
853
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
854
+
855
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
856
+ cross_attn_present_key_value = cross_attention_outputs[-1]
857
+ present_key_value = present_key_value + cross_attn_present_key_value
858
+
859
+ layer_output = apply_chunking_to_forward(
860
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
861
+ )
862
+ outputs = (layer_output,) + outputs
863
+
864
+ # if decoder, return the attn key/values as the last output
865
+ if self.is_decoder:
866
+ outputs = outputs + (present_key_value,)
867
+
868
+ return outputs
869
+
870
+ def feed_forward_chunk(self, attention_output):
871
+ intermediate_output = self.intermediate(attention_output)
872
+ layer_output = self.output(intermediate_output, attention_output)
873
+ return layer_output
874
+
875
+
876
+ # Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Roberta
877
+ class RobertaEncoder(nn.Module):
878
+ def __init__(self, config):
879
+ super().__init__()
880
+ self.config = config
881
+ self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])
882
+ self.gradient_checkpointing = False
883
+
884
+ def forward(
885
+ self,
886
+ hidden_states: torch.Tensor,
887
+ attention_mask: Optional[torch.FloatTensor] = None,
888
+ head_mask: Optional[torch.FloatTensor] = None,
889
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
890
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
891
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
892
+ use_cache: Optional[bool] = None,
893
+ output_attentions: Optional[bool] = False,
894
+ output_hidden_states: Optional[bool] = False,
895
+ return_dict: Optional[bool] = True,
896
+ original_attention_mask: Optional[torch.Tensor] = None,
897
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
898
+ all_hidden_states = () if output_hidden_states else None
899
+ all_self_attentions = () if output_attentions else None
900
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
901
+
902
+ if self.gradient_checkpointing and self.training:
903
+ if use_cache:
904
+ logger.warning_once(
905
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
906
+ )
907
+ use_cache = False
908
+
909
+ next_decoder_cache = () if use_cache else None
910
+ for i, layer_module in enumerate(self.layer):
911
+ if output_hidden_states:
912
+ all_hidden_states = all_hidden_states + (hidden_states,)
913
+
914
+ layer_head_mask = head_mask[i] if head_mask is not None else None
915
+ past_key_value = past_key_values[i] if past_key_values is not None else None
916
+
917
+ if self.gradient_checkpointing and self.training:
918
+ layer_outputs = self._gradient_checkpointing_func(
919
+ layer_module.__call__,
920
+ hidden_states,
921
+ attention_mask,
922
+ layer_head_mask,
923
+ encoder_hidden_states,
924
+ encoder_attention_mask,
925
+ past_key_value,
926
+ output_attentions,
927
+ original_attention_mask
928
+ )
929
+ else:
930
+ layer_outputs = layer_module(
931
+ hidden_states,
932
+ attention_mask,
933
+ layer_head_mask,
934
+ encoder_hidden_states,
935
+ encoder_attention_mask,
936
+ past_key_value,
937
+ output_attentions,
938
+ original_attention_mask
939
+ )
940
+
941
+ hidden_states = layer_outputs[0]
942
+ if use_cache:
943
+ next_decoder_cache += (layer_outputs[-1],)
944
+ if output_attentions:
945
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
946
+ if self.config.add_cross_attention:
947
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
948
+
949
+ if output_hidden_states:
950
+ all_hidden_states = all_hidden_states + (hidden_states,)
951
+
952
+ if not return_dict:
953
+ return tuple(
954
+ v
955
+ for v in [
956
+ hidden_states,
957
+ next_decoder_cache,
958
+ all_hidden_states,
959
+ all_self_attentions,
960
+ all_cross_attentions,
961
+ ]
962
+ if v is not None
963
+ )
964
+ return BaseModelOutputWithPastAndCrossAttentions(
965
+ last_hidden_state=hidden_states,
966
+ past_key_values=next_decoder_cache,
967
+ hidden_states=all_hidden_states,
968
+ attentions=all_self_attentions,
969
+ cross_attentions=all_cross_attentions,
970
+ )
971
+
972
+
973
+ # Copied from transformers.models.bert.modeling_bert.BertPooler
974
+ class RobertaPooler(nn.Module):
975
+ def __init__(self, config):
976
+ super().__init__()
977
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
978
+ self.activation = nn.Tanh()
979
+
980
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
981
+ # We "pool" the model by simply taking the hidden state corresponding
982
+ # to the first token.
983
+ first_token_tensor = hidden_states[:, 0]
984
+ pooled_output = self.dense(first_token_tensor)
985
+ pooled_output = self.activation(pooled_output)
986
+ return pooled_output
987
+
988
+
989
+ class RobertaPreTrainedModel(PreTrainedModel):
990
+ """
991
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
992
+ models.
993
+ """
994
+
995
+ config_class = RobertaConfig
996
+ base_model_prefix = "roberta"
997
+ supports_gradient_checkpointing = True
998
+ _no_split_modules = ["RobertaEmbeddings", "RobertaSelfAttention"]
999
+ _supports_flash_attn_2 = True
1000
+ _supports_sdpa = True
1001
+
1002
+ # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
1003
+ def _init_weights(self, module):
1004
+ """Initialize the weights"""
1005
+ if isinstance(module, nn.Linear):
1006
+ # Slightly different from the TF version which uses truncated_normal for initialization
1007
+ # cf https://github.com/pytorch/pytorch/pull/5617
1008
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
1009
+ if module.bias is not None:
1010
+ module.bias.data.zero_()
1011
+ elif isinstance(module, nn.Embedding):
1012
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
1013
+ if module.padding_idx is not None:
1014
+ module.weight.data[module.padding_idx].zero_()
1015
+ elif isinstance(module, nn.LayerNorm):
1016
+ module.bias.data.zero_()
1017
+ module.weight.data.fill_(1.0)
1018
+
1019
+
1020
+ ROBERTA_START_DOCSTRING = r"""
1021
+
1022
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
1023
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
1024
+ etc.)
1025
+
1026
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
1027
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
1028
+ and behavior.
1029
+
1030
+ Parameters:
1031
+ config ([`RobertaConfig`]): Model configuration class with all the parameters of the
1032
+ model. Initializing with a config file does not load the weights associated with the model, only the
1033
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
1034
+ """
1035
+
1036
+ ROBERTA_INPUTS_DOCSTRING = r"""
1037
+ Args:
1038
+ input_ids (`torch.LongTensor` of shape `({0})`):
1039
+ Indices of input sequence tokens in the vocabulary.
1040
+
1041
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1042
+ [`PreTrainedTokenizer.__call__`] for details.
1043
+
1044
+ [What are input IDs?](../glossary#input-ids)
1045
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
1046
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1047
+
1048
+ - 1 for tokens that are **not masked**,
1049
+ - 0 for tokens that are **masked**.
1050
+
1051
+ [What are attention masks?](../glossary#attention-mask)
1052
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
1053
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
1054
+
1055
+ - 0 corresponds to a *sentence A* token,
1056
+ - 1 corresponds to a *sentence B* token.
1057
+ This parameter can only be used when the model is initialized with `type_vocab_size` parameter with value
1058
+ >= 2. All the value in this tensor should be always < type_vocab_size.
1059
+
1060
+ [What are token type IDs?](../glossary#token-type-ids)
1061
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
1062
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1063
+ config.max_position_embeddings - 1]`.
1064
+
1065
+ [What are position IDs?](../glossary#position-ids)
1066
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
1067
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
1068
+
1069
+ - 1 indicates the head is **not masked**,
1070
+ - 0 indicates the head is **masked**.
1071
+
1072
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
1073
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
1074
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
1075
+ model's internal embedding lookup matrix.
1076
+ output_attentions (`bool`, *optional*):
1077
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
1078
+ tensors for more detail.
1079
+ output_hidden_states (`bool`, *optional*):
1080
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
1081
+ more detail.
1082
+ return_dict (`bool`, *optional*):
1083
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
1084
+ """
1085
+
1086
+
1087
+ @add_start_docstrings(
1088
+ "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
1089
+ ROBERTA_START_DOCSTRING,
1090
+ )
1091
+ class RobertaModel(RobertaPreTrainedModel):
1092
+ """
1093
+
1094
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
1095
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
1096
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
1097
+ Kaiser and Illia Polosukhin.
1098
+
1099
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
1100
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
1101
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
1102
+
1103
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
1104
+
1105
+ """
1106
+
1107
+ # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
1108
+ def __init__(self, config, add_pooling_layer=True):
1109
+ super().__init__(config)
1110
+ self.config = config
1111
+
1112
+ self.embeddings = RobertaEmbeddings(config)
1113
+ self.encoder = RobertaEncoder(config)
1114
+
1115
+ self.pooler = RobertaPooler(config) if add_pooling_layer else None
1116
+
1117
+ # Initialize weights and apply final processing
1118
+ self.post_init()
1119
+
1120
+ def get_input_embeddings(self):
1121
+ return self.embeddings.word_embeddings
1122
+
1123
+ def set_input_embeddings(self, value):
1124
+ self.embeddings.word_embeddings = value
1125
+
1126
+ def _prune_heads(self, heads_to_prune):
1127
+ """
1128
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
1129
+ class PreTrainedModel
1130
+ """
1131
+ for layer, heads in heads_to_prune.items():
1132
+ self.encoder.layer[layer].attention.prune_heads(heads)
1133
+
1134
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1135
+ @add_code_sample_docstrings(
1136
+ checkpoint=_CHECKPOINT_FOR_DOC,
1137
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
1138
+ config_class=_CONFIG_FOR_DOC,
1139
+ )
1140
+ # Copied from transformers.models.bert.modeling_bert.BertModel.forward
1141
+ def forward(
1142
+ self,
1143
+ input_ids: Optional[torch.Tensor] = None,
1144
+ attention_mask: Optional[torch.Tensor] = None,
1145
+ token_type_ids: Optional[torch.Tensor] = None,
1146
+ position_ids: Optional[torch.Tensor] = None,
1147
+ head_mask: Optional[torch.Tensor] = None,
1148
+ inputs_embeds: Optional[torch.Tensor] = None,
1149
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1150
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1151
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1152
+ use_cache: Optional[bool] = None,
1153
+ output_attentions: Optional[bool] = None,
1154
+ output_hidden_states: Optional[bool] = None,
1155
+ return_dict: Optional[bool] = None,
1156
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
1157
+ r"""
1158
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1159
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
1160
+ the model is configured as a decoder.
1161
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1162
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1163
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1164
+
1165
+ - 1 for tokens that are **not masked**,
1166
+ - 0 for tokens that are **masked**.
1167
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1168
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1169
+
1170
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1171
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1172
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1173
+ use_cache (`bool`, *optional*):
1174
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1175
+ `past_key_values`).
1176
+ """
1177
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1178
+ output_hidden_states = (
1179
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1180
+ )
1181
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1182
+
1183
+ if self.config.is_decoder:
1184
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1185
+ else:
1186
+ use_cache = False
1187
+
1188
+ if input_ids is not None and inputs_embeds is not None:
1189
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
1190
+ elif input_ids is not None:
1191
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
1192
+ input_shape = input_ids.size()
1193
+ elif inputs_embeds is not None:
1194
+ input_shape = inputs_embeds.size()[:-1]
1195
+ else:
1196
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1197
+
1198
+ batch_size, seq_length = input_shape
1199
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
1200
+
1201
+ # past_key_values_length
1202
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
1203
+
1204
+ if attention_mask is None:
1205
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
1206
+
1207
+ if token_type_ids is None:
1208
+ if hasattr(self.embeddings, "token_type_ids"):
1209
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
1210
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
1211
+ token_type_ids = buffered_token_type_ids_expanded
1212
+ else:
1213
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
1214
+
1215
+ binary_attention_mask = torch.where(attention_mask > 0, 1.0, 0.0)
1216
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
1217
+ # ourselves in which case we just need to make it broadcastable to all heads.
1218
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(binary_attention_mask, input_shape)
1219
+
1220
+ # If a 2D or 3D attention mask is provided for the cross-attention
1221
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
1222
+ if self.config.is_decoder and encoder_hidden_states is not None:
1223
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
1224
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
1225
+ if encoder_attention_mask is None:
1226
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
1227
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
1228
+ else:
1229
+ encoder_extended_attention_mask = None
1230
+
1231
+ # Prepare head mask if needed
1232
+ # 1.0 in head_mask indicate we keep the head
1233
+ # attention_probs has shape bsz x n_heads x N x N
1234
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
1235
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
1236
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
1237
+
1238
+ embedding_output = self.embeddings(
1239
+ input_ids=input_ids,
1240
+ position_ids=position_ids,
1241
+ token_type_ids=token_type_ids,
1242
+ inputs_embeds=inputs_embeds,
1243
+ past_key_values_length=past_key_values_length,
1244
+ )
1245
+ encoder_outputs = self.encoder(
1246
+ embedding_output,
1247
+ attention_mask=extended_attention_mask,
1248
+ head_mask=head_mask,
1249
+ encoder_hidden_states=encoder_hidden_states,
1250
+ encoder_attention_mask=encoder_extended_attention_mask,
1251
+ past_key_values=past_key_values,
1252
+ use_cache=use_cache,
1253
+ output_attentions=output_attentions,
1254
+ output_hidden_states=output_hidden_states,
1255
+ return_dict=return_dict,
1256
+ original_attention_mask=attention_mask
1257
+ )
1258
+ sequence_output = encoder_outputs[0]
1259
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1260
+
1261
+ if not return_dict:
1262
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
1263
+
1264
+ return BaseModelOutputWithPoolingAndCrossAttentions(
1265
+ last_hidden_state=sequence_output,
1266
+ pooler_output=pooled_output,
1267
+ past_key_values=encoder_outputs.past_key_values,
1268
+ hidden_states=encoder_outputs.hidden_states,
1269
+ attentions=encoder_outputs.attentions,
1270
+ cross_attentions=encoder_outputs.cross_attentions,
1271
+ )
1272
+
1273
+
1274
+ @add_start_docstrings(
1275
+ """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING
1276
+ )
1277
+ class RobertaForCausalLM(RobertaPreTrainedModel):
1278
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1279
+
1280
+ def __init__(self, config):
1281
+ super().__init__(config)
1282
+
1283
+ if not config.is_decoder:
1284
+ logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1285
+
1286
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1287
+ self.lm_head = RobertaLMHead(config)
1288
+
1289
+ # Initialize weights and apply final processing
1290
+ self.post_init()
1291
+
1292
+ def get_output_embeddings(self):
1293
+ return self.lm_head.decoder
1294
+
1295
+ def set_output_embeddings(self, new_embeddings):
1296
+ self.lm_head.decoder = new_embeddings
1297
+
1298
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1299
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
1300
+ def forward(
1301
+ self,
1302
+ input_ids: Optional[torch.LongTensor] = None,
1303
+ attention_mask: Optional[torch.FloatTensor] = None,
1304
+ token_type_ids: Optional[torch.LongTensor] = None,
1305
+ position_ids: Optional[torch.LongTensor] = None,
1306
+ head_mask: Optional[torch.FloatTensor] = None,
1307
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1308
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1309
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
1310
+ labels: Optional[torch.LongTensor] = None,
1311
+ past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
1312
+ use_cache: Optional[bool] = None,
1313
+ output_attentions: Optional[bool] = None,
1314
+ output_hidden_states: Optional[bool] = None,
1315
+ return_dict: Optional[bool] = None,
1316
+ ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
1317
+ r"""
1318
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
1319
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
1320
+ the model is configured as a decoder.
1321
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1322
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1323
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1324
+
1325
+ - 1 for tokens that are **not masked**,
1326
+ - 0 for tokens that are **masked**.
1327
+
1328
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1329
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
1330
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
1331
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1332
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
1333
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
1334
+
1335
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
1336
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
1337
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1338
+ use_cache (`bool`, *optional*):
1339
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
1340
+ `past_key_values`).
1341
+
1342
+ Returns:
1343
+
1344
+ Example:
1345
+
1346
+ ```python
1347
+ >>> from transformers import AutoTokenizer, RobertaForCausalLM, AutoConfig
1348
+ >>> import torch
1349
+
1350
+ >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
1351
+ >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
1352
+ >>> config.is_decoder = True
1353
+ >>> model = RobertaForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
1354
+
1355
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
1356
+ >>> outputs = model(**inputs)
1357
+
1358
+ >>> prediction_logits = outputs.logits
1359
+ ```"""
1360
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1361
+ if labels is not None:
1362
+ use_cache = False
1363
+
1364
+ outputs = self.roberta(
1365
+ input_ids,
1366
+ attention_mask=attention_mask,
1367
+ token_type_ids=token_type_ids,
1368
+ position_ids=position_ids,
1369
+ head_mask=head_mask,
1370
+ inputs_embeds=inputs_embeds,
1371
+ encoder_hidden_states=encoder_hidden_states,
1372
+ encoder_attention_mask=encoder_attention_mask,
1373
+ past_key_values=past_key_values,
1374
+ use_cache=use_cache,
1375
+ output_attentions=output_attentions,
1376
+ output_hidden_states=output_hidden_states,
1377
+ return_dict=return_dict,
1378
+ )
1379
+
1380
+ sequence_output = outputs[0]
1381
+ prediction_scores = self.lm_head(sequence_output)
1382
+
1383
+ lm_loss = None
1384
+ if labels is not None:
1385
+ # move labels to correct device to enable model parallelism
1386
+ labels = labels.to(prediction_scores.device)
1387
+ # we are doing next-token prediction; shift prediction scores and input ids by one
1388
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
1389
+ labels = labels[:, 1:].contiguous()
1390
+ loss_fct = CrossEntropyLoss()
1391
+ lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1392
+
1393
+ if not return_dict:
1394
+ output = (prediction_scores,) + outputs[2:]
1395
+ return ((lm_loss,) + output) if lm_loss is not None else output
1396
+
1397
+ return CausalLMOutputWithCrossAttentions(
1398
+ loss=lm_loss,
1399
+ logits=prediction_scores,
1400
+ past_key_values=outputs.past_key_values,
1401
+ hidden_states=outputs.hidden_states,
1402
+ attentions=outputs.attentions,
1403
+ cross_attentions=outputs.cross_attentions,
1404
+ )
1405
+
1406
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
1407
+ input_shape = input_ids.shape
1408
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
1409
+ if attention_mask is None:
1410
+ attention_mask = input_ids.new_ones(input_shape)
1411
+
1412
+ # cut decoder_input_ids if past_key_values is used
1413
+ if past_key_values is not None:
1414
+ past_length = past_key_values[0][0].shape[2]
1415
+
1416
+ # Some generation methods already pass only the last input ID
1417
+ if input_ids.shape[1] > past_length:
1418
+ remove_prefix_length = past_length
1419
+ else:
1420
+ # Default to old behavior: keep only final ID
1421
+ remove_prefix_length = input_ids.shape[1] - 1
1422
+
1423
+ input_ids = input_ids[:, remove_prefix_length:]
1424
+
1425
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
1426
+
1427
+ def _reorder_cache(self, past_key_values, beam_idx):
1428
+ reordered_past = ()
1429
+ for layer_past in past_key_values:
1430
+ reordered_past += (
1431
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1432
+ )
1433
+ return reordered_past
1434
+
1435
+
1436
+ @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING)
1437
+ class RobertaForMaskedLM(RobertaPreTrainedModel):
1438
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1439
+
1440
+ def __init__(self, config):
1441
+ super().__init__(config)
1442
+
1443
+ if config.is_decoder:
1444
+ logger.warning(
1445
+ "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
1446
+ "bi-directional self-attention."
1447
+ )
1448
+
1449
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1450
+ self.lm_head = RobertaLMHead(config)
1451
+
1452
+ # Initialize weights and apply final processing
1453
+ self.post_init()
1454
+
1455
+ def get_output_embeddings(self):
1456
+ return self.lm_head.decoder
1457
+
1458
+ def set_output_embeddings(self, new_embeddings):
1459
+ self.lm_head.decoder = new_embeddings
1460
+
1461
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1462
+ @add_code_sample_docstrings(
1463
+ checkpoint=_CHECKPOINT_FOR_DOC,
1464
+ output_type=MaskedLMOutput,
1465
+ config_class=_CONFIG_FOR_DOC,
1466
+ mask="<mask>",
1467
+ expected_output="' Paris'",
1468
+ expected_loss=0.1,
1469
+ )
1470
+ def forward(
1471
+ self,
1472
+ input_ids: Optional[torch.LongTensor] = None,
1473
+ attention_mask: Optional[torch.FloatTensor] = None,
1474
+ token_type_ids: Optional[torch.LongTensor] = None,
1475
+ position_ids: Optional[torch.LongTensor] = None,
1476
+ head_mask: Optional[torch.FloatTensor] = None,
1477
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1478
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1479
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
1480
+ labels: Optional[torch.LongTensor] = None,
1481
+ output_attentions: Optional[bool] = None,
1482
+ output_hidden_states: Optional[bool] = None,
1483
+ return_dict: Optional[bool] = None,
1484
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
1485
+ r"""
1486
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1487
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
1488
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
1489
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1490
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
1491
+ Used to hide legacy arguments that have been deprecated.
1492
+ """
1493
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1494
+
1495
+ outputs = self.roberta(
1496
+ input_ids,
1497
+ attention_mask=attention_mask,
1498
+ token_type_ids=token_type_ids,
1499
+ position_ids=position_ids,
1500
+ head_mask=head_mask,
1501
+ inputs_embeds=inputs_embeds,
1502
+ encoder_hidden_states=encoder_hidden_states,
1503
+ encoder_attention_mask=encoder_attention_mask,
1504
+ output_attentions=output_attentions,
1505
+ output_hidden_states=output_hidden_states,
1506
+ return_dict=return_dict,
1507
+ )
1508
+ sequence_output = outputs[0]
1509
+ prediction_scores = self.lm_head(sequence_output)
1510
+
1511
+ masked_lm_loss = None
1512
+ if labels is not None:
1513
+ # move labels to correct device to enable model parallelism
1514
+ labels = labels.to(prediction_scores.device)
1515
+ loss_fct = CrossEntropyLoss()
1516
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1517
+
1518
+ if not return_dict:
1519
+ output = (prediction_scores,) + outputs[2:]
1520
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1521
+
1522
+ return MaskedLMOutput(
1523
+ loss=masked_lm_loss,
1524
+ logits=prediction_scores,
1525
+ hidden_states=outputs.hidden_states,
1526
+ attentions=outputs.attentions,
1527
+ )
1528
+
1529
+
1530
+ class RobertaLMHead(nn.Module):
1531
+ """Roberta Head for masked language modeling."""
1532
+
1533
+ def __init__(self, config):
1534
+ super().__init__()
1535
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1536
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
1537
+
1538
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
1539
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
1540
+ self.decoder.bias = self.bias
1541
+
1542
+ def forward(self, features, **kwargs):
1543
+ x = self.dense(features)
1544
+ x = gelu(x)
1545
+ x = self.layer_norm(x)
1546
+
1547
+ # project back to size of vocabulary with bias
1548
+ x = self.decoder(x)
1549
+
1550
+ return x
1551
+
1552
+ def _tie_weights(self):
1553
+ # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
1554
+ # For accelerate compatibility and to not break backward compatibility
1555
+ if self.decoder.bias.device.type == "meta":
1556
+ self.decoder.bias = self.bias
1557
+ else:
1558
+ self.bias = self.decoder.bias
1559
+
1560
+
1561
+ @add_start_docstrings(
1562
+ """
1563
+ RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
1564
+ pooled output) e.g. for GLUE tasks.
1565
+ """,
1566
+ ROBERTA_START_DOCSTRING,
1567
+ )
1568
+ class RobertaForSequenceClassification(RobertaPreTrainedModel):
1569
+ def __init__(self, config):
1570
+ super().__init__(config)
1571
+ self.num_labels = config.num_labels
1572
+ self.config = config
1573
+
1574
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1575
+ self.classifier = RobertaClassificationHead(config)
1576
+
1577
+ # Initialize weights and apply final processing
1578
+ self.post_init()
1579
+
1580
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1581
+ @add_code_sample_docstrings(
1582
+ checkpoint="cardiffnlp/twitter-roberta-base-emotion",
1583
+ output_type=SequenceClassifierOutput,
1584
+ config_class=_CONFIG_FOR_DOC,
1585
+ expected_output="'optimism'",
1586
+ expected_loss=0.08,
1587
+ )
1588
+ def forward(
1589
+ self,
1590
+ input_ids: Optional[torch.LongTensor] = None,
1591
+ attention_mask: Optional[torch.FloatTensor] = None,
1592
+ token_type_ids: Optional[torch.LongTensor] = None,
1593
+ position_ids: Optional[torch.LongTensor] = None,
1594
+ head_mask: Optional[torch.FloatTensor] = None,
1595
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1596
+ labels: Optional[torch.LongTensor] = None,
1597
+ output_attentions: Optional[bool] = None,
1598
+ output_hidden_states: Optional[bool] = None,
1599
+ return_dict: Optional[bool] = None,
1600
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
1601
+ r"""
1602
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1603
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1604
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1605
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1606
+ """
1607
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1608
+
1609
+ outputs = self.roberta(
1610
+ input_ids,
1611
+ attention_mask=attention_mask,
1612
+ token_type_ids=token_type_ids,
1613
+ position_ids=position_ids,
1614
+ head_mask=head_mask,
1615
+ inputs_embeds=inputs_embeds,
1616
+ output_attentions=output_attentions,
1617
+ output_hidden_states=output_hidden_states,
1618
+ return_dict=return_dict,
1619
+ )
1620
+ sequence_output = outputs[0]
1621
+ logits = self.classifier(sequence_output)
1622
+
1623
+ loss = None
1624
+ if labels is not None:
1625
+ # move labels to correct device to enable model parallelism
1626
+ labels = labels.to(logits.device)
1627
+ if self.config.problem_type is None:
1628
+ if self.num_labels == 1:
1629
+ self.config.problem_type = "regression"
1630
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1631
+ self.config.problem_type = "single_label_classification"
1632
+ else:
1633
+ self.config.problem_type = "multi_label_classification"
1634
+
1635
+ if self.config.problem_type == "regression":
1636
+ loss_fct = MSELoss()
1637
+ if self.num_labels == 1:
1638
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1639
+ else:
1640
+ loss = loss_fct(logits, labels)
1641
+ elif self.config.problem_type == "single_label_classification":
1642
+ loss_fct = CrossEntropyLoss()
1643
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1644
+ elif self.config.problem_type == "multi_label_classification":
1645
+ loss_fct = BCEWithLogitsLoss()
1646
+ loss = loss_fct(logits, labels)
1647
+
1648
+ if not return_dict:
1649
+ output = (logits,) + outputs[2:]
1650
+ return ((loss,) + output) if loss is not None else output
1651
+
1652
+ return SequenceClassifierOutput(
1653
+ loss=loss,
1654
+ logits=logits,
1655
+ hidden_states=outputs.hidden_states,
1656
+ attentions=outputs.attentions,
1657
+ )
1658
+
1659
+
1660
+ @add_start_docstrings(
1661
+ """
1662
+ Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
1663
+ softmax) e.g. for RocStories/SWAG tasks.
1664
+ """,
1665
+ ROBERTA_START_DOCSTRING,
1666
+ )
1667
+ class RobertaForMultipleChoice(RobertaPreTrainedModel):
1668
+ def __init__(self, config):
1669
+ super().__init__(config)
1670
+
1671
+ self.roberta = RobertaModel(config)
1672
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
1673
+ self.classifier = nn.Linear(config.hidden_size, 1)
1674
+
1675
+ # Initialize weights and apply final processing
1676
+ self.post_init()
1677
+
1678
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
1679
+ @add_code_sample_docstrings(
1680
+ checkpoint=_CHECKPOINT_FOR_DOC,
1681
+ output_type=MultipleChoiceModelOutput,
1682
+ config_class=_CONFIG_FOR_DOC,
1683
+ )
1684
+ def forward(
1685
+ self,
1686
+ input_ids: Optional[torch.LongTensor] = None,
1687
+ token_type_ids: Optional[torch.LongTensor] = None,
1688
+ attention_mask: Optional[torch.FloatTensor] = None,
1689
+ labels: Optional[torch.LongTensor] = None,
1690
+ position_ids: Optional[torch.LongTensor] = None,
1691
+ head_mask: Optional[torch.FloatTensor] = None,
1692
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1693
+ output_attentions: Optional[bool] = None,
1694
+ output_hidden_states: Optional[bool] = None,
1695
+ return_dict: Optional[bool] = None,
1696
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
1697
+ r"""
1698
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1699
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
1700
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1701
+ `input_ids` above)
1702
+ """
1703
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1704
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1705
+
1706
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
1707
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
1708
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1709
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1710
+ flat_inputs_embeds = (
1711
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1712
+ if inputs_embeds is not None
1713
+ else None
1714
+ )
1715
+
1716
+ outputs = self.roberta(
1717
+ flat_input_ids,
1718
+ position_ids=flat_position_ids,
1719
+ token_type_ids=flat_token_type_ids,
1720
+ attention_mask=flat_attention_mask,
1721
+ head_mask=head_mask,
1722
+ inputs_embeds=flat_inputs_embeds,
1723
+ output_attentions=output_attentions,
1724
+ output_hidden_states=output_hidden_states,
1725
+ return_dict=return_dict,
1726
+ )
1727
+ pooled_output = outputs[1]
1728
+
1729
+ pooled_output = self.dropout(pooled_output)
1730
+ logits = self.classifier(pooled_output)
1731
+ reshaped_logits = logits.view(-1, num_choices)
1732
+
1733
+ loss = None
1734
+ if labels is not None:
1735
+ # move labels to correct device to enable model parallelism
1736
+ labels = labels.to(reshaped_logits.device)
1737
+ loss_fct = CrossEntropyLoss()
1738
+ loss = loss_fct(reshaped_logits, labels)
1739
+
1740
+ if not return_dict:
1741
+ output = (reshaped_logits,) + outputs[2:]
1742
+ return ((loss,) + output) if loss is not None else output
1743
+
1744
+ return MultipleChoiceModelOutput(
1745
+ loss=loss,
1746
+ logits=reshaped_logits,
1747
+ hidden_states=outputs.hidden_states,
1748
+ attentions=outputs.attentions,
1749
+ )
1750
+
1751
+
1752
+ @add_start_docstrings(
1753
+ """
1754
+ Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
1755
+ Named-Entity-Recognition (NER) tasks.
1756
+ """,
1757
+ ROBERTA_START_DOCSTRING,
1758
+ )
1759
+ class RobertaForTokenClassification(RobertaPreTrainedModel):
1760
+ def __init__(self, config):
1761
+ super().__init__(config)
1762
+ self.num_labels = config.num_labels
1763
+
1764
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1765
+ classifier_dropout = (
1766
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1767
+ )
1768
+ self.dropout = nn.Dropout(classifier_dropout)
1769
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1770
+
1771
+ # Initialize weights and apply final processing
1772
+ self.post_init()
1773
+
1774
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1775
+ @add_code_sample_docstrings(
1776
+ checkpoint="Jean-Baptiste/roberta-large-ner-english",
1777
+ output_type=TokenClassifierOutput,
1778
+ config_class=_CONFIG_FOR_DOC,
1779
+ expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
1780
+ expected_loss=0.01,
1781
+ )
1782
+ def forward(
1783
+ self,
1784
+ input_ids: Optional[torch.LongTensor] = None,
1785
+ attention_mask: Optional[torch.FloatTensor] = None,
1786
+ token_type_ids: Optional[torch.LongTensor] = None,
1787
+ position_ids: Optional[torch.LongTensor] = None,
1788
+ head_mask: Optional[torch.FloatTensor] = None,
1789
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1790
+ labels: Optional[torch.LongTensor] = None,
1791
+ output_attentions: Optional[bool] = None,
1792
+ output_hidden_states: Optional[bool] = None,
1793
+ return_dict: Optional[bool] = None,
1794
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
1795
+ r"""
1796
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1797
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1798
+ """
1799
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1800
+
1801
+ outputs = self.roberta(
1802
+ input_ids,
1803
+ attention_mask=attention_mask,
1804
+ token_type_ids=token_type_ids,
1805
+ position_ids=position_ids,
1806
+ head_mask=head_mask,
1807
+ inputs_embeds=inputs_embeds,
1808
+ output_attentions=output_attentions,
1809
+ output_hidden_states=output_hidden_states,
1810
+ return_dict=return_dict,
1811
+ )
1812
+
1813
+ sequence_output = outputs[0]
1814
+
1815
+ sequence_output = self.dropout(sequence_output)
1816
+ logits = self.classifier(sequence_output)
1817
+
1818
+ loss = None
1819
+ if labels is not None:
1820
+ # move labels to correct device to enable model parallelism
1821
+ labels = labels.to(logits.device)
1822
+ loss_fct = CrossEntropyLoss()
1823
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1824
+
1825
+ if not return_dict:
1826
+ output = (logits,) + outputs[2:]
1827
+ return ((loss,) + output) if loss is not None else output
1828
+
1829
+ return TokenClassifierOutput(
1830
+ loss=loss,
1831
+ logits=logits,
1832
+ hidden_states=outputs.hidden_states,
1833
+ attentions=outputs.attentions,
1834
+ )
1835
+
1836
+
1837
+ class RobertaClassificationHead(nn.Module):
1838
+ """Head for sentence-level classification tasks."""
1839
+
1840
+ def __init__(self, config):
1841
+ super().__init__()
1842
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1843
+ classifier_dropout = (
1844
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1845
+ )
1846
+ self.dropout = nn.Dropout(classifier_dropout)
1847
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
1848
+
1849
+ def forward(self, features, **kwargs):
1850
+ x = features[:, 0, :] # take <s> token (equiv. to [CLS])
1851
+ x = self.dropout(x)
1852
+ x = self.dense(x)
1853
+ x = torch.tanh(x)
1854
+ x = self.dropout(x)
1855
+ x = self.out_proj(x)
1856
+ return x
1857
+
1858
+
1859
+ @add_start_docstrings(
1860
+ """
1861
+ Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
1862
+ layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
1863
+ """,
1864
+ ROBERTA_START_DOCSTRING,
1865
+ )
1866
+ class RobertaForQuestionAnswering(RobertaPreTrainedModel):
1867
+ def __init__(self, config):
1868
+ super().__init__(config)
1869
+ self.num_labels = config.num_labels
1870
+
1871
+ self.roberta = RobertaModel(config, add_pooling_layer=False)
1872
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1873
+
1874
+ # Initialize weights and apply final processing
1875
+ self.post_init()
1876
+
1877
+ @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1878
+ @add_code_sample_docstrings(
1879
+ checkpoint="deepset/roberta-base-squad2",
1880
+ output_type=QuestionAnsweringModelOutput,
1881
+ config_class=_CONFIG_FOR_DOC,
1882
+ expected_output="' puppet'",
1883
+ expected_loss=0.86,
1884
+ )
1885
+ def forward(
1886
+ self,
1887
+ input_ids: Optional[torch.LongTensor] = None,
1888
+ attention_mask: Optional[torch.FloatTensor] = None,
1889
+ token_type_ids: Optional[torch.LongTensor] = None,
1890
+ position_ids: Optional[torch.LongTensor] = None,
1891
+ head_mask: Optional[torch.FloatTensor] = None,
1892
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1893
+ start_positions: Optional[torch.LongTensor] = None,
1894
+ end_positions: Optional[torch.LongTensor] = None,
1895
+ output_attentions: Optional[bool] = None,
1896
+ output_hidden_states: Optional[bool] = None,
1897
+ return_dict: Optional[bool] = None,
1898
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
1899
+ r"""
1900
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1901
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1902
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1903
+ are not taken into account for computing the loss.
1904
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1905
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1906
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1907
+ are not taken into account for computing the loss.
1908
+ """
1909
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1910
+
1911
+ outputs = self.roberta(
1912
+ input_ids,
1913
+ attention_mask=attention_mask,
1914
+ token_type_ids=token_type_ids,
1915
+ position_ids=position_ids,
1916
+ head_mask=head_mask,
1917
+ inputs_embeds=inputs_embeds,
1918
+ output_attentions=output_attentions,
1919
+ output_hidden_states=output_hidden_states,
1920
+ return_dict=return_dict,
1921
+ )
1922
+
1923
+ sequence_output = outputs[0]
1924
+
1925
+ logits = self.qa_outputs(sequence_output)
1926
+ start_logits, end_logits = logits.split(1, dim=-1)
1927
+ start_logits = start_logits.squeeze(-1).contiguous()
1928
+ end_logits = end_logits.squeeze(-1).contiguous()
1929
+
1930
+ total_loss = None
1931
+ if start_positions is not None and end_positions is not None:
1932
+ # If we are on multi-GPU, split add a dimension
1933
+ if len(start_positions.size()) > 1:
1934
+ start_positions = start_positions.squeeze(-1)
1935
+ if len(end_positions.size()) > 1:
1936
+ end_positions = end_positions.squeeze(-1)
1937
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1938
+ ignored_index = start_logits.size(1)
1939
+ start_positions = start_positions.clamp(0, ignored_index)
1940
+ end_positions = end_positions.clamp(0, ignored_index)
1941
+
1942
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1943
+ start_loss = loss_fct(start_logits, start_positions)
1944
+ end_loss = loss_fct(end_logits, end_positions)
1945
+ total_loss = (start_loss + end_loss) / 2
1946
+
1947
+ if not return_dict:
1948
+ output = (start_logits, end_logits) + outputs[2:]
1949
+ return ((total_loss,) + output) if total_loss is not None else output
1950
+
1951
+ return QuestionAnsweringModelOutput(
1952
+ loss=total_loss,
1953
+ start_logits=start_logits,
1954
+ end_logits=end_logits,
1955
+ hidden_states=outputs.hidden_states,
1956
+ attentions=outputs.attentions,
1957
+ )
1958
+
1959
+
1960
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
1961
+ """
1962
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
1963
+ are ignored. This is modified from fairseq's `utils.make_positions`.
1964
+
1965
+ Args:
1966
+ x: torch.Tensor x:
1967
+
1968
+ Returns: torch.Tensor
1969
+ """
1970
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1971
+ mask = input_ids.ne(padding_idx).int()
1972
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
1973
+ return incremental_indices.long() + padding_idx
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": true,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<s>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<pad>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "</s>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<unk>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "128000": {
37
+ "content": "<mask>",
38
+ "lstrip": true,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "128001": {
45
+ "content": "<user_token_1>",
46
+ "lstrip": false,
47
+ "normalized": true,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": false
51
+ },
52
+ "128002": {
53
+ "content": "<user_token_2>",
54
+ "lstrip": false,
55
+ "normalized": true,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": false
59
+ },
60
+ "128003": {
61
+ "content": "<user_token_3>",
62
+ "lstrip": false,
63
+ "normalized": true,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": false
67
+ },
68
+ "128004": {
69
+ "content": "<user_token_4>",
70
+ "lstrip": false,
71
+ "normalized": true,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": false
75
+ },
76
+ "128005": {
77
+ "content": "<user_token_5>",
78
+ "lstrip": false,
79
+ "normalized": true,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": false
83
+ },
84
+ "128006": {
85
+ "content": "<user_token_6>",
86
+ "lstrip": false,
87
+ "normalized": true,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": false
91
+ },
92
+ "128007": {
93
+ "content": "<user_token_7>",
94
+ "lstrip": false,
95
+ "normalized": true,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": false
99
+ },
100
+ "128008": {
101
+ "content": "<user_token_8>",
102
+ "lstrip": false,
103
+ "normalized": true,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": false
107
+ },
108
+ "128009": {
109
+ "content": "<user_token_9>",
110
+ "lstrip": false,
111
+ "normalized": true,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": false
115
+ },
116
+ "128010": {
117
+ "content": "<user_token_10>",
118
+ "lstrip": false,
119
+ "normalized": true,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "128011": {
125
+ "content": "<user_token_11>",
126
+ "lstrip": false,
127
+ "normalized": true,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "128012": {
133
+ "content": "<user_token_12>",
134
+ "lstrip": false,
135
+ "normalized": true,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "128013": {
141
+ "content": "<user_token_13>",
142
+ "lstrip": false,
143
+ "normalized": true,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "128014": {
149
+ "content": "<user_token_14>",
150
+ "lstrip": false,
151
+ "normalized": true,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "128015": {
157
+ "content": "<user_token_15>",
158
+ "lstrip": false,
159
+ "normalized": true,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "128016": {
165
+ "content": "<user_token_16>",
166
+ "lstrip": false,
167
+ "normalized": true,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "128017": {
173
+ "content": "<user_token_17>",
174
+ "lstrip": false,
175
+ "normalized": true,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ },
180
+ "128018": {
181
+ "content": "<user_token_18>",
182
+ "lstrip": false,
183
+ "normalized": true,
184
+ "rstrip": false,
185
+ "single_word": false,
186
+ "special": false
187
+ },
188
+ "128019": {
189
+ "content": "<user_token_19>",
190
+ "lstrip": false,
191
+ "normalized": true,
192
+ "rstrip": false,
193
+ "single_word": false,
194
+ "special": false
195
+ },
196
+ "128020": {
197
+ "content": "<user_token_20>",
198
+ "lstrip": false,
199
+ "normalized": true,
200
+ "rstrip": false,
201
+ "single_word": false,
202
+ "special": false
203
+ },
204
+ "128021": {
205
+ "content": "<user_token_21>",
206
+ "lstrip": false,
207
+ "normalized": true,
208
+ "rstrip": false,
209
+ "single_word": false,
210
+ "special": false
211
+ },
212
+ "128022": {
213
+ "content": "<user_token_22>",
214
+ "lstrip": false,
215
+ "normalized": true,
216
+ "rstrip": false,
217
+ "single_word": false,
218
+ "special": false
219
+ },
220
+ "128023": {
221
+ "content": "<user_token_23>",
222
+ "lstrip": false,
223
+ "normalized": true,
224
+ "rstrip": false,
225
+ "single_word": false,
226
+ "special": false
227
+ },
228
+ "128024": {
229
+ "content": "<user_token_24>",
230
+ "lstrip": false,
231
+ "normalized": true,
232
+ "rstrip": false,
233
+ "single_word": false,
234
+ "special": false
235
+ },
236
+ "128025": {
237
+ "content": "<user_token_25>",
238
+ "lstrip": false,
239
+ "normalized": true,
240
+ "rstrip": false,
241
+ "single_word": false,
242
+ "special": false
243
+ },
244
+ "128026": {
245
+ "content": "<user_token_26>",
246
+ "lstrip": false,
247
+ "normalized": true,
248
+ "rstrip": false,
249
+ "single_word": false,
250
+ "special": false
251
+ },
252
+ "128027": {
253
+ "content": "<user_token_27>",
254
+ "lstrip": false,
255
+ "normalized": true,
256
+ "rstrip": false,
257
+ "single_word": false,
258
+ "special": false
259
+ },
260
+ "128028": {
261
+ "content": "<user_token_28>",
262
+ "lstrip": false,
263
+ "normalized": true,
264
+ "rstrip": false,
265
+ "single_word": false,
266
+ "special": false
267
+ },
268
+ "128029": {
269
+ "content": "<user_token_29>",
270
+ "lstrip": false,
271
+ "normalized": true,
272
+ "rstrip": false,
273
+ "single_word": false,
274
+ "special": false
275
+ },
276
+ "128030": {
277
+ "content": "<user_token_30>",
278
+ "lstrip": false,
279
+ "normalized": true,
280
+ "rstrip": false,
281
+ "single_word": false,
282
+ "special": false
283
+ },
284
+ "128031": {
285
+ "content": "<user_token_31>",
286
+ "lstrip": false,
287
+ "normalized": true,
288
+ "rstrip": false,
289
+ "single_word": false,
290
+ "special": false
291
+ },
292
+ "128032": {
293
+ "content": "<user_token_32>",
294
+ "lstrip": false,
295
+ "normalized": true,
296
+ "rstrip": false,
297
+ "single_word": false,
298
+ "special": false
299
+ },
300
+ "128033": {
301
+ "content": "<user_token_33>",
302
+ "lstrip": false,
303
+ "normalized": true,
304
+ "rstrip": false,
305
+ "single_word": false,
306
+ "special": false
307
+ },
308
+ "128034": {
309
+ "content": "<user_token_34>",
310
+ "lstrip": false,
311
+ "normalized": true,
312
+ "rstrip": false,
313
+ "single_word": false,
314
+ "special": false
315
+ },
316
+ "128035": {
317
+ "content": "<user_token_35>",
318
+ "lstrip": false,
319
+ "normalized": true,
320
+ "rstrip": false,
321
+ "single_word": false,
322
+ "special": false
323
+ },
324
+ "128036": {
325
+ "content": "<user_token_36>",
326
+ "lstrip": false,
327
+ "normalized": true,
328
+ "rstrip": false,
329
+ "single_word": false,
330
+ "special": false
331
+ },
332
+ "128037": {
333
+ "content": "<user_token_37>",
334
+ "lstrip": false,
335
+ "normalized": true,
336
+ "rstrip": false,
337
+ "single_word": false,
338
+ "special": false
339
+ },
340
+ "128038": {
341
+ "content": "<user_token_38>",
342
+ "lstrip": false,
343
+ "normalized": true,
344
+ "rstrip": false,
345
+ "single_word": false,
346
+ "special": false
347
+ },
348
+ "128039": {
349
+ "content": "<user_token_39>",
350
+ "lstrip": false,
351
+ "normalized": true,
352
+ "rstrip": false,
353
+ "single_word": false,
354
+ "special": false
355
+ },
356
+ "128040": {
357
+ "content": "<user_token_40>",
358
+ "lstrip": false,
359
+ "normalized": true,
360
+ "rstrip": false,
361
+ "single_word": false,
362
+ "special": false
363
+ },
364
+ "128041": {
365
+ "content": "<user_token_41>",
366
+ "lstrip": false,
367
+ "normalized": true,
368
+ "rstrip": false,
369
+ "single_word": false,
370
+ "special": false
371
+ },
372
+ "128042": {
373
+ "content": "<user_token_42>",
374
+ "lstrip": false,
375
+ "normalized": true,
376
+ "rstrip": false,
377
+ "single_word": false,
378
+ "special": false
379
+ },
380
+ "128043": {
381
+ "content": "<user_token_43>",
382
+ "lstrip": false,
383
+ "normalized": true,
384
+ "rstrip": false,
385
+ "single_word": false,
386
+ "special": false
387
+ },
388
+ "128044": {
389
+ "content": "<user_token_44>",
390
+ "lstrip": false,
391
+ "normalized": true,
392
+ "rstrip": false,
393
+ "single_word": false,
394
+ "special": false
395
+ },
396
+ "128045": {
397
+ "content": "<user_token_45>",
398
+ "lstrip": false,
399
+ "normalized": true,
400
+ "rstrip": false,
401
+ "single_word": false,
402
+ "special": false
403
+ },
404
+ "128046": {
405
+ "content": "<user_token_46>",
406
+ "lstrip": false,
407
+ "normalized": true,
408
+ "rstrip": false,
409
+ "single_word": false,
410
+ "special": false
411
+ },
412
+ "128047": {
413
+ "content": "<user_token_47>",
414
+ "lstrip": false,
415
+ "normalized": true,
416
+ "rstrip": false,
417
+ "single_word": false,
418
+ "special": false
419
+ },
420
+ "128048": {
421
+ "content": "<user_token_48>",
422
+ "lstrip": false,
423
+ "normalized": true,
424
+ "rstrip": false,
425
+ "single_word": false,
426
+ "special": false
427
+ },
428
+ "128049": {
429
+ "content": "<user_token_49>",
430
+ "lstrip": false,
431
+ "normalized": true,
432
+ "rstrip": false,
433
+ "single_word": false,
434
+ "special": false
435
+ },
436
+ "128050": {
437
+ "content": "<user_token_50>",
438
+ "lstrip": false,
439
+ "normalized": true,
440
+ "rstrip": false,
441
+ "single_word": false,
442
+ "special": false
443
+ },
444
+ "128051": {
445
+ "content": "<user_token_51>",
446
+ "lstrip": false,
447
+ "normalized": true,
448
+ "rstrip": false,
449
+ "single_word": false,
450
+ "special": false
451
+ },
452
+ "128052": {
453
+ "content": "<user_token_52>",
454
+ "lstrip": false,
455
+ "normalized": true,
456
+ "rstrip": false,
457
+ "single_word": false,
458
+ "special": false
459
+ },
460
+ "128053": {
461
+ "content": "<user_token_53>",
462
+ "lstrip": false,
463
+ "normalized": true,
464
+ "rstrip": false,
465
+ "single_word": false,
466
+ "special": false
467
+ },
468
+ "128054": {
469
+ "content": "<user_token_54>",
470
+ "lstrip": false,
471
+ "normalized": true,
472
+ "rstrip": false,
473
+ "single_word": false,
474
+ "special": false
475
+ },
476
+ "128055": {
477
+ "content": "<user_token_55>",
478
+ "lstrip": false,
479
+ "normalized": true,
480
+ "rstrip": false,
481
+ "single_word": false,
482
+ "special": false
483
+ },
484
+ "128056": {
485
+ "content": "<user_token_56>",
486
+ "lstrip": false,
487
+ "normalized": true,
488
+ "rstrip": false,
489
+ "single_word": false,
490
+ "special": false
491
+ },
492
+ "128057": {
493
+ "content": "<user_token_57>",
494
+ "lstrip": false,
495
+ "normalized": true,
496
+ "rstrip": false,
497
+ "single_word": false,
498
+ "special": false
499
+ },
500
+ "128058": {
501
+ "content": "<user_token_58>",
502
+ "lstrip": false,
503
+ "normalized": true,
504
+ "rstrip": false,
505
+ "single_word": false,
506
+ "special": false
507
+ },
508
+ "128059": {
509
+ "content": "<user_token_59>",
510
+ "lstrip": false,
511
+ "normalized": true,
512
+ "rstrip": false,
513
+ "single_word": false,
514
+ "special": false
515
+ },
516
+ "128060": {
517
+ "content": "<user_token_60>",
518
+ "lstrip": false,
519
+ "normalized": true,
520
+ "rstrip": false,
521
+ "single_word": false,
522
+ "special": false
523
+ },
524
+ "128061": {
525
+ "content": "<user_token_61>",
526
+ "lstrip": false,
527
+ "normalized": true,
528
+ "rstrip": false,
529
+ "single_word": false,
530
+ "special": false
531
+ },
532
+ "128062": {
533
+ "content": "<user_token_62>",
534
+ "lstrip": false,
535
+ "normalized": true,
536
+ "rstrip": false,
537
+ "single_word": false,
538
+ "special": false
539
+ },
540
+ "128063": {
541
+ "content": "<user_token_63>",
542
+ "lstrip": false,
543
+ "normalized": true,
544
+ "rstrip": false,
545
+ "single_word": false,
546
+ "special": false
547
+ }
548
+ },
549
+ "bos_token": "<s>",
550
+ "clean_up_tokenization_spaces": false,
551
+ "cls_token": "<s>",
552
+ "eos_token": "</s>",
553
+ "errors": "replace",
554
+ "extra_special_tokens": {},
555
+ "mask_token": "<mask>",
556
+ "model_max_length": 8192,
557
+ "pad_token": "<pad>",
558
+ "sep_token": "</s>",
559
+ "tokenizer_class": "RobertaTokenizer",
560
+ "trim_offsets": true,
561
+ "unk_token": "<unk>"
562
+ }
unigram.json ADDED
The diff for this file is too large to render. See raw diff