lvwerra HF Staff commited on
Commit
40a20a1
·
1 Parent(s): 3826872

add npmi logic

Browse files
Files changed (2) hide show
  1. npmi.py +192 -59
  2. requirements.txt +3 -2
npmi.py CHANGED
@@ -1,4 +1,4 @@
1
- # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
  # you may not use this file except in compliance with the License.
@@ -11,85 +11,218 @@
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
- """TODO: Add a description here."""
15
 
16
- import evaluate
17
- import datasets
 
18
 
 
 
 
 
 
19
 
20
- # TODO: Add BibTeX citation
21
  _CITATION = """\
22
- @InProceedings{huggingface:module,
23
- title = {A great new module},
24
- authors={huggingface, Inc.},
25
- year={2020}
26
- }
27
  """
28
 
29
- # TODO: Add description of the module here
30
  _DESCRIPTION = """\
31
- This new module is designed to solve this great ML task and is crafted with a lot of care.
 
32
  """
33
 
34
-
35
- # TODO: Add description of the arguments of the module here
36
- _KWARGS_DESCRIPTION = """
37
- Calculates how good are predictions given some references, using certain scores
38
  Args:
39
- predictions: list of predictions to score. Each predictions
40
- should be a string with tokens separated by spaces.
41
- references: list of reference for each prediction. Each
42
- reference should be a string with tokens separated by spaces.
43
  Returns:
44
- accuracy: description of the first score,
45
- another_score: description of the second score,
46
- Examples:
47
- Examples should be written in doctest format, and should illustrate how
48
- to use the function.
49
-
50
- >>> my_new_module = evaluate.load("my_new_module")
51
- >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52
- >>> print(results)
53
- {'accuracy': 1.0}
54
  """
55
 
56
- # TODO: Define external resources urls if needed
57
- BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 
 
58
 
 
 
59
 
60
- @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61
- class nPMI(evaluate.Measurement):
62
- """TODO: Short description of my evaluation module."""
 
 
63
 
 
64
  def _info(self):
65
- # TODO: Specifies the evaluate.EvaluationModuleInfo object
66
  return evaluate.MeasurementInfo(
67
- # This is the description that will appear on the modules page.
68
  module_type="measurement",
69
  description=_DESCRIPTION,
70
  citation=_CITATION,
71
  inputs_description=_KWARGS_DESCRIPTION,
72
- # This defines the format of each prediction and reference
73
- features=datasets.Features({
74
- 'predictions': datasets.Value('int64'),
75
- 'references': datasets.Value('int64'),
76
- }),
77
- # Homepage of the module for documentation
78
- homepage="http://module.homepage",
79
- # Additional links to the codebase or references
80
- codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81
- reference_urls=["http://path.to.reference.url/new_module"]
 
 
 
 
 
 
82
  )
83
 
84
- def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
88
-
89
- def _compute(self, predictions, references):
90
- """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93
- return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021 The HuggingFace Team. All rights reserved.
2
  #
3
  # Licensed under the Apache License, Version 2.0 (the "License");
4
  # you may not use this file except in compliance with the License.
 
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
 
14
 
15
+ # TODO: Change print statements to logging?
16
+ # from evaluate import logging as logs
17
+ import warnings
18
 
19
+ import datasets
20
+ import evaluate
21
+ import numpy as np
22
+ import pandas as pd
23
+ from sklearn.preprocessing import MultiLabelBinarizer
24
 
 
25
  _CITATION = """\
26
+ Osman Aka, Ken Burke, Alex Bauerle, Christina Greer, and Margaret Mitchell. \
27
+ 2021. Measuring Model Biases in the Absence of Ground Truth. \
28
+ In Proceedings of the 2021 AAAI/ACM Conference on AI, Ethics, and Society \
29
+ (AIES '21). Association for Computing Machinery, New York, NY, USA, 327–335. \
30
+ https://doi.org/10.1145/3461702.3462557
31
  """
32
 
 
33
  _DESCRIPTION = """\
34
+ Normalized Pointwise Information (nPMI) is an entropy-based measurement
35
+ of association, used here to measure the association between words.
36
  """
37
 
38
+ _KWARGS_DESCRIPTION = """\
 
 
 
39
  Args:
40
+ data (list of lists): List of tokenized sentences.
41
+ vocab_counts (dict or dataframe): Vocab terms and their counts
 
 
42
  Returns:
43
+ npmi_df: A dataframe with (1) nPMI association scores for each term; \
44
+ (2) the difference between them.
 
 
 
 
 
 
 
 
45
  """
46
 
47
+ # TODO: Is this necessary?
48
+ warnings.filterwarnings(action="ignore", category=UserWarning)
49
+ # When we divide by 0 in log
50
+ np.seterr(divide="ignore")
51
 
52
+ # treating inf values as NaN as well
53
+ pd.set_option("use_inf_as_na", True)
54
 
55
+ # This can be changed to whatever a person likes;
56
+ # it is the number of batches to use when iterating through the vocabulary.
57
+ _NUM_BATCHES = 500
58
+ PROP = "proportion"
59
+ CNT = "count"
60
 
61
+ class nPMI(evaluate.Measurement):
62
  def _info(self):
 
63
  return evaluate.MeasurementInfo(
 
64
  module_type="measurement",
65
  description=_DESCRIPTION,
66
  citation=_CITATION,
67
  inputs_description=_KWARGS_DESCRIPTION,
68
+ features=[
69
+ datasets.Features(
70
+ {
71
+ "data": datasets.Sequence(
72
+ datasets.Value("string", id="sequence"),
73
+ id="data"),
74
+ }
75
+ ),
76
+ datasets.Features(
77
+ {
78
+ "data": datasets.Value("string", id="data"),
79
+ }
80
+ )
81
+ ]
82
+ # TODO: Create docs for this.
83
+ # reference_urls=["https://huggingface.co/docs/..."],
84
  )
85
 
86
+ def _compute(self, data, vocab_counts, subgroup):
87
+ if isinstance(vocab_counts, dict):
88
+ vocab_counts_df = pd.DataFrame.from_dict(vocab_counts,
89
+ orient='index',
90
+ columns=[CNT])
91
+ elif isinstance(vocab_counts, pd.DataFrame):
92
+ vocab_counts_df = vocab_counts
93
+ else:
94
+ print("Can't support the data structure for the vocab counts. =(")
95
+ return
96
+ if isinstance(data[0], str):
97
+ data = [d.split() for d in data]
98
+ # These are used throughout the rest of the functions
99
+ self.data = data
100
+ self.vocab_counts_df = vocab_counts_df
101
+ self.vocab_counts_df[PROP] = vocab_counts_df[CNT] / sum(
102
+ vocab_counts_df[CNT])
103
+ # self.mlb_list holds num batches x num_sentences
104
+ self.mlb_list = []
105
+ # Index of the subgroup word in the sparse vector
106
+ subgroup_idx = vocab_counts_df.index.get_loc(subgroup)
107
+ print("Calculating co-occurrences...")
108
+ df_coo = self.calc_cooccurrences(subgroup, subgroup_idx)
109
+ vocab_cooc_df = self.set_idx_cols(df_coo, subgroup)
110
+ print("Calculating PMI...")
111
+ pmi_df = self.calc_PMI(vocab_cooc_df, subgroup)
112
+ print("Calculating nPMI...")
113
+ npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup)
114
+ npmi_bias = npmi_df.max(axis=0) + abs(npmi_df.min(axis=0))
115
+ return {"bias": npmi_bias, "co-occurrences": vocab_cooc_df,
116
+ "pmi": pmi_df, "npmi": npmi_df}
117
+
118
+ def _binarize_words_in_sentence(self):
119
+ print("Creating co-occurrence matrix for PMI calculations.")
120
+ batches = np.linspace(0, len(self.data), _NUM_BATCHES).astype(int)
121
+ i = 0
122
+ # Creates list of size (# batches x # sentences)
123
+ while i < len(batches) - 1:
124
+ # Makes a sparse matrix (shape: # sentences x # words),
125
+ # with the occurrence of each word per sentence.
126
+ mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index)
127
+ print(
128
+ "%s of %s sentence binarize batches." % (
129
+ str(i), str(len(batches)))
130
+ )
131
+ # Returns series: batch size x num_words
132
+ mlb_series = mlb.fit_transform(
133
+ self.data[batches[i]:batches[i + 1]]
134
+ )
135
+ i += 1
136
+ self.mlb_list.append(mlb_series)
137
+
138
+ def calc_cooccurrences(self, subgroup, subgroup_idx):
139
+ initialize = True
140
+ coo_df = None
141
+ # Big computation here! Should only happen once.
142
+ print(
143
+ "Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences."
144
+ )
145
+ if not self.mlb_list:
146
+ self._binarize_words_in_sentence()
147
+ for batch_id in range(len(self.mlb_list)):
148
+ print(
149
+ "%s of %s co-occurrence count batches"
150
+ % (str(batch_id), str(len(self.mlb_list)))
151
+ )
152
+ # List of all the sentences (list of vocab) in that batch
153
+ batch_sentence_row = self.mlb_list[batch_id]
154
+ # Dataframe of # sentences in batch x vocabulary size
155
+ sent_batch_df = pd.DataFrame(batch_sentence_row)
156
+ # Subgroup counts per-sentence for the given batch
157
+ subgroup_df = sent_batch_df[subgroup_idx]
158
+ subgroup_df.columns = [subgroup]
159
+ # Remove the sentences where the count of the subgroup is 0.
160
+ # This way we have less computation & resources needs.
161
+ subgroup_df = subgroup_df[subgroup_df > 0]
162
+ mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
163
+ # Create cooccurrence matrix for the given subgroup and all words.
164
+ batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))
165
+
166
+ # Creates a batch-sized dataframe of co-occurrence counts.
167
+ # Note these could just be summed rather than be batch size.
168
+ if initialize:
169
+ coo_df = batch_coo_df
170
+ else:
171
+ coo_df = coo_df.add(batch_coo_df, fill_value=0)
172
+ initialize = False
173
+ print("Returning co-occurrence matrix")
174
+ return pd.DataFrame(coo_df)
175
+
176
+ def set_idx_cols(self, df_coo, subgroup):
177
+ """
178
+ :param df_coo: Co-occurrence counts for subgroup, length is num_words
179
+ :return:
180
+ """
181
+ count_df = df_coo.set_index(self.vocab_counts_df.index)
182
+ count_df.columns = [subgroup + "-count"]
183
+ count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(
184
+ int)
185
+ return count_df
186
+
187
+ def calc_PMI(self, vocab_cooc_df, subgroup):
188
+ """
189
+ # PMI(x;y) = h(y) - h(y|x)
190
+ # = h(subgroup) - h(subgroup|word)
191
+ # = log (p(subgroup|word) / p(subgroup))
192
+ # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
193
+ """
194
+ # Calculation of p(subgroup)
195
+ # TODO: Is this better?
196
+ # subgroup_prob = vocab_counts_df.loc[subgroup][PROP]
197
+ subgroup_prob = self.vocab_counts_df.loc[subgroup][CNT] / sum(
198
+ self.vocab_counts_df[CNT])
199
+ # Calculation of p(subgroup|word) = count(subgroup,word) / count(word)
200
+ # Because the indices match (the vocab words),
201
+ # this division doesn't need to specify the index (I think?!)
202
+ p_subgroup_g_word = (
203
+ vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df[
204
+ CNT]
205
+ )
206
+ pmi_df = pd.DataFrame()
207
+ pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob)
208
+ # Note: A potentially faster solution for adding count, npmi,
209
+ # can be based on this zip idea:
210
+ # df_test['size_kb'], df_test['size_mb'], df_test['size_gb'] =
211
+ # zip(*df_test['size'].apply(sizes))
212
+ return pmi_df.dropna()
213
+
214
+ def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup):
215
+ """
216
+ # nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
217
+ # = -log(p(word|subgroup)p(word))
218
+ """
219
+ p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum(
220
+ vocab_cooc_df[subgroup + "-count"]
221
+ )
222
+ p_word = pmi_df.apply(
223
+ lambda x: self.vocab_counts_df.loc[x.name][PROP], axis=1
224
+ )
225
+ normalize_pmi = -np.log(p_word_g_subgroup * p_word)
226
+ npmi_df = pd.DataFrame()
227
+ npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi
228
+ return npmi_df.dropna()
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
- git+https://github.com/huggingface/evaluate@a45df1eb9996eec64ec3282ebe554061cb366388
2
- datasets~=2.0
 
 
1
+ git+https://github.com/huggingface/evaluate@main
2
+ sklearn
3
+ pandas