add npmi logic
Browse files- npmi.py +192 -59
- requirements.txt +3 -2
npmi.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
# Copyright
|
2 |
#
|
3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
# you may not use this file except in compliance with the License.
|
@@ -11,85 +11,218 @@
|
|
11 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
-
"""TODO: Add a description here."""
|
15 |
|
16 |
-
|
17 |
-
import
|
|
|
18 |
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
# TODO: Add BibTeX citation
|
21 |
_CITATION = """\
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
"""
|
28 |
|
29 |
-
# TODO: Add description of the module here
|
30 |
_DESCRIPTION = """\
|
31 |
-
|
|
|
32 |
"""
|
33 |
|
34 |
-
|
35 |
-
# TODO: Add description of the arguments of the module here
|
36 |
-
_KWARGS_DESCRIPTION = """
|
37 |
-
Calculates how good are predictions given some references, using certain scores
|
38 |
Args:
|
39 |
-
|
40 |
-
|
41 |
-
references: list of reference for each prediction. Each
|
42 |
-
reference should be a string with tokens separated by spaces.
|
43 |
Returns:
|
44 |
-
|
45 |
-
|
46 |
-
Examples:
|
47 |
-
Examples should be written in doctest format, and should illustrate how
|
48 |
-
to use the function.
|
49 |
-
|
50 |
-
>>> my_new_module = evaluate.load("my_new_module")
|
51 |
-
>>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
|
52 |
-
>>> print(results)
|
53 |
-
{'accuracy': 1.0}
|
54 |
"""
|
55 |
|
56 |
-
# TODO:
|
57 |
-
|
|
|
|
|
58 |
|
|
|
|
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
63 |
|
|
|
64 |
def _info(self):
|
65 |
-
# TODO: Specifies the evaluate.EvaluationModuleInfo object
|
66 |
return evaluate.MeasurementInfo(
|
67 |
-
# This is the description that will appear on the modules page.
|
68 |
module_type="measurement",
|
69 |
description=_DESCRIPTION,
|
70 |
citation=_CITATION,
|
71 |
inputs_description=_KWARGS_DESCRIPTION,
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
)
|
83 |
|
84 |
-
def
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2021 The HuggingFace Team. All rights reserved.
|
2 |
#
|
3 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
# you may not use this file except in compliance with the License.
|
|
|
11 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
|
|
14 |
|
15 |
+
# TODO: Change print statements to logging?
|
16 |
+
# from evaluate import logging as logs
|
17 |
+
import warnings
|
18 |
|
19 |
+
import datasets
|
20 |
+
import evaluate
|
21 |
+
import numpy as np
|
22 |
+
import pandas as pd
|
23 |
+
from sklearn.preprocessing import MultiLabelBinarizer
|
24 |
|
|
|
25 |
_CITATION = """\
|
26 |
+
Osman Aka, Ken Burke, Alex Bauerle, Christina Greer, and Margaret Mitchell. \
|
27 |
+
2021. Measuring Model Biases in the Absence of Ground Truth. \
|
28 |
+
In Proceedings of the 2021 AAAI/ACM Conference on AI, Ethics, and Society \
|
29 |
+
(AIES '21). Association for Computing Machinery, New York, NY, USA, 327–335. \
|
30 |
+
https://doi.org/10.1145/3461702.3462557
|
31 |
"""
|
32 |
|
|
|
33 |
_DESCRIPTION = """\
|
34 |
+
Normalized Pointwise Information (nPMI) is an entropy-based measurement
|
35 |
+
of association, used here to measure the association between words.
|
36 |
"""
|
37 |
|
38 |
+
_KWARGS_DESCRIPTION = """\
|
|
|
|
|
|
|
39 |
Args:
|
40 |
+
data (list of lists): List of tokenized sentences.
|
41 |
+
vocab_counts (dict or dataframe): Vocab terms and their counts
|
|
|
|
|
42 |
Returns:
|
43 |
+
npmi_df: A dataframe with (1) nPMI association scores for each term; \
|
44 |
+
(2) the difference between them.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
"""
|
46 |
|
47 |
+
# TODO: Is this necessary?
|
48 |
+
warnings.filterwarnings(action="ignore", category=UserWarning)
|
49 |
+
# When we divide by 0 in log
|
50 |
+
np.seterr(divide="ignore")
|
51 |
|
52 |
+
# treating inf values as NaN as well
|
53 |
+
pd.set_option("use_inf_as_na", True)
|
54 |
|
55 |
+
# This can be changed to whatever a person likes;
|
56 |
+
# it is the number of batches to use when iterating through the vocabulary.
|
57 |
+
_NUM_BATCHES = 500
|
58 |
+
PROP = "proportion"
|
59 |
+
CNT = "count"
|
60 |
|
61 |
+
class nPMI(evaluate.Measurement):
|
62 |
def _info(self):
|
|
|
63 |
return evaluate.MeasurementInfo(
|
|
|
64 |
module_type="measurement",
|
65 |
description=_DESCRIPTION,
|
66 |
citation=_CITATION,
|
67 |
inputs_description=_KWARGS_DESCRIPTION,
|
68 |
+
features=[
|
69 |
+
datasets.Features(
|
70 |
+
{
|
71 |
+
"data": datasets.Sequence(
|
72 |
+
datasets.Value("string", id="sequence"),
|
73 |
+
id="data"),
|
74 |
+
}
|
75 |
+
),
|
76 |
+
datasets.Features(
|
77 |
+
{
|
78 |
+
"data": datasets.Value("string", id="data"),
|
79 |
+
}
|
80 |
+
)
|
81 |
+
]
|
82 |
+
# TODO: Create docs for this.
|
83 |
+
# reference_urls=["https://huggingface.co/docs/..."],
|
84 |
)
|
85 |
|
86 |
+
def _compute(self, data, vocab_counts, subgroup):
|
87 |
+
if isinstance(vocab_counts, dict):
|
88 |
+
vocab_counts_df = pd.DataFrame.from_dict(vocab_counts,
|
89 |
+
orient='index',
|
90 |
+
columns=[CNT])
|
91 |
+
elif isinstance(vocab_counts, pd.DataFrame):
|
92 |
+
vocab_counts_df = vocab_counts
|
93 |
+
else:
|
94 |
+
print("Can't support the data structure for the vocab counts. =(")
|
95 |
+
return
|
96 |
+
if isinstance(data[0], str):
|
97 |
+
data = [d.split() for d in data]
|
98 |
+
# These are used throughout the rest of the functions
|
99 |
+
self.data = data
|
100 |
+
self.vocab_counts_df = vocab_counts_df
|
101 |
+
self.vocab_counts_df[PROP] = vocab_counts_df[CNT] / sum(
|
102 |
+
vocab_counts_df[CNT])
|
103 |
+
# self.mlb_list holds num batches x num_sentences
|
104 |
+
self.mlb_list = []
|
105 |
+
# Index of the subgroup word in the sparse vector
|
106 |
+
subgroup_idx = vocab_counts_df.index.get_loc(subgroup)
|
107 |
+
print("Calculating co-occurrences...")
|
108 |
+
df_coo = self.calc_cooccurrences(subgroup, subgroup_idx)
|
109 |
+
vocab_cooc_df = self.set_idx_cols(df_coo, subgroup)
|
110 |
+
print("Calculating PMI...")
|
111 |
+
pmi_df = self.calc_PMI(vocab_cooc_df, subgroup)
|
112 |
+
print("Calculating nPMI...")
|
113 |
+
npmi_df = self.calc_nPMI(pmi_df, vocab_cooc_df, subgroup)
|
114 |
+
npmi_bias = npmi_df.max(axis=0) + abs(npmi_df.min(axis=0))
|
115 |
+
return {"bias": npmi_bias, "co-occurrences": vocab_cooc_df,
|
116 |
+
"pmi": pmi_df, "npmi": npmi_df}
|
117 |
+
|
118 |
+
def _binarize_words_in_sentence(self):
|
119 |
+
print("Creating co-occurrence matrix for PMI calculations.")
|
120 |
+
batches = np.linspace(0, len(self.data), _NUM_BATCHES).astype(int)
|
121 |
+
i = 0
|
122 |
+
# Creates list of size (# batches x # sentences)
|
123 |
+
while i < len(batches) - 1:
|
124 |
+
# Makes a sparse matrix (shape: # sentences x # words),
|
125 |
+
# with the occurrence of each word per sentence.
|
126 |
+
mlb = MultiLabelBinarizer(classes=self.vocab_counts_df.index)
|
127 |
+
print(
|
128 |
+
"%s of %s sentence binarize batches." % (
|
129 |
+
str(i), str(len(batches)))
|
130 |
+
)
|
131 |
+
# Returns series: batch size x num_words
|
132 |
+
mlb_series = mlb.fit_transform(
|
133 |
+
self.data[batches[i]:batches[i + 1]]
|
134 |
+
)
|
135 |
+
i += 1
|
136 |
+
self.mlb_list.append(mlb_series)
|
137 |
+
|
138 |
+
def calc_cooccurrences(self, subgroup, subgroup_idx):
|
139 |
+
initialize = True
|
140 |
+
coo_df = None
|
141 |
+
# Big computation here! Should only happen once.
|
142 |
+
print(
|
143 |
+
"Approaching big computation! Here, we binarize all words in the sentences, making a sparse matrix of sentences."
|
144 |
+
)
|
145 |
+
if not self.mlb_list:
|
146 |
+
self._binarize_words_in_sentence()
|
147 |
+
for batch_id in range(len(self.mlb_list)):
|
148 |
+
print(
|
149 |
+
"%s of %s co-occurrence count batches"
|
150 |
+
% (str(batch_id), str(len(self.mlb_list)))
|
151 |
+
)
|
152 |
+
# List of all the sentences (list of vocab) in that batch
|
153 |
+
batch_sentence_row = self.mlb_list[batch_id]
|
154 |
+
# Dataframe of # sentences in batch x vocabulary size
|
155 |
+
sent_batch_df = pd.DataFrame(batch_sentence_row)
|
156 |
+
# Subgroup counts per-sentence for the given batch
|
157 |
+
subgroup_df = sent_batch_df[subgroup_idx]
|
158 |
+
subgroup_df.columns = [subgroup]
|
159 |
+
# Remove the sentences where the count of the subgroup is 0.
|
160 |
+
# This way we have less computation & resources needs.
|
161 |
+
subgroup_df = subgroup_df[subgroup_df > 0]
|
162 |
+
mlb_subgroup_only = sent_batch_df[sent_batch_df[subgroup_idx] > 0]
|
163 |
+
# Create cooccurrence matrix for the given subgroup and all words.
|
164 |
+
batch_coo_df = pd.DataFrame(mlb_subgroup_only.T.dot(subgroup_df))
|
165 |
+
|
166 |
+
# Creates a batch-sized dataframe of co-occurrence counts.
|
167 |
+
# Note these could just be summed rather than be batch size.
|
168 |
+
if initialize:
|
169 |
+
coo_df = batch_coo_df
|
170 |
+
else:
|
171 |
+
coo_df = coo_df.add(batch_coo_df, fill_value=0)
|
172 |
+
initialize = False
|
173 |
+
print("Returning co-occurrence matrix")
|
174 |
+
return pd.DataFrame(coo_df)
|
175 |
+
|
176 |
+
def set_idx_cols(self, df_coo, subgroup):
|
177 |
+
"""
|
178 |
+
:param df_coo: Co-occurrence counts for subgroup, length is num_words
|
179 |
+
:return:
|
180 |
+
"""
|
181 |
+
count_df = df_coo.set_index(self.vocab_counts_df.index)
|
182 |
+
count_df.columns = [subgroup + "-count"]
|
183 |
+
count_df[subgroup + "-count"] = count_df[subgroup + "-count"].astype(
|
184 |
+
int)
|
185 |
+
return count_df
|
186 |
+
|
187 |
+
def calc_PMI(self, vocab_cooc_df, subgroup):
|
188 |
+
"""
|
189 |
+
# PMI(x;y) = h(y) - h(y|x)
|
190 |
+
# = h(subgroup) - h(subgroup|word)
|
191 |
+
# = log (p(subgroup|word) / p(subgroup))
|
192 |
+
# nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
|
193 |
+
"""
|
194 |
+
# Calculation of p(subgroup)
|
195 |
+
# TODO: Is this better?
|
196 |
+
# subgroup_prob = vocab_counts_df.loc[subgroup][PROP]
|
197 |
+
subgroup_prob = self.vocab_counts_df.loc[subgroup][CNT] / sum(
|
198 |
+
self.vocab_counts_df[CNT])
|
199 |
+
# Calculation of p(subgroup|word) = count(subgroup,word) / count(word)
|
200 |
+
# Because the indices match (the vocab words),
|
201 |
+
# this division doesn't need to specify the index (I think?!)
|
202 |
+
p_subgroup_g_word = (
|
203 |
+
vocab_cooc_df[subgroup + "-count"] / self.vocab_counts_df[
|
204 |
+
CNT]
|
205 |
+
)
|
206 |
+
pmi_df = pd.DataFrame()
|
207 |
+
pmi_df[subgroup + "-pmi"] = np.log(p_subgroup_g_word / subgroup_prob)
|
208 |
+
# Note: A potentially faster solution for adding count, npmi,
|
209 |
+
# can be based on this zip idea:
|
210 |
+
# df_test['size_kb'], df_test['size_mb'], df_test['size_gb'] =
|
211 |
+
# zip(*df_test['size'].apply(sizes))
|
212 |
+
return pmi_df.dropna()
|
213 |
+
|
214 |
+
def calc_nPMI(self, pmi_df, vocab_cooc_df, subgroup):
|
215 |
+
"""
|
216 |
+
# nPMI additionally divides by -log(p(x,y)) = -log(p(x|y)p(y))
|
217 |
+
# = -log(p(word|subgroup)p(word))
|
218 |
+
"""
|
219 |
+
p_word_g_subgroup = vocab_cooc_df[subgroup + "-count"] / sum(
|
220 |
+
vocab_cooc_df[subgroup + "-count"]
|
221 |
+
)
|
222 |
+
p_word = pmi_df.apply(
|
223 |
+
lambda x: self.vocab_counts_df.loc[x.name][PROP], axis=1
|
224 |
+
)
|
225 |
+
normalize_pmi = -np.log(p_word_g_subgroup * p_word)
|
226 |
+
npmi_df = pd.DataFrame()
|
227 |
+
npmi_df[subgroup + "-npmi"] = pmi_df[subgroup + "-pmi"] / normalize_pmi
|
228 |
+
return npmi_df.dropna()
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
-
git+https://github.com/huggingface/evaluate@
|
2 |
-
|
|
|
|
1 |
+
git+https://github.com/huggingface/evaluate@main
|
2 |
+
sklearn
|
3 |
+
pandas
|