de-Rodrigo commited on
Commit
8336799
2 Parent(s): ac0a5d0 af68571

Merge branch 'main' of https://huggingface.co/spaces/de-Rodrigo/Embeddings

Browse files
app.py CHANGED
@@ -1,10 +1,13 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
  from bokeh.plotting import figure
4
- from bokeh.models import ColumnDataSource
5
- from bokeh.palettes import Category10
 
6
  from sklearn.decomposition import PCA
7
  from sklearn.manifold import TSNE
 
8
 
9
  TOOLTIPS = """
10
  <div>
@@ -17,149 +20,283 @@ TOOLTIPS = """
17
  </div>
18
  """
19
 
20
- def render_plot(selected_labels, df, plot_placeholder):
21
- if not selected_labels:
22
- st.write("No data to display. Please select at least one subset.")
23
- return
 
 
 
 
 
24
 
25
- filtered_data = df[df['label'].isin(selected_labels)]
26
- p = figure(width=400, height=400, tooltips=TOOLTIPS)
27
-
28
- num_labels = len(selected_labels)
29
- # Ajuste de la paleta
30
- if num_labels < 3:
31
- palette = Category10[3][:num_labels]
32
- elif num_labels in [3, 4, 5, 6, 7, 8, 9, 10]:
33
- palette = Category10[num_labels]
34
  else:
35
- palette = Category10[10][:num_labels]
 
 
36
 
37
- # Graficar cada label por separado
38
- for label, color in zip(selected_labels, palette):
39
- subset = filtered_data[filtered_data['label'] == label]
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  source = ColumnDataSource(data=dict(
41
  x=subset['x'],
42
  y=subset['y'],
43
  label=subset['label'],
44
  img=subset['img']
45
  ))
46
- p.scatter('x', 'y', size=12, source=source, color=color, legend_label=label)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- p.legend.title = "Subsets"
49
- p.legend.location = "top_right"
50
- p.legend.click_policy = "hide"
51
 
52
- plot_placeholder.bokeh_chart(p)
 
 
 
 
 
 
 
53
 
54
- def config_style():
55
- st.markdown(
56
- """
57
- <style>
58
- .main-title {
59
- font-size: 50px;
60
- color: #4CAF50;
61
- text-align: center;
62
- }
63
- .sub-title {
64
- font-size: 30px;
65
- color: #555;
66
- }
67
- .custom-text {
68
- font-size: 18px;
69
- line-height: 1.5;
70
- }
71
- </style>
72
- """,
73
- unsafe_allow_html=True
74
- )
75
 
76
- st.markdown('<h1 class="main-title">Merit Secret Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
77
- st.markdown('<h2 class="sub-title">Donut</h2>', unsafe_allow_html=True)
78
- st.markdown(
79
- """
80
- <p class="custom-text">
81
- Explore how Donut perceives real data.
82
- </p>
83
- """,
84
- unsafe_allow_html=True
85
- )
86
 
87
- if __name__ == "__main__":
88
- config_style()
 
 
 
 
 
89
 
90
- # --- Primer gr谩fico: datos de Donut ---
91
- # Se asume que "embeddings_donut.csv" contiene las columnas "dim_0", "dim_1", ..., "dim_N", adem谩s de "label" e "img"
92
- df_donut = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
93
-
94
- # Selecci贸n de visualizaci贸n
95
- donut_mode = st.selectbox(
96
- "Seleccione visualizaci贸n para Donut:",
97
- options=["PCA", "t-SNE"]
98
- )
99
-
100
- # Extraer columnas de embedding (aquellas que empiezan con "dim_")
101
- embedding_cols = [col for col in df_donut.columns if col.startswith("dim_")]
102
- all_embeddings = df_donut[embedding_cols].values
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- if donut_mode == "PCA":
105
- pca = PCA(n_components=2)
106
- reduced = pca.fit_transform(all_embeddings)
 
107
  else:
108
- tsne = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
109
- reduced = tsne.fit_transform(all_embeddings)
110
-
111
- # A帽adir las coordenadas resultantes al DataFrame
112
- df_donut['x'] = reduced[:, 0]
113
- df_donut['y'] = reduced[:, 1]
114
 
115
- unique_labels = df_donut['label'].unique().tolist()
116
- plot_placeholder = st.empty()
 
117
 
118
- # Mostrar gr谩fico inicial con todas las etiquetas
119
- render_plot(unique_labels, df_donut, plot_placeholder)
 
 
 
 
 
 
 
 
120
 
121
- # Desplegable para filtrar etiquetas
122
- selected_labels = st.multiselect(
123
- "Seleccione subsets para visualizar (Donut):",
124
- options=unique_labels,
125
- default=unique_labels
126
- )
127
- render_plot(selected_labels, df_donut, plot_placeholder)
128
 
129
- # --- Segundo gr谩fico: datos de Idefics2 ---
130
- st.markdown('<h2 class="sub-title">Idefics2</h2>', unsafe_allow_html=True)
131
-
132
- # Se asume que "embeddings_idefics2.csv" tiene la misma estructura
133
- df_idefics2 = pd.read_csv("data/embeddings_idefics2.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- idefics2_mode = st.selectbox(
136
- "Seleccione visualizaci贸n para Idefics2:",
137
- options=["PCA", "t-SNE"],
138
- key="idefics2_mode"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  )
140
 
141
- embedding_cols2 = [col for col in df_idefics2.columns if col.startswith("dim_")]
142
- all_embeddings2 = df_idefics2[embedding_cols2].values
143
-
144
- if idefics2_mode == "PCA":
145
- pca2 = PCA(n_components=2)
146
- reduced2 = pca2.fit_transform(all_embeddings2)
147
- else:
148
- tsne2 = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
149
- reduced2 = tsne2.fit_transform(all_embeddings2)
150
-
151
- df_idefics2['x'] = reduced2[:, 0]
152
- df_idefics2['y'] = reduced2[:, 1]
153
-
154
- unique_labels2 = df_idefics2['label'].unique().tolist()
155
- plot_placeholder2 = st.empty()
156
 
157
- render_plot(unique_labels2, df_idefics2, plot_placeholder2)
 
 
158
 
159
- selected_labels2 = st.multiselect(
160
- "Seleccione subsets para visualizar (Idefics2):",
161
- options=unique_labels2,
162
- default=unique_labels2,
163
- key="idefics2"
164
- )
165
- render_plot(selected_labels2, df_idefics2, plot_placeholder2)
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import numpy as np
4
  from bokeh.plotting import figure
5
+ from bokeh.models import ColumnDataSource, DataTable, TableColumn, CustomJS, Select, Button
6
+ from bokeh.layouts import row, column
7
+ from bokeh.palettes import Reds9, Blues9
8
  from sklearn.decomposition import PCA
9
  from sklearn.manifold import TSNE
10
+ import io
11
 
12
  TOOLTIPS = """
13
  <div>
 
20
  </div>
21
  """
22
 
23
+ def config_style():
24
+ st.markdown("""
25
+ <style>
26
+ .main-title { font-size: 50px; color: #4CAF50; text-align: center; }
27
+ .sub-title { font-size: 30px; color: #555; }
28
+ .custom-text { font-size: 18px; line-height: 1.5; }
29
+ </style>
30
+ """, unsafe_allow_html=True)
31
+ st.markdown('<h1 class="main-title">Merit Embeddings 馃帓馃搩馃弳</h1>', unsafe_allow_html=True)
32
 
33
+ # Modificamos load_embeddings para aceptar el modelo a cargar
34
+ def load_embeddings(model):
35
+ if model == "Donut":
36
+ df_real = pd.read_csv("data/donut_de_Rodrigo_merit_secret_all_embeddings.csv")
37
+ df_es_digital_seq = pd.read_csv("data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
38
+ elif model == "Idefics2":
39
+ df_real = pd.read_csv("data/idefics2_de_Rodrigo_merit_secret_britanico_embeddings.csv")
40
+ df_es_digital_seq = pd.read_csv("data/idefics2_de_Rodrigo_merit_es-digital-seq_embeddings.csv")
 
41
  else:
42
+ st.error("Modelo no reconocido")
43
+ return None
44
+ return {"real": df_real, "es-digital-seq": df_es_digital_seq}
45
 
46
+ # Funciones auxiliares (id茅nticas a las de tu c贸digo)
47
+ def reducer_selector(df_combined, embedding_cols):
48
+ reduction_method = st.selectbox("Select Dimensionality Reduction Method:", options=["PCA", "t-SNE"])
49
+ all_embeddings = df_combined[embedding_cols].values
50
+ if reduction_method == "PCA":
51
+ reducer = PCA(n_components=2)
52
+ else:
53
+ reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
54
+ return reducer.fit_transform(all_embeddings)
55
+
56
+ def add_dataset_to_fig(fig, df, selected_labels, marker, color_mapping):
57
+ renderers = {}
58
+ for label in selected_labels:
59
+ subset = df[df['label'] == label]
60
+ if subset.empty:
61
+ continue
62
  source = ColumnDataSource(data=dict(
63
  x=subset['x'],
64
  y=subset['y'],
65
  label=subset['label'],
66
  img=subset['img']
67
  ))
68
+ color = color_mapping[label]
69
+ if marker == "circle":
70
+ r = fig.circle('x', 'y', size=10, source=source,
71
+ fill_color=color, line_color=color,
72
+ legend_label=f"{label} (Real)")
73
+ elif marker == "square":
74
+ r = fig.square('x', 'y', size=6, source=source,
75
+ fill_color=color, line_color=color,
76
+ legend_label=f"{label} (Synthetic)")
77
+ renderers[label] = r
78
+ return renderers
79
+
80
+ def get_color_maps(selected_subsets: dict):
81
+ num_real = len(selected_subsets["real"])
82
+ red_palette = Reds9[:num_real] if num_real <= 9 else (Reds9 * ((num_real // 9) + 1))[:num_real]
83
+ color_mapping_real = {label: red_palette[i] for i, label in enumerate(sorted(selected_subsets["real"]))}
84
+
85
+ num_es = len(selected_subsets["es-digital-seq"])
86
+ blue_palette = Blues9[:num_es] if num_es <= 9 else (Blues9 * ((num_es // 9) + 1))[:num_es]
87
+ color_mapping_es = {label: blue_palette[i] for i, label in enumerate(sorted(selected_subsets["es-digital-seq"]))}
88
 
89
+ return {"real": color_mapping_real, "es-digital-seq": color_mapping_es}
 
 
90
 
91
+ def split_versions(df_combined, reduced):
92
+ df_combined['x'] = reduced[:, 0]
93
+ df_combined['y'] = reduced[:, 1]
94
+ df_real = df_combined[df_combined["version"] == "real"].copy()
95
+ df_es = df_combined[df_combined["version"] == "es_digital_seq"].copy()
96
+ unique_real = sorted(df_real['label'].unique().tolist())
97
+ unique_es = sorted(df_es['label'].unique().tolist())
98
+ return {"real": df_real, "es-digital-seq": df_es}, {"real": unique_real, "es-digital-seq": unique_es}
99
 
100
+ def create_figure(dfs_reduced, selected_subsets: dict, color_maps: dict):
101
+ fig = figure(width=400, height=400, tooltips=TOOLTIPS, title="")
102
+ real_renderers = add_dataset_to_fig(fig, dfs_reduced["real"], selected_subsets["real"],
103
+ marker="circle", color_mapping=color_maps["real"])
104
+ synthetic_renderers = add_dataset_to_fig(fig, dfs_reduced["es-digital-seq"], selected_subsets["es-digital-seq"],
105
+ marker="square", color_mapping=color_maps["es-digital-seq"])
106
+ fig.legend.location = "top_right"
107
+ fig.legend.click_policy = "hide"
108
+ return fig, real_renderers, synthetic_renderers
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ def calculate_cluster_centers(df: pd.DataFrame, selected_labels: list) -> dict:
111
+ centers = {}
112
+ for label in selected_labels:
113
+ subset = df[df['label'] == label]
114
+ if not subset.empty:
115
+ centers[label] = (subset['x'].mean(), subset['y'].mean())
116
+ return centers
 
 
 
117
 
118
+ def compute_distances(centers_es: dict, centers_real: dict) -> pd.DataFrame:
119
+ distances = {}
120
+ for es_label, (x_es, y_es) in centers_es.items():
121
+ distances[es_label] = {}
122
+ for real_label, (x_real, y_real) in centers_real.items():
123
+ distances[es_label][real_label] = np.sqrt((x_es - x_real)**2 + (y_es - y_real)**2)
124
+ return pd.DataFrame(distances).T
125
 
126
+ def create_table(df_distances):
127
+ df_table = df_distances.copy()
128
+ df_table.reset_index(inplace=True)
129
+ df_table.rename(columns={'index': 'Synthetic'}, inplace=True)
130
+ source_table = ColumnDataSource(df_table)
131
+ columns = [TableColumn(field='Synthetic', title='Synthetic')]
132
+ for col in df_table.columns:
133
+ if col != 'Synthetic':
134
+ columns.append(TableColumn(field=col, title=col))
135
+ row_height = 28
136
+ header_height = 30
137
+ total_height = header_height + len(df_table) * row_height
138
+
139
+ data_table = DataTable(source=source_table, columns=columns, sizing_mode='stretch_width', height=total_height)
140
+ return data_table, df_table, source_table
141
+
142
+ # Funci贸n que ejecuta todo el proceso para un modelo determinado
143
+ def run_model(model_name):
144
+ embeddings = load_embeddings(model_name)
145
+ if embeddings is None:
146
+ return
147
+
148
+ # Asignamos la versi贸n para distinguir en el split
149
+ embeddings["real"]["version"] = "real"
150
+ embeddings["es-digital-seq"]["version"] = "es_digital_seq"
151
+ embedding_cols = [col for col in embeddings["real"].columns if col.startswith("dim_")]
152
+ df_combined = pd.concat([embeddings["real"], embeddings["es-digital-seq"]], ignore_index=True)
153
 
154
+ st.markdown('<h6 class="sub-title">Select Dimensionality Reduction Method</h6>', unsafe_allow_html=True)
155
+ reduction_method = st.selectbox("", options=["t-SNE", "PCA"], key=model_name)
156
+ if reduction_method == "PCA":
157
+ reducer = PCA(n_components=2)
158
  else:
159
+ reducer = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate=200)
160
+ reduced = reducer.fit_transform(df_combined[embedding_cols].values)
 
 
 
 
161
 
162
+ dfs_reduced, unique_subsets = split_versions(df_combined, reduced)
163
+ selected_subsets = {"real": unique_subsets["real"], "es-digital-seq": unique_subsets["es-digital-seq"]}
164
+ color_maps = get_color_maps(selected_subsets)
165
 
166
+ fig, real_renderers, synthetic_renderers = create_figure(dfs_reduced, selected_subsets, color_maps)
167
+ centers_real = calculate_cluster_centers(dfs_reduced["real"], selected_subsets["real"])
168
+ centers_es = calculate_cluster_centers(dfs_reduced["es-digital-seq"], selected_subsets["es-digital-seq"])
169
+ df_distances = compute_distances(centers_es, centers_real)
170
+ data_table, df_table, source_table = create_table(df_distances)
171
+ real_subset_names = list(df_table.columns[1:])
172
+ real_select = Select(title="", value=real_subset_names[0], options=real_subset_names)
173
+ reset_button = Button(label="Reset Colors", button_type="primary")
174
+ line_source = ColumnDataSource(data={'x': [], 'y': []})
175
+ fig.line('x', 'y', source=line_source, line_width=2, line_color='black')
176
 
177
+ synthetic_centers_js = {k: [v[0], v[1]] for k, v in centers_es.items()}
178
+ real_centers_js = {k: [v[0], v[1]] for k, v in centers_real.items()}
 
 
 
 
 
179
 
180
+ # Callback para actualizar el gr谩fico
181
+ callback = CustomJS(args=dict(source=source_table, line_source=line_source,
182
+ synthetic_centers=synthetic_centers_js,
183
+ real_centers=real_centers_js,
184
+ synthetic_renderers=synthetic_renderers,
185
+ real_renderers=real_renderers,
186
+ synthetic_colors=color_maps["es-digital-seq"],
187
+ real_colors=color_maps["real"],
188
+ real_select=real_select),
189
+ code="""
190
+ var selected = source.selected.indices;
191
+ if (selected.length > 0) {
192
+ var row = selected[0];
193
+ var data = source.data;
194
+ var synthetic_label = data['Synthetic'][row];
195
+ var real_label = real_select.value;
196
+ var syn_coords = synthetic_centers[synthetic_label];
197
+ var real_coords = real_centers[real_label];
198
+ line_source.data = { 'x': [syn_coords[0], real_coords[0]], 'y': [syn_coords[1], real_coords[1]] };
199
+ line_source.change.emit();
200
+
201
+ for (var key in synthetic_renderers) {
202
+ if (synthetic_renderers.hasOwnProperty(key)) {
203
+ var renderer = synthetic_renderers[key];
204
+ if (key === synthetic_label) {
205
+ renderer.glyph.fill_color = synthetic_colors[key];
206
+ renderer.glyph.line_color = synthetic_colors[key];
207
+ } else {
208
+ renderer.glyph.fill_color = "lightgray";
209
+ renderer.glyph.line_color = "lightgray";
210
+ }
211
+ }
212
+ }
213
+ for (var key in real_renderers) {
214
+ if (real_renderers.hasOwnProperty(key)) {
215
+ var renderer = real_renderers[key];
216
+ if (key === real_label) {
217
+ renderer.glyph.fill_color = real_colors[key];
218
+ renderer.glyph.line_color = real_colors[key];
219
+ } else {
220
+ renderer.glyph.fill_color = "lightgray";
221
+ renderer.glyph.line_color = "lightgray";
222
+ }
223
+ }
224
+ }
225
+ } else {
226
+ line_source.data = { 'x': [], 'y': [] };
227
+ line_source.change.emit();
228
+ for (var key in synthetic_renderers) {
229
+ if (synthetic_renderers.hasOwnProperty(key)) {
230
+ var renderer = synthetic_renderers[key];
231
+ renderer.glyph.fill_color = synthetic_colors[key];
232
+ renderer.glyph.line_color = synthetic_colors[key];
233
+ }
234
+ }
235
+ for (var key in real_renderers) {
236
+ if (real_renderers.hasOwnProperty(key)) {
237
+ var renderer = real_renderers[key];
238
+ renderer.glyph.fill_color = real_colors[key];
239
+ renderer.glyph.line_color = real_colors[key];
240
+ }
241
+ }
242
+ }
243
+ """)
244
+ source_table.selected.js_on_change('indices', callback)
245
+ real_select.js_on_change('value', callback)
246
 
247
+ reset_callback = CustomJS(args=dict(line_source=line_source,
248
+ synthetic_renderers=synthetic_renderers,
249
+ real_renderers=real_renderers,
250
+ synthetic_colors=color_maps["es-digital-seq"],
251
+ real_colors=color_maps["real"]),
252
+ code="""
253
+ line_source.data = { 'x': [], 'y': [] };
254
+ line_source.change.emit();
255
+ for (var key in synthetic_renderers) {
256
+ if (synthetic_renderers.hasOwnProperty(key)) {
257
+ var renderer = synthetic_renderers[key];
258
+ renderer.glyph.fill_color = synthetic_colors[key];
259
+ renderer.glyph.line_color = synthetic_colors[key];
260
+ }
261
+ }
262
+ for (var key in real_renderers) {
263
+ if (real_renderers.hasOwnProperty(key)) {
264
+ var renderer = real_renderers[key];
265
+ renderer.glyph.fill_color = real_colors[key];
266
+ renderer.glyph.line_color = real_colors[key];
267
+ }
268
+ }
269
+ """)
270
+ reset_button.js_on_event("button_click", reset_callback)
271
+
272
+ buffer = io.BytesIO()
273
+ df_table.to_excel(buffer, index=False)
274
+ buffer.seek(0)
275
+
276
+ # Agregar un bot贸n de descarga en Streamlit
277
+ st.download_button(
278
+ label="Exportar tabla a Excel",
279
+ data=buffer,
280
+ file_name="tabla.xlsx",
281
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
282
  )
283
 
284
+ layout = column(fig, column(real_select, reset_button, data_table))
285
+ st.bokeh_chart(layout, use_container_width=True)
286
+
287
+
288
+ # Funci贸n principal con tabs para cambiar de modelo
289
+ def main():
290
+ config_style()
291
+ tabs = st.tabs(["Donut", "Idefics2"])
 
 
 
 
 
 
 
292
 
293
+ with tabs[0]:
294
+ st.markdown('<h2 class="sub-title">Modelo Donut 馃</h2>', unsafe_allow_html=True)
295
+ run_model("Donut")
296
 
297
+ with tabs[1]:
298
+ st.markdown('<h2 class="sub-title">Modelo Idefics2 馃</h2>', unsafe_allow_html=True)
299
+ run_model("Idefics2")
300
+
301
+ if __name__ == "__main__":
302
+ main()
 
data/donut_de_Rodrigo_merit_es-digital-seq_embeddings.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/donut_de_Rodrigo_merit_secret_all_embeddings.csv CHANGED
The diff for this file is too large to render. See raw diff
 
data/idefics2_de_Rodrigo_merit_es-digital-seq_embeddings.csv ADDED
The diff for this file is too large to render. See raw diff
 
data/idefics2_de_Rodrigo_merit_secret_britanico_embeddings.csv ADDED
The diff for this file is too large to render. See raw diff