sumuks HF Staff commited on
Commit
5455df8
·
verified ·
1 Parent(s): 60daa70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +195 -37
app.py CHANGED
@@ -1,54 +1,212 @@
1
  import gradio as gr
2
  from datasets import load_dataset
 
 
 
 
 
 
 
3
 
4
- DATASET_NAME = "sumuks/fineweb-10BT-annotated"
5
  SPLIT = "train"
6
 
 
7
  SCORE_COLUMN = "score"
8
  TEXT_COLUMN = "text"
9
  ID_COLUMN = "id"
 
 
 
 
 
10
 
11
- # Load the dataset once when the app starts
12
- try:
13
- dataset = load_dataset(DATASET_NAME, split=SPLIT)
14
- except Exception as e:
15
- dataset = None
16
- load_error = str(e)
17
- else:
18
- load_error = None
19
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- def get_examples_by_score(score: int, n_examples: int = 5):
22
- if dataset is None:
23
- return [f"Dataset could not be loaded: {load_error}"]
24
- subset = dataset.filter(lambda x: x.get(SCORE_COLUMN) == score)
 
 
 
 
25
  n = min(len(subset), n_examples)
26
- examples = []
27
- for item in subset.select(range(n)):
 
 
 
 
 
 
 
28
  text = item.get(TEXT_COLUMN, "")
29
- examples.append(text)
30
- if not examples:
31
- examples.append("No examples found for this score")
32
- return examples
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- def build_tabs():
36
- tabs = []
37
- with gr.Tab("About"):
38
- gr.Markdown(
39
- f"# Dataset Inspector\nUsing dataset `{DATASET_NAME}`\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  )
41
- if load_error:
42
- gr.Markdown(f"**Dataset failed to load:** {load_error}")
43
- for score in range(6):
44
- with gr.Tab(f"Score {score}"):
45
- examples = get_examples_by_score(score, 5)
46
- for i, example in enumerate(examples):
47
- gr.Markdown(f"### Example {i+1}\n{example}")
48
- return tabs
49
-
50
-
51
- with gr.Blocks(theme="default") as demo:
52
- build_tabs()
 
 
53
 
54
- demo.launch()
 
 
 
1
  import gradio as gr
2
  from datasets import load_dataset
3
+ import random
4
+
5
+ # Available datasets
6
+ DATASETS = {
7
+ "Main Dataset": "sumuks/fineweb-10BT-annotated",
8
+ "Ablation Dataset": "sumuks/fineweb-10BT-annotated-ablation-1"
9
+ }
10
 
 
11
  SPLIT = "train"
12
 
13
+ # Column names (from build.py)
14
  SCORE_COLUMN = "score"
15
  TEXT_COLUMN = "text"
16
  ID_COLUMN = "id"
17
+ SUMMARY_COLUMN = "summary"
18
+ JUSTIFICATION_COLUMN = "justification"
19
+ THINKING_COLUMN = "thinking"
20
+ MODEL_COLUMN = "annotation_model"
21
+ DATE_COLUMN = "annotation_date"
22
 
23
+ # Global state
24
+ current_dataset = None
25
+ dataset_name = None
26
+ seen_ids = set()
 
 
 
 
27
 
28
+ def load_selected_dataset(selected_dataset):
29
+ global current_dataset, dataset_name, seen_ids
30
+ dataset_name = DATASETS[selected_dataset]
31
+ seen_ids = set() # Reset seen examples when switching datasets
32
+
33
+ try:
34
+ current_dataset = load_dataset(dataset_name, split=SPLIT)
35
+ return f"✅ Loaded {len(current_dataset)} examples from {dataset_name}"
36
+ except Exception as e:
37
+ current_dataset = None
38
+ return f"❌ Failed to load {dataset_name}: {str(e)}"
39
 
40
+ def get_examples_by_score(score: int, n_examples: int = 5, show_details: bool = False):
41
+ if current_dataset is None:
42
+ return "Please select and load a dataset first."
43
+
44
+ subset = current_dataset.filter(lambda x: x.get(SCORE_COLUMN) == score)
45
+ if len(subset) == 0:
46
+ return "No examples found for this score."
47
+
48
  n = min(len(subset), n_examples)
49
+ examples_text = []
50
+
51
+ # Randomly sample indices instead of taking the first n
52
+ total_available = len(subset)
53
+ random_indices = random.sample(range(total_available), n)
54
+
55
+ for idx in random_indices:
56
+ item = subset[idx]
57
+ example_id = item.get(ID_COLUMN, "Unknown")
58
  text = item.get(TEXT_COLUMN, "")
59
+ summary = item.get(SUMMARY_COLUMN, "")
60
+ justification = item.get(JUSTIFICATION_COLUMN, "")
61
+ thinking = item.get(THINKING_COLUMN, "")
62
+ model = item.get(MODEL_COLUMN, "")
63
+ date = item.get(DATE_COLUMN, "")
64
+
65
+ # Build the example display
66
+ example_display = f"**Document ID:** {example_id}\n\n"
67
+
68
+ if show_details and summary:
69
+ example_display += f"**Summary:** {summary}\n\n"
70
+
71
+ if show_details and justification:
72
+ example_display += f"**Justification:** {justification}\n\n"
73
+
74
+ if show_details and thinking:
75
+ example_display += f"**Thinking Process:** {thinking}\n\n"
76
+
77
+ if show_details and model:
78
+ example_display += f"**Model:** {model} | **Date:** {date}\n\n"
79
+
80
+ example_display += f"**Text:**\n{text}\n\n---\n"
81
+ examples_text.append(example_display)
82
+
83
+ return "\n".join(examples_text)
84
 
85
+ def get_random_unseen_example(show_details: bool = False):
86
+ if current_dataset is None:
87
+ return "Please select and load a dataset first."
88
+
89
+ # Get all IDs we haven't seen
90
+ all_ids = set(current_dataset[ID_COLUMN])
91
+ unseen_ids = all_ids - seen_ids
92
+
93
+ if not unseen_ids:
94
+ # Reset if we've seen everything
95
+ seen_ids.clear()
96
+ unseen_ids = all_ids
97
+ if not unseen_ids:
98
+ return "No examples available in dataset."
99
+
100
+ # Pick random unseen ID
101
+ random_id = random.choice(list(unseen_ids))
102
+ seen_ids.add(random_id)
103
+
104
+ # Find the item with this ID
105
+ item_idx = current_dataset[ID_COLUMN].index(random_id)
106
+ item = current_dataset[item_idx]
107
+
108
+ # Extract data
109
+ text = item.get(TEXT_COLUMN, "")
110
+ score = item.get(SCORE_COLUMN, "N/A")
111
+ summary = item.get(SUMMARY_COLUMN, "")
112
+ justification = item.get(JUSTIFICATION_COLUMN, "")
113
+ thinking = item.get(THINKING_COLUMN, "")
114
+ model = item.get(MODEL_COLUMN, "")
115
+ date = item.get(DATE_COLUMN, "")
116
+
117
+ # Build display
118
+ display = f"**Document ID:** {random_id} | **Score:** {score}\n\n"
119
+
120
+ if show_details and summary:
121
+ display += f"**Summary:** {summary}\n\n"
122
+
123
+ if show_details and justification:
124
+ display += f"**Justification:** {justification}\n\n"
125
+
126
+ if show_details and thinking:
127
+ display += f"**Thinking Process:** {thinking}\n\n"
128
+
129
+ if show_details and model:
130
+ display += f"**Model:** {model} | **Date:** {date}\n\n"
131
+
132
+ display += f"**Text:**\n{text}"
133
+
134
+ return display
135
 
136
+ def build_interface():
137
+ with gr.Blocks(theme="default", title="Dataset Inspector") as demo:
138
+ gr.Markdown("# 📊 Expert Content Classification Dataset Inspector")
139
+
140
+ with gr.Row():
141
+ with gr.Column(scale=2):
142
+ dataset_dropdown = gr.Dropdown(
143
+ choices=list(DATASETS.keys()),
144
+ label="Select Dataset",
145
+ value="Main Dataset"
146
+ )
147
+ with gr.Column(scale=1):
148
+ load_btn = gr.Button("Load Dataset", variant="primary")
149
+
150
+ status_display = gr.Markdown("")
151
+
152
+ with gr.Row():
153
+ show_details_global = gr.Checkbox(
154
+ label="Show annotation details (summary, justification, thinking)",
155
+ value=False
156
+ )
157
+
158
+ with gr.Tabs():
159
+ # Random sampling tab
160
+ with gr.Tab("🎲 Random Sampling"):
161
+ gr.Markdown("Sample random examples you haven't seen before")
162
+ with gr.Row():
163
+ sample_btn = gr.Button("Get Random Example", variant="secondary", size="lg")
164
+ random_output = gr.Markdown("")
165
+
166
+ # Score-based browsing tabs
167
+ for score in range(6):
168
+ with gr.Tab(f"⭐ Score {score}"):
169
+ gr.Markdown(f"Browse examples with quality score {score}")
170
+ with gr.Row():
171
+ n_examples = gr.Slider(
172
+ minimum=1,
173
+ maximum=20,
174
+ value=3,
175
+ step=1,
176
+ label="Number of examples"
177
+ )
178
+ show_btn = gr.Button(f"Show Score {score} Examples", variant="secondary")
179
+
180
+ score_output = gr.Markdown("")
181
+
182
+ # Set up the click handler for this score
183
+ show_btn.click(
184
+ fn=lambda n, details, s=score: get_examples_by_score(s, n, details),
185
+ inputs=[n_examples, show_details_global],
186
+ outputs=score_output
187
+ )
188
+
189
+ # Event handlers
190
+ load_btn.click(
191
+ fn=load_selected_dataset,
192
+ inputs=dataset_dropdown,
193
+ outputs=status_display
194
  )
195
+
196
+ sample_btn.click(
197
+ fn=get_random_unseen_example,
198
+ inputs=show_details_global,
199
+ outputs=random_output
200
+ )
201
+
202
+ # Load default dataset on startup
203
+ demo.load(
204
+ fn=lambda: load_selected_dataset("Main Dataset"),
205
+ outputs=status_display
206
+ )
207
+
208
+ return demo
209
 
210
+ if __name__ == "__main__":
211
+ demo = build_interface()
212
+ demo.launch()