Bor Hodošček commited on
Commit
17327cb
·
1 Parent(s): dd4089f

feat: improved pca, ca, hclust code and support for both modes

Browse files
Files changed (3) hide show
  1. app.py +388 -186
  2. pyproject.toml +1 -0
  3. uv.lock +16 -0
app.py CHANGED
@@ -9,6 +9,7 @@
9
  # "pandas==2.3.0",
10
  # "pca==2.10.0",
11
  # "plotly==6.2.0",
 
12
  # "pyarrow",
13
  # "scattertext==0.2.2",
14
  # "scikit-learn==1.7.0",
@@ -24,11 +25,10 @@
24
  import marimo
25
 
26
  __generated_with = "0.14.9"
27
- app = marimo.App(width="full", app_title="Scattertext on Japanese novels")
28
 
29
  with app.setup:
30
  import marimo as mo
31
- import itertools
32
  import spacy
33
  import pandas as pd
34
  import scipy
@@ -37,6 +37,7 @@ with app.setup:
37
  import re
38
  import scattertext as st
39
  from pca import pca
 
40
  import matplotlib.pyplot as plt
41
  from pathlib import Path
42
  from types import SimpleNamespace
@@ -61,7 +62,7 @@ def function_export():
61
 
62
  @mo.cache
63
  def parse_texts(texts: list[str], nlp=load_nlp()) -> list[str]:
64
- """Tokenize English text via spaCy and emit a whitespacejoined string."""
65
  return [" ".join(tok.text for tok in doc) for doc in nlp.pipe(texts)]
66
 
67
  @mo.cache
@@ -84,37 +85,74 @@ def function_export():
84
  .compact(st.AssociationCompactor(2000))
85
  )
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  @mo.cache
88
  def chunk_texts(
89
- texts: list[str],
90
- categories: list[str],
91
- filenames: list[str],
92
  chunk_size: int = 2000,
93
- ) -> tuple[list[str], list[str], list[str]]:
94
- """Chunk each text into segments of chunk_size tokens, preserving category and filename."""
95
- chunked_texts: list[str] = []
96
- chunked_cats: list[str] = []
97
- chunked_fnames: list[str] = []
98
- for text, cat, fname in zip(texts, categories, filenames):
99
- # compute a short “Initials‐Initials” label for author‐title
100
- stem = Path(fname).stem.replace("_advanced", "")
101
- author, title = stem.split("_", 1)
102
-
103
- def _initials(s: str) -> str:
104
- return "".join(tok[0].upper() for tok in s.split("-"))
105
-
106
- short_label = f"{_initials(author)}-{_initials(title)}"
107
- tokens = text.split()
108
- for i in range(0, len(tokens), chunk_size):
109
- chunk = " ".join(tokens[i : i + chunk_size])
110
- chunked_texts.append(chunk)
111
- chunked_cats.append(cat)
112
- chunked_fnames.append(f"{short_label}({cat})#{i // chunk_size + 1}")
113
- else:
114
- chunked_texts.append(chunk)
115
- chunked_cats.append(cat)
116
- chunked_fnames.append(f"{short_label}({cat})#last")
117
- return chunked_texts, chunked_cats, chunked_fnames
 
 
 
118
 
119
  @mo.cache
120
  def train_scikit_cached(
@@ -182,51 +220,63 @@ def function_export():
182
  texts = [Path(fn).read_text(encoding="utf-8") for fn in defaults]
183
  return names, texts
184
 
185
- def make_speech_df(uploaded_files):
 
 
186
  """
187
- Build a DataFrame of speech vs non-speech segments, one row per segment,
188
- carrying over the source filename.
189
  """
190
- defaults = [
191
- "e-r-eddison_the-worm-ouroboros_advanced.txt",
192
- "h-g-wells_the-wonderful-visit_advanced.txt",
193
- ]
194
- names, raws = _load_files(uploaded_files, defaults)
195
-
196
- speech_segs, nonspeech_segs = [], []
197
- speech_files, nonspeech_files = [], []
198
 
 
 
199
  for name, raw in zip(names, raws):
200
- sp, ns = split_speech_text(raw)
201
- speech_segs.append(sp)
202
- nonspeech_segs.append(ns)
203
- speech_files.append(name)
204
- nonspeech_files.append(name)
205
-
206
- tok_sp = parse_texts(speech_segs)
207
- tok_ns = parse_texts(nonspeech_segs)
208
-
209
- ratios = [
210
- len(s) / (len(s) + len(ns)) for s, ns in zip(speech_segs, nonspeech_segs)
211
- ]
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
- ratios_full = ratios + ratios
 
 
 
 
 
 
 
 
 
214
 
215
- return pd.DataFrame(
216
- {
217
- "category": ["speech"] * len(tok_sp) + ["non-speech"] * len(tok_ns),
218
- "filename": speech_files + nonspeech_files,
219
- "text": tok_sp + tok_ns,
220
- "speech_ratio": ratios_full,
221
- }
222
- )
223
 
224
  return (
225
  build_corpus_cached,
226
  chunk_texts,
227
- make_speech_df,
228
  parse_texts,
229
- split_speech_text,
230
  train_scikit_cached,
231
  )
232
 
@@ -250,7 +300,7 @@ def intro():
250
  2. データ内容を確認・修正
251
  3. チャンク&サンプリング設定
252
  4. Scattertextによる可視化
253
- 5. PCAのbiplot, 階層的クラスタリングのデンドログラムでサンプルの分布と素性の関係を観察
254
  6. 気になるサンプルをドロップダウンで選択し、内容を確認
255
 
256
  > 単語分割には、[spaCy](https://spacy.io/)([en_core_web_sm](https://spacy.io/models/en#en_core_web_sm)モデル)を使用しています。
@@ -274,7 +324,9 @@ def data_settings():
274
  full_width=True,
275
  )
276
  files_a = mo.ui.file(
277
- label="Aのファイルアップロード(UTF-8、.txt形式)", multiple=True, kind="area"
 
 
278
  )
279
  ### Category form
280
  label_b = mo.ui.text(
@@ -284,7 +336,9 @@ def data_settings():
284
  full_width=True,
285
  )
286
  files_b = mo.ui.file(
287
- label="Bのファイルアップロード(UTF-8、.txt形式)", multiple=True, kind="area"
 
 
288
  )
289
  split_speech = mo.ui.switch(
290
  label="Split speech vs non-speech segments?",
@@ -353,12 +407,11 @@ def data_settings():
353
  @app.cell
354
  def data_check(
355
  category_form,
356
- make_speech_df,
357
  mode_tabs,
358
  parse_texts,
 
359
  speech_form,
360
  split_speech,
361
- split_speech_text,
362
  ):
363
  mo.stop(mode_tabs.value == "Speech vs Non-Speech" and speech_form.value is None)
364
  mo.stop(mode_tabs.value == "Category Comparison" and category_form.value is None)
@@ -366,12 +419,22 @@ def data_check(
366
  validation_messages: list[str] = []
367
 
368
  if mode_tabs.value == "Speech vs Non-Speech":
369
- data = make_speech_df(speech_form.value.get("files_s", []))
 
 
 
 
 
 
 
 
 
 
 
370
  mo.md(
371
  f"## Data preview (speech vs non-speech)\n"
372
- f"{mo.ui.table(data, selection='multi')}"
373
  )
374
- # fake data_form so all scattertext cells see the same API
375
  data_form = SimpleNamespace(
376
  value={
377
  "category_name": "Speech vs Non-speech",
@@ -381,7 +444,6 @@ def data_check(
381
  )
382
  elif category_form.value is not None and mode_tabs.value == "Category Comparison":
383
  # Category vs Category
384
-
385
  if category_form.value["label_a"] == category_form.value["label_b"]:
386
  validation_messages.append(
387
  "⚠️ **警告**: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
@@ -392,89 +454,30 @@ def data_check(
392
  "ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
393
  )
394
 
395
- try:
396
- # Group A: either uploaded files or default
397
- if category_form.value["files_a"]:
398
- category_a_texts = (
399
- f.contents.decode("utf-8") for f in category_form.value["files_a"]
400
- )
401
- category_a_names = (f.name for f in category_form.value["files_a"])
402
- else:
403
- # Default Group A: E. R. Eddison: The Worm Ouroboros
404
- default_a = "e-r-eddison_the-worm-ouroboros_advanced.txt"
405
- category_a_texts = [Path(default_a).read_text(encoding="utf-8")]
406
- category_a_names = [default_a]
407
-
408
- if split_speech.value:
409
- texts_list = list(category_a_texts)
410
- names_list = list(category_a_names)
411
- expanded_txt, expanded_names = [], []
412
- for nm, raw in zip(names_list, texts_list):
413
- sp, ns = split_speech_text(raw)
414
- expanded_txt.extend([sp, ns])
415
- expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
416
- category_a_texts, category_a_names = expanded_txt, expanded_names
417
-
418
- # Group B: either uploaded files or default
419
- if category_form.value["files_b"]:
420
- category_b_texts = (
421
- f.contents.decode("utf-8") for f in category_form.value["files_b"]
422
- )
423
- category_b_names = (f.name for f in category_form.value["files_b"])
424
- else:
425
- # Default Group B: H. G. Wells: The Wonderful Visit
426
- default_b = "h-g-wells_the-wonderful-visit_advanced.txt"
427
- category_b_texts = [Path(default_b).read_text(encoding="utf-8")]
428
- category_b_names = [default_b]
429
-
430
- # same splitting for B‐side
431
- if split_speech.value:
432
- texts_list = list(category_b_texts)
433
- names_list = list(category_b_names)
434
- expanded_txt, expanded_names = [], []
435
- for nm, raw in zip(names_list, texts_list):
436
- sp, ns = split_speech_text(raw)
437
- expanded_txt.extend([sp, ns])
438
- expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
439
- category_b_texts, category_b_names = expanded_txt, expanded_names
440
-
441
- # infer categories: use UI labels when files uploaded,
442
- # otherwise derive from filename‐stem
443
- # (e.g. "e-r-eddison_..." -> "E R Eddison")
444
- if category_form.value["files_a"]:
445
- cats_a = [category_form.value["label_a"]] * len(category_a_names)
446
- else:
447
- cats_a = [
448
- Path(fn).stem.split("_", 1)[0].replace("-", " ").title()
449
- for fn in category_a_names
450
- ]
451
-
452
- if category_form.value["files_b"]:
453
- cats_b = [category_form.value["label_b"]] * len(category_b_names)
454
- else:
455
- cats_b = [
456
- Path(fn).stem.split("_", 1)[0].replace("-", " ").title()
457
- for fn in category_b_names
458
- ]
459
-
460
- data = pd.DataFrame(
461
- {
462
- "category": cats_a + cats_b,
463
- "filename": itertools.chain(category_a_names, category_b_names),
464
- "text": itertools.chain(category_a_texts, category_b_texts),
465
- }
466
- )
467
 
468
- with mo.status.spinner("コーパスを解析中..."):
469
- data["text"] = parse_texts(list(data["text"]))
470
- # pass through the real form
471
- data_form = category_form
 
 
 
472
 
473
- except Exception as e:
474
- data = None
475
- validation_messages.append(
476
- f"❌ **エラー**: ファイルの読み込みに失敗しました: {str(e)}\n"
477
- )
478
  else:
479
  data = None
480
  validation_messages.append(
@@ -491,7 +494,9 @@ def data_check(
491
  解析済テキスト一覧:
492
  {
493
  mo.ui.table(
494
- data, selection="multi", format_mapping={"text": lambda s: s[:20] + "..."}
 
 
495
  )
496
  if (data is not None and not data.empty)
497
  else "No data"
@@ -532,26 +537,21 @@ def _(build_corpus_cached, chunk_texts, data, sample_frac, sampling_form):
532
  mo.stop(sampling_form.value is None)
533
 
534
  with mo.status.spinner("コーパスをサンプリング中…"):
535
- texts, cats, fnames = chunk_texts(
536
- list(data.text),
537
- list(data.category),
538
- list(data.filename),
539
- sampling_form.value["chunk_size"],
540
- )
541
-
542
  if sample_frac.value < 1.0:
543
- N = len(texts)
544
- k = int(N * sampling_form.value["sample_frac"])
545
- idx = random.sample(range(N), k)
546
- texts = [texts[i] for i in idx]
547
- cats = [cats[i] for i in idx]
548
- fnames = [fnames[i] for i in idx]
549
-
550
- corpus = build_corpus_cached(
551
- texts,
552
- cats,
553
- )
554
- return cats, corpus, fnames, texts
555
 
556
 
557
  @app.cell
@@ -621,11 +621,41 @@ def _():
621
 
622
  @app.cell
623
  def _():
624
- min_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.25, show_value=True, label="Minimum proportion of samples feature appears in")
625
- max_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.8, show_value=True, label="Maximum proportion of samples feature appears in")
626
- max_features_setting = mo.ui.slider(start=10, stop=10_000, step=1, value=100, show_value=True, label="Maximum number of features to use")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
627
 
628
- mo.vstack([mo.md("### 素性設定\n\nどのような単語を分析に使用するかを下記のスライダーで決めます。標準では、ほとんど全ての文章に現る単語、または極端に少ない文章にしか現れない単語が���外されています。そのうえで、$\\mathrm{tfidf}$の値上位100件まで素性としています。"), min_df_setting, max_df_setting, max_features_setting])
 
 
 
 
 
 
 
 
 
629
  return max_df_setting, max_features_setting, min_df_setting
630
 
631
 
@@ -640,7 +670,12 @@ def _(
640
  train_scikit_cached,
641
  ):
642
  scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
643
- texts, cats, fnames, min_df=min_df_setting.value, max_df=max_df_setting.value, max_features=max_features_setting.value,
 
 
 
 
 
644
  )
645
  return chunk_cats, chunk_fnames, tfidf_X, vectorizer
646
 
@@ -667,6 +702,11 @@ def _(X_train, chunk_fnames, vectorizer):
667
  idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
668
  tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
669
 
 
 
 
 
 
670
 
671
  mo.md(rf"""
672
  ### サンプルと素性の行列
@@ -685,9 +725,9 @@ def _(X_train, chunk_fnames, vectorizer):
685
  - ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
686
  - ${D_formula}$: number of documents with $t$
687
 
688
- {mo.ui.table(pd.DataFrame(X_train.toarray(), index=chunk_fnames, columns=vectorizer.get_feature_names_out()))}
689
  """)
690
- return
691
 
692
 
693
  @app.cell
@@ -695,7 +735,7 @@ def pca_biplot(chunk_cats, tfidf_X, vectorizer):
695
  X = tfidf_X.toarray() if hasattr(tfidf_X, "toarray") else tfidf_X
696
  feature_names = vectorizer.get_feature_names_out()
697
 
698
- model = pca(normalize=True, n_components=3)
699
  results = model.fit_transform(
700
  X,
701
  col_labels=feature_names,
@@ -714,6 +754,7 @@ def _(model, results, three_switch):
714
  figsize=(12, 8),
715
  fontsize=12,
716
  s=20,
 
717
  PC=[0, 1, 2] if three_switch.value else [0, 1],
718
  )
719
  # labels=np.array(chunk_fnames)
@@ -722,7 +763,14 @@ def _(model, results, three_switch):
722
  mo.vstack(
723
  [
724
  mo.md(
725
- """## [PCA](https://erdogant.github.io/pca/pages/html/index.html)のbiplot
 
 
 
 
 
 
 
726
  """
727
  ),
728
  mo.mpl.interactive(plt.gcf()),
@@ -732,6 +780,112 @@ def _(model, results, three_switch):
732
  return
733
 
734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735
  @app.cell
736
  def _():
737
  linkage_methods = mo.ui.dropdown(
@@ -759,7 +913,15 @@ def _():
759
  d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
760
 
761
  mo.md(f"""
762
- ## 階層的クラスタリング
 
 
 
 
 
 
 
 
763
 
764
  {d_stack}
765
  {dendrogram_height}
@@ -783,9 +945,33 @@ def _(X, chunk_fnames, dendrogram_height, distance_metrics, linkage_methods):
783
  distfun=distfun,
784
  linkagefun=linkagefun,
785
  )
786
- fig.update_layout(width=800, height=dendrogram_height.value)
787
 
788
  mo.ui.plotly(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  return
790
 
791
 
@@ -809,6 +995,22 @@ def sample_viewer(fnames, text_selector, texts):
809
  return
810
 
811
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
812
  @app.cell
813
  def _():
814
  return
 
9
  # "pandas==2.3.0",
10
  # "pca==2.10.0",
11
  # "plotly==6.2.0",
12
+ # "prince==0.16.0",
13
  # "pyarrow",
14
  # "scattertext==0.2.2",
15
  # "scikit-learn==1.7.0",
 
25
  import marimo
26
 
27
  __generated_with = "0.14.9"
28
+ app = marimo.App(width="full", app_title="Scattertext on English novels")
29
 
30
  with app.setup:
31
  import marimo as mo
 
32
  import spacy
33
  import pandas as pd
34
  import scipy
 
37
  import re
38
  import scattertext as st
39
  from pca import pca
40
+ import prince
41
  import matplotlib.pyplot as plt
42
  from pathlib import Path
43
  from types import SimpleNamespace
 
62
 
63
  @mo.cache
64
  def parse_texts(texts: list[str], nlp=load_nlp()) -> list[str]:
65
+ """Tokenize English text via spaCy and emit a whitespace-joined string."""
66
  return [" ".join(tok.text for tok in doc) for doc in nlp.pipe(texts)]
67
 
68
  @mo.cache
 
85
  .compact(st.AssociationCompactor(2000))
86
  )
87
 
88
+ def _strip_advanced(fn: str) -> str:
89
+ """
90
+ Strip trailing '_advanced' from a filename stem.
91
+ """
92
+ from pathlib import Path
93
+
94
+ stem = Path(fn).stem
95
+ return stem.replace("_advanced", "")
96
+
97
+ def make_short_label(fn: str) -> str:
98
+ """
99
+ Generate an initials-based short label from filename.
100
+ E.g., 'e-r-eddison_the-worm-ouroboros.txt' -> 'ERE-TWO'.
101
+ """
102
+ stem = _strip_advanced(fn)
103
+ author, title = stem.split("_", 1)
104
+ initials = lambda s: "".join(part[0].upper() for part in s.split("-"))
105
+ return f"{initials(author)}-{initials(title)}"
106
+
107
+ def format_chunk_label(
108
+ fn: str,
109
+ category: str,
110
+ speech_type: str,
111
+ chunk_idx: int | str,
112
+ ) -> str:
113
+ """
114
+ Create a chunk label 'SHORTLABEL(CATEGORY[-speech_type])#INDEX'.
115
+ """
116
+ sl = make_short_label(fn)
117
+ # append speech_type only if it differs from category and isn't 'mixed'
118
+ if speech_type and speech_type != "mixed" and speech_type != category:
119
+ label = f"{category}-{speech_type}"
120
+ else:
121
+ label = category
122
+ return f"{sl}({label})#{chunk_idx}"
123
+
124
  @mo.cache
125
  def chunk_texts(
126
+ df: pd.DataFrame,
 
 
127
  chunk_size: int = 2000,
128
+ ) -> pd.DataFrame:
129
+ """
130
+ Turn each row of df into token‐chunks of size chunk_size,
131
+ preserving category, filename, author, work, and producing
132
+ a `chunk_label`.
133
+ """
134
+ records: list[dict] = []
135
+ for _, row in df.iterrows():
136
+ tokens = row["text"].split()
137
+ n_chunks = (len(tokens) + chunk_size - 1) // chunk_size
138
+ for idx in range(n_chunks):
139
+ seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
140
+ label_idx = idx + 1 if idx + 1 < n_chunks else "last"
141
+ records.append({
142
+ "text": seg,
143
+ "category": row["category"],
144
+ "speech_type": row["speech_type"],
145
+ "filename": row["filename"],
146
+ "author": row["author"],
147
+ "work": row["work"],
148
+ "chunk_label": format_chunk_label(
149
+ row["filename"],
150
+ row["category"],
151
+ row["speech_type"],
152
+ label_idx,
153
+ ),
154
+ })
155
+ return pd.DataFrame(records)
156
 
157
  @mo.cache
158
  def train_scikit_cached(
 
220
  texts = [Path(fn).read_text(encoding="utf-8") for fn in defaults]
221
  return names, texts
222
 
223
+ def prepare_files(
224
+ uploaded: list, defaults: list[str], split: bool = False
225
+ ) -> pd.DataFrame:
226
  """
227
+ Ingest uploaded vs. default files into a DataFrame with columns:
228
+ ['filename','raw_text','category' (if split),'author','work'].
229
  """
230
+ import pandas as pd
 
 
 
 
 
 
 
231
 
232
+ names, raws = _load_files(uploaded, defaults)
233
+ records: list[dict] = []
234
  for name, raw in zip(names, raws):
235
+ if split:
236
+ sp, ns = split_speech_text(raw)
237
+ records.append(
238
+ {
239
+ "filename": name,
240
+ "raw_text": sp,
241
+ "speech_type": "speech",
242
+ }
243
+ )
244
+ records.append(
245
+ {
246
+ "filename": name,
247
+ "raw_text": ns,
248
+ "speech_type": "non-speech",
249
+ }
250
+ )
251
+ else:
252
+ records.append(
253
+ {
254
+ "filename": name,
255
+ "raw_text": raw,
256
+ "speech_type": "mixed",
257
+ }
258
+ )
259
 
260
+ df_p = pd.DataFrame(records)
261
+ # infer author & work from the file's true stem (no extension, no "_advanced")
262
+ def _extract_auth_work(fn: str) -> tuple[str, str]:
263
+ base = Path(fn).stem.replace("_advanced", "")
264
+ auth, *rest = base.split("_", 1)
265
+ work_raw = rest[0] if rest else base
266
+ return (
267
+ auth.replace("-", " ").title(),
268
+ work_raw.replace("-", " ").title(),
269
+ )
270
 
271
+ aw = df_p["filename"].apply(_extract_auth_work)
272
+ df_p["author"], df_p["work"] = zip(*aw)
273
+ return df_p
 
 
 
 
 
274
 
275
  return (
276
  build_corpus_cached,
277
  chunk_texts,
 
278
  parse_texts,
279
+ prepare_files,
280
  train_scikit_cached,
281
  )
282
 
 
300
  2. データ内容を確認・修正
301
  3. チャンク&サンプリング設定
302
  4. Scattertextによる可視化
303
+ 5. PCAとCAのbiplot、階層的クラスタリングのデンドログラムでサンプル、カテゴリと素性の分布と関係を観察
304
  6. 気になるサンプルをドロップダウンで選択し、内容を確認
305
 
306
  > 単語分割には、[spaCy](https://spacy.io/)([en_core_web_sm](https://spacy.io/models/en#en_core_web_sm)モデル)を使用しています。
 
324
  full_width=True,
325
  )
326
  files_a = mo.ui.file(
327
+ label="Aのファイルアップロード(UTF-8、.txt形式)",
328
+ multiple=True,
329
+ kind="area",
330
  )
331
  ### Category form
332
  label_b = mo.ui.text(
 
336
  full_width=True,
337
  )
338
  files_b = mo.ui.file(
339
+ label="Bのファイルアップロード(UTF-8、.txt形式)",
340
+ multiple=True,
341
+ kind="area",
342
  )
343
  split_speech = mo.ui.switch(
344
  label="Split speech vs non-speech segments?",
 
407
  @app.cell
408
  def data_check(
409
  category_form,
 
410
  mode_tabs,
411
  parse_texts,
412
+ prepare_files,
413
  speech_form,
414
  split_speech,
 
415
  ):
416
  mo.stop(mode_tabs.value == "Speech vs Non-Speech" and speech_form.value is None)
417
  mo.stop(mode_tabs.value == "Category Comparison" and category_form.value is None)
 
419
  validation_messages: list[str] = []
420
 
421
  if mode_tabs.value == "Speech vs Non-Speech":
422
+ defaults = [
423
+ "e-r-eddison_the-worm-ouroboros_advanced.txt",
424
+ "h-g-wells_the-wonderful-visit_advanced.txt",
425
+ ]
426
+ df_pre = prepare_files(
427
+ speech_form.value.get("files_s", []),
428
+ defaults,
429
+ split=True,
430
+ )
431
+ data = df_pre.rename(columns={"raw_text": "text"})
432
+ # use the speech‐vs‐non‐speech flag as our category
433
+ data["category"] = data["speech_type"]
434
  mo.md(
435
  f"## Data preview (speech vs non-speech)\n"
436
+ f"{mo.ui.table(data, selection=None)}"
437
  )
 
438
  data_form = SimpleNamespace(
439
  value={
440
  "category_name": "Speech vs Non-speech",
 
444
  )
445
  elif category_form.value is not None and mode_tabs.value == "Category Comparison":
446
  # Category vs Category
 
447
  if category_form.value["label_a"] == category_form.value["label_b"]:
448
  validation_messages.append(
449
  "⚠️ **警告**: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
 
454
  "ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
455
  )
456
 
457
+ defaults_a = ["e-r-eddison_the-worm-ouroboros_advanced.txt"]
458
+ df_a = prepare_files(
459
+ category_form.value["files_a"],
460
+ defaults_a,
461
+ split=split_speech.value,
462
+ )
463
+ df_a["category"] = (
464
+ [category_form.value["label_a"]] * len(df_a)
465
+ if category_form.value["files_a"]
466
+ else [category_form.value["label_a"]] * len(df_a)
467
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
 
469
+ defaults_b = ["h-g-wells_the-wonderful-visit_advanced.txt"]
470
+ df_b = prepare_files(
471
+ category_form.value["files_b"],
472
+ defaults_b,
473
+ split=split_speech.value,
474
+ )
475
+ df_b["category"] = [category_form.value["label_b"]] * len(df_b)
476
 
477
+ data = pd.concat([df_a, df_b], ignore_index=True)
478
+ # tokenize text if not already (optional)
479
+ data["text"] = parse_texts(list(data["raw_text"]))
480
+ data_form = category_form
 
481
  else:
482
  data = None
483
  validation_messages.append(
 
494
  解析済テキスト一覧:
495
  {
496
  mo.ui.table(
497
+ data,
498
+ selection=None,
499
+ format_mapping={"text": lambda s: s[:20] + "..."},
500
  )
501
  if (data is not None and not data.empty)
502
  else "No data"
 
537
  mo.stop(sampling_form.value is None)
538
 
539
  with mo.status.spinner("コーパスをサンプリング中…"):
540
+ # chunk the DataFrame
541
+ chunk_df = chunk_texts(data, sampling_form.value["chunk_size"])
542
+ # optional subsampling
 
 
 
 
543
  if sample_frac.value < 1.0:
544
+ chunk_df = chunk_df.sample(frac=sample_frac.value, random_state=RANDOM_SEED)
545
+
546
+ texts = chunk_df["text"].tolist()
547
+ cats = chunk_df["category"].tolist()
548
+ fnames = chunk_df["chunk_label"].tolist()
549
+ authors = chunk_df["author"].tolist()
550
+ works = chunk_df["work"].tolist()
551
+ speech_types = chunk_df["speech_type"].tolist()
552
+
553
+ corpus = build_corpus_cached(texts, cats)
554
+ return authors, cats, corpus, fnames, speech_types, texts, works
 
555
 
556
 
557
  @app.cell
 
621
 
622
  @app.cell
623
  def _():
624
+ min_df_setting = mo.ui.slider(
625
+ start=0.0,
626
+ stop=1.0,
627
+ step=0.05,
628
+ value=0.25,
629
+ show_value=True,
630
+ label="Minimum proportion of samples feature appears in",
631
+ )
632
+ max_df_setting = mo.ui.slider(
633
+ start=0.0,
634
+ stop=1.0,
635
+ step=0.05,
636
+ value=0.8,
637
+ show_value=True,
638
+ label="Maximum proportion of samples feature appears in",
639
+ )
640
+ max_features_setting = mo.ui.slider(
641
+ start=10,
642
+ stop=10_000,
643
+ step=1,
644
+ value=100,
645
+ show_value=True,
646
+ label="Maximum number of features to use",
647
+ )
648
 
649
+ mo.vstack(
650
+ [
651
+ mo.md(
652
+ "### 素性設定\n\nどのような単語を分析に使用するかを下記のスライダーで決めます。標準では、ほとんど全ての文章に現る単語、または極端に少ない文章にしか現れない単語が除外されています。そのうえで、$\\mathrm{tfidf}$の値上位100件まで素性としています。"
653
+ ),
654
+ min_df_setting,
655
+ max_df_setting,
656
+ max_features_setting,
657
+ ]
658
+ )
659
  return max_df_setting, max_features_setting, min_df_setting
660
 
661
 
 
670
  train_scikit_cached,
671
  ):
672
  scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
673
+ texts,
674
+ cats,
675
+ fnames,
676
+ min_df=min_df_setting.value,
677
+ max_df=max_df_setting.value,
678
+ max_features=max_features_setting.value,
679
  )
680
  return chunk_cats, chunk_fnames, tfidf_X, vectorizer
681
 
 
702
  idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
703
  tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
704
 
705
+ X_df = pd.DataFrame(
706
+ X_train.toarray(),
707
+ index=chunk_fnames,
708
+ columns=vectorizer.get_feature_names_out(),
709
+ )
710
 
711
  mo.md(rf"""
712
  ### サンプルと素性の行列
 
725
  - ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
726
  - ${D_formula}$: number of documents with $t$
727
 
728
+ {mo.ui.table(X_df, selection=None)}
729
  """)
730
+ return (X_df,)
731
 
732
 
733
  @app.cell
 
735
  X = tfidf_X.toarray() if hasattr(tfidf_X, "toarray") else tfidf_X
736
  feature_names = vectorizer.get_feature_names_out()
737
 
738
+ model = pca(normalize=False, n_components=3)
739
  results = model.fit_transform(
740
  X,
741
  col_labels=feature_names,
 
754
  figsize=(12, 8),
755
  fontsize=12,
756
  s=20,
757
+ arrowdict={"alpha": 0.0},
758
  PC=[0, 1, 2] if three_switch.value else [0, 1],
759
  )
760
  # labels=np.array(chunk_fnames)
 
763
  mo.vstack(
764
  [
765
  mo.md(
766
+ r"""## Principal Components Analysis / 主成分分析
767
+
768
+ [Principal Components Analysis](https://erdogant.github.io/pca/pages/html/index.html) (PCA)は、$\mathrm{{tfidf}}$スコアを連続的な数値データとして扱い、データセット内の分散を最も多く説明する単語の線形結合を特定します。この分析により、以下の点が明らかになります。
769
+
770
+ - 主成分によって会話文と地の文(あるいは他の分析カテゴリ)を最も効果的に区別する単語の組み合わせが判明します。
771
+ - 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
772
+ - PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
773
+ - $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
774
  """
775
  ),
776
  mo.mpl.interactive(plt.gcf()),
 
780
  return
781
 
782
 
783
+ @app.cell
784
+ def _():
785
+ mo.md(
786
+ r"""
787
+ ## Correspondence Analysis / 対応分析
788
+
789
+ 対応分析(CA)のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
790
+ 対応分析を行うには、$\mathrm{tfidf}$行列をカテゴリカルな形式の分割表(contingency table)に変換する必要があります。次に、そのデータを連関表として解析します。この手法により、
791
+
792
+ - 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
793
+ - サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
794
+ - 各カテゴリに最も特徴的な単語を、PCAでのユークリッド距離ではなくカイ二乗距離を用���て抽出
795
+ - サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
796
+
797
+ といった分析が可能となります。
798
+ """
799
+ )
800
+ return
801
+
802
+
803
+ @app.cell
804
+ def _(X_df, authors, chunk_cats, speech_types, works):
805
+ import itertools
806
+
807
+ # Build a small DF to test each dim‐combo
808
+ df_chk = X_df.copy()
809
+ df_chk["author"] = authors
810
+ df_chk["category"] = chunk_cats
811
+ df_chk["work"] = works
812
+ df_chk["speech_type"] = speech_types
813
+
814
+ dims_all = ["author", "category", "work", "speech_type"]
815
+ options: list[str] = []
816
+ # Enumerate all non-empty combinations; keep those yielding >2 groups
817
+ for r in range(1, len(dims_all) + 1):
818
+ for combo in itertools.combinations(dims_all, r):
819
+ if df_chk.groupby(list(combo)).ngroups > 2:
820
+ options.append("|".join(combo))
821
+
822
+ mo.stop(
823
+ not options,
824
+ "No category combination yielding more than two rows, so cannot perform CA.",
825
+ )
826
+
827
+ ca_group_by = mo.ui.dropdown(
828
+ options=options,
829
+ value=options[0],
830
+ label="Group by (dims that yield >2 rows)",
831
+ full_width=True,
832
+ )
833
+ ca_group_by
834
+ return (ca_group_by,)
835
+
836
+
837
+ @app.cell
838
+ def _(X_df, authors, ca_group_by, chunk_cats, speech_types, works):
839
+ df = X_df.copy()
840
+ df["author"] = authors
841
+ df["category"] = chunk_cats
842
+ df["work"] = works
843
+ df["speech_type"] = speech_types
844
+
845
+ # split "author|work" (etc.) into the actual list of dims
846
+ dims = ca_group_by.value.split("|")
847
+
848
+ # sum only numeric (feature) columns by group
849
+ num_cols = df.select_dtypes(include="number").columns.tolist()
850
+ ct = df.groupby(dims)[num_cols].sum()
851
+
852
+ # flatten MultiIndex into a single‐level index
853
+ if len(dims) > 1:
854
+ ct.index = ["|".join(idx) for idx in ct.index]
855
+ else:
856
+ ct.index = ct.index.astype(str)
857
+
858
+ mo.md(f"""
859
+ ### カテゴリと素性の行列
860
+
861
+ {mo.ui.table(ct, selection=None)}
862
+ """)
863
+ return (ct,)
864
+
865
+
866
+ @app.cell
867
+ def _(ct):
868
+ ca_model = prince.CA(
869
+ n_components=2,
870
+ n_iter=10,
871
+ copy=True,
872
+ check_input=True,
873
+ engine="sklearn",
874
+ random_state=RANDOM_SEED,
875
+ )
876
+ ca_model = ca_model.fit(ct)
877
+ ca_model.plot(
878
+ ct,
879
+ x_component=0,
880
+ y_component=1,
881
+ show_row_markers=True,
882
+ show_column_markers=True,
883
+ show_row_labels=True,
884
+ show_column_labels=True,
885
+ )
886
+ return
887
+
888
+
889
  @app.cell
890
  def _():
891
  linkage_methods = mo.ui.dropdown(
 
913
  d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
914
 
915
  mo.md(f"""
916
+ ## Hierarchical Clustering / 階層的クラスタリング
917
+
918
+ 階層的クラスタリングは、(予め設定したカテゴリに関わらず)サンプル間の$\\mathrm{{tfidf}}$単語使用パターンの類似性に基づき、直接的にグループ化を行います。
919
+
920
+ - サンプル同士が異なる類似度レベルでどのようにグループ化されるかを示す樹状図(デンドログラム)を生成
921
+ - サンプル間の距離計算において、定めた全ての$\\mathrm{{tfidf}}$特徴量を保持
922
+ - PCA/CAと比べ、特徴量間の関係ではなく、サンプル間の関係性に着目(ただし、行列を回転し、逆の分析もできる)
923
+ - 高次元$\\mathrm{{tfidf}}$ベクトル間の類似度を測定するために、ユークリッド距離やコサイン距離といった距離尺度を用いる
924
+ - 類似した単語使用パターンを有するサンプル群の離散的なクラスタを構築
925
 
926
  {d_stack}
927
  {dendrogram_height}
 
945
  distfun=distfun,
946
  linkagefun=linkagefun,
947
  )
948
+ fig.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on samples",)
949
 
950
  mo.ui.plotly(fig)
951
+ return distfun, ff, linkagefun
952
+
953
+
954
+ @app.cell
955
+ def _(
956
+ X,
957
+ X_df,
958
+ dendrogram_height,
959
+ distance_metrics,
960
+ distfun,
961
+ ff,
962
+ linkage_methods,
963
+ linkagefun,
964
+ ):
965
+ fig_T = ff.create_dendrogram(
966
+ X.T,
967
+ orientation="left",
968
+ labels=X_df.columns,
969
+ distfun=distfun,
970
+ linkagefun=linkagefun,
971
+ )
972
+ fig_T.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on features")
973
+
974
+ mo.ui.plotly(fig_T)
975
  return
976
 
977
 
 
995
  return
996
 
997
 
998
+ @app.cell
999
+ def _():
1000
+ mo.md(
1001
+ r"""
1002
+ # まとめ
1003
+
1004
+ これ3つのアプローチをすべて用いることで、異なる視点を得ることができます:
1005
+
1006
+ - **階層的クラ���タリング**: データ内の"自然な"グループ分けを明らかにします。例えば、特定の著者の話し方のパターンが一緒にクラスタ化されたり、叙述部分と会話部分が明確に異なるグループを形成したりすることが考えられます。
1007
+ - **対応分析**: カテゴリ間の関連性を明らかにします。例えば、異なる著者や発話タイプに最も特徴的な単語がどれであるかを調べることができます。
1008
+ - **主成分分析**: 最も識別力の高い単語の組み合わせを特定します。例えば、どの語彙パターンが会話文/地の文や著者間の区別に最も寄与しているかを示すことができます。
1009
+ """
1010
+ )
1011
+ return
1012
+
1013
+
1014
  @app.cell
1015
  def _():
1016
  return
pyproject.toml CHANGED
@@ -12,6 +12,7 @@ dependencies = [
12
  "pandas>=2.3.0",
13
  "pca>=2.10.0",
14
  "plotly>=6.2.0",
 
15
  "pyarrow>=20.0.0",
16
  "scattertext==0.2.2",
17
  "scikit-learn==1.7.0",
 
12
  "pandas>=2.3.0",
13
  "pca>=2.10.0",
14
  "plotly>=6.2.0",
15
+ "prince>=0.16.0",
16
  "pyarrow>=20.0.0",
17
  "scattertext==0.2.2",
18
  "scikit-learn==1.7.0",
uv.lock CHANGED
@@ -953,6 +953,20 @@ wheels = [
953
  { url = "https://files.pythonhosted.org/packages/fa/8c/d3e30f80b2ef21f267f09f0b7d18995adccc928ede5b73ea3fe54e1303f4/preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed", size = 115769, upload-time = "2025-05-26T15:18:21.842Z" },
954
  ]
955
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
956
  [[package]]
957
  name = "psutil"
958
  version = "7.0.0"
@@ -1276,6 +1290,7 @@ dependencies = [
1276
  { name = "pandas" },
1277
  { name = "pca" },
1278
  { name = "plotly" },
 
1279
  { name = "pyarrow" },
1280
  { name = "scattertext" },
1281
  { name = "scikit-learn" },
@@ -1293,6 +1308,7 @@ requires-dist = [
1293
  { name = "pandas", specifier = ">=2.3.0" },
1294
  { name = "pca", specifier = ">=2.10.0" },
1295
  { name = "plotly", specifier = ">=6.2.0" },
 
1296
  { name = "pyarrow", specifier = ">=20.0.0" },
1297
  { name = "scattertext", specifier = "==0.2.2" },
1298
  { name = "scikit-learn", specifier = "==1.7.0" },
 
953
  { url = "https://files.pythonhosted.org/packages/fa/8c/d3e30f80b2ef21f267f09f0b7d18995adccc928ede5b73ea3fe54e1303f4/preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed", size = 115769, upload-time = "2025-05-26T15:18:21.842Z" },
954
  ]
955
 
956
+ [[package]]
957
+ name = "prince"
958
+ version = "0.16.0"
959
+ source = { registry = "https://pypi.org/simple" }
960
+ dependencies = [
961
+ { name = "altair" },
962
+ { name = "pandas" },
963
+ { name = "scikit-learn" },
964
+ ]
965
+ sdist = { url = "https://files.pythonhosted.org/packages/ae/bd/fde5962680ad17f8402848a6849344717c3ee341d47f60864f2d78bf720e/prince-0.16.0.tar.gz", hash = "sha256:8b3b9e74fc84ad066a1e6ef4fc076a55d80b7a46db2541a76902e47951c39b16", size = 414243, upload-time = "2025-03-09T21:38:43.631Z" }
966
+ wheels = [
967
+ { url = "https://files.pythonhosted.org/packages/18/d5/b4480a0f381cbbcfad31f4d118732ab717216857508a730938ee615669a1/prince-0.16.0-py3-none-any.whl", hash = "sha256:7e21a78d4dd06ca3ec526ee714a50b349f26de3fca6b79664150a951b31688f3", size = 417759, upload-time = "2025-03-09T21:38:41.001Z" },
968
+ ]
969
+
970
  [[package]]
971
  name = "psutil"
972
  version = "7.0.0"
 
1290
  { name = "pandas" },
1291
  { name = "pca" },
1292
  { name = "plotly" },
1293
+ { name = "prince" },
1294
  { name = "pyarrow" },
1295
  { name = "scattertext" },
1296
  { name = "scikit-learn" },
 
1308
  { name = "pandas", specifier = ">=2.3.0" },
1309
  { name = "pca", specifier = ">=2.10.0" },
1310
  { name = "plotly", specifier = ">=6.2.0" },
1311
+ { name = "prince", specifier = ">=0.16.0" },
1312
  { name = "pyarrow", specifier = ">=20.0.0" },
1313
  { name = "scattertext", specifier = "==0.2.2" },
1314
  { name = "scikit-learn", specifier = "==1.7.0" },