Spaces:
Sleeping
Sleeping
Bor Hodošček
commited on
Commit
·
17327cb
1
Parent(s):
dd4089f
feat: improved pca, ca, hclust code and support for both modes
Browse files- app.py +388 -186
- pyproject.toml +1 -0
- uv.lock +16 -0
app.py
CHANGED
|
@@ -9,6 +9,7 @@
|
|
| 9 |
# "pandas==2.3.0",
|
| 10 |
# "pca==2.10.0",
|
| 11 |
# "plotly==6.2.0",
|
|
|
|
| 12 |
# "pyarrow",
|
| 13 |
# "scattertext==0.2.2",
|
| 14 |
# "scikit-learn==1.7.0",
|
|
@@ -24,11 +25,10 @@
|
|
| 24 |
import marimo
|
| 25 |
|
| 26 |
__generated_with = "0.14.9"
|
| 27 |
-
app = marimo.App(width="full", app_title="Scattertext on
|
| 28 |
|
| 29 |
with app.setup:
|
| 30 |
import marimo as mo
|
| 31 |
-
import itertools
|
| 32 |
import spacy
|
| 33 |
import pandas as pd
|
| 34 |
import scipy
|
|
@@ -37,6 +37,7 @@ with app.setup:
|
|
| 37 |
import re
|
| 38 |
import scattertext as st
|
| 39 |
from pca import pca
|
|
|
|
| 40 |
import matplotlib.pyplot as plt
|
| 41 |
from pathlib import Path
|
| 42 |
from types import SimpleNamespace
|
|
@@ -61,7 +62,7 @@ def function_export():
|
|
| 61 |
|
| 62 |
@mo.cache
|
| 63 |
def parse_texts(texts: list[str], nlp=load_nlp()) -> list[str]:
|
| 64 |
-
"""Tokenize English text via spaCy and emit a whitespace
|
| 65 |
return [" ".join(tok.text for tok in doc) for doc in nlp.pipe(texts)]
|
| 66 |
|
| 67 |
@mo.cache
|
|
@@ -84,37 +85,74 @@ def function_export():
|
|
| 84 |
.compact(st.AssociationCompactor(2000))
|
| 85 |
)
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
@mo.cache
|
| 88 |
def chunk_texts(
|
| 89 |
-
|
| 90 |
-
categories: list[str],
|
| 91 |
-
filenames: list[str],
|
| 92 |
chunk_size: int = 2000,
|
| 93 |
-
) ->
|
| 94 |
-
"""
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
@mo.cache
|
| 120 |
def train_scikit_cached(
|
|
@@ -182,51 +220,63 @@ def function_export():
|
|
| 182 |
texts = [Path(fn).read_text(encoding="utf-8") for fn in defaults]
|
| 183 |
return names, texts
|
| 184 |
|
| 185 |
-
def
|
|
|
|
|
|
|
| 186 |
"""
|
| 187 |
-
|
| 188 |
-
|
| 189 |
"""
|
| 190 |
-
|
| 191 |
-
"e-r-eddison_the-worm-ouroboros_advanced.txt",
|
| 192 |
-
"h-g-wells_the-wonderful-visit_advanced.txt",
|
| 193 |
-
]
|
| 194 |
-
names, raws = _load_files(uploaded_files, defaults)
|
| 195 |
-
|
| 196 |
-
speech_segs, nonspeech_segs = [], []
|
| 197 |
-
speech_files, nonspeech_files = [], []
|
| 198 |
|
|
|
|
|
|
|
| 199 |
for name, raw in zip(names, raws):
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
"filename": speech_files + nonspeech_files,
|
| 219 |
-
"text": tok_sp + tok_ns,
|
| 220 |
-
"speech_ratio": ratios_full,
|
| 221 |
-
}
|
| 222 |
-
)
|
| 223 |
|
| 224 |
return (
|
| 225 |
build_corpus_cached,
|
| 226 |
chunk_texts,
|
| 227 |
-
make_speech_df,
|
| 228 |
parse_texts,
|
| 229 |
-
|
| 230 |
train_scikit_cached,
|
| 231 |
)
|
| 232 |
|
|
@@ -250,7 +300,7 @@ def intro():
|
|
| 250 |
2. データ内容を確認・修正
|
| 251 |
3. チャンク&サンプリング設定
|
| 252 |
4. Scattertextによる可視化
|
| 253 |
-
5. PCAのbiplot
|
| 254 |
6. 気になるサンプルをドロップダウンで選択し、内容を確認
|
| 255 |
|
| 256 |
> 単語分割には、[spaCy](https://spacy.io/)([en_core_web_sm](https://spacy.io/models/en#en_core_web_sm)モデル)を使用しています。
|
|
@@ -274,7 +324,9 @@ def data_settings():
|
|
| 274 |
full_width=True,
|
| 275 |
)
|
| 276 |
files_a = mo.ui.file(
|
| 277 |
-
label="Aのファイルアップロード(UTF-8、.txt形式)",
|
|
|
|
|
|
|
| 278 |
)
|
| 279 |
### Category form
|
| 280 |
label_b = mo.ui.text(
|
|
@@ -284,7 +336,9 @@ def data_settings():
|
|
| 284 |
full_width=True,
|
| 285 |
)
|
| 286 |
files_b = mo.ui.file(
|
| 287 |
-
label="Bのファイルアップロード(UTF-8、.txt形式)",
|
|
|
|
|
|
|
| 288 |
)
|
| 289 |
split_speech = mo.ui.switch(
|
| 290 |
label="Split speech vs non-speech segments?",
|
|
@@ -353,12 +407,11 @@ def data_settings():
|
|
| 353 |
@app.cell
|
| 354 |
def data_check(
|
| 355 |
category_form,
|
| 356 |
-
make_speech_df,
|
| 357 |
mode_tabs,
|
| 358 |
parse_texts,
|
|
|
|
| 359 |
speech_form,
|
| 360 |
split_speech,
|
| 361 |
-
split_speech_text,
|
| 362 |
):
|
| 363 |
mo.stop(mode_tabs.value == "Speech vs Non-Speech" and speech_form.value is None)
|
| 364 |
mo.stop(mode_tabs.value == "Category Comparison" and category_form.value is None)
|
|
@@ -366,12 +419,22 @@ def data_check(
|
|
| 366 |
validation_messages: list[str] = []
|
| 367 |
|
| 368 |
if mode_tabs.value == "Speech vs Non-Speech":
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
mo.md(
|
| 371 |
f"## Data preview (speech vs non-speech)\n"
|
| 372 |
-
f"{mo.ui.table(data, selection=
|
| 373 |
)
|
| 374 |
-
# fake data_form so all scattertext cells see the same API
|
| 375 |
data_form = SimpleNamespace(
|
| 376 |
value={
|
| 377 |
"category_name": "Speech vs Non-speech",
|
|
@@ -381,7 +444,6 @@ def data_check(
|
|
| 381 |
)
|
| 382 |
elif category_form.value is not None and mode_tabs.value == "Category Comparison":
|
| 383 |
# Category vs Category
|
| 384 |
-
|
| 385 |
if category_form.value["label_a"] == category_form.value["label_b"]:
|
| 386 |
validation_messages.append(
|
| 387 |
"⚠️ **警告**: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
|
|
@@ -392,89 +454,30 @@ def data_check(
|
|
| 392 |
"ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
|
| 393 |
)
|
| 394 |
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
category_a_names = [default_a]
|
| 407 |
-
|
| 408 |
-
if split_speech.value:
|
| 409 |
-
texts_list = list(category_a_texts)
|
| 410 |
-
names_list = list(category_a_names)
|
| 411 |
-
expanded_txt, expanded_names = [], []
|
| 412 |
-
for nm, raw in zip(names_list, texts_list):
|
| 413 |
-
sp, ns = split_speech_text(raw)
|
| 414 |
-
expanded_txt.extend([sp, ns])
|
| 415 |
-
expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
|
| 416 |
-
category_a_texts, category_a_names = expanded_txt, expanded_names
|
| 417 |
-
|
| 418 |
-
# Group B: either uploaded files or default
|
| 419 |
-
if category_form.value["files_b"]:
|
| 420 |
-
category_b_texts = (
|
| 421 |
-
f.contents.decode("utf-8") for f in category_form.value["files_b"]
|
| 422 |
-
)
|
| 423 |
-
category_b_names = (f.name for f in category_form.value["files_b"])
|
| 424 |
-
else:
|
| 425 |
-
# Default Group B: H. G. Wells: The Wonderful Visit
|
| 426 |
-
default_b = "h-g-wells_the-wonderful-visit_advanced.txt"
|
| 427 |
-
category_b_texts = [Path(default_b).read_text(encoding="utf-8")]
|
| 428 |
-
category_b_names = [default_b]
|
| 429 |
-
|
| 430 |
-
# same splitting for B‐side
|
| 431 |
-
if split_speech.value:
|
| 432 |
-
texts_list = list(category_b_texts)
|
| 433 |
-
names_list = list(category_b_names)
|
| 434 |
-
expanded_txt, expanded_names = [], []
|
| 435 |
-
for nm, raw in zip(names_list, texts_list):
|
| 436 |
-
sp, ns = split_speech_text(raw)
|
| 437 |
-
expanded_txt.extend([sp, ns])
|
| 438 |
-
expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
|
| 439 |
-
category_b_texts, category_b_names = expanded_txt, expanded_names
|
| 440 |
-
|
| 441 |
-
# infer categories: use UI labels when files uploaded,
|
| 442 |
-
# otherwise derive from filename‐stem
|
| 443 |
-
# (e.g. "e-r-eddison_..." -> "E R Eddison")
|
| 444 |
-
if category_form.value["files_a"]:
|
| 445 |
-
cats_a = [category_form.value["label_a"]] * len(category_a_names)
|
| 446 |
-
else:
|
| 447 |
-
cats_a = [
|
| 448 |
-
Path(fn).stem.split("_", 1)[0].replace("-", " ").title()
|
| 449 |
-
for fn in category_a_names
|
| 450 |
-
]
|
| 451 |
-
|
| 452 |
-
if category_form.value["files_b"]:
|
| 453 |
-
cats_b = [category_form.value["label_b"]] * len(category_b_names)
|
| 454 |
-
else:
|
| 455 |
-
cats_b = [
|
| 456 |
-
Path(fn).stem.split("_", 1)[0].replace("-", " ").title()
|
| 457 |
-
for fn in category_b_names
|
| 458 |
-
]
|
| 459 |
-
|
| 460 |
-
data = pd.DataFrame(
|
| 461 |
-
{
|
| 462 |
-
"category": cats_a + cats_b,
|
| 463 |
-
"filename": itertools.chain(category_a_names, category_b_names),
|
| 464 |
-
"text": itertools.chain(category_a_texts, category_b_texts),
|
| 465 |
-
}
|
| 466 |
-
)
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
)
|
| 478 |
else:
|
| 479 |
data = None
|
| 480 |
validation_messages.append(
|
|
@@ -491,7 +494,9 @@ def data_check(
|
|
| 491 |
解析済テキスト一覧:
|
| 492 |
{
|
| 493 |
mo.ui.table(
|
| 494 |
-
data,
|
|
|
|
|
|
|
| 495 |
)
|
| 496 |
if (data is not None and not data.empty)
|
| 497 |
else "No data"
|
|
@@ -532,26 +537,21 @@ def _(build_corpus_cached, chunk_texts, data, sample_frac, sampling_form):
|
|
| 532 |
mo.stop(sampling_form.value is None)
|
| 533 |
|
| 534 |
with mo.status.spinner("コーパスをサンプリング中…"):
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
list(data.filename),
|
| 539 |
-
sampling_form.value["chunk_size"],
|
| 540 |
-
)
|
| 541 |
-
|
| 542 |
if sample_frac.value < 1.0:
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
return cats, corpus, fnames, texts
|
| 555 |
|
| 556 |
|
| 557 |
@app.cell
|
|
@@ -621,11 +621,41 @@ def _():
|
|
| 621 |
|
| 622 |
@app.cell
|
| 623 |
def _():
|
| 624 |
-
min_df_setting = mo.ui.slider(
|
| 625 |
-
|
| 626 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 627 |
|
| 628 |
-
mo.vstack(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 629 |
return max_df_setting, max_features_setting, min_df_setting
|
| 630 |
|
| 631 |
|
|
@@ -640,7 +670,12 @@ def _(
|
|
| 640 |
train_scikit_cached,
|
| 641 |
):
|
| 642 |
scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
|
| 643 |
-
texts,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 644 |
)
|
| 645 |
return chunk_cats, chunk_fnames, tfidf_X, vectorizer
|
| 646 |
|
|
@@ -667,6 +702,11 @@ def _(X_train, chunk_fnames, vectorizer):
|
|
| 667 |
idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
|
| 668 |
tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
|
| 669 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
|
| 671 |
mo.md(rf"""
|
| 672 |
### サンプルと素性の行列
|
|
@@ -685,9 +725,9 @@ def _(X_train, chunk_fnames, vectorizer):
|
|
| 685 |
- ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
|
| 686 |
- ${D_formula}$: number of documents with $t$
|
| 687 |
|
| 688 |
-
{mo.ui.table(
|
| 689 |
""")
|
| 690 |
-
return
|
| 691 |
|
| 692 |
|
| 693 |
@app.cell
|
|
@@ -695,7 +735,7 @@ def pca_biplot(chunk_cats, tfidf_X, vectorizer):
|
|
| 695 |
X = tfidf_X.toarray() if hasattr(tfidf_X, "toarray") else tfidf_X
|
| 696 |
feature_names = vectorizer.get_feature_names_out()
|
| 697 |
|
| 698 |
-
model = pca(normalize=
|
| 699 |
results = model.fit_transform(
|
| 700 |
X,
|
| 701 |
col_labels=feature_names,
|
|
@@ -714,6 +754,7 @@ def _(model, results, three_switch):
|
|
| 714 |
figsize=(12, 8),
|
| 715 |
fontsize=12,
|
| 716 |
s=20,
|
|
|
|
| 717 |
PC=[0, 1, 2] if three_switch.value else [0, 1],
|
| 718 |
)
|
| 719 |
# labels=np.array(chunk_fnames)
|
|
@@ -722,7 +763,14 @@ def _(model, results, three_switch):
|
|
| 722 |
mo.vstack(
|
| 723 |
[
|
| 724 |
mo.md(
|
| 725 |
-
"""##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
"""
|
| 727 |
),
|
| 728 |
mo.mpl.interactive(plt.gcf()),
|
|
@@ -732,6 +780,112 @@ def _(model, results, three_switch):
|
|
| 732 |
return
|
| 733 |
|
| 734 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 735 |
@app.cell
|
| 736 |
def _():
|
| 737 |
linkage_methods = mo.ui.dropdown(
|
|
@@ -759,7 +913,15 @@ def _():
|
|
| 759 |
d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
|
| 760 |
|
| 761 |
mo.md(f"""
|
| 762 |
-
## 階層的クラスタリング
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 763 |
|
| 764 |
{d_stack}
|
| 765 |
{dendrogram_height}
|
|
@@ -783,9 +945,33 @@ def _(X, chunk_fnames, dendrogram_height, distance_metrics, linkage_methods):
|
|
| 783 |
distfun=distfun,
|
| 784 |
linkagefun=linkagefun,
|
| 785 |
)
|
| 786 |
-
fig.update_layout(width=800, height=dendrogram_height.value)
|
| 787 |
|
| 788 |
mo.ui.plotly(fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 789 |
return
|
| 790 |
|
| 791 |
|
|
@@ -809,6 +995,22 @@ def sample_viewer(fnames, text_selector, texts):
|
|
| 809 |
return
|
| 810 |
|
| 811 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 812 |
@app.cell
|
| 813 |
def _():
|
| 814 |
return
|
|
|
|
| 9 |
# "pandas==2.3.0",
|
| 10 |
# "pca==2.10.0",
|
| 11 |
# "plotly==6.2.0",
|
| 12 |
+
# "prince==0.16.0",
|
| 13 |
# "pyarrow",
|
| 14 |
# "scattertext==0.2.2",
|
| 15 |
# "scikit-learn==1.7.0",
|
|
|
|
| 25 |
import marimo
|
| 26 |
|
| 27 |
__generated_with = "0.14.9"
|
| 28 |
+
app = marimo.App(width="full", app_title="Scattertext on English novels")
|
| 29 |
|
| 30 |
with app.setup:
|
| 31 |
import marimo as mo
|
|
|
|
| 32 |
import spacy
|
| 33 |
import pandas as pd
|
| 34 |
import scipy
|
|
|
|
| 37 |
import re
|
| 38 |
import scattertext as st
|
| 39 |
from pca import pca
|
| 40 |
+
import prince
|
| 41 |
import matplotlib.pyplot as plt
|
| 42 |
from pathlib import Path
|
| 43 |
from types import SimpleNamespace
|
|
|
|
| 62 |
|
| 63 |
@mo.cache
|
| 64 |
def parse_texts(texts: list[str], nlp=load_nlp()) -> list[str]:
|
| 65 |
+
"""Tokenize English text via spaCy and emit a whitespace-joined string."""
|
| 66 |
return [" ".join(tok.text for tok in doc) for doc in nlp.pipe(texts)]
|
| 67 |
|
| 68 |
@mo.cache
|
|
|
|
| 85 |
.compact(st.AssociationCompactor(2000))
|
| 86 |
)
|
| 87 |
|
| 88 |
+
def _strip_advanced(fn: str) -> str:
|
| 89 |
+
"""
|
| 90 |
+
Strip trailing '_advanced' from a filename stem.
|
| 91 |
+
"""
|
| 92 |
+
from pathlib import Path
|
| 93 |
+
|
| 94 |
+
stem = Path(fn).stem
|
| 95 |
+
return stem.replace("_advanced", "")
|
| 96 |
+
|
| 97 |
+
def make_short_label(fn: str) -> str:
|
| 98 |
+
"""
|
| 99 |
+
Generate an initials-based short label from filename.
|
| 100 |
+
E.g., 'e-r-eddison_the-worm-ouroboros.txt' -> 'ERE-TWO'.
|
| 101 |
+
"""
|
| 102 |
+
stem = _strip_advanced(fn)
|
| 103 |
+
author, title = stem.split("_", 1)
|
| 104 |
+
initials = lambda s: "".join(part[0].upper() for part in s.split("-"))
|
| 105 |
+
return f"{initials(author)}-{initials(title)}"
|
| 106 |
+
|
| 107 |
+
def format_chunk_label(
|
| 108 |
+
fn: str,
|
| 109 |
+
category: str,
|
| 110 |
+
speech_type: str,
|
| 111 |
+
chunk_idx: int | str,
|
| 112 |
+
) -> str:
|
| 113 |
+
"""
|
| 114 |
+
Create a chunk label 'SHORTLABEL(CATEGORY[-speech_type])#INDEX'.
|
| 115 |
+
"""
|
| 116 |
+
sl = make_short_label(fn)
|
| 117 |
+
# append speech_type only if it differs from category and isn't 'mixed'
|
| 118 |
+
if speech_type and speech_type != "mixed" and speech_type != category:
|
| 119 |
+
label = f"{category}-{speech_type}"
|
| 120 |
+
else:
|
| 121 |
+
label = category
|
| 122 |
+
return f"{sl}({label})#{chunk_idx}"
|
| 123 |
+
|
| 124 |
@mo.cache
|
| 125 |
def chunk_texts(
|
| 126 |
+
df: pd.DataFrame,
|
|
|
|
|
|
|
| 127 |
chunk_size: int = 2000,
|
| 128 |
+
) -> pd.DataFrame:
|
| 129 |
+
"""
|
| 130 |
+
Turn each row of df into token‐chunks of size chunk_size,
|
| 131 |
+
preserving category, filename, author, work, and producing
|
| 132 |
+
a `chunk_label`.
|
| 133 |
+
"""
|
| 134 |
+
records: list[dict] = []
|
| 135 |
+
for _, row in df.iterrows():
|
| 136 |
+
tokens = row["text"].split()
|
| 137 |
+
n_chunks = (len(tokens) + chunk_size - 1) // chunk_size
|
| 138 |
+
for idx in range(n_chunks):
|
| 139 |
+
seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
|
| 140 |
+
label_idx = idx + 1 if idx + 1 < n_chunks else "last"
|
| 141 |
+
records.append({
|
| 142 |
+
"text": seg,
|
| 143 |
+
"category": row["category"],
|
| 144 |
+
"speech_type": row["speech_type"],
|
| 145 |
+
"filename": row["filename"],
|
| 146 |
+
"author": row["author"],
|
| 147 |
+
"work": row["work"],
|
| 148 |
+
"chunk_label": format_chunk_label(
|
| 149 |
+
row["filename"],
|
| 150 |
+
row["category"],
|
| 151 |
+
row["speech_type"],
|
| 152 |
+
label_idx,
|
| 153 |
+
),
|
| 154 |
+
})
|
| 155 |
+
return pd.DataFrame(records)
|
| 156 |
|
| 157 |
@mo.cache
|
| 158 |
def train_scikit_cached(
|
|
|
|
| 220 |
texts = [Path(fn).read_text(encoding="utf-8") for fn in defaults]
|
| 221 |
return names, texts
|
| 222 |
|
| 223 |
+
def prepare_files(
|
| 224 |
+
uploaded: list, defaults: list[str], split: bool = False
|
| 225 |
+
) -> pd.DataFrame:
|
| 226 |
"""
|
| 227 |
+
Ingest uploaded vs. default files into a DataFrame with columns:
|
| 228 |
+
['filename','raw_text','category' (if split),'author','work'].
|
| 229 |
"""
|
| 230 |
+
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
names, raws = _load_files(uploaded, defaults)
|
| 233 |
+
records: list[dict] = []
|
| 234 |
for name, raw in zip(names, raws):
|
| 235 |
+
if split:
|
| 236 |
+
sp, ns = split_speech_text(raw)
|
| 237 |
+
records.append(
|
| 238 |
+
{
|
| 239 |
+
"filename": name,
|
| 240 |
+
"raw_text": sp,
|
| 241 |
+
"speech_type": "speech",
|
| 242 |
+
}
|
| 243 |
+
)
|
| 244 |
+
records.append(
|
| 245 |
+
{
|
| 246 |
+
"filename": name,
|
| 247 |
+
"raw_text": ns,
|
| 248 |
+
"speech_type": "non-speech",
|
| 249 |
+
}
|
| 250 |
+
)
|
| 251 |
+
else:
|
| 252 |
+
records.append(
|
| 253 |
+
{
|
| 254 |
+
"filename": name,
|
| 255 |
+
"raw_text": raw,
|
| 256 |
+
"speech_type": "mixed",
|
| 257 |
+
}
|
| 258 |
+
)
|
| 259 |
|
| 260 |
+
df_p = pd.DataFrame(records)
|
| 261 |
+
# infer author & work from the file's true stem (no extension, no "_advanced")
|
| 262 |
+
def _extract_auth_work(fn: str) -> tuple[str, str]:
|
| 263 |
+
base = Path(fn).stem.replace("_advanced", "")
|
| 264 |
+
auth, *rest = base.split("_", 1)
|
| 265 |
+
work_raw = rest[0] if rest else base
|
| 266 |
+
return (
|
| 267 |
+
auth.replace("-", " ").title(),
|
| 268 |
+
work_raw.replace("-", " ").title(),
|
| 269 |
+
)
|
| 270 |
|
| 271 |
+
aw = df_p["filename"].apply(_extract_auth_work)
|
| 272 |
+
df_p["author"], df_p["work"] = zip(*aw)
|
| 273 |
+
return df_p
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
return (
|
| 276 |
build_corpus_cached,
|
| 277 |
chunk_texts,
|
|
|
|
| 278 |
parse_texts,
|
| 279 |
+
prepare_files,
|
| 280 |
train_scikit_cached,
|
| 281 |
)
|
| 282 |
|
|
|
|
| 300 |
2. データ内容を確認・修正
|
| 301 |
3. チャンク&サンプリング設定
|
| 302 |
4. Scattertextによる可視化
|
| 303 |
+
5. PCAとCAのbiplot、階層的クラスタリングのデンドログラムでサンプル、カテゴリと素性の分布と関係を観察
|
| 304 |
6. 気になるサンプルをドロップダウンで選択し、内容を確認
|
| 305 |
|
| 306 |
> 単語分割には、[spaCy](https://spacy.io/)([en_core_web_sm](https://spacy.io/models/en#en_core_web_sm)モデル)を使用しています。
|
|
|
|
| 324 |
full_width=True,
|
| 325 |
)
|
| 326 |
files_a = mo.ui.file(
|
| 327 |
+
label="Aのファイルアップロード(UTF-8、.txt形式)",
|
| 328 |
+
multiple=True,
|
| 329 |
+
kind="area",
|
| 330 |
)
|
| 331 |
### Category form
|
| 332 |
label_b = mo.ui.text(
|
|
|
|
| 336 |
full_width=True,
|
| 337 |
)
|
| 338 |
files_b = mo.ui.file(
|
| 339 |
+
label="Bのファイルアップロード(UTF-8、.txt形式)",
|
| 340 |
+
multiple=True,
|
| 341 |
+
kind="area",
|
| 342 |
)
|
| 343 |
split_speech = mo.ui.switch(
|
| 344 |
label="Split speech vs non-speech segments?",
|
|
|
|
| 407 |
@app.cell
|
| 408 |
def data_check(
|
| 409 |
category_form,
|
|
|
|
| 410 |
mode_tabs,
|
| 411 |
parse_texts,
|
| 412 |
+
prepare_files,
|
| 413 |
speech_form,
|
| 414 |
split_speech,
|
|
|
|
| 415 |
):
|
| 416 |
mo.stop(mode_tabs.value == "Speech vs Non-Speech" and speech_form.value is None)
|
| 417 |
mo.stop(mode_tabs.value == "Category Comparison" and category_form.value is None)
|
|
|
|
| 419 |
validation_messages: list[str] = []
|
| 420 |
|
| 421 |
if mode_tabs.value == "Speech vs Non-Speech":
|
| 422 |
+
defaults = [
|
| 423 |
+
"e-r-eddison_the-worm-ouroboros_advanced.txt",
|
| 424 |
+
"h-g-wells_the-wonderful-visit_advanced.txt",
|
| 425 |
+
]
|
| 426 |
+
df_pre = prepare_files(
|
| 427 |
+
speech_form.value.get("files_s", []),
|
| 428 |
+
defaults,
|
| 429 |
+
split=True,
|
| 430 |
+
)
|
| 431 |
+
data = df_pre.rename(columns={"raw_text": "text"})
|
| 432 |
+
# use the speech‐vs‐non‐speech flag as our category
|
| 433 |
+
data["category"] = data["speech_type"]
|
| 434 |
mo.md(
|
| 435 |
f"## Data preview (speech vs non-speech)\n"
|
| 436 |
+
f"{mo.ui.table(data, selection=None)}"
|
| 437 |
)
|
|
|
|
| 438 |
data_form = SimpleNamespace(
|
| 439 |
value={
|
| 440 |
"category_name": "Speech vs Non-speech",
|
|
|
|
| 444 |
)
|
| 445 |
elif category_form.value is not None and mode_tabs.value == "Category Comparison":
|
| 446 |
# Category vs Category
|
|
|
|
| 447 |
if category_form.value["label_a"] == category_form.value["label_b"]:
|
| 448 |
validation_messages.append(
|
| 449 |
"⚠️ **警告**: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
|
|
|
|
| 454 |
"ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
|
| 455 |
)
|
| 456 |
|
| 457 |
+
defaults_a = ["e-r-eddison_the-worm-ouroboros_advanced.txt"]
|
| 458 |
+
df_a = prepare_files(
|
| 459 |
+
category_form.value["files_a"],
|
| 460 |
+
defaults_a,
|
| 461 |
+
split=split_speech.value,
|
| 462 |
+
)
|
| 463 |
+
df_a["category"] = (
|
| 464 |
+
[category_form.value["label_a"]] * len(df_a)
|
| 465 |
+
if category_form.value["files_a"]
|
| 466 |
+
else [category_form.value["label_a"]] * len(df_a)
|
| 467 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
+
defaults_b = ["h-g-wells_the-wonderful-visit_advanced.txt"]
|
| 470 |
+
df_b = prepare_files(
|
| 471 |
+
category_form.value["files_b"],
|
| 472 |
+
defaults_b,
|
| 473 |
+
split=split_speech.value,
|
| 474 |
+
)
|
| 475 |
+
df_b["category"] = [category_form.value["label_b"]] * len(df_b)
|
| 476 |
|
| 477 |
+
data = pd.concat([df_a, df_b], ignore_index=True)
|
| 478 |
+
# tokenize text if not already (optional)
|
| 479 |
+
data["text"] = parse_texts(list(data["raw_text"]))
|
| 480 |
+
data_form = category_form
|
|
|
|
| 481 |
else:
|
| 482 |
data = None
|
| 483 |
validation_messages.append(
|
|
|
|
| 494 |
解析済テキスト一覧:
|
| 495 |
{
|
| 496 |
mo.ui.table(
|
| 497 |
+
data,
|
| 498 |
+
selection=None,
|
| 499 |
+
format_mapping={"text": lambda s: s[:20] + "..."},
|
| 500 |
)
|
| 501 |
if (data is not None and not data.empty)
|
| 502 |
else "No data"
|
|
|
|
| 537 |
mo.stop(sampling_form.value is None)
|
| 538 |
|
| 539 |
with mo.status.spinner("コーパスをサンプリング中…"):
|
| 540 |
+
# chunk the DataFrame
|
| 541 |
+
chunk_df = chunk_texts(data, sampling_form.value["chunk_size"])
|
| 542 |
+
# optional subsampling
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
if sample_frac.value < 1.0:
|
| 544 |
+
chunk_df = chunk_df.sample(frac=sample_frac.value, random_state=RANDOM_SEED)
|
| 545 |
+
|
| 546 |
+
texts = chunk_df["text"].tolist()
|
| 547 |
+
cats = chunk_df["category"].tolist()
|
| 548 |
+
fnames = chunk_df["chunk_label"].tolist()
|
| 549 |
+
authors = chunk_df["author"].tolist()
|
| 550 |
+
works = chunk_df["work"].tolist()
|
| 551 |
+
speech_types = chunk_df["speech_type"].tolist()
|
| 552 |
+
|
| 553 |
+
corpus = build_corpus_cached(texts, cats)
|
| 554 |
+
return authors, cats, corpus, fnames, speech_types, texts, works
|
|
|
|
| 555 |
|
| 556 |
|
| 557 |
@app.cell
|
|
|
|
| 621 |
|
| 622 |
@app.cell
|
| 623 |
def _():
|
| 624 |
+
min_df_setting = mo.ui.slider(
|
| 625 |
+
start=0.0,
|
| 626 |
+
stop=1.0,
|
| 627 |
+
step=0.05,
|
| 628 |
+
value=0.25,
|
| 629 |
+
show_value=True,
|
| 630 |
+
label="Minimum proportion of samples feature appears in",
|
| 631 |
+
)
|
| 632 |
+
max_df_setting = mo.ui.slider(
|
| 633 |
+
start=0.0,
|
| 634 |
+
stop=1.0,
|
| 635 |
+
step=0.05,
|
| 636 |
+
value=0.8,
|
| 637 |
+
show_value=True,
|
| 638 |
+
label="Maximum proportion of samples feature appears in",
|
| 639 |
+
)
|
| 640 |
+
max_features_setting = mo.ui.slider(
|
| 641 |
+
start=10,
|
| 642 |
+
stop=10_000,
|
| 643 |
+
step=1,
|
| 644 |
+
value=100,
|
| 645 |
+
show_value=True,
|
| 646 |
+
label="Maximum number of features to use",
|
| 647 |
+
)
|
| 648 |
|
| 649 |
+
mo.vstack(
|
| 650 |
+
[
|
| 651 |
+
mo.md(
|
| 652 |
+
"### 素性設定\n\nどのような単語を分析に使用するかを下記のスライダーで決めます。標準では、ほとんど全ての文章に現る単語、または極端に少ない文章にしか現れない単語が除外されています。そのうえで、$\\mathrm{tfidf}$の値上位100件まで素性としています。"
|
| 653 |
+
),
|
| 654 |
+
min_df_setting,
|
| 655 |
+
max_df_setting,
|
| 656 |
+
max_features_setting,
|
| 657 |
+
]
|
| 658 |
+
)
|
| 659 |
return max_df_setting, max_features_setting, min_df_setting
|
| 660 |
|
| 661 |
|
|
|
|
| 670 |
train_scikit_cached,
|
| 671 |
):
|
| 672 |
scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
|
| 673 |
+
texts,
|
| 674 |
+
cats,
|
| 675 |
+
fnames,
|
| 676 |
+
min_df=min_df_setting.value,
|
| 677 |
+
max_df=max_df_setting.value,
|
| 678 |
+
max_features=max_features_setting.value,
|
| 679 |
)
|
| 680 |
return chunk_cats, chunk_fnames, tfidf_X, vectorizer
|
| 681 |
|
|
|
|
| 702 |
idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
|
| 703 |
tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
|
| 704 |
|
| 705 |
+
X_df = pd.DataFrame(
|
| 706 |
+
X_train.toarray(),
|
| 707 |
+
index=chunk_fnames,
|
| 708 |
+
columns=vectorizer.get_feature_names_out(),
|
| 709 |
+
)
|
| 710 |
|
| 711 |
mo.md(rf"""
|
| 712 |
### サンプルと素性の行列
|
|
|
|
| 725 |
- ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
|
| 726 |
- ${D_formula}$: number of documents with $t$
|
| 727 |
|
| 728 |
+
{mo.ui.table(X_df, selection=None)}
|
| 729 |
""")
|
| 730 |
+
return (X_df,)
|
| 731 |
|
| 732 |
|
| 733 |
@app.cell
|
|
|
|
| 735 |
X = tfidf_X.toarray() if hasattr(tfidf_X, "toarray") else tfidf_X
|
| 736 |
feature_names = vectorizer.get_feature_names_out()
|
| 737 |
|
| 738 |
+
model = pca(normalize=False, n_components=3)
|
| 739 |
results = model.fit_transform(
|
| 740 |
X,
|
| 741 |
col_labels=feature_names,
|
|
|
|
| 754 |
figsize=(12, 8),
|
| 755 |
fontsize=12,
|
| 756 |
s=20,
|
| 757 |
+
arrowdict={"alpha": 0.0},
|
| 758 |
PC=[0, 1, 2] if three_switch.value else [0, 1],
|
| 759 |
)
|
| 760 |
# labels=np.array(chunk_fnames)
|
|
|
|
| 763 |
mo.vstack(
|
| 764 |
[
|
| 765 |
mo.md(
|
| 766 |
+
r"""## Principal Components Analysis / 主成分分析
|
| 767 |
+
|
| 768 |
+
[Principal Components Analysis](https://erdogant.github.io/pca/pages/html/index.html) (PCA)は、$\mathrm{{tfidf}}$スコアを連続的な数値データとして扱い、データセット内の分散を最も多く説明する単語の線形結合を特定します。この分析により、以下の点が明らかになります。
|
| 769 |
+
|
| 770 |
+
- 主成分によって会話文と地の文(あるいは他の分析カテゴリ)を最も効果的に区別する単語の組み合わせが判明します。
|
| 771 |
+
- 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
|
| 772 |
+
- PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
|
| 773 |
+
- $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
|
| 774 |
"""
|
| 775 |
),
|
| 776 |
mo.mpl.interactive(plt.gcf()),
|
|
|
|
| 780 |
return
|
| 781 |
|
| 782 |
|
| 783 |
+
@app.cell
|
| 784 |
+
def _():
|
| 785 |
+
mo.md(
|
| 786 |
+
r"""
|
| 787 |
+
## Correspondence Analysis / 対応分析
|
| 788 |
+
|
| 789 |
+
対応分析(CA)のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
|
| 790 |
+
対応分析を行うには、$\mathrm{tfidf}$行列をカテゴリカルな形式の分割表(contingency table)に変換する必要があります。次に、そのデータを連関表として解析します。この手法により、
|
| 791 |
+
|
| 792 |
+
- 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
|
| 793 |
+
- サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
|
| 794 |
+
- 各カテゴリに最も特徴的な単語を、PCAでのユークリッド距離ではなくカイ二乗距離を用���て抽出
|
| 795 |
+
- サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
|
| 796 |
+
|
| 797 |
+
といった分析が可能となります。
|
| 798 |
+
"""
|
| 799 |
+
)
|
| 800 |
+
return
|
| 801 |
+
|
| 802 |
+
|
| 803 |
+
@app.cell
|
| 804 |
+
def _(X_df, authors, chunk_cats, speech_types, works):
|
| 805 |
+
import itertools
|
| 806 |
+
|
| 807 |
+
# Build a small DF to test each dim‐combo
|
| 808 |
+
df_chk = X_df.copy()
|
| 809 |
+
df_chk["author"] = authors
|
| 810 |
+
df_chk["category"] = chunk_cats
|
| 811 |
+
df_chk["work"] = works
|
| 812 |
+
df_chk["speech_type"] = speech_types
|
| 813 |
+
|
| 814 |
+
dims_all = ["author", "category", "work", "speech_type"]
|
| 815 |
+
options: list[str] = []
|
| 816 |
+
# Enumerate all non-empty combinations; keep those yielding >2 groups
|
| 817 |
+
for r in range(1, len(dims_all) + 1):
|
| 818 |
+
for combo in itertools.combinations(dims_all, r):
|
| 819 |
+
if df_chk.groupby(list(combo)).ngroups > 2:
|
| 820 |
+
options.append("|".join(combo))
|
| 821 |
+
|
| 822 |
+
mo.stop(
|
| 823 |
+
not options,
|
| 824 |
+
"No category combination yielding more than two rows, so cannot perform CA.",
|
| 825 |
+
)
|
| 826 |
+
|
| 827 |
+
ca_group_by = mo.ui.dropdown(
|
| 828 |
+
options=options,
|
| 829 |
+
value=options[0],
|
| 830 |
+
label="Group by (dims that yield >2 rows)",
|
| 831 |
+
full_width=True,
|
| 832 |
+
)
|
| 833 |
+
ca_group_by
|
| 834 |
+
return (ca_group_by,)
|
| 835 |
+
|
| 836 |
+
|
| 837 |
+
@app.cell
|
| 838 |
+
def _(X_df, authors, ca_group_by, chunk_cats, speech_types, works):
|
| 839 |
+
df = X_df.copy()
|
| 840 |
+
df["author"] = authors
|
| 841 |
+
df["category"] = chunk_cats
|
| 842 |
+
df["work"] = works
|
| 843 |
+
df["speech_type"] = speech_types
|
| 844 |
+
|
| 845 |
+
# split "author|work" (etc.) into the actual list of dims
|
| 846 |
+
dims = ca_group_by.value.split("|")
|
| 847 |
+
|
| 848 |
+
# sum only numeric (feature) columns by group
|
| 849 |
+
num_cols = df.select_dtypes(include="number").columns.tolist()
|
| 850 |
+
ct = df.groupby(dims)[num_cols].sum()
|
| 851 |
+
|
| 852 |
+
# flatten MultiIndex into a single‐level index
|
| 853 |
+
if len(dims) > 1:
|
| 854 |
+
ct.index = ["|".join(idx) for idx in ct.index]
|
| 855 |
+
else:
|
| 856 |
+
ct.index = ct.index.astype(str)
|
| 857 |
+
|
| 858 |
+
mo.md(f"""
|
| 859 |
+
### カテゴリと素性の行列
|
| 860 |
+
|
| 861 |
+
{mo.ui.table(ct, selection=None)}
|
| 862 |
+
""")
|
| 863 |
+
return (ct,)
|
| 864 |
+
|
| 865 |
+
|
| 866 |
+
@app.cell
|
| 867 |
+
def _(ct):
|
| 868 |
+
ca_model = prince.CA(
|
| 869 |
+
n_components=2,
|
| 870 |
+
n_iter=10,
|
| 871 |
+
copy=True,
|
| 872 |
+
check_input=True,
|
| 873 |
+
engine="sklearn",
|
| 874 |
+
random_state=RANDOM_SEED,
|
| 875 |
+
)
|
| 876 |
+
ca_model = ca_model.fit(ct)
|
| 877 |
+
ca_model.plot(
|
| 878 |
+
ct,
|
| 879 |
+
x_component=0,
|
| 880 |
+
y_component=1,
|
| 881 |
+
show_row_markers=True,
|
| 882 |
+
show_column_markers=True,
|
| 883 |
+
show_row_labels=True,
|
| 884 |
+
show_column_labels=True,
|
| 885 |
+
)
|
| 886 |
+
return
|
| 887 |
+
|
| 888 |
+
|
| 889 |
@app.cell
|
| 890 |
def _():
|
| 891 |
linkage_methods = mo.ui.dropdown(
|
|
|
|
| 913 |
d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
|
| 914 |
|
| 915 |
mo.md(f"""
|
| 916 |
+
## Hierarchical Clustering / 階層的クラスタリング
|
| 917 |
+
|
| 918 |
+
階層的クラスタリングは、(予め設定したカテゴリに関わらず)サンプル間の$\\mathrm{{tfidf}}$単語使用パターンの類似性に基づき、直接的にグループ化を行います。
|
| 919 |
+
|
| 920 |
+
- サンプル同士が異なる類似度レベルでどのようにグループ化されるかを示す樹状図(デンドログラム)を生成
|
| 921 |
+
- サンプル間の距離計算において、定めた全ての$\\mathrm{{tfidf}}$特徴量を保持
|
| 922 |
+
- PCA/CAと比べ、特徴量間の関係ではなく、サンプル間の関係性に着目(ただし、行列を回転し、逆の分析もできる)
|
| 923 |
+
- 高次元$\\mathrm{{tfidf}}$ベクトル間の類似度を測定するために、ユークリッド距離やコサイン距離といった距離尺度を用いる
|
| 924 |
+
- 類似した単語使用パターンを有するサンプル群の離散的なクラスタを構築
|
| 925 |
|
| 926 |
{d_stack}
|
| 927 |
{dendrogram_height}
|
|
|
|
| 945 |
distfun=distfun,
|
| 946 |
linkagefun=linkagefun,
|
| 947 |
)
|
| 948 |
+
fig.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on samples",)
|
| 949 |
|
| 950 |
mo.ui.plotly(fig)
|
| 951 |
+
return distfun, ff, linkagefun
|
| 952 |
+
|
| 953 |
+
|
| 954 |
+
@app.cell
|
| 955 |
+
def _(
|
| 956 |
+
X,
|
| 957 |
+
X_df,
|
| 958 |
+
dendrogram_height,
|
| 959 |
+
distance_metrics,
|
| 960 |
+
distfun,
|
| 961 |
+
ff,
|
| 962 |
+
linkage_methods,
|
| 963 |
+
linkagefun,
|
| 964 |
+
):
|
| 965 |
+
fig_T = ff.create_dendrogram(
|
| 966 |
+
X.T,
|
| 967 |
+
orientation="left",
|
| 968 |
+
labels=X_df.columns,
|
| 969 |
+
distfun=distfun,
|
| 970 |
+
linkagefun=linkagefun,
|
| 971 |
+
)
|
| 972 |
+
fig_T.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on features")
|
| 973 |
+
|
| 974 |
+
mo.ui.plotly(fig_T)
|
| 975 |
return
|
| 976 |
|
| 977 |
|
|
|
|
| 995 |
return
|
| 996 |
|
| 997 |
|
| 998 |
+
@app.cell
|
| 999 |
+
def _():
|
| 1000 |
+
mo.md(
|
| 1001 |
+
r"""
|
| 1002 |
+
# まとめ
|
| 1003 |
+
|
| 1004 |
+
これ3つのアプローチをすべて用いることで、異なる視点を得ることができます:
|
| 1005 |
+
|
| 1006 |
+
- **階層的クラ���タリング**: データ内の"自然な"グループ分けを明らかにします。例えば、特定の著者の話し方のパターンが一緒にクラスタ化されたり、叙述部分と会話部分が明確に異なるグループを形成したりすることが考えられます。
|
| 1007 |
+
- **対応分析**: カテゴリ間の関連性を明らかにします。例えば、異なる著者や発話タイプに最も特徴的な単語がどれであるかを調べることができます。
|
| 1008 |
+
- **主成分分析**: 最も識別力の高い単語の組み合わせを特定します。例えば、どの語彙パターンが会話文/地の文や著者間の区別に最も寄与しているかを示すことができます。
|
| 1009 |
+
"""
|
| 1010 |
+
)
|
| 1011 |
+
return
|
| 1012 |
+
|
| 1013 |
+
|
| 1014 |
@app.cell
|
| 1015 |
def _():
|
| 1016 |
return
|
pyproject.toml
CHANGED
|
@@ -12,6 +12,7 @@ dependencies = [
|
|
| 12 |
"pandas>=2.3.0",
|
| 13 |
"pca>=2.10.0",
|
| 14 |
"plotly>=6.2.0",
|
|
|
|
| 15 |
"pyarrow>=20.0.0",
|
| 16 |
"scattertext==0.2.2",
|
| 17 |
"scikit-learn==1.7.0",
|
|
|
|
| 12 |
"pandas>=2.3.0",
|
| 13 |
"pca>=2.10.0",
|
| 14 |
"plotly>=6.2.0",
|
| 15 |
+
"prince>=0.16.0",
|
| 16 |
"pyarrow>=20.0.0",
|
| 17 |
"scattertext==0.2.2",
|
| 18 |
"scikit-learn==1.7.0",
|
uv.lock
CHANGED
|
@@ -953,6 +953,20 @@ wheels = [
|
|
| 953 |
{ url = "https://files.pythonhosted.org/packages/fa/8c/d3e30f80b2ef21f267f09f0b7d18995adccc928ede5b73ea3fe54e1303f4/preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed", size = 115769, upload-time = "2025-05-26T15:18:21.842Z" },
|
| 954 |
]
|
| 955 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 956 |
[[package]]
|
| 957 |
name = "psutil"
|
| 958 |
version = "7.0.0"
|
|
@@ -1276,6 +1290,7 @@ dependencies = [
|
|
| 1276 |
{ name = "pandas" },
|
| 1277 |
{ name = "pca" },
|
| 1278 |
{ name = "plotly" },
|
|
|
|
| 1279 |
{ name = "pyarrow" },
|
| 1280 |
{ name = "scattertext" },
|
| 1281 |
{ name = "scikit-learn" },
|
|
@@ -1293,6 +1308,7 @@ requires-dist = [
|
|
| 1293 |
{ name = "pandas", specifier = ">=2.3.0" },
|
| 1294 |
{ name = "pca", specifier = ">=2.10.0" },
|
| 1295 |
{ name = "plotly", specifier = ">=6.2.0" },
|
|
|
|
| 1296 |
{ name = "pyarrow", specifier = ">=20.0.0" },
|
| 1297 |
{ name = "scattertext", specifier = "==0.2.2" },
|
| 1298 |
{ name = "scikit-learn", specifier = "==1.7.0" },
|
|
|
|
| 953 |
{ url = "https://files.pythonhosted.org/packages/fa/8c/d3e30f80b2ef21f267f09f0b7d18995adccc928ede5b73ea3fe54e1303f4/preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed", size = 115769, upload-time = "2025-05-26T15:18:21.842Z" },
|
| 954 |
]
|
| 955 |
|
| 956 |
+
[[package]]
|
| 957 |
+
name = "prince"
|
| 958 |
+
version = "0.16.0"
|
| 959 |
+
source = { registry = "https://pypi.org/simple" }
|
| 960 |
+
dependencies = [
|
| 961 |
+
{ name = "altair" },
|
| 962 |
+
{ name = "pandas" },
|
| 963 |
+
{ name = "scikit-learn" },
|
| 964 |
+
]
|
| 965 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ae/bd/fde5962680ad17f8402848a6849344717c3ee341d47f60864f2d78bf720e/prince-0.16.0.tar.gz", hash = "sha256:8b3b9e74fc84ad066a1e6ef4fc076a55d80b7a46db2541a76902e47951c39b16", size = 414243, upload-time = "2025-03-09T21:38:43.631Z" }
|
| 966 |
+
wheels = [
|
| 967 |
+
{ url = "https://files.pythonhosted.org/packages/18/d5/b4480a0f381cbbcfad31f4d118732ab717216857508a730938ee615669a1/prince-0.16.0-py3-none-any.whl", hash = "sha256:7e21a78d4dd06ca3ec526ee714a50b349f26de3fca6b79664150a951b31688f3", size = 417759, upload-time = "2025-03-09T21:38:41.001Z" },
|
| 968 |
+
]
|
| 969 |
+
|
| 970 |
[[package]]
|
| 971 |
name = "psutil"
|
| 972 |
version = "7.0.0"
|
|
|
|
| 1290 |
{ name = "pandas" },
|
| 1291 |
{ name = "pca" },
|
| 1292 |
{ name = "plotly" },
|
| 1293 |
+
{ name = "prince" },
|
| 1294 |
{ name = "pyarrow" },
|
| 1295 |
{ name = "scattertext" },
|
| 1296 |
{ name = "scikit-learn" },
|
|
|
|
| 1308 |
{ name = "pandas", specifier = ">=2.3.0" },
|
| 1309 |
{ name = "pca", specifier = ">=2.10.0" },
|
| 1310 |
{ name = "plotly", specifier = ">=6.2.0" },
|
| 1311 |
+
{ name = "prince", specifier = ">=0.16.0" },
|
| 1312 |
{ name = "pyarrow", specifier = ">=20.0.0" },
|
| 1313 |
{ name = "scattertext", specifier = "==0.2.2" },
|
| 1314 |
{ name = "scikit-learn", specifier = "==1.7.0" },
|