Spaces:

bor
/

scattertext-en-novels

Sleeping

App Files Files Community

Bor Hodošček commited on Jul 2

Commit

17327cb

1 Parent(s): dd4089f

feat: improved pca, ca, hclust code and support for both modes

Browse files

Files changed (3) hide show

app.py +388 -186
pyproject.toml +1 -0
uv.lock +16 -0

app.py CHANGED Viewed

@@ -9,6 +9,7 @@
 #     "pandas==2.3.0",
 #     "pca==2.10.0",
 #     "plotly==6.2.0",
 #     "pyarrow",
 #     "scattertext==0.2.2",
 #     "scikit-learn==1.7.0",
@@ -24,11 +25,10 @@
 import marimo
 __generated_with = "0.14.9"
-app = marimo.App(width="full", app_title="Scattertext on Japanese novels")
 with app.setup:
     import marimo as mo
-    import itertools
     import spacy
     import pandas as pd
     import scipy
@@ -37,6 +37,7 @@ with app.setup:
     import re
     import scattertext as st
     from pca import pca
     import matplotlib.pyplot as plt
     from pathlib import Path
     from types import SimpleNamespace
@@ -61,7 +62,7 @@ def function_export():
     @mo.cache
     def parse_texts(texts: list[str], nlp=load_nlp()) -> list[str]:
-        """Tokenize English text via spaCy and emit a whitespace‐joined string."""
         return [" ".join(tok.text for tok in doc) for doc in nlp.pipe(texts)]
     @mo.cache
@@ -84,37 +85,74 @@ def function_export():
             .compact(st.AssociationCompactor(2000))
         )
     @mo.cache
     def chunk_texts(
-        texts: list[str],
-        categories: list[str],
-        filenames: list[str],
         chunk_size: int = 2000,
-    ) -> tuple[list[str], list[str], list[str]]:
-        """Chunk each text into segments of chunk_size tokens, preserving category and filename."""
-        chunked_texts: list[str] = []
-        chunked_cats: list[str] = []
-        chunked_fnames: list[str] = []
-        for text, cat, fname in zip(texts, categories, filenames):
-            # compute a short “Initials‐Initials” label for author‐title
-            stem = Path(fname).stem.replace("_advanced", "")
-            author, title = stem.split("_", 1)
-            def _initials(s: str) -> str:
-                return "".join(tok[0].upper() for tok in s.split("-"))
-            short_label = f"{_initials(author)}-{_initials(title)}"
-            tokens = text.split()
-            for i in range(0, len(tokens), chunk_size):
-                chunk = " ".join(tokens[i : i + chunk_size])
-                chunked_texts.append(chunk)
-                chunked_cats.append(cat)
-                chunked_fnames.append(f"{short_label}({cat})#{i // chunk_size + 1}")
-            else:
-                chunked_texts.append(chunk)
-                chunked_cats.append(cat)
-                chunked_fnames.append(f"{short_label}({cat})#last")
-        return chunked_texts, chunked_cats, chunked_fnames
     @mo.cache
     def train_scikit_cached(
@@ -182,51 +220,63 @@ def function_export():
             texts = [Path(fn).read_text(encoding="utf-8") for fn in defaults]
         return names, texts
-    def make_speech_df(uploaded_files):
         """
-        Build a DataFrame of speech vs non-speech segments, one row per segment,
-        carrying over the source filename.
         """
-        defaults = [
-            "e-r-eddison_the-worm-ouroboros_advanced.txt",
-            "h-g-wells_the-wonderful-visit_advanced.txt",
-        ]
-        names, raws = _load_files(uploaded_files, defaults)
-        speech_segs, nonspeech_segs = [], []
-        speech_files, nonspeech_files = [], []
         for name, raw in zip(names, raws):
-            sp, ns = split_speech_text(raw)
-            speech_segs.append(sp)
-            nonspeech_segs.append(ns)
-            speech_files.append(name)
-            nonspeech_files.append(name)
-        tok_sp = parse_texts(speech_segs)
-        tok_ns = parse_texts(nonspeech_segs)
-        ratios = [
-            len(s) / (len(s) + len(ns)) for s, ns in zip(speech_segs, nonspeech_segs)
-        ]
-        ratios_full = ratios + ratios
-        return pd.DataFrame(
-            {
-                "category": ["speech"] * len(tok_sp) + ["non-speech"] * len(tok_ns),
-                "filename": speech_files + nonspeech_files,
-                "text": tok_sp + tok_ns,
-                "speech_ratio": ratios_full,
-            }
-        )
     return (
         build_corpus_cached,
         chunk_texts,
-        make_speech_df,
         parse_texts,
-        split_speech_text,
         train_scikit_cached,
     )
@@ -250,7 +300,7 @@ def intro():
     2. データ内容を確認・修正
     3. チャンク＆サンプリング設定
     4. Scattertextによる可視化
-    5. PCAのbiplot, 階層的クラスタリングのデンドログラムでサンプルの分布と素性の関係を観察
     6. 気になるサンプルをドロップダウンで選択し、内容を確認
     > 単語分割には、[spaCy](https://spacy.io/)（[en_core_web_sm](https://spacy.io/models/en#en_core_web_sm)モデル）を使用しています。
@@ -274,7 +324,9 @@ def data_settings():
         full_width=True,
     )
     files_a = mo.ui.file(
-        label="Aのファイルアップロード（UTF-8、.txt形式）", multiple=True, kind="area"
     )
     ### Category form
     label_b = mo.ui.text(
@@ -284,7 +336,9 @@ def data_settings():
         full_width=True,
     )
     files_b = mo.ui.file(
-        label="Bのファイルアップロード（UTF-8、.txt形式）", multiple=True, kind="area"
     )
     split_speech = mo.ui.switch(
         label="Split speech vs non-speech segments?",
@@ -353,12 +407,11 @@ def data_settings():
 @app.cell
 def data_check(
     category_form,
-    make_speech_df,
     mode_tabs,
     parse_texts,
     speech_form,
     split_speech,
-    split_speech_text,
 ):
     mo.stop(mode_tabs.value == "Speech vs Non-Speech" and speech_form.value is None)
     mo.stop(mode_tabs.value == "Category Comparison" and category_form.value is None)
@@ -366,12 +419,22 @@ def data_check(
     validation_messages: list[str] = []
     if mode_tabs.value == "Speech vs Non-Speech":
-        data = make_speech_df(speech_form.value.get("files_s", []))
         mo.md(
             f"## Data preview (speech vs non-speech)\n"
-            f"{mo.ui.table(data, selection='multi')}"
         )
-        # fake data_form so all scattertext cells see the same API
         data_form = SimpleNamespace(
             value={
                 "category_name": "Speech vs Non-speech",
@@ -381,7 +444,6 @@ def data_check(
         )
     elif category_form.value is not None and mode_tabs.value == "Category Comparison":
         # Category vs Category
         if category_form.value["label_a"] == category_form.value["label_b"]:
             validation_messages.append(
                 "⚠️ **警告**: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
@@ -392,89 +454,30 @@ def data_check(
                 "ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
             )
-        try:
-            # Group A: either uploaded files or default
-            if category_form.value["files_a"]:
-                category_a_texts = (
-                    f.contents.decode("utf-8") for f in category_form.value["files_a"]
-                )
-                category_a_names = (f.name for f in category_form.value["files_a"])
-            else:
-                # Default Group A: E. R. Eddison: The Worm Ouroboros
-                default_a = "e-r-eddison_the-worm-ouroboros_advanced.txt"
-                category_a_texts = [Path(default_a).read_text(encoding="utf-8")]
-                category_a_names = [default_a]
-            if split_speech.value:
-                texts_list = list(category_a_texts)
-                names_list = list(category_a_names)
-                expanded_txt, expanded_names = [], []
-                for nm, raw in zip(names_list, texts_list):
-                    sp, ns = split_speech_text(raw)
-                    expanded_txt.extend([sp, ns])
-                    expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
-                category_a_texts, category_a_names = expanded_txt, expanded_names
-            # Group B: either uploaded files or default
-            if category_form.value["files_b"]:
-                category_b_texts = (
-                    f.contents.decode("utf-8") for f in category_form.value["files_b"]
-                )
-                category_b_names = (f.name for f in category_form.value["files_b"])
-            else:
-                # Default Group B: H. G. Wells: The Wonderful Visit
-                default_b = "h-g-wells_the-wonderful-visit_advanced.txt"
-                category_b_texts = [Path(default_b).read_text(encoding="utf-8")]
-                category_b_names = [default_b]
-            # same splitting for B‐side
-            if split_speech.value:
-                texts_list = list(category_b_texts)
-                names_list = list(category_b_names)
-                expanded_txt, expanded_names = [], []
-                for nm, raw in zip(names_list, texts_list):
-                    sp, ns = split_speech_text(raw)
-                    expanded_txt.extend([sp, ns])
-                    expanded_names.extend([f"{nm} (speech)", f"{nm} (non-speech)"])
-                category_b_texts, category_b_names = expanded_txt, expanded_names
-            # infer categories: use UI labels when files uploaded,
-            # otherwise derive from filename‐stem
-            # (e.g. "e-r-eddison_..." -> "E R Eddison")
-            if category_form.value["files_a"]:
-                cats_a = [category_form.value["label_a"]] * len(category_a_names)
-            else:
-                cats_a = [
-                    Path(fn).stem.split("_", 1)[0].replace("-", " ").title()
-                    for fn in category_a_names
-                ]
-            if category_form.value["files_b"]:
-                cats_b = [category_form.value["label_b"]] * len(category_b_names)
-            else:
-                cats_b = [
-                    Path(fn).stem.split("_", 1)[0].replace("-", " ").title()
-                    for fn in category_b_names
-                ]
-            data = pd.DataFrame(
-                {
-                    "category": cats_a + cats_b,
-                    "filename": itertools.chain(category_a_names, category_b_names),
-                    "text": itertools.chain(category_a_texts, category_b_texts),
-                }
-            )
-            with mo.status.spinner("コーパスを解析中..."):
-                data["text"] = parse_texts(list(data["text"]))
-            # pass through the real form
-            data_form = category_form
-        except Exception as e:
-            data = None
-            validation_messages.append(
-                f"❌ **エラー**: ファイルの読み込みに失敗しました: {str(e)}\n"
-            )
     else:
         data = None
         validation_messages.append(
@@ -491,7 +494,9 @@ def data_check(
     解析済テキスト一覧:
     {
         mo.ui.table(
-            data, selection="multi", format_mapping={"text": lambda s: s[:20] + "..."}
         )
         if (data is not None and not data.empty)
         else "No data"
@@ -532,26 +537,21 @@ def _(build_corpus_cached, chunk_texts, data, sample_frac, sampling_form):
     mo.stop(sampling_form.value is None)
     with mo.status.spinner("コーパスをサンプリング中…"):
-        texts, cats, fnames = chunk_texts(
-            list(data.text),
-            list(data.category),
-            list(data.filename),
-            sampling_form.value["chunk_size"],
-        )
         if sample_frac.value < 1.0:
-            N = len(texts)
-            k = int(N * sampling_form.value["sample_frac"])
-            idx = random.sample(range(N), k)
-            texts = [texts[i] for i in idx]
-            cats = [cats[i] for i in idx]
-            fnames = [fnames[i] for i in idx]
-        corpus = build_corpus_cached(
-            texts,
-            cats,
-        )
-    return cats, corpus, fnames, texts
 @app.cell
@@ -621,11 +621,41 @@ def _():
 @app.cell
 def _():
-    min_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.25, show_value=True, label="Minimum proportion of samples feature appears in")
-    max_df_setting = mo.ui.slider(start=0.0, stop=1.0, step=0.05, value=0.8, show_value=True, label="Maximum proportion of samples feature appears in")
-    max_features_setting = mo.ui.slider(start=10, stop=10_000, step=1, value=100, show_value=True, label="Maximum number of features to use")
-    mo.vstack([mo.md("### 素性設定\n\nどのような単語を分析に使用するかを下記のスライダーで決めます。標準では、ほとんど全ての文章に現る単語、または極端に少ない文章にしか現れない単語が���外されています。そのうえで、$\\mathrm{tfidf}$の値上位100件まで素性としています。"), min_df_setting, max_df_setting, max_features_setting])
     return max_df_setting, max_features_setting, min_df_setting
@@ -640,7 +670,12 @@ def _(
     train_scikit_cached,
 ):
     scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
-        texts, cats, fnames, min_df=min_df_setting.value, max_df=max_df_setting.value, max_features=max_features_setting.value,
     )
     return chunk_cats, chunk_fnames, tfidf_X, vectorizer
@@ -667,6 +702,11 @@ def _(X_train, chunk_fnames, vectorizer):
     idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
     tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
     mo.md(rf"""
     ### サンプルと素性の行列
@@ -685,9 +725,9 @@ def _(X_train, chunk_fnames, vectorizer):
     - ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
     - ${D_formula}$: number of documents with $t$
-    {mo.ui.table(pd.DataFrame(X_train.toarray(), index=chunk_fnames, columns=vectorizer.get_feature_names_out()))}
     """)
-    return
 @app.cell
@@ -695,7 +735,7 @@ def pca_biplot(chunk_cats, tfidf_X, vectorizer):
     X = tfidf_X.toarray() if hasattr(tfidf_X, "toarray") else tfidf_X
     feature_names = vectorizer.get_feature_names_out()
-    model = pca(normalize=True, n_components=3)
     results = model.fit_transform(
         X,
         col_labels=feature_names,
@@ -714,6 +754,7 @@ def _(model, results, three_switch):
         figsize=(12, 8),
         fontsize=12,
         s=20,
         PC=[0, 1, 2] if three_switch.value else [0, 1],
     )
     # labels=np.array(chunk_fnames)
@@ -722,7 +763,14 @@ def _(model, results, three_switch):
     mo.vstack(
         [
             mo.md(
-                """## [PCA](https://erdogant.github.io/pca/pages/html/index.html)のbiplot
                 """
             ),
             mo.mpl.interactive(plt.gcf()),
@@ -732,6 +780,112 @@ def _(model, results, three_switch):
     return
 @app.cell
 def _():
     linkage_methods = mo.ui.dropdown(
@@ -759,7 +913,15 @@ def _():
     d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
     mo.md(f"""
-    ## 階層的クラスタリング
     {d_stack}
     {dendrogram_height}
@@ -783,9 +945,33 @@ def _(X, chunk_fnames, dendrogram_height, distance_metrics, linkage_methods):
         distfun=distfun,
         linkagefun=linkagefun,
     )
-    fig.update_layout(width=800, height=dendrogram_height.value)
     mo.ui.plotly(fig)
     return
@@ -809,6 +995,22 @@ def sample_viewer(fnames, text_selector, texts):
     return
 @app.cell
 def _():
     return

 #     "pandas==2.3.0",
 #     "pca==2.10.0",
 #     "plotly==6.2.0",
+#     "prince==0.16.0",
 #     "pyarrow",
 #     "scattertext==0.2.2",
 #     "scikit-learn==1.7.0",
 import marimo
 __generated_with = "0.14.9"
+app = marimo.App(width="full", app_title="Scattertext on English novels")
 with app.setup:
     import marimo as mo
     import spacy
     import pandas as pd
     import scipy
     import re
     import scattertext as st
     from pca import pca
+    import prince
     import matplotlib.pyplot as plt
     from pathlib import Path
     from types import SimpleNamespace
     @mo.cache
     def parse_texts(texts: list[str], nlp=load_nlp()) -> list[str]:
+        """Tokenize English text via spaCy and emit a whitespace-joined string."""
         return [" ".join(tok.text for tok in doc) for doc in nlp.pipe(texts)]
     @mo.cache
             .compact(st.AssociationCompactor(2000))
         )
+    def _strip_advanced(fn: str) -> str:
+        """
+        Strip trailing '_advanced' from a filename stem.
+        """
+        from pathlib import Path
+        stem = Path(fn).stem
+        return stem.replace("_advanced", "")
+    def make_short_label(fn: str) -> str:
+        """
+        Generate an initials-based short label from filename.
+        E.g., 'e-r-eddison_the-worm-ouroboros.txt' -> 'ERE-TWO'.
+        """
+        stem = _strip_advanced(fn)
+        author, title = stem.split("_", 1)
+        initials = lambda s: "".join(part[0].upper() for part in s.split("-"))
+        return f"{initials(author)}-{initials(title)}"
+    def format_chunk_label(
+        fn: str,
+        category: str,
+        speech_type: str,
+        chunk_idx: int | str,
+    ) -> str:
+        """
+        Create a chunk label 'SHORTLABEL(CATEGORY[-speech_type])#INDEX'.
+        """
+        sl = make_short_label(fn)
+        # append speech_type only if it differs from category and isn't 'mixed'
+        if speech_type and speech_type != "mixed" and speech_type != category:
+            label = f"{category}-{speech_type}"
+        else:
+            label = category
+        return f"{sl}({label})#{chunk_idx}"
     @mo.cache
     def chunk_texts(
+        df: pd.DataFrame,
         chunk_size: int = 2000,
+    ) -> pd.DataFrame:
+        """
+        Turn each row of df into token‐chunks of size chunk_size,
+        preserving category, filename, author, work, and producing
+        a `chunk_label`.
+        """
+        records: list[dict] = []
+        for _, row in df.iterrows():
+            tokens = row["text"].split()
+            n_chunks = (len(tokens) + chunk_size - 1) // chunk_size
+            for idx in range(n_chunks):
+                seg = " ".join(tokens[idx * chunk_size : (idx + 1) * chunk_size])
+                label_idx = idx + 1 if idx + 1 < n_chunks else "last"
+                records.append({
+                    "text":         seg,
+                    "category":     row["category"],
+                    "speech_type":  row["speech_type"],
+                    "filename":     row["filename"],
+                    "author":       row["author"],
+                    "work":         row["work"],
+                    "chunk_label":  format_chunk_label(
+                        row["filename"],
+                        row["category"],
+                        row["speech_type"],
+                        label_idx,
+                    ),
+                })
+        return pd.DataFrame(records)
     @mo.cache
     def train_scikit_cached(
             texts = [Path(fn).read_text(encoding="utf-8") for fn in defaults]
         return names, texts
+    def prepare_files(
+        uploaded: list, defaults: list[str], split: bool = False
+    ) -> pd.DataFrame:
         """
+        Ingest uploaded vs. default files into a DataFrame with columns:
+        ['filename','raw_text','category' (if split),'author','work'].
         """
+        import pandas as pd
+        names, raws = _load_files(uploaded, defaults)
+        records: list[dict] = []
         for name, raw in zip(names, raws):
+            if split:
+                sp, ns = split_speech_text(raw)
+                records.append(
+                    {
+                        "filename": name,
+                        "raw_text": sp,
+                        "speech_type": "speech",
+                    }
+                )
+                records.append(
+                    {
+                        "filename": name,
+                        "raw_text": ns,
+                        "speech_type": "non-speech",
+                    }
+                )
+            else:
+                records.append(
+                    {
+                        "filename": name,
+                        "raw_text": raw,
+                        "speech_type": "mixed",
+                    }
+                )
+        df_p = pd.DataFrame(records)
+        # infer author & work from the file's true stem (no extension, no "_advanced")
+        def _extract_auth_work(fn: str) -> tuple[str, str]:
+            base = Path(fn).stem.replace("_advanced", "")
+            auth, *rest = base.split("_", 1)
+            work_raw = rest[0] if rest else base
+            return (
+                auth.replace("-", " ").title(),
+                work_raw.replace("-", " ").title(),
+            )
+        aw = df_p["filename"].apply(_extract_auth_work)
+        df_p["author"], df_p["work"] = zip(*aw)
+        return df_p
     return (
         build_corpus_cached,
         chunk_texts,
         parse_texts,
+        prepare_files,
         train_scikit_cached,
     )
     2. データ内容を確認・修正
     3. チャンク＆サンプリング設定
     4. Scattertextによる可視化
+    5. PCAとCAのbiplot、階層的クラスタリングのデンドログラムでサンプル、カテゴリと素性の分布と関係を観察
     6. 気になるサンプルをドロップダウンで選択し、内容を確認
     > 単語分割には、[spaCy](https://spacy.io/)（[en_core_web_sm](https://spacy.io/models/en#en_core_web_sm)モデル）を使用しています。
         full_width=True,
     )
     files_a = mo.ui.file(
+        label="Aのファイルアップロード（UTF-8、.txt形式）",
+        multiple=True,
+        kind="area",
     )
     ### Category form
     label_b = mo.ui.text(
         full_width=True,
     )
     files_b = mo.ui.file(
+        label="Bのファイルアップロード（UTF-8、.txt形式）",
+        multiple=True,
+        kind="area",
     )
     split_speech = mo.ui.switch(
         label="Split speech vs non-speech segments?",
 @app.cell
 def data_check(
     category_form,
     mode_tabs,
     parse_texts,
+    prepare_files,
     speech_form,
     split_speech,
 ):
     mo.stop(mode_tabs.value == "Speech vs Non-Speech" and speech_form.value is None)
     mo.stop(mode_tabs.value == "Category Comparison" and category_form.value is None)
     validation_messages: list[str] = []
     if mode_tabs.value == "Speech vs Non-Speech":
+        defaults = [
+            "e-r-eddison_the-worm-ouroboros_advanced.txt",
+            "h-g-wells_the-wonderful-visit_advanced.txt",
+        ]
+        df_pre = prepare_files(
+            speech_form.value.get("files_s", []),
+            defaults,
+            split=True,
+        )
+        data = df_pre.rename(columns={"raw_text": "text"})
+        # use the speech‐vs‐non‐speech flag as our category
+        data["category"] = data["speech_type"]
         mo.md(
             f"## Data preview (speech vs non-speech)\n"
+            f"{mo.ui.table(data, selection=None)}"
         )
         data_form = SimpleNamespace(
             value={
                 "category_name": "Speech vs Non-speech",
         )
     elif category_form.value is not None and mode_tabs.value == "Category Comparison":
         # Category vs Category
         if category_form.value["label_a"] == category_form.value["label_b"]:
             validation_messages.append(
                 "⚠️ **警告**: グループAとBのラベルが同じです。AとBは異なるラベルを設定してください。\n"
                 "ℹ️ ファイルが未指定のため、デフォルトサンプルを使用しています。\n"
             )
+        defaults_a = ["e-r-eddison_the-worm-ouroboros_advanced.txt"]
+        df_a = prepare_files(
+            category_form.value["files_a"],
+            defaults_a,
+            split=split_speech.value,
+        )
+        df_a["category"] = (
+            [category_form.value["label_a"]] * len(df_a)
+            if category_form.value["files_a"]
+            else [category_form.value["label_a"]] * len(df_a)
+        )
+        defaults_b = ["h-g-wells_the-wonderful-visit_advanced.txt"]
+        df_b = prepare_files(
+            category_form.value["files_b"],
+            defaults_b,
+            split=split_speech.value,
+        )
+        df_b["category"] = [category_form.value["label_b"]] * len(df_b)
+        data = pd.concat([df_a, df_b], ignore_index=True)
+        # tokenize text if not already (optional)
+        data["text"] = parse_texts(list(data["raw_text"]))
+        data_form = category_form
     else:
         data = None
         validation_messages.append(
     解析済テキスト一覧:
     {
         mo.ui.table(
+            data,
+            selection=None,
+            format_mapping={"text": lambda s: s[:20] + "..."},
         )
         if (data is not None and not data.empty)
         else "No data"
     mo.stop(sampling_form.value is None)
     with mo.status.spinner("コーパスをサンプリング中…"):
+        # chunk the DataFrame
+        chunk_df = chunk_texts(data, sampling_form.value["chunk_size"])
+        # optional subsampling
         if sample_frac.value < 1.0:
+            chunk_df = chunk_df.sample(frac=sample_frac.value, random_state=RANDOM_SEED)
+        texts = chunk_df["text"].tolist()
+        cats = chunk_df["category"].tolist()
+        fnames = chunk_df["chunk_label"].tolist()
+        authors = chunk_df["author"].tolist()
+        works = chunk_df["work"].tolist()
+        speech_types = chunk_df["speech_type"].tolist()
+        corpus = build_corpus_cached(texts, cats)
+    return authors, cats, corpus, fnames, speech_types, texts, works
 @app.cell
 @app.cell
 def _():
+    min_df_setting = mo.ui.slider(
+        start=0.0,
+        stop=1.0,
+        step=0.05,
+        value=0.25,
+        show_value=True,
+        label="Minimum proportion of samples feature appears in",
+    )
+    max_df_setting = mo.ui.slider(
+        start=0.0,
+        stop=1.0,
+        step=0.05,
+        value=0.8,
+        show_value=True,
+        label="Maximum proportion of samples feature appears in",
+    )
+    max_features_setting = mo.ui.slider(
+        start=10,
+        stop=10_000,
+        step=1,
+        value=100,
+        show_value=True,
+        label="Maximum number of features to use",
+    )
+    mo.vstack(
+        [
+            mo.md(
+                "### 素性設定\n\nどのような単語を分析に使用するかを下記のスライダーで決めます。標準では、ほとんど全ての文章に現る単語、または極端に少ない文章にしか現れない単語が除外されています。そのうえで、$\\mathrm{tfidf}$の値上位100件まで素性としています。"
+            ),
+            min_df_setting,
+            max_df_setting,
+            max_features_setting,
+        ]
+    )
     return max_df_setting, max_features_setting, min_df_setting
     train_scikit_cached,
 ):
     scikit_corpus, tfidf_X, vectorizer, chunk_cats, chunk_fnames = train_scikit_cached(
+        texts,
+        cats,
+        fnames,
+        min_df=min_df_setting.value,
+        max_df=max_df_setting.value,
+        max_features=max_features_setting.value,
     )
     return chunk_cats, chunk_fnames, tfidf_X, vectorizer
     idf_formula = rf"$\mathrm{{idf}}(t,D)=\log{{\frac{{N}}{{{D_formula}}}}}$"
     tf_formula = r"${\displaystyle \mathrm {tf} (t,d)=\textrm{number of times }t\textrm{ appears in }d}$"
+    X_df = pd.DataFrame(
+        X_train.toarray(),
+        index=chunk_fnames,
+        columns=vectorizer.get_feature_names_out(),
+    )
     mo.md(rf"""
     ### サンプルと素性の行列
     - ${{\displaystyle N}}$: total number of documents in the corpus ${{\displaystyle N={{|D|}}}}$
     - ${D_formula}$: number of documents with $t$
+    {mo.ui.table(X_df, selection=None)}
     """)
+    return (X_df,)
 @app.cell
     X = tfidf_X.toarray() if hasattr(tfidf_X, "toarray") else tfidf_X
     feature_names = vectorizer.get_feature_names_out()
+    model = pca(normalize=False, n_components=3)
     results = model.fit_transform(
         X,
         col_labels=feature_names,
         figsize=(12, 8),
         fontsize=12,
         s=20,
+        arrowdict={"alpha": 0.0},
         PC=[0, 1, 2] if three_switch.value else [0, 1],
     )
     # labels=np.array(chunk_fnames)
     mo.vstack(
         [
             mo.md(
+                r"""## Principal Components Analysis / 主成分分析
+                [Principal Components Analysis](https://erdogant.github.io/pca/pages/html/index.html) (PCA)は、$\mathrm{{tfidf}}$スコアを連続的な数値データとして扱い、データセット内の分散を最も多く説明する単語の線形結合を特定します。この分析により、以下の点が明らかになります。
+                - 主成分によって会話文と地の文（あるいは他の分析カテゴリ）を最も効果的に区別する単語の組み合わせが判明します。
+                - 会話文と地の文サンプル間の分散に最も寄与する共起語彙パターン、および判別力の高い語彙が特定されます。
+                - PCAは傾度に沿った線形関係を仮定するため、言語スタイルの緩やかな変化も示されます。
+                - $\mathrm{{tfidf}}$スコアの連続性を保持したまま、次元削減が実現されます。
                 """
             ),
             mo.mpl.interactive(plt.gcf()),
     return
+@app.cell
+def _():
+    mo.md(
+        r"""
+    ## Correspondence Analysis / 対応分析
+    対応分析（CA）のbiplotでは、主成分分析のbiplotと似ているような分析として、サンプルと素性の関係が観察できますが、いくつかの違いがあります。
+    対応分析を行うには、$\mathrm{tfidf}$行列をカテゴリカルな形式の分割表（contingency table）に変換する必要があります。次に、そのデータを連関表として解析します。この手法により、
+    - 会話文と地の文カテゴリと特定単語出現パターンとの関連性を検討
+    - サンプルのカテゴリと単語特徴量との離散的な関連として関係性を示すバイプロットを作成
+    - 各カテゴリに最も特徴的な単語を、PCAでのユークリッド距離ではなくカイ二乗距離を用���て抽出
+    - サンプルと単語の両方をランダムな観測値として対称的に扱うことができる
+    といった分析が可能となります。
+    """
+    )
+    return
+@app.cell
+def _(X_df, authors, chunk_cats, speech_types, works):
+    import itertools
+    # Build a small DF to test each dim‐combo
+    df_chk = X_df.copy()
+    df_chk["author"] = authors
+    df_chk["category"] = chunk_cats
+    df_chk["work"] = works
+    df_chk["speech_type"] = speech_types
+    dims_all = ["author", "category", "work", "speech_type"]
+    options: list[str] = []
+    # Enumerate all non-empty combinations; keep those yielding >2 groups
+    for r in range(1, len(dims_all) + 1):
+        for combo in itertools.combinations(dims_all, r):
+            if df_chk.groupby(list(combo)).ngroups > 2:
+                options.append("|".join(combo))
+    mo.stop(
+        not options,
+        "No category combination yielding more than two rows, so cannot perform CA.",
+    )
+    ca_group_by = mo.ui.dropdown(
+        options=options,
+        value=options[0],
+        label="Group by (dims that yield >2 rows)",
+        full_width=True,
+    )
+    ca_group_by
+    return (ca_group_by,)
+@app.cell
+def _(X_df, authors, ca_group_by, chunk_cats, speech_types, works):
+    df = X_df.copy()
+    df["author"] = authors
+    df["category"] = chunk_cats
+    df["work"] = works
+    df["speech_type"] = speech_types
+    # split "author|work" (etc.) into the actual list of dims
+    dims = ca_group_by.value.split("|")
+    # sum only numeric (feature) columns by group
+    num_cols = df.select_dtypes(include="number").columns.tolist()
+    ct = df.groupby(dims)[num_cols].sum()
+    # flatten MultiIndex into a single‐level index
+    if len(dims) > 1:
+        ct.index = ["|".join(idx) for idx in ct.index]
+    else:
+        ct.index = ct.index.astype(str)
+    mo.md(f"""
+    ### カテゴリと素性の行列
+    {mo.ui.table(ct, selection=None)}
+    """)
+    return (ct,)
+@app.cell
+def _(ct):
+    ca_model = prince.CA(
+        n_components=2,
+        n_iter=10,
+        copy=True,
+        check_input=True,
+        engine="sklearn",
+        random_state=RANDOM_SEED,
+    )
+    ca_model = ca_model.fit(ct)
+    ca_model.plot(
+        ct,
+        x_component=0,
+        y_component=1,
+        show_row_markers=True,
+        show_column_markers=True,
+        show_row_labels=True,
+        show_column_labels=True,
+    )
+    return
 @app.cell
 def _():
     linkage_methods = mo.ui.dropdown(
     d_stack = mo.hstack([linkage_methods, distance_metrics], justify="start")
     mo.md(f"""
+    ## Hierarchical Clustering / 階層的クラスタリング
+    階層的クラスタリングは、（予め設定したカテゴリに関わらず）サンプル間の$\\mathrm{{tfidf}}$単語使用パターンの類似性に基づき、直接的にグループ化を行います。
+    - サンプル同士が異なる類似度レベルでどのようにグループ化されるかを示す樹状図（デンドログラム）を生成
+    - サンプル間の距離計算において、定めた全ての$\\mathrm{{tfidf}}$特徴量を保持
+    - PCA/CAと比べ、特徴量間の関係ではなく、サンプル間の関係性に着目（ただし、行列を回転し、逆の分析もできる）
+    - 高次元$\\mathrm{{tfidf}}$ベクトル間の類似度を測定するために、ユークリッド距離やコサイン距離といった距離尺度を用いる
+    - 類似した単語使用パターンを有するサンプル群の離散的なクラスタを構築
     {d_stack}
     {dendrogram_height}
         distfun=distfun,
         linkagefun=linkagefun,
     )
+    fig.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on samples",)
     mo.ui.plotly(fig)
+    return distfun, ff, linkagefun
+@app.cell
+def _(
+    X,
+    X_df,
+    dendrogram_height,
+    distance_metrics,
+    distfun,
+    ff,
+    linkage_methods,
+    linkagefun,
+):
+    fig_T = ff.create_dendrogram(
+        X.T,
+        orientation="left",
+        labels=X_df.columns,
+        distfun=distfun,
+        linkagefun=linkagefun,
+    )
+    fig_T.update_layout(width=800, height=dendrogram_height.value, title=f"Dendrogram using {linkage_methods.value} link method and {distance_metrics.value} distance on features")
+    mo.ui.plotly(fig_T)
     return
     return
+@app.cell
+def _():
+    mo.md(
+        r"""
+    # まとめ
+    これ３つのアプローチをすべて用いることで、異なる視点を得ることができます：
+    - **階層的クラ���タリング**: データ内の"自然な"グループ分けを明らかにします。例えば、特定の著者の話し方のパターンが一緒にクラスタ化されたり、叙述部分と会話部分が明確に異なるグループを形成したりすることが考えられます。
+    - **対応分析**: カテゴリ間の関連性を明らかにします。例えば、異なる著者や発話タイプに最も特徴的な単語がどれであるかを調べることができます。
+    - **主成分分析**: 最も識別力の高い単語の組み合わせを特定します。例えば、どの語彙パターンが会話文／地の文や著者間の区別に最も寄与しているかを示すことができます。
+    """
+    )
+    return
 @app.cell
 def _():
     return

pyproject.toml CHANGED Viewed

@@ -12,6 +12,7 @@ dependencies = [
     "pandas>=2.3.0",
     "pca>=2.10.0",
     "plotly>=6.2.0",
     "pyarrow>=20.0.0",
     "scattertext==0.2.2",
     "scikit-learn==1.7.0",

     "pandas>=2.3.0",
     "pca>=2.10.0",
     "plotly>=6.2.0",
+    "prince>=0.16.0",
     "pyarrow>=20.0.0",
     "scattertext==0.2.2",
     "scikit-learn==1.7.0",

uv.lock CHANGED Viewed

@@ -953,6 +953,20 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fa/8c/d3e30f80b2ef21f267f09f0b7d18995adccc928ede5b73ea3fe54e1303f4/preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed", size = 115769, upload-time = "2025-05-26T15:18:21.842Z" },
 ]
 [[package]]
 name = "psutil"
 version = "7.0.0"
@@ -1276,6 +1290,7 @@ dependencies = [
     { name = "pandas" },
     { name = "pca" },
     { name = "plotly" },
     { name = "pyarrow" },
     { name = "scattertext" },
     { name = "scikit-learn" },
@@ -1293,6 +1308,7 @@ requires-dist = [
     { name = "pandas", specifier = ">=2.3.0" },
     { name = "pca", specifier = ">=2.10.0" },
     { name = "plotly", specifier = ">=6.2.0" },
     { name = "pyarrow", specifier = ">=20.0.0" },
     { name = "scattertext", specifier = "==0.2.2" },
     { name = "scikit-learn", specifier = "==1.7.0" },

     { url = "https://files.pythonhosted.org/packages/fa/8c/d3e30f80b2ef21f267f09f0b7d18995adccc928ede5b73ea3fe54e1303f4/preshed-3.0.10-cp313-cp313-win_amd64.whl", hash = "sha256:97e0e2edfd25a7dfba799b49b3c5cc248ad0318a76edd9d5fd2c82aa3d5c64ed", size = 115769, upload-time = "2025-05-26T15:18:21.842Z" },
 ]
+[[package]]
+name = "prince"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "altair" },
+    { name = "pandas" },
+    { name = "scikit-learn" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/bd/fde5962680ad17f8402848a6849344717c3ee341d47f60864f2d78bf720e/prince-0.16.0.tar.gz", hash = "sha256:8b3b9e74fc84ad066a1e6ef4fc076a55d80b7a46db2541a76902e47951c39b16", size = 414243, upload-time = "2025-03-09T21:38:43.631Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/d5/b4480a0f381cbbcfad31f4d118732ab717216857508a730938ee615669a1/prince-0.16.0-py3-none-any.whl", hash = "sha256:7e21a78d4dd06ca3ec526ee714a50b349f26de3fca6b79664150a951b31688f3", size = 417759, upload-time = "2025-03-09T21:38:41.001Z" },
+]
 [[package]]
 name = "psutil"
 version = "7.0.0"
     { name = "pandas" },
     { name = "pca" },
     { name = "plotly" },
+    { name = "prince" },
     { name = "pyarrow" },
     { name = "scattertext" },
     { name = "scikit-learn" },
     { name = "pandas", specifier = ">=2.3.0" },
     { name = "pca", specifier = ">=2.10.0" },
     { name = "plotly", specifier = ">=6.2.0" },
+    { name = "prince", specifier = ">=0.16.0" },
     { name = "pyarrow", specifier = ">=20.0.0" },
     { name = "scattertext", specifier = "==0.2.2" },
     { name = "scikit-learn", specifier = "==1.7.0" },