File size: 7,950 Bytes
8aac46d
 
 
 
 
cac53d2
d96f744
 
 
8aac46d
 
 
 
 
 
 
 
 
d96f744
 
 
8aac46d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d96f744
cac53d2
c7ad5e5
 
8aac46d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d96f744
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8aac46d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d96f744
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/env python3
# analyze_aspects.py

#python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
# python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
# Fixing Punkt tokenizer bug
#!/usr/bin/env python3
# analyze_aspects.py

import sqlite3
import argparse
import logging
from pathlib import Path
import nltk
from transformers import pipeline
from collections import defaultdict
import matplotlib.pyplot as plt

# ✅ Download punkt tokenizer wie lokal
nltk.download('punkt')
from nltk import sent_tokenize

# Logging Configuration
def configure_logging():
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    return logging.getLogger(__name__)

logger = configure_logging()

# Aspekt-Label-Maps
ASPECT_LABEL_MAP = {
    "Handlung": ["Handlung", "Plot", "Story", "Aufbau"],
    "Charaktere": ["Charaktere", "Figuren", "Protagonisten", "Nebenfiguren", "Beziehungen"],
    "Stil": ["Stil", "Sprachstil", "Sprache", "Erzählweise"],
    "Emotionale Wirkung": ["Lesevergnügen", "Berührend", "Bewegend", "Begeisternd", "Spannend"],
    "Tiefgang": ["Tiefgang", "Nachdenklich", "Philosophisch", "kritisch"],
    "Thema & Kontext": ["Thema", "Motiv", "Zeitgeschehen", "Historischer Kontext", "Gesellschaft"],
    "Originalität": ["Originalität", "Kreativität", "Innovativ", "Idee", 'Humor'],
    "Recherche & Authentizität": ["Recherche", "Authentizität", "Realismus", "Fakten"]
}

ASPECT_LABEL_MAP_EN = {
    "Plot": ["Plot", "Story", "Narrative", "Structure"],
    "Characters": ["Characters", "Protagonists", "Antagonists", "Relationships"],
    "Style": ["Style", "Language", "Tone", "Narration"],
    "Emotional Impact": ["Touching", "Funny", "Exciting", "Moving", "Engaging"],
    "Depth": ["Philosophical", "Thought-provoking", "Insightful", "Critical"],
    "Theme & Context": ["Theme", "Motif", "Historical Context", "Social Issues"],
    "Originality": ["Originality", "Creativity", "Innovation", "Idea"],
    "Research & Authenticity": ["Research", "Authenticity", "Realism", "Facts"]
}

ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels]

# --- Datenbankzugriff ---

def load_reviews(db_path: Path, isbn: str) -> list:
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute(
        "SELECT id, cleaned_text, cleaned_text_en FROM reviews_und_notizen WHERE buch_isbn = ?",
        (isbn,)
    )
    rows = cursor.fetchall()
    conn.close()
    texts_to_analyze = []
    for review_id, text_de, text_en in rows:
        if text_de and isinstance(text_de, str):
            texts_to_analyze.append((review_id, text_de, 'de'))
        if text_en and isinstance(text_en, str):
            texts_to_analyze.append((review_id, text_en, 'en'))
    return texts_to_analyze

# --- Analysefunktion ---

def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict:
    reviews = load_reviews(db_path, isbn)
    reviews = [r for r in reviews if r[2] in languages]
    if not reviews:
        logger.warning(f"Keine gesäuberten Reviews für ISBN {isbn} in den gewählten Sprachen gefunden.")
        return {}

    zsl = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, multi_label=True)
    sent_de = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert", device=device)
    sent_en = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)

    aspect_results = defaultdict(list)
    total_aspects = 0

    for review_id, text, lang in reviews:
        if not text:
            continue

        logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")

        lang_map = {'de': 'german', 'en': 'english'}
        tokenizer = nltk.data.load(f"tokenizers/punkt/{lang_map.get(lang, 'english')}.pickle")
        sentences = tokenizer.tokenize(text)

        if lang == 'de':
            aspect_map = ASPECT_LABEL_MAP
            all_labels = ALL_LABELS
            sent_pipeline = sent_de
            hypothesis_template = "Dieser Satz handelt von {}."
        elif lang == 'en':
            aspect_map = ASPECT_LABEL_MAP_EN
            all_labels = [label for labels in aspect_map.values() for label in labels]
            sent_pipeline = sent_en
            hypothesis_template = "This sentence is about {}."
        else:
            continue

        for sent in sentences:
            if not sent.strip() or len(sent) < 15:
                continue

            result = zsl(sent, candidate_labels=all_labels, hypothesis_template=hypothesis_template)

            main_label = ""
            best_score = 0.0
            for label, score in zip(result["labels"], result["scores"]):
                if score > 0.8:
                    main_label = next((k for k, v in aspect_map.items() if label in v), label)
                    best_score = score
                    break

            if not main_label:
                continue

            ml_sentiment = sent_pipeline(sent)[0]
            ml_score = ml_sentiment['score'] if ml_sentiment['label'].upper().startswith('POS') else -ml_sentiment['score']
            final_score = ml_score
            final_label = 'POS' if final_score > 0.1 else 'NEG' if final_score < -0.1 else 'NEU'

            print(
                f"Review {review_id} ({lang}) | Satz: {sent}\n"
                f"  Aspekt: {main_label} (via '{result['labels'][0]}', {best_score:.2f}) | "
                f"ML: {ml_sentiment['label']}({ml_sentiment['score']:.2f}) -> Final: {final_label}({final_score:.2f})"
            )

            aspect_results[main_label].append(final_score)
            total_aspects += 1

    logger.info(f"Total aspects found: {total_aspects}")
    return aspect_results

def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
    output_dir.mkdir(parents=True, exist_ok=True)
    aspects = list(aspect_results.keys())
    avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()]
    colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores]
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    bars = plt.barh(aspects, avg_scores, color=colors)
    plt.axvline(x=0, color='black', linewidth=0.8)
    plt.xlabel("Durchschnittlicher Sentiment-Score")
    plt.title("Sentiment-Analyse pro Aspekt")
    for bar, score in zip(bars, avg_scores):
        plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
                 f"{score:.2f}", va='center')
    plt.tight_layout()
    plt.gca().invert_yaxis()
    output_path = output_dir / filename
    plt.savefig(output_path, dpi=300)
    plt.close()
    logger.info(f"Diagramm gespeichert unter: {output_path}")

# --- Entry Point ---

def main():
    parser = argparse.ArgumentParser(description="Quick-Win ABSA ohne SentiWS")
    parser.add_argument("--db-path", required=True, help="Pfad zur SQLite-Datenbank")
    parser.add_argument("--isbn", required=True, help="ISBN des Buchs")
    parser.add_argument("--gpu", action="store_true", help="GPU verwenden (device=0)")
    parser.add_argument("--languages", nargs="+", choices=["de", "en"], default=["de", "en"],
                        help="Sprachen der Reviews, z. B. --languages de oder --languages de en")
    args = parser.parse_args()

    device = 0 if args.gpu else -1
    aspect_results = analyze_quickwin(
        Path(args.db_path), args.isbn,
        device=device,
        languages=args.languages
    )

    if aspect_results:
        output_dir = Path("output")
        visualize_aspects(aspect_results, output_dir)
    else:
        logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.")