File size: 7,950 Bytes
8aac46d cac53d2 d96f744 8aac46d d96f744 8aac46d d96f744 cac53d2 c7ad5e5 8aac46d d96f744 8aac46d d96f744 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
#!/usr/bin/env python3
# analyze_aspects.py
#python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
# python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
# Fixing Punkt tokenizer bug
#!/usr/bin/env python3
# analyze_aspects.py
import sqlite3
import argparse
import logging
from pathlib import Path
import nltk
from transformers import pipeline
from collections import defaultdict
import matplotlib.pyplot as plt
# ✅ Download punkt tokenizer wie lokal
nltk.download('punkt')
from nltk import sent_tokenize
# Logging Configuration
def configure_logging():
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
return logging.getLogger(__name__)
logger = configure_logging()
# Aspekt-Label-Maps
ASPECT_LABEL_MAP = {
"Handlung": ["Handlung", "Plot", "Story", "Aufbau"],
"Charaktere": ["Charaktere", "Figuren", "Protagonisten", "Nebenfiguren", "Beziehungen"],
"Stil": ["Stil", "Sprachstil", "Sprache", "Erzählweise"],
"Emotionale Wirkung": ["Lesevergnügen", "Berührend", "Bewegend", "Begeisternd", "Spannend"],
"Tiefgang": ["Tiefgang", "Nachdenklich", "Philosophisch", "kritisch"],
"Thema & Kontext": ["Thema", "Motiv", "Zeitgeschehen", "Historischer Kontext", "Gesellschaft"],
"Originalität": ["Originalität", "Kreativität", "Innovativ", "Idee", 'Humor'],
"Recherche & Authentizität": ["Recherche", "Authentizität", "Realismus", "Fakten"]
}
ASPECT_LABEL_MAP_EN = {
"Plot": ["Plot", "Story", "Narrative", "Structure"],
"Characters": ["Characters", "Protagonists", "Antagonists", "Relationships"],
"Style": ["Style", "Language", "Tone", "Narration"],
"Emotional Impact": ["Touching", "Funny", "Exciting", "Moving", "Engaging"],
"Depth": ["Philosophical", "Thought-provoking", "Insightful", "Critical"],
"Theme & Context": ["Theme", "Motif", "Historical Context", "Social Issues"],
"Originality": ["Originality", "Creativity", "Innovation", "Idea"],
"Research & Authenticity": ["Research", "Authenticity", "Realism", "Facts"]
}
ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels]
# --- Datenbankzugriff ---
def load_reviews(db_path: Path, isbn: str) -> list:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(
"SELECT id, cleaned_text, cleaned_text_en FROM reviews_und_notizen WHERE buch_isbn = ?",
(isbn,)
)
rows = cursor.fetchall()
conn.close()
texts_to_analyze = []
for review_id, text_de, text_en in rows:
if text_de and isinstance(text_de, str):
texts_to_analyze.append((review_id, text_de, 'de'))
if text_en and isinstance(text_en, str):
texts_to_analyze.append((review_id, text_en, 'en'))
return texts_to_analyze
# --- Analysefunktion ---
def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict:
reviews = load_reviews(db_path, isbn)
reviews = [r for r in reviews if r[2] in languages]
if not reviews:
logger.warning(f"Keine gesäuberten Reviews für ISBN {isbn} in den gewählten Sprachen gefunden.")
return {}
zsl = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, multi_label=True)
sent_de = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert", device=device)
sent_en = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
aspect_results = defaultdict(list)
total_aspects = 0
for review_id, text, lang in reviews:
if not text:
continue
logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
lang_map = {'de': 'german', 'en': 'english'}
tokenizer = nltk.data.load(f"tokenizers/punkt/{lang_map.get(lang, 'english')}.pickle")
sentences = tokenizer.tokenize(text)
if lang == 'de':
aspect_map = ASPECT_LABEL_MAP
all_labels = ALL_LABELS
sent_pipeline = sent_de
hypothesis_template = "Dieser Satz handelt von {}."
elif lang == 'en':
aspect_map = ASPECT_LABEL_MAP_EN
all_labels = [label for labels in aspect_map.values() for label in labels]
sent_pipeline = sent_en
hypothesis_template = "This sentence is about {}."
else:
continue
for sent in sentences:
if not sent.strip() or len(sent) < 15:
continue
result = zsl(sent, candidate_labels=all_labels, hypothesis_template=hypothesis_template)
main_label = ""
best_score = 0.0
for label, score in zip(result["labels"], result["scores"]):
if score > 0.8:
main_label = next((k for k, v in aspect_map.items() if label in v), label)
best_score = score
break
if not main_label:
continue
ml_sentiment = sent_pipeline(sent)[0]
ml_score = ml_sentiment['score'] if ml_sentiment['label'].upper().startswith('POS') else -ml_sentiment['score']
final_score = ml_score
final_label = 'POS' if final_score > 0.1 else 'NEG' if final_score < -0.1 else 'NEU'
print(
f"Review {review_id} ({lang}) | Satz: {sent}\n"
f" Aspekt: {main_label} (via '{result['labels'][0]}', {best_score:.2f}) | "
f"ML: {ml_sentiment['label']}({ml_sentiment['score']:.2f}) -> Final: {final_label}({final_score:.2f})"
)
aspect_results[main_label].append(final_score)
total_aspects += 1
logger.info(f"Total aspects found: {total_aspects}")
return aspect_results
def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
output_dir.mkdir(parents=True, exist_ok=True)
aspects = list(aspect_results.keys())
avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()]
colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores]
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
bars = plt.barh(aspects, avg_scores, color=colors)
plt.axvline(x=0, color='black', linewidth=0.8)
plt.xlabel("Durchschnittlicher Sentiment-Score")
plt.title("Sentiment-Analyse pro Aspekt")
for bar, score in zip(bars, avg_scores):
plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
f"{score:.2f}", va='center')
plt.tight_layout()
plt.gca().invert_yaxis()
output_path = output_dir / filename
plt.savefig(output_path, dpi=300)
plt.close()
logger.info(f"Diagramm gespeichert unter: {output_path}")
# --- Entry Point ---
def main():
parser = argparse.ArgumentParser(description="Quick-Win ABSA ohne SentiWS")
parser.add_argument("--db-path", required=True, help="Pfad zur SQLite-Datenbank")
parser.add_argument("--isbn", required=True, help="ISBN des Buchs")
parser.add_argument("--gpu", action="store_true", help="GPU verwenden (device=0)")
parser.add_argument("--languages", nargs="+", choices=["de", "en"], default=["de", "en"],
help="Sprachen der Reviews, z. B. --languages de oder --languages de en")
args = parser.parse_args()
device = 0 if args.gpu else -1
aspect_results = analyze_quickwin(
Path(args.db_path), args.isbn,
device=device,
languages=args.languages
)
if aspect_results:
output_dir = Path("output")
visualize_aspects(aspect_results, output_dir)
else:
logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.")
|