Spaces:

tillfischer
/

aspect-sentiment-analyzer

Running

aspect-sentiment-analyzer / analyze_aspects.py

Till Fischer

Update all changes

c7ad5e5 7 days ago

7.95 kB

	#!/usr/bin/env python3
	# analyze_aspects.py

	#python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
	# python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
	# Fixing Punkt tokenizer bug
	#!/usr/bin/env python3
	# analyze_aspects.py

	import sqlite3
	import argparse
	import logging
	from pathlib import Path
	import nltk
	from transformers import pipeline
	from collections import defaultdict
	import matplotlib.pyplot as plt

	# ✅ Download punkt tokenizer wie lokal
	nltk.download('punkt')
	from nltk import sent_tokenize

	# Logging Configuration
	def configure_logging():
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	return logging.getLogger(__name__)

	logger = configure_logging()

	# Aspekt-Label-Maps
	ASPECT_LABEL_MAP = {
	"Handlung": ["Handlung", "Plot", "Story", "Aufbau"],
	"Charaktere": ["Charaktere", "Figuren", "Protagonisten", "Nebenfiguren", "Beziehungen"],
	"Stil": ["Stil", "Sprachstil", "Sprache", "Erzählweise"],
	"Emotionale Wirkung": ["Lesevergnügen", "Berührend", "Bewegend", "Begeisternd", "Spannend"],
	"Tiefgang": ["Tiefgang", "Nachdenklich", "Philosophisch", "kritisch"],
	"Thema & Kontext": ["Thema", "Motiv", "Zeitgeschehen", "Historischer Kontext", "Gesellschaft"],
	"Originalität": ["Originalität", "Kreativität", "Innovativ", "Idee", 'Humor'],
	"Recherche & Authentizität": ["Recherche", "Authentizität", "Realismus", "Fakten"]
	}

	ASPECT_LABEL_MAP_EN = {
	"Plot": ["Plot", "Story", "Narrative", "Structure"],
	"Characters": ["Characters", "Protagonists", "Antagonists", "Relationships"],
	"Style": ["Style", "Language", "Tone", "Narration"],
	"Emotional Impact": ["Touching", "Funny", "Exciting", "Moving", "Engaging"],
	"Depth": ["Philosophical", "Thought-provoking", "Insightful", "Critical"],
	"Theme & Context": ["Theme", "Motif", "Historical Context", "Social Issues"],
	"Originality": ["Originality", "Creativity", "Innovation", "Idea"],
	"Research & Authenticity": ["Research", "Authenticity", "Realism", "Facts"]
	}

	ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels]

	# --- Datenbankzugriff ---

	def load_reviews(db_path: Path, isbn: str) -> list:
	conn = sqlite3.connect(db_path)
	cursor = conn.cursor()
	cursor.execute(
	"SELECT id, cleaned_text, cleaned_text_en FROM reviews_und_notizen WHERE buch_isbn = ?",
	(isbn,)
	)
	rows = cursor.fetchall()
	conn.close()
	texts_to_analyze = []
	for review_id, text_de, text_en in rows:
	if text_de and isinstance(text_de, str):
	texts_to_analyze.append((review_id, text_de, 'de'))
	if text_en and isinstance(text_en, str):
	texts_to_analyze.append((review_id, text_en, 'en'))
	return texts_to_analyze

	# --- Analysefunktion ---

	def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict:
	reviews = load_reviews(db_path, isbn)
	reviews = [r for r in reviews if r[2] in languages]
	if not reviews:
	logger.warning(f"Keine gesäuberten Reviews für ISBN {isbn} in den gewählten Sprachen gefunden.")
	return {}

	zsl = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=device, multi_label=True)
	sent_de = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert", device=device)
	sent_en = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)

	aspect_results = defaultdict(list)
	total_aspects = 0

	for review_id, text, lang in reviews:
	if not text:
	continue

	logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")

	lang_map = {'de': 'german', 'en': 'english'}
	tokenizer = nltk.data.load(f"tokenizers/punkt/{lang_map.get(lang, 'english')}.pickle")
	sentences = tokenizer.tokenize(text)

	if lang == 'de':
	aspect_map = ASPECT_LABEL_MAP
	all_labels = ALL_LABELS
	sent_pipeline = sent_de
	hypothesis_template = "Dieser Satz handelt von {}."
	elif lang == 'en':
	aspect_map = ASPECT_LABEL_MAP_EN
	all_labels = [label for labels in aspect_map.values() for label in labels]
	sent_pipeline = sent_en
	hypothesis_template = "This sentence is about {}."
	else:
	continue

	for sent in sentences:
	if not sent.strip() or len(sent) < 15:
	continue

	result = zsl(sent, candidate_labels=all_labels, hypothesis_template=hypothesis_template)

	main_label = ""
	best_score = 0.0
	for label, score in zip(result["labels"], result["scores"]):
	if score > 0.8:
	main_label = next((k for k, v in aspect_map.items() if label in v), label)
	best_score = score
	break

	if not main_label:
	continue

	ml_sentiment = sent_pipeline(sent)[0]
	ml_score = ml_sentiment['score'] if ml_sentiment['label'].upper().startswith('POS') else -ml_sentiment['score']
	final_score = ml_score
	final_label = 'POS' if final_score > 0.1 else 'NEG' if final_score < -0.1 else 'NEU'

	print(
	f"Review {review_id} ({lang}) \| Satz: {sent}\n"
	f" Aspekt: {main_label} (via '{result['labels'][0]}', {best_score:.2f}) \| "
	f"ML: {ml_sentiment['label']}({ml_sentiment['score']:.2f}) -> Final: {final_label}({final_score:.2f})"
	)

	aspect_results[main_label].append(final_score)
	total_aspects += 1

	logger.info(f"Total aspects found: {total_aspects}")
	return aspect_results

	def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
	output_dir.mkdir(parents=True, exist_ok=True)
	aspects = list(aspect_results.keys())
	avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()]
	colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores]
	import matplotlib.pyplot as plt
	plt.figure(figsize=(10, 6))
	bars = plt.barh(aspects, avg_scores, color=colors)
	plt.axvline(x=0, color='black', linewidth=0.8)
	plt.xlabel("Durchschnittlicher Sentiment-Score")
	plt.title("Sentiment-Analyse pro Aspekt")
	for bar, score in zip(bars, avg_scores):
	plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
	f"{score:.2f}", va='center')
	plt.tight_layout()
	plt.gca().invert_yaxis()
	output_path = output_dir / filename
	plt.savefig(output_path, dpi=300)
	plt.close()
	logger.info(f"Diagramm gespeichert unter: {output_path}")

	# --- Entry Point ---

	def main():
	parser = argparse.ArgumentParser(description="Quick-Win ABSA ohne SentiWS")
	parser.add_argument("--db-path", required=True, help="Pfad zur SQLite-Datenbank")
	parser.add_argument("--isbn", required=True, help="ISBN des Buchs")
	parser.add_argument("--gpu", action="store_true", help="GPU verwenden (device=0)")
	parser.add_argument("--languages", nargs="+", choices=["de", "en"], default=["de", "en"],
	help="Sprachen der Reviews, z. B. --languages de oder --languages de en")
	args = parser.parse_args()

	device = 0 if args.gpu else -1
	aspect_results = analyze_quickwin(
	Path(args.db_path), args.isbn,
	device=device,
	languages=args.languages
	)

	if aspect_results:
	output_dir = Path("output")
	visualize_aspects(aspect_results, output_dir)
	else:
	logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.")