Till Fischer commited on
Commit
d96f744
·
1 Parent(s): ec6d3be

Update all changes

Browse files
Files changed (3) hide show
  1. analyze_aspects.py +28 -36
  2. app.py +3 -0
  3. download_nltk_resources.py +4 -0
analyze_aspects.py CHANGED
@@ -4,6 +4,9 @@
4
  #python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
5
  # python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
6
  # Fixing Punkt tokenizer bug
 
 
 
7
  import sqlite3
8
  import argparse
9
  import logging
@@ -12,39 +15,10 @@ import nltk
12
  from transformers import pipeline
13
  from collections import defaultdict
14
  import matplotlib.pyplot as plt
15
- import os
16
- nltk.download('punkt')
17
-
18
- def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
19
- output_dir.mkdir(parents=True, exist_ok=True)
20
-
21
- aspects = list(aspect_results.keys())
22
- avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()]
23
- colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores]
24
-
25
- plt.figure(figsize=(10, 6))
26
- bars = plt.barh(aspects, avg_scores, color=colors)
27
- plt.axvline(x=0, color='black', linewidth=0.8)
28
- plt.xlabel("Durchschnittlicher Sentiment-Score")
29
- plt.title("Sentiment-Analyse pro Aspekt")
30
-
31
- for bar, score in zip(bars, avg_scores):
32
- plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
33
- f"{score:.2f}", va='center')
34
-
35
- plt.tight_layout()
36
- plt.gca().invert_yaxis()
37
-
38
- output_path = output_dir / filename
39
- plt.savefig(output_path, dpi=300)
40
- plt.close()
41
-
42
- logger.info(f"Diagramm gespeichert unter: {output_path}")
43
-
44
 
45
- # NLTK punkt model for sentence tokenization
46
- nltk.download('punkt', download_dir='/home/user/nltk_data')
47
- from nltk.tokenize import sent_tokenize
48
 
49
  # Logging Configuration
50
  def configure_logging():
@@ -78,7 +52,6 @@ ASPECT_LABEL_MAP_EN = {
78
 
79
  ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels]
80
 
81
-
82
  # --- Datenbankzugriff ---
83
 
84
  def load_reviews(db_path: Path, isbn: str) -> list:
@@ -98,7 +71,6 @@ def load_reviews(db_path: Path, isbn: str) -> list:
98
  texts_to_analyze.append((review_id, text_en, 'en'))
99
  return texts_to_analyze
100
 
101
-
102
  # --- Analysefunktion ---
103
 
104
  def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict:
@@ -120,7 +92,7 @@ def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list
120
  continue
121
 
122
  logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
123
-
124
  lang_map = {'de': 'german', 'en': 'english'}
125
  sentences = sent_tokenize(text, language=lang_map.get(lang, 'english'))
126
 
@@ -171,6 +143,26 @@ def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list
171
  logger.info(f"Total aspects found: {total_aspects}")
172
  return aspect_results
173
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  # --- Entry Point ---
176
 
@@ -194,4 +186,4 @@ def main():
194
  output_dir = Path("output")
195
  visualize_aspects(aspect_results, output_dir)
196
  else:
197
- logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.")
 
4
  #python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
5
  # python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
6
  # Fixing Punkt tokenizer bug
7
+ #!/usr/bin/env python3
8
+ # analyze_aspects.py
9
+
10
  import sqlite3
11
  import argparse
12
  import logging
 
15
  from transformers import pipeline
16
  from collections import defaultdict
17
  import matplotlib.pyplot as plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Download punkt tokenizer wie lokal
20
+ nltk.download('punkt')
21
+ from nltk import sent_tokenize
22
 
23
  # Logging Configuration
24
  def configure_logging():
 
52
 
53
  ALL_LABELS = [label for labels in ASPECT_LABEL_MAP.values() for label in labels]
54
 
 
55
  # --- Datenbankzugriff ---
56
 
57
  def load_reviews(db_path: Path, isbn: str) -> list:
 
71
  texts_to_analyze.append((review_id, text_en, 'en'))
72
  return texts_to_analyze
73
 
 
74
  # --- Analysefunktion ---
75
 
76
  def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list[str] = ["de", "en"]) -> dict:
 
92
  continue
93
 
94
  logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
95
+
96
  lang_map = {'de': 'german', 'en': 'english'}
97
  sentences = sent_tokenize(text, language=lang_map.get(lang, 'english'))
98
 
 
143
  logger.info(f"Total aspects found: {total_aspects}")
144
  return aspect_results
145
 
146
+ def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
147
+ output_dir.mkdir(parents=True, exist_ok=True)
148
+ aspects = list(aspect_results.keys())
149
+ avg_scores = [sum(scores) / len(scores) for scores in aspect_results.values()]
150
+ colors = ['green' if score > 0.1 else 'red' if score < -0.1 else 'gray' for score in avg_scores]
151
+ import matplotlib.pyplot as plt
152
+ plt.figure(figsize=(10, 6))
153
+ bars = plt.barh(aspects, avg_scores, color=colors)
154
+ plt.axvline(x=0, color='black', linewidth=0.8)
155
+ plt.xlabel("Durchschnittlicher Sentiment-Score")
156
+ plt.title("Sentiment-Analyse pro Aspekt")
157
+ for bar, score in zip(bars, avg_scores):
158
+ plt.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height() / 2,
159
+ f"{score:.2f}", va='center')
160
+ plt.tight_layout()
161
+ plt.gca().invert_yaxis()
162
+ output_path = output_dir / filename
163
+ plt.savefig(output_path, dpi=300)
164
+ plt.close()
165
+ logger.info(f"Diagramm gespeichert unter: {output_path}")
166
 
167
  # --- Entry Point ---
168
 
 
186
  output_dir = Path("output")
187
  visualize_aspects(aspect_results, output_dir)
188
  else:
189
+ logger.info("Keine Aspekt-Daten zur Visualisierung verfügbar.")
app.py CHANGED
@@ -5,6 +5,9 @@ from analyze_aspects import analyze_quickwin, visualize_aspects
5
  from pathlib import Path
6
  import tempfile
7
  import shutil
 
 
 
8
 
9
  def run_analysis(db_file, isbn, languages):
10
  if not isbn.strip():
 
5
  from pathlib import Path
6
  import tempfile
7
  import shutil
8
+ import os
9
+
10
+ os.system("python download_nltk_resources.py")
11
 
12
  def run_analysis(db_file, isbn, languages):
13
  if not isbn.strip():
download_nltk_resources.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import nltk
2
+
3
+ nltk.download('punkt')
4
+ nltk.download('stopwords')