Till Fischer
commited on
Commit
·
cac53d2
1
Parent(s):
8aac46d
Fix PunktTokenizer für Hugging Face Space
Browse files- analyze_aspects.py +9 -4
analyze_aspects.py
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
|
4 |
#python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
|
5 |
# python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
|
6 |
-
|
7 |
import sqlite3
|
8 |
import argparse
|
9 |
import logging
|
@@ -13,7 +13,6 @@ from transformers import pipeline
|
|
13 |
from collections import defaultdict
|
14 |
import matplotlib.pyplot as plt
|
15 |
|
16 |
-
|
17 |
def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
|
18 |
output_dir.mkdir(parents=True, exist_ok=True)
|
19 |
|
@@ -42,7 +41,7 @@ def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path,
|
|
42 |
|
43 |
|
44 |
# NLTK punkt model for sentence tokenization
|
45 |
-
nltk.download('punkt')
|
46 |
from nltk import sent_tokenize
|
47 |
|
48 |
# Logging Configuration
|
@@ -119,7 +118,13 @@ def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list
|
|
119 |
continue
|
120 |
|
121 |
logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
if lang == 'de':
|
125 |
aspect_map = ASPECT_LABEL_MAP
|
|
|
3 |
|
4 |
#python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
|
5 |
# python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
|
6 |
+
# Fixing Punkt tokenizer bug
|
7 |
import sqlite3
|
8 |
import argparse
|
9 |
import logging
|
|
|
13 |
from collections import defaultdict
|
14 |
import matplotlib.pyplot as plt
|
15 |
|
|
|
16 |
def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
|
17 |
output_dir.mkdir(parents=True, exist_ok=True)
|
18 |
|
|
|
41 |
|
42 |
|
43 |
# NLTK punkt model for sentence tokenization
|
44 |
+
nltk.download('punkt', download_dir='/home/user/nltk_data')
|
45 |
from nltk import sent_tokenize
|
46 |
|
47 |
# Logging Configuration
|
|
|
118 |
continue
|
119 |
|
120 |
logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
|
121 |
+
import os
|
122 |
+
nltk.download('punkt')
|
123 |
+
nltk.data.path.append("/home/user/nltk_data")
|
124 |
+
os.environ["NLTK_DATA"] = "/home/user/nltk_data"
|
125 |
+
|
126 |
+
lang_map = {'de': 'german', 'en': 'english'}
|
127 |
+
sentences = sent_tokenize(text, language=lang_map.get(lang, 'english'))
|
128 |
|
129 |
if lang == 'de':
|
130 |
aspect_map = ASPECT_LABEL_MAP
|