Till Fischer commited on
Commit
cac53d2
·
1 Parent(s): 8aac46d

Fix PunktTokenizer für Hugging Face Space

Browse files
Files changed (1) hide show
  1. analyze_aspects.py +9 -4
analyze_aspects.py CHANGED
@@ -3,7 +3,7 @@
3
 
4
  #python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
5
  # python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
6
-
7
  import sqlite3
8
  import argparse
9
  import logging
@@ -13,7 +13,6 @@ from transformers import pipeline
13
  from collections import defaultdict
14
  import matplotlib.pyplot as plt
15
 
16
-
17
  def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
18
  output_dir.mkdir(parents=True, exist_ok=True)
19
 
@@ -42,7 +41,7 @@ def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path,
42
 
43
 
44
  # NLTK punkt model for sentence tokenization
45
- nltk.download('punkt')
46
  from nltk import sent_tokenize
47
 
48
  # Logging Configuration
@@ -119,7 +118,13 @@ def analyze_quickwin(db_path: Path, isbn: str, device: int = -1, languages: list
119
  continue
120
 
121
  logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
122
- sentences = sent_tokenize(text, language='german' if lang == 'de' else 'english')
 
 
 
 
 
 
123
 
124
  if lang == 'de':
125
  aspect_map = ASPECT_LABEL_MAP
 
3
 
4
  #python /Users/fischer/Desktop/HanserMVP/scraping/analyze_aspects.py --isbn "9783446264199" --db-path /Users/fischer/Desktop/buch_datenbank.sqlite --languages de
5
  # python analyze_aspects.py --isbn "9783446264199" --db-path /Pfad/zur/sqlite.db --languages de
6
+ # Fixing Punkt tokenizer bug
7
  import sqlite3
8
  import argparse
9
  import logging
 
13
  from collections import defaultdict
14
  import matplotlib.pyplot as plt
15
 
 
16
  def visualize_aspects(aspect_results: dict[str, list[float]], output_dir: Path, filename: str = "sentiment_aspekte.png"):
17
  output_dir.mkdir(parents=True, exist_ok=True)
18
 
 
41
 
42
 
43
  # NLTK punkt model for sentence tokenization
44
+ nltk.download('punkt', download_dir='/home/user/nltk_data')
45
  from nltk import sent_tokenize
46
 
47
  # Logging Configuration
 
118
  continue
119
 
120
  logger.info(f"Review ID {review_id} ({lang}) wird verarbeitet.")
121
+ import os
122
+ nltk.download('punkt')
123
+ nltk.data.path.append("/home/user/nltk_data")
124
+ os.environ["NLTK_DATA"] = "/home/user/nltk_data"
125
+
126
+ lang_map = {'de': 'german', 'en': 'english'}
127
+ sentences = sent_tokenize(text, language=lang_map.get(lang, 'english'))
128
 
129
  if lang == 'de':
130
  aspect_map = ASPECT_LABEL_MAP