drewThomasson commited on
Commit
771c7ff
·
verified ·
1 Parent(s): 4920af4

added Arabic punctuation to sentence splitting

Browse files
Files changed (1) hide show
  1. app.py +3 -6
app.py CHANGED
@@ -537,12 +537,6 @@ def split_long_sentence(sentence, language='en', max_pauses=10):
537
  #Get the Max character length for the selected language -2 : with a default of 248 if no language is found
538
  max_length = (char_limits.get(language, 250)-2)
539
 
540
- ## Adjust the max_length and punctuation symbols based on language
541
- #if language == 'zh-cn':
542
- # punctuation = [',', '。', ';', '!', '?'] # Chinese-specific punctuation
543
- #else:
544
- # punctuation = [',', ';', '.'] # Default punctuation
545
-
546
  # Adjust the pause punctuation symbols based on language
547
  if language == 'zh-cn':
548
  punctuation = [',', '。', ';', '?', '!'] # Chinese-specific pause punctuation including sentence-ending marks
@@ -550,12 +544,15 @@ def split_long_sentence(sentence, language='en', max_pauses=10):
550
  punctuation = ['、', '。', ';', '?', '!'] # Japanese-specific pause punctuation
551
  elif language == 'ko':
552
  punctuation = [',', '。', ';', '?', '!'] # Korean-specific pause punctuation
 
 
553
  elif language == 'en':
554
  punctuation = [',', ';', '.'] # English-specific pause punctuation
555
  else:
556
  # Default pause punctuation for other languages (es, fr, de, it, pt, pl, cs, ru, nl, tr, hu)
557
  punctuation = [',', '.', ';', ':', '?', '!']
558
 
 
559
 
560
  parts = []
561
  while len(sentence) > max_length or sum(sentence.count(p) for p in punctuation) > max_pauses:
 
537
  #Get the Max character length for the selected language -2 : with a default of 248 if no language is found
538
  max_length = (char_limits.get(language, 250)-2)
539
 
 
 
 
 
 
 
540
  # Adjust the pause punctuation symbols based on language
541
  if language == 'zh-cn':
542
  punctuation = [',', '。', ';', '?', '!'] # Chinese-specific pause punctuation including sentence-ending marks
 
544
  punctuation = ['、', '。', ';', '?', '!'] # Japanese-specific pause punctuation
545
  elif language == 'ko':
546
  punctuation = [',', '。', ';', '?', '!'] # Korean-specific pause punctuation
547
+ elif language == 'ar':
548
+ punctuation = ['،', '؛', '؟', '!', '·', '؛', '.'] # Arabic-specific punctuation
549
  elif language == 'en':
550
  punctuation = [',', ';', '.'] # English-specific pause punctuation
551
  else:
552
  # Default pause punctuation for other languages (es, fr, de, it, pt, pl, cs, ru, nl, tr, hu)
553
  punctuation = [',', '.', ';', ':', '?', '!']
554
 
555
+
556
 
557
  parts = []
558
  while len(sentence) > max_length or sum(sentence.count(p) for p in punctuation) > max_pauses: