|
import logging |
|
import re |
|
|
|
|
|
import jieba |
|
|
|
jieba.setLogLevel(logging.CRITICAL) |
|
|
|
|
|
from pathlib import Path |
|
import fast_langdetect |
|
|
|
fast_langdetect.infer._default_detector = fast_langdetect.infer.LangDetector( |
|
fast_langdetect.infer.LangDetectConfig( |
|
cache_dir=Path(__file__).parent.parent.parent / "pretrained_models" / "fast_langdetect" |
|
) |
|
) |
|
|
|
|
|
from split_lang import LangSplitter |
|
|
|
|
|
def full_en(text): |
|
pattern = r"^(?=.*[A-Za-z])[A-Za-z0-9\s\u0020-\u007E\u2000-\u206F\u3000-\u303F\uFF00-\uFFEF]+$" |
|
return bool(re.match(pattern, text)) |
|
|
|
|
|
def full_cjk(text): |
|
|
|
cjk_ranges = [ |
|
(0x4E00, 0x9FFF), |
|
(0x3400, 0x4DB5), |
|
(0x20000, 0x2A6DD), |
|
(0x2A700, 0x2B73F), |
|
(0x2B740, 0x2B81F), |
|
(0x2B820, 0x2CEAF), |
|
(0x2CEB0, 0x2EBEF), |
|
(0x30000, 0x3134A), |
|
(0x31350, 0x323AF), |
|
(0x2EBF0, 0x2EE5D), |
|
] |
|
|
|
pattern = r"[0-9、-〜。!?.!?… /]+$" |
|
|
|
cjk_text = "" |
|
for char in text: |
|
code_point = ord(char) |
|
in_cjk = any(start <= code_point <= end for start, end in cjk_ranges) |
|
if in_cjk or re.match(pattern, char): |
|
cjk_text += char |
|
return cjk_text |
|
|
|
|
|
def split_jako(tag_lang, item): |
|
if tag_lang == "ja": |
|
pattern = r"([\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]+(?:[0-9、-〜。!?.!?… ]+[\u3041-\u3096\u3099\u309A\u30A1-\u30FA\u30FC]*)*)" |
|
else: |
|
pattern = r"([\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]+(?:[0-9、-〜。!?.!?… ]+[\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]*)*)" |
|
|
|
lang_list: list[dict] = [] |
|
tag = 0 |
|
for match in re.finditer(pattern, item["text"]): |
|
if match.start() > tag: |
|
lang_list.append({"lang": item["lang"], "text": item["text"][tag : match.start()]}) |
|
|
|
tag = match.end() |
|
lang_list.append({"lang": tag_lang, "text": item["text"][match.start() : match.end()]}) |
|
|
|
if tag < len(item["text"]): |
|
lang_list.append({"lang": item["lang"], "text": item["text"][tag : len(item["text"])]}) |
|
|
|
return lang_list |
|
|
|
|
|
def merge_lang(lang_list, item): |
|
if lang_list and item["lang"] == lang_list[-1]["lang"]: |
|
lang_list[-1]["text"] += item["text"] |
|
else: |
|
lang_list.append(item) |
|
return lang_list |
|
|
|
|
|
class LangSegmenter: |
|
|
|
DEFAULT_LANG_MAP = { |
|
"zh": "zh", |
|
"yue": "zh", |
|
"wuu": "zh", |
|
"zh-cn": "zh", |
|
"zh-tw": "x", |
|
"ko": "ko", |
|
"ja": "ja", |
|
"en": "en", |
|
} |
|
|
|
def getTexts(text): |
|
lang_splitter = LangSplitter(lang_map=LangSegmenter.DEFAULT_LANG_MAP) |
|
substr = lang_splitter.split_by_lang(text=text) |
|
|
|
lang_list: list[dict] = [] |
|
|
|
for _, item in enumerate(substr): |
|
dict_item = {"lang": item.lang, "text": item.text} |
|
|
|
|
|
if full_en(dict_item["text"]): |
|
dict_item["lang"] = "en" |
|
lang_list = merge_lang(lang_list, dict_item) |
|
continue |
|
|
|
|
|
ja_list: list[dict] = [] |
|
if dict_item["lang"] != "ja": |
|
ja_list = split_jako("ja", dict_item) |
|
|
|
if not ja_list: |
|
ja_list.append(dict_item) |
|
|
|
|
|
ko_list: list[dict] = [] |
|
temp_list: list[dict] = [] |
|
for _, ko_item in enumerate(ja_list): |
|
if ko_item["lang"] != "ko": |
|
ko_list = split_jako("ko", ko_item) |
|
|
|
if ko_list: |
|
temp_list.extend(ko_list) |
|
else: |
|
temp_list.append(ko_item) |
|
|
|
|
|
if len(temp_list) == 1: |
|
|
|
if dict_item["lang"] == "x": |
|
cjk_text = full_cjk(dict_item["text"]) |
|
if cjk_text: |
|
dict_item = {"lang": "zh", "text": cjk_text} |
|
lang_list = merge_lang(lang_list, dict_item) |
|
else: |
|
lang_list = merge_lang(lang_list, dict_item) |
|
continue |
|
else: |
|
lang_list = merge_lang(lang_list, dict_item) |
|
continue |
|
|
|
|
|
for _, temp_item in enumerate(temp_list): |
|
|
|
if temp_item["lang"] == "x": |
|
cjk_text = full_cjk(dict_item["text"]) |
|
if cjk_text: |
|
dict_item = {"lang": "zh", "text": cjk_text} |
|
lang_list = merge_lang(lang_list, dict_item) |
|
else: |
|
lang_list = merge_lang(lang_list, dict_item) |
|
else: |
|
lang_list = merge_lang(lang_list, temp_item) |
|
|
|
temp_list = lang_list |
|
lang_list = [] |
|
for _, temp_item in enumerate(temp_list): |
|
if temp_item["lang"] == "x": |
|
if lang_list: |
|
temp_item["lang"] = lang_list[-1]["lang"] |
|
elif len(temp_list) > 1: |
|
temp_item["lang"] = temp_list[1]["lang"] |
|
else: |
|
temp_item["lang"] = "zh" |
|
|
|
lang_list = merge_lang(lang_list, temp_item) |
|
|
|
return lang_list |
|
|
|
|
|
if __name__ == "__main__": |
|
text = "MyGO?,你也喜欢まいご吗?" |
|
print(LangSegmenter.getTexts(text)) |
|
|
|
text = "ねえ、知ってる?最近、僕は天文学を勉強してるんだ。君の瞳が星空みたいにキラキラしてるからさ。" |
|
print(LangSegmenter.getTexts(text)) |
|
|