Spaces:
Running
Running
from utils.common.data_record import read_json, write_json | |
import requests | |
import random | |
import hashlib | |
import tqdm | |
import time | |
session = requests.Session() | |
def translate(sentence): | |
app_id = '20221004001369410' | |
salt = str(random.randint(1000000000, 9999999999)) | |
key = 'XEsBS6babmp9wz5bcoEs' | |
sign = hashlib.md5(f'{app_id}{sentence}{salt}{key}'.encode('utf8')).hexdigest() | |
response = requests.get( | |
'https://fanyi-api.baidu.com/api/trans/vip/translate', | |
params={ | |
'q': sentence, | |
'from': 'en', | |
'to': 'zh', | |
'appid': app_id, | |
'salt': salt, | |
'sign': sign | |
} | |
).json() | |
if 'trans_result' not in response.keys(): | |
print(response) | |
raise RuntimeError | |
return response['trans_result'][0]['src'], response['trans_result'][0]['dst'] | |
def gen_label_from_sen_cls_json(sen_cls_json_path): | |
# generate Chinese translation | |
texts = [] | |
anns = read_json(sen_cls_json_path) | |
for v in anns.values(): | |
texts += [v['sentence']] | |
assert '\n' not in texts[-1] | |
texts = list(set(texts)) | |
res_json = [] | |
for text in tqdm.tqdm(texts): | |
time.sleep(1.2) | |
src_text, dst_text = translate(text) | |
res_json += [{ | |
'src': src_text, | |
'dst': dst_text | |
}] | |
write_json(sen_cls_json_path + '.translate_data', res_json, backup=False) | |
if __name__ == '__main__': | |
# res = translate('I am a doctor.\nHello world!') | |
# print(res) | |
import os | |
data_dir_paths = { | |
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing5Domains/asc/{k.split("-")[1]}' | |
for k in ['HL5Domains-ApexAD2600Progressive', 'HL5Domains-CanonG3', 'HL5Domains-CreativeLabsNomadJukeboxZenXtra40GB', | |
'HL5Domains-NikonCoolpix4300', 'HL5Domains-Nokia6610']}, | |
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing3Domains/asc/{k.split("-")[1]}' | |
for k in ['Liu3Domains-Computer', 'Liu3Domains-Router', 'Liu3Domains-Speaker']}, | |
**{k: f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc/{k.split("-")[1]}' | |
for k in [f'Ding9Domains-{d}' for d in os.listdir('/data/zql/datasets/nlp_asc_19_domains/dat/absa/Bing9Domains/asc')]}, | |
**{f'SemEval-{k[0].upper()}{k[1:]}': f'/data/zql/datasets/nlp_asc_19_domains/dat/absa/XuSemEval/asc/14/{k}' | |
for k in ['laptop', 'rest']}, | |
} | |
json_paths = [] | |
for p in data_dir_paths.values(): | |
json_paths += [os.path.join(p, f'{split}.json') for split in ['train', 'dev', 'test']] | |
assert all([os.path.exists(p) for p in json_paths]) | |
# print(len(json_paths)) | |
# exit() | |
for p in tqdm.tqdm(json_paths[23:]): | |
print(p) | |
gen_label_from_sen_cls_json(p) |