| import os | |
| import pandas as pd | |
| filenames = os.listdir("data_translated") | |
| df = pd.read_feather("data/ccs_synthetic_multi.feather") | |
| df_list = [pd.read_feather(os.path.join("data_translated", filename)) for filename in filenames] | |
| df_multi = pd.concat(df_list) | |
| df_multi = df_multi.reset_index(drop=True) | |
| df = df.drop("caption_multi", axis=1) | |
| df = df.merge(df_multi[["caption_multi", "index"]], how="left", on="index") | |
| df = df[ | |
| [ | |
| "caption", | |
| "caption_sv", | |
| "caption_multi", | |
| "url", | |
| "multi_language_code", | |
| "multi_language_name", | |
| "multi_target", | |
| "target_code", | |
| "opus_mt_url", | |
| "index", | |
| ] | |
| ] | |
| df = df.rename(columns={"multi_target": "multiple_target_model"}) | |
| df["opus_mt_url"] = df["opus_mt_url"].str.replace("https://huggingface.co/", "") | |
| df.to_feather("ccs_synthetic.feather") | |