Spaces:

TheWeeeed
/

chinese-qa-demo

Running

App Files Files Community

TheWeeeed commited on 27 days ago

Commit

4c26b67

verified ·

1 Parent(s): 2bb107e

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -28

app.py CHANGED Viewed

@@ -215,48 +215,68 @@ def prepare_features_for_qa_inference(examples, tokenizer, pad_on_right, max_seq
     final_batch = {}
     if not processed_features:
-        logger.warning(f"No features generated for example IDs: {examples.get('id', ['N/A'])}. Returning empty structure.")
-        # 確保返回的結構與 .map 期望的一致，即字典的鍵是列名，值是空列表
         for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
             final_batch[key_to_ensure] = []
         return final_batch
-    # 1. 首先，將 processed_features (list of dicts) 轉換為 final_batch (dict of lists)
     for key in processed_features[0].keys(): # 假設所有特徵字典有相同的鍵
         final_batch[key] = [feature[key] for feature in processed_features]
-    # 2. 然後，對 final_batch 中需要轉換為張量的字段進行健壯性檢查和修正
     keys_to_fix_for_tensor_conversion = ["input_ids", "attention_mask", "token_type_ids"]
     pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
-    cls_token_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101
     sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102
-    for key in keys_to_fix_for_tensor_conversion:
-        if key in final_batch:
-            # final_batch[key] 是一個列表的列表，例如 [[ids_for_feature1], [ids_for_feature2], ...]
-            corrected_list_of_lists = []
-            for i, single_feature_list in enumerate(final_batch[key]):
-                if single_feature_list is None:
-                    logger.warning(f"Feature list for '{key}' at index {i} is None. Replacing with default for max_seq_len {max_seq_len}.")
-                    if key == "input_ids":
                         default_seq = [cls_token_id, sep_token_id] + [pad_token_id] * (max_seq_len - 2)
-                        corrected_list_of_lists.append(default_seq[:max_seq_len])
-                    elif key == "attention_mask":
                         default_mask = [1, 1] + [0] * (max_seq_len - 2)
-                        corrected_list_of_lists.append(default_mask[:max_seq_len])
-                    elif key == "token_type_ids":
-                        corrected_list_of_lists.append([0] * max_seq_len)
-                elif not all(isinstance(x, int) for x in single_feature_list):
-                    logger.warning(f"Feature list for '{key}' at index {i} contains non-integers: {str(single_feature_list)[:50]}... Fixing Nones.")
-                    default_val = pad_token_id if key == "input_ids" else 0
-                    fixed_list = [default_val if not isinstance(x, int) else x for x in single_feature_list]
-                    corrected_list_of_lists.append(fixed_list)
                 else:
-                    corrected_list_of_lists.append(single_feature_list) # List is already good
-            final_batch[key] = corrected_list_of_lists
-    # 在返回前，可以再加一層打印，確認修正後的 final_batch 結構
-    # logger.debug(f"Returning final_batch from prepare_features: { {k: str(v)[:200] + '...' for k,v in final_batch.items()} }")
     return final_batch
 # postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入

     final_batch = {}
     if not processed_features:
+        logger.warning(f"在 prepare_features_for_qa_inference 中，由於 tokenizer 沒有為 ID {examples.get('id', ['N/A'])[0]} 生成任何有效特徵 (processed_features 為空), 將返回空的特徵結構。")
+        # 確保所有期望的鍵都存在，並且值是空列表，以匹配 .map 的期望輸出結構
         for key_to_ensure in ['input_ids', 'attention_mask', 'token_type_ids', 'example_id', 'offset_mapping']:
             final_batch[key_to_ensure] = []
         return final_batch
+    # 1. 將 processed_features (list of dicts) 轉換為 final_batch (dict of lists)
     for key in processed_features[0].keys(): # 假設所有特徵字典有相同的鍵
         final_batch[key] = [feature[key] for feature in processed_features]
+    # 2. 對 final_batch 中需要轉換為張量的字段進行健壯性檢查和修正
     keys_to_fix_for_tensor_conversion = ["input_ids", "attention_mask", "token_type_ids"]
     pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
+    cls_token_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else 101
     sep_token_id = tokenizer.sep_token_id if tokenizer.sep_token_id is not None else 102
+    for key_to_fix in keys_to_fix_for_tensor_conversion:
+        if key_to_fix in final_batch:
+            # final_batch[key_to_fix] 應該是一個列表的列表，例如 [[ids_for_feature1], [ids_for_feature2], ...]
+            list_of_feature_sequences = final_batch[key_to_fix]
+            corrected_list_of_feature_sequences = []
+            for i, single_feature_sequence in enumerate(list_of_feature_sequences):
+                current_example_id = final_batch.get("example_id", [f"unknown_example_index_{i}"]*len(list_of_feature_sequences) )[i]
+                if single_feature_sequence is None:
+                    logger.warning(f"對於樣本 {current_example_id} 的特徵 {i}, 字段 '{key_to_fix}' 的整個序列是 None。將用默認安全序列替換。")
+                    if key_to_fix == "input_ids":
                         default_seq = [cls_token_id, sep_token_id] + [pad_token_id] * (max_seq_len - 2)
+                        corrected_list_of_feature_sequences.append(default_seq[:max_seq_len])
+                    elif key_to_fix == "attention_mask":
                         default_mask = [1, 1] + [0] * (max_seq_len - 2)
+                        corrected_list_of_feature_sequences.append(default_mask[:max_seq_len])
+                    elif key_to_fix == "token_type_ids":
+                        corrected_list_of_feature_sequences.append([0] * max_seq_len)
+                    else: # 不應該發生，因為我們只檢查這三個鍵
+                        corrected_list_of_feature_sequences.append([0] * max_seq_len) # 一個備用安全值
+                elif not all(isinstance(x, int) for x in single_feature_sequence):
+                    logger.warning(f"對於樣本 {current_example_id} 的特徵 {i}, 字段 '{key_to_fix}' 列表內部包含非整數值: {str(single_feature_sequence)[:50]}... 將嘗試修正 None 值。")
+                    default_val_for_element = pad_token_id if key_to_fix == "input_ids" else 0
+                    fixed_sequence = []
+                    for x_val in single_feature_sequence:
+                        if x_val is None: # 如果列表中的某個元素是 None
+                            fixed_sequence.append(default_val_for_element)
+                        elif not isinstance(x_val, int): # 如果不是整數也不是 None (異常情況)
+                            logger.error(f"嚴重錯誤：在 {key_to_fix} 中發現了既不是 int 也不是 None 的值: {x_val} (類型: {type(x_val)})。用默認值替換。")
+                            fixed_sequence.append(default_val_for_element)
+                        else:
+                            fixed_sequence.append(x_val)
+                    corrected_list_of_feature_sequences.append(fixed_sequence)
                 else:
+                    corrected_list_of_feature_sequences.append(single_feature_sequence) # 列表本身是好的
+            final_batch[key_to_fix] = corrected_list_of_feature_sequences
+    # (可選) 添加最終調試打印，檢查修正後的 final_batch
+    logger.info(f"DEBUG: Final batch being returned by prepare_features_for_qa_inference for example {examples.get('id', ['N/A'])[0]}:")
+    for key_to_log in ["input_ids", "attention_mask", "token_type_ids"]:
+        if key_to_log in final_batch:
+            logger.info(f"  {key_to_log}: {str(final_batch[key_to_log])[:200]}...") # 打印部分內容
     return final_batch
 # postprocess_qa_predictions 函數也需要從 utils_qa.py 複製或導入