Spaces:

doropiza
/

chatbot

Sleeping

App Files Files Community

doropiza commited on May 25

Commit

d4bc91d

1 Parent(s): f05af65

c

Browse files

Files changed (1) hide show

app.py +120 -31

app.py CHANGED Viewed

@@ -27,13 +27,24 @@ class ChatBot:
         self.current_model_name = "gemma-2b-it"
         self.model_path = self.available_models[self.current_model_name]
-        # 量子化設定
-        self.quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_quant_type="nf4",
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_compute_dtype=torch.float16
-        )
         # モデル初期化
         self.tokenizer = None
@@ -56,49 +67,101 @@ class ChatBot:
             "repetition_penalty": 1.2
         }
     def load_model(self):
-        """モデルの読み込み"""
         try:
             logger.info(f"モデル {self.model_path} を読み込み中...")
             # 既存モデルのメモリ解放
-            if hasattr(self, 'model') and self.model is not None:
-                del self.model
-            if hasattr(self, 'tokenizer') and self.tokenizer is not None:
-                del self.tokenizer
-            # ガベージコレクション実行
-            gc.collect()
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_path,
                 token=HUGGINGFACE_TOKEN,
                 trust_remote_code=True
             )
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_path,
-                token=HUGGINGFACE_TOKEN,
-                quantization_config=self.quantization_config,
-                device_map="auto",
-                torch_dtype=torch.float16,
-                trust_remote_code=True
             )
-            # パディングトークン設定
-            if self.tokenizer.pad_token is None:
-                self.tokenizer.pad_token = self.tokenizer.eos_token
             self.model_loaded = True
             logger.info(f"モデル {self.model_path} の読み込み完了")
         except Exception as e:
             logger.error(f"モデル読み込みエラー: {e}")
             self.model_loaded = False
             self.tokenizer = None
             self.model = None
     def switch_model(self, model_name):
         """モデルの切り替え"""
@@ -161,7 +224,7 @@ class ChatBot:
     @spaces.GPU(duration=45)
     def generate_response(self, message, conversation_history=None):
-        """応答生成（GPU使用）"""
         if not self.model_loaded:
             return "申し訳ありませんが、現在AIモデルが利用できません。モデルを読み込み直してください。"
@@ -172,14 +235,21 @@ class ChatBot:
             # プロンプト作成
             prompt = self.create_prompt(message, conversation_history)
-            # トークン化
             inputs = self.tokenizer.encode(
                 prompt,
                 return_tensors='pt',
                 max_length=1024,
-                truncation=True
             )
             # 生成パラメータを動的に調整
             generation_kwargs = {
                 "inputs": inputs,
@@ -190,12 +260,24 @@ class ChatBot:
                 "repetition_penalty": self.generation_config["repetition_penalty"],
                 "pad_token_id": self.tokenizer.pad_token_id,
                 "eos_token_id": self.tokenizer.eos_token_id,
-                "use_cache": True
             }
-            # 生成
             with torch.no_grad():
-                outputs = self.model.generate(**generation_kwargs)
             # デコード
             response = self.tokenizer.decode(
@@ -215,6 +297,13 @@ class ChatBot:
         except Exception as e:
             logger.error(f"応答生成エラー: {e}")
             return f"エラーが発生しました: {str(e)}"
     def get_conversation(self, session_id="default"):

         self.current_model_name = "gemma-2b-it"
         self.model_path = self.available_models[self.current_model_name]
+        # デバイス設定
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logger.info(f"使用デバイス: {self.device}")
+        # 量子化設定（改良版）
+        if torch.cuda.is_available():
+            self.quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_quant_storage=torch.uint8,  # 追加
+                llm_int8_enable_fp32_cpu_offload=False  # 追加
+            )
+        else:
+            # CPU使用時は量子化を無効化
+            self.quantization_config = None
+            logger.info("CPUモードのため量子化を無効化")
         # モデル初期化
         self.tokenizer = None
             "repetition_penalty": 1.2
         }
+    def cleanup_memory(self):
+        """メモリクリーンアップ"""
+        if hasattr(self, 'model') and self.model is not None:
+            del self.model
+        if hasattr(self, 'tokenizer') and self.tokenizer is not None:
+            del self.tokenizer
+        # ガベージコレクション実行
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            # 追加: CUDA同期
+            torch.cuda.synchronize()
     def load_model(self):
+        """モデルの読み込み（改良版）"""
         try:
             logger.info(f"モデル {self.model_path} を読み込み中...")
             # 既存モデルのメモリ解放
+            self.cleanup_memory()
+            # トークナイザー読み込み
             self.tokenizer = AutoTokenizer.from_pretrained(
                 self.model_path,
                 token=HUGGINGFACE_TOKEN,
                 trust_remote_code=True
             )
+            # パディングトークン設定（事前に）
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token
+            # モデル読み込み設定
+            model_kwargs = {
+                "token": HUGGINGFACE_TOKEN,
+                "trust_remote_code": True,
+                "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
+                "low_cpu_mem_usage": True,  # 追加
+                "use_flash_attention_2": False  # 安定性のため無効化
+            }
+            # 量子化設定の適用
+            if self.quantization_config is not None:
+                model_kwargs["quantization_config"] = self.quantization_config
+                model_kwargs["device_map"] = "auto"
+            else:
+                # CPU使用時
+                model_kwargs["torch_dtype"] = torch.float32
             self.model = AutoModelForCausalLM.from_pretrained(
                 self.model_path,
+                **model_kwargs
             )
+            # CPU使用時の明示的なデバイス移動
+            if not torch.cuda.is_available():
+                self.model = self.model.to(self.device)
+            # 量子化モデルの場合、明示的にCUDAに移動
+            elif self.quantization_config is not None:
+                try:
+                    # 量子化レイヤーの初期化
+                    if hasattr(self.model, 'cuda'):
+                        self.model.cuda()
+                    # 各レイヤーを確実にGPUに移動
+                    for name, module in self.model.named_modules():
+                        if hasattr(module, 'cuda') and not next(module.parameters(), torch.tensor(0)).is_cuda:
+                            try:
+                                module.cuda()
+                            except Exception as layer_e:
+                                logger.warning(f"レイヤー {name} のCUDA移動に失敗: {layer_e}")
+                except Exception as cuda_e:
+                    logger.warning(f"CUDA移動エラー: {cuda_e}")
+            # モデルを評価モードに設定
+            self.model.eval()
             self.model_loaded = True
             logger.info(f"モデル {self.model_path} の読み込み完了")
+            # メモリ使用量ログ
+            if torch.cuda.is_available():
+                memory_allocated = torch.cuda.memory_allocated() / 1024**3
+                logger.info(f"GPU メモリ使用量: {memory_allocated:.2f} GB")
         except Exception as e:
             logger.error(f"モデル読み込みエラー: {e}")
             self.model_loaded = False
             self.tokenizer = None
             self.model = None
+            # エラー時のメモリクリーンアップ
+            self.cleanup_memory()
     def switch_model(self, model_name):
         """モデルの切り替え"""
     @spaces.GPU(duration=45)
     def generate_response(self, message, conversation_history=None):
+        """応答生成（GPU使用・改良版）"""
         if not self.model_loaded:
             return "申し訳ありませんが、現在AIモデルが利用できません。モデルを読み込み直してください。"
             # プロンプト作成
             prompt = self.create_prompt(message, conversation_history)
+            # トークン化（デバイス指定を明示）
             inputs = self.tokenizer.encode(
                 prompt,
                 return_tensors='pt',
                 max_length=1024,
+                truncation=True,
+                padding=True  # 追加
             )
+            # 入力をモデルと同じデバイスに移動
+            if torch.cuda.is_available() and self.model.device.type == 'cuda':
+                inputs = inputs.to(self.model.device)
+            elif not torch.cuda.is_available():
+                inputs = inputs.to(self.device)
             # 生成パラメータを動的に調整
             generation_kwargs = {
                 "inputs": inputs,
                 "repetition_penalty": self.generation_config["repetition_penalty"],
                 "pad_token_id": self.tokenizer.pad_token_id,
                 "eos_token_id": self.tokenizer.eos_token_id,
+                "use_cache": True,
+                "attention_mask": torch.ones_like(inputs)  # 追加
             }
+            # 生成実行
             with torch.no_grad():
+                try:
+                    outputs = self.model.generate(**generation_kwargs)
+                except RuntimeError as runtime_error:
+                    if "FP4 quantization state not initialized" in str(runtime_error):
+                        logger.warning("量子化エラーを検出、モデルを再初期化します...")
+                        self.load_model()
+                        if self.model_loaded:
+                            outputs = self.model.generate(**generation_kwargs)
+                        else:
+                            raise runtime_error
+                    else:
+                        raise runtime_error
             # デコード
             response = self.tokenizer.decode(
         except Exception as e:
             logger.error(f"応答生成エラー: {e}")
+            # 特定のエラーに対するリトライ機構
+            if "FP4 quantization" in str(e) or "not initialized" in str(e):
+                logger.info("量子化エラーによるモデル再読み込みを実行...")
+                self.load_model()
+                if self.model_loaded:
+                    return "モデルを再読み込みしました。もう一度お試しください。"
             return f"エラーが発生しました: {str(e)}"
     def get_conversation(self, session_id="default"):