doropiza commited on
Commit
c86f81a
·
1 Parent(s): a038aaa
Files changed (1) hide show
  1. app.py +120 -84
app.py CHANGED
@@ -1,96 +1,132 @@
1
- import gradio as gr
2
- import torch
3
- import os
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
- # 環境変数からトークンを取得
7
- HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
8
- if not HUGGINGFACE_TOKEN:
9
- raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
10
 
11
 
12
- def _load_model():
13
- if not torch.cuda.is_available():
14
- raise RuntimeError("GPU is not available but required.")
15
- print("GPU is available and model will be loaded.")
16
- return "GPU ready"
17
 
18
- _load_model()
19
 
20
- # モデルとトークナイザーの初期化
21
- MODEL_NAME = "google/gemma-7b-it"
22
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HUGGINGFACE_TOKEN)
23
- model = AutoModelForCausalLM.from_pretrained(
24
- MODEL_NAME,
25
- torch_dtype=torch.float16,
26
- device_map="auto",
27
- token=HUGGINGFACE_TOKEN
28
- )
29
-
30
- def generate_response(prompt):
31
- # プロンプトの準備
32
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
33
 
34
- # 応答の生成
35
- with torch.no_grad():
36
- outputs = model.generate(
37
- **inputs,
38
- max_new_tokens=512,
39
- temperature=0.7,
40
- top_p=0.9,
41
- do_sample=True,
42
- pad_token_id=tokenizer.eos_token_id
43
- )
44
 
45
- # 応答のデコード
46
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
47
- return response
48
 
49
- def respond(message, history):
50
- # チャット履歴の構築
51
- chat_history = ""
52
- for msg in history:
53
- chat_history += f"{msg['role']}: {msg['content']}\n"
54
 
55
- # 現在のメッセージを追加
56
- prompt = f"{chat_history}Human: {message}\nAssistant:"
57
 
58
- try:
59
- response = generate_response(prompt)
60
- # 応答から余分な部分を削除
61
- response = response.split("Assistant:")[-1].strip()
62
- return response
63
- except Exception as e:
64
- return f"エラーが発生しました: {str(e)}"
65
-
66
- # Gradioインターフェースの設定
67
- iface = gr.ChatInterface(
68
- fn=respond,
69
- textbox=gr.Textbox(
70
- placeholder="メッセージを入力してください...",
71
- container=False,
72
- scale=7,
73
- lines=2
74
- ),
75
- chatbot=gr.Chatbot(
76
- height=600,
77
- show_copy_button=True,
78
- show_share_button=True,
79
- avatar_images=(None, None)
80
- ),
81
- title="Gemma Chat Assistant",
82
- description="Google Gemmaモデルを使用したチャットアシスタントです。",
83
- theme=gr.themes.Soft(),
84
- examples=[
85
- "こんにちは",
86
- "自己紹介をしてください",
87
- "Pythonについて教えてください"
88
- ]
89
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
 
91
  if __name__ == "__main__":
92
- iface.launch(
93
- share=True,
94
- server_name="0.0.0.0",
95
- server_port=7860
96
- )
 
1
+ # import gradio as gr
2
+ # import torch
3
+ # import os
4
+ # from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
+ # # 環境変数からトークンを取得
7
+ # HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
8
+ # if not HUGGINGFACE_TOKEN:
9
+ # raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
10
 
11
 
12
+ # def _load_model():
13
+ # if not torch.cuda.is_available():
14
+ # raise RuntimeError("GPU is not available but required.")
15
+ # print("GPU is available and model will be loaded.")
16
+ # return "GPU ready"
17
 
18
+ # _load_model()
19
 
20
+ # # モデルとトークナイザーの初期化
21
+ # MODEL_NAME = "google/gemma-7b-it"
22
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HUGGINGFACE_TOKEN)
23
+ # model = AutoModelForCausalLM.from_pretrained(
24
+ # MODEL_NAME,
25
+ # torch_dtype=torch.float16,
26
+ # device_map="auto",
27
+ # token=HUGGINGFACE_TOKEN
28
+ # )
29
+
30
+ # def generate_response(prompt):
31
+ # # プロンプトの準備
32
+ # inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
33
 
34
+ # # 応答の生成
35
+ # with torch.no_grad():
36
+ # outputs = model.generate(
37
+ # **inputs,
38
+ # max_new_tokens=512,
39
+ # temperature=0.7,
40
+ # top_p=0.9,
41
+ # do_sample=True,
42
+ # pad_token_id=tokenizer.eos_token_id
43
+ # )
44
 
45
+ # # 応答のデコード
46
+ # response = tokenizer.decode(outputs[0], skip_special_tokens=True)
47
+ # return response
48
 
49
+ # def respond(message, history):
50
+ # # チャット履歴の構築
51
+ # chat_history = ""
52
+ # for msg in history:
53
+ # chat_history += f"{msg['role']}: {msg['content']}\n"
54
 
55
+ # # 現在のメッセージを追加
56
+ # prompt = f"{chat_history}Human: {message}\nAssistant:"
57
 
58
+ # try:
59
+ # response = generate_response(prompt)
60
+ # # 応答から余分な部分を削除
61
+ # response = response.split("Assistant:")[-1].strip()
62
+ # return response
63
+ # except Exception as e:
64
+ # return f"エラーが発生しました: {str(e)}"
65
+
66
+ # # Gradioインターフェースの設定
67
+ # iface = gr.ChatInterface(
68
+ # fn=respond,
69
+ # textbox=gr.Textbox(
70
+ # placeholder="メッセージを入力してください...",
71
+ # container=False,
72
+ # scale=7,
73
+ # lines=2
74
+ # ),
75
+ # chatbot=gr.Chatbot(
76
+ # height=600,
77
+ # show_copy_button=True,
78
+ # show_share_button=True,
79
+ # avatar_images=(None, None)
80
+ # ),
81
+ # title="Gemma Chat Assistant",
82
+ # description="Google Gemmaモデルを使用したチャットアシスタントです。",
83
+ # theme=gr.themes.Soft(),
84
+ # examples=[
85
+ # "こんにちは",
86
+ # "自己紹介をしてください",
87
+ # "Pythonについて教えてください"
88
+ # ]
89
+ # )
90
+
91
+ # if __name__ == "__main__":
92
+ # iface.launch(
93
+ # share=True,
94
+ # server_name="0.0.0.0",
95
+ # server_port=7860
96
+ # )
97
+
98
+
99
+ import os, torch, gradio as gr
100
+ from transformers import AutoModelForCausalLM, AutoTokenizer
101
+
102
+ HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
103
+ MODEL_NAME = "google/gemma-7b-it"
104
+
105
+ model, tokenizer = None, None # ← グローバルで空のまま
106
+
107
+ def load_model():
108
+ """初回リクエスト時にのみ GPU を要求してモデルをロード"""
109
+ global model, tokenizer
110
+ if model is not None:
111
+ return
112
+ if not torch.cuda.is_available():
113
+ # ZeroGPU ならここで一度 False → 数秒待って再度 True になることもある
114
+ raise RuntimeError("GPU still not attached (ZeroGPU)。数秒後に再試行してください。")
115
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HUGGINGFACE_TOKEN)
116
+ model = AutoModelForCausalLM.from_pretrained(
117
+ MODEL_NAME,
118
+ device_map="auto",
119
+ torch_dtype=torch.float16,
120
+ token=HUGGINGFACE_TOKEN
121
+ )
122
+
123
+ def respond(message, history):
124
+ load_model() # ← ここで初めて GPU を確保・モデルロード
125
+ inputs = tokenizer(message, return_tensors="pt").to(model.device)
126
+ with torch.no_grad():
127
+ out = model.generate(**inputs, max_new_tokens=512, temperature=0.7, top_p=0.9)
128
+ return tokenizer.decode(out[0], skip_special_tokens=True)
129
 
130
+ iface = gr.ChatInterface(fn=respond, title="Gemma-ZeroGPU Demo")
131
  if __name__ == "__main__":
132
+ iface.launch()