Locutusque commited on
Commit
8eb13b2
·
verified ·
1 Parent(s): 3b00f9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -31
app.py CHANGED
@@ -2,50 +2,121 @@ import spaces
2
  import gradio as gr
3
  from transformers import pipeline, AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
4
  import torch
5
- from threading import Thread
6
  import os
 
 
 
 
7
 
8
  # Global dictionary to store preloaded models and tokenizers
9
  LOADED_MODELS = {}
10
  LOADED_TOKENIZERS = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def preload_models(model_choices):
13
  """Preload all models to CPU at startup"""
14
- print("Preloading models to CPU...")
15
- for model_name in model_choices:
16
- try:
17
- print(f"Loading {model_name}...")
18
- # Load model to CPU with bfloat16 to save memory
19
- model = AutoModelForCausalLM.from_pretrained(
20
- model_name,
21
- torch_dtype=torch.bfloat16,
22
- trust_remote_code=True,
23
- token=os.environ.get("token"),
24
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- # Load tokenizer
27
- tokenizer = AutoTokenizer.from_pretrained(
28
- model_name,
29
- trust_remote_code=True,
30
- token=os.environ.get("token")
31
- )
32
- tokenizer.eos_token = "<|im_end|>"
33
 
34
- LOADED_MODELS[model_name] = model
35
- LOADED_TOKENIZERS[model_name] = tokenizer
36
- print(f"Successfully loaded {model_name}")
37
- except Exception as e:
38
- print(f"Failed to load {model_name}: {e}")
39
 
40
  @spaces.GPU()
41
  def get_model_pipeline(model_name):
42
  """Move selected model to GPU and create pipeline"""
43
- if model_name not in LOADED_MODELS:
44
- raise ValueError(f"Model {model_name} not found in preloaded models")
45
-
46
- # Move model to GPU
47
- model = LOADED_MODELS[model_name]
48
- tokenizer = LOADED_TOKENIZERS[model_name]
 
49
 
50
  # Create pipeline with the GPU model
51
  pipe = pipeline(
@@ -134,6 +205,10 @@ model_choices = [
134
  # Preload all models to CPU at startup
135
  preload_models(model_choices)
136
 
 
 
 
 
137
  # Create Gradio interface
138
  g = gr.ChatInterface(
139
  fn=generate,
@@ -160,4 +235,8 @@ g = gr.ChatInterface(
160
  )
161
 
162
  if __name__ == "__main__":
163
- g.launch()
 
 
 
 
 
2
  import gradio as gr
3
  from transformers import pipeline, AutoTokenizer, TextIteratorStreamer, AutoModelForCausalLM
4
  import torch
5
+ from threading import Thread, Lock, Event
6
  import os
7
+ import asyncio
8
+ import time
9
+ from datetime import datetime
10
+ import gc
11
 
12
  # Global dictionary to store preloaded models and tokenizers
13
  LOADED_MODELS = {}
14
  LOADED_TOKENIZERS = {}
15
+ # Lock for thread-safe model access
16
+ MODEL_LOCK = Lock()
17
+ # Event to signal shutdown
18
+ SHUTDOWN_EVENT = Event()
19
+
20
+ def clear_memory():
21
+ """Clear GPU and CPU memory"""
22
+ torch.cuda.empty_cache()
23
+ gc.collect()
24
+
25
+ def load_single_model(model_name):
26
+ """Load a single model and tokenizer"""
27
+ try:
28
+ print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Loading {model_name}...")
29
+
30
+ # Load model to CPU with bfloat16 to save memory
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ model_name,
33
+ torch_dtype=torch.bfloat16,
34
+ trust_remote_code=True,
35
+ token=os.environ.get("token"),
36
+ )
37
+
38
+ # Load tokenizer
39
+ tokenizer = AutoTokenizer.from_pretrained(
40
+ model_name,
41
+ trust_remote_code=True,
42
+ token=os.environ.get("token")
43
+ )
44
+ tokenizer.eos_token = "<|im_end|>"
45
+
46
+ print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Successfully loaded {model_name}")
47
+ return model, tokenizer
48
+ except Exception as e:
49
+ print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Failed to load {model_name}: {e}")
50
+ return None, None
51
 
52
  def preload_models(model_choices):
53
  """Preload all models to CPU at startup"""
54
+ print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Preloading models to CPU...")
55
+
56
+ with MODEL_LOCK:
57
+ for model_name in model_choices:
58
+ model, tokenizer = load_single_model(model_name)
59
+ if model is not None and tokenizer is not None:
60
+ LOADED_MODELS[model_name] = model
61
+ LOADED_TOKENIZERS[model_name] = tokenizer
62
+
63
+ def reload_models_task(model_choices):
64
+ """Background task to reload models every 15 minutes"""
65
+ print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Starting model reload task...")
66
+
67
+ while not SHUTDOWN_EVENT.is_set():
68
+ # Wait for 15 minutes (900 seconds)
69
+ if SHUTDOWN_EVENT.wait(900):
70
+ # If event is set, exit the loop
71
+ break
72
+
73
+ print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Starting periodic model reload...")
74
+
75
+ # Create temporary dictionaries for new models
76
+ new_models = {}
77
+ new_tokenizers = {}
78
+
79
+ # Load new models
80
+ for model_name in model_choices:
81
+ model, tokenizer = load_single_model(model_name)
82
+ if model is not None and tokenizer is not None:
83
+ new_models[model_name] = model
84
+ new_tokenizers[model_name] = tokenizer
85
+
86
+ # Replace old models with new ones atomically
87
+ with MODEL_LOCK:
88
+ # Clear old models from memory
89
+ for model_name in LOADED_MODELS:
90
+ if model_name in LOADED_MODELS:
91
+ try:
92
+ del LOADED_MODELS[model_name]
93
+ except:
94
+ pass
95
+ if model_name in LOADED_TOKENIZERS:
96
+ try:
97
+ del LOADED_TOKENIZERS[model_name]
98
+ except:
99
+ pass
100
 
101
+ # Clear memory
102
+ clear_memory()
 
 
 
 
 
103
 
104
+ # Update with new models
105
+ LOADED_MODELS.update(new_models)
106
+ LOADED_TOKENIZERS.update(new_tokenizers)
107
+
108
+ print(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Model reload completed")
109
 
110
  @spaces.GPU()
111
  def get_model_pipeline(model_name):
112
  """Move selected model to GPU and create pipeline"""
113
+ with MODEL_LOCK:
114
+ if model_name not in LOADED_MODELS:
115
+ raise ValueError(f"Model {model_name} not found in preloaded models")
116
+
117
+ # Get model and tokenizer references
118
+ model = LOADED_MODELS[model_name]
119
+ tokenizer = LOADED_TOKENIZERS[model_name]
120
 
121
  # Create pipeline with the GPU model
122
  pipe = pipeline(
 
205
  # Preload all models to CPU at startup
206
  preload_models(model_choices)
207
 
208
+ # Start the background reload task
209
+ reload_thread = Thread(target=reload_models_task, args=(model_choices,), daemon=True)
210
+ reload_thread.start()
211
+
212
  # Create Gradio interface
213
  g = gr.ChatInterface(
214
  fn=generate,
 
235
  )
236
 
237
  if __name__ == "__main__":
238
+ try:
239
+ g.launch()
240
+ finally:
241
+ # Signal the reload thread to stop when the app shuts down
242
+ SHUTDOWN_EVENT.set()