Spaces:
Running
on
L40S
Running
on
L40S
import torch | |
from torch.func import functional_call | |
import queue | |
import threading | |
from typing import Dict, List, Any | |
import omegaconf | |
from pydantic import BaseModel, validator | |
from typing import Optional | |
from functools import wraps | |
def _callable_once(func): | |
def wrapper(self, *args, **kwargs): | |
method_called_flag = f"_called_once_{func.__name__}" | |
if getattr(self, method_called_flag, False): | |
raise RuntimeError(f"{func.__name__} can only be called once.") | |
setattr(self, method_called_flag, True) | |
return func(self, *args, **kwargs) | |
return wrapper | |
class OffloadCleanCacheWrapperParam(BaseModel): | |
module: Any | |
method_name: str | |
diff_mem_gb_thre: float | |
class OffloadParam(BaseModel): | |
offload_module: Any | |
cpu_mem_gb: float | |
pre_copy_step: Optional[int] = None | |
clean_cache_after_forward: Optional[bool] = None | |
dtype: Optional[str] = None | |
offload_layer_dict: Dict[str, int] = {} | |
ignore_layer_list: List[str] = [] | |
clean_cache_wrapper: Optional[OffloadCleanCacheWrapperParam] = None | |
debug: Optional[bool] = None | |
def parse_dtype(cls, value): | |
if value is None: | |
return None | |
dtype_map = { | |
'torch.float16': torch.float16, | |
'torch.float32': torch.float32, | |
'torch.float64': torch.float64, | |
'torch.int64': torch.int64, | |
} | |
if value not in dtype_map: | |
raise ValueError(f"Unsupported dtype: {value}") | |
return dtype_map[value] | |
def init_param_dict(self): | |
param_dict = {} | |
param_dict['cpu_mem_gb'] = self.cpu_mem_gb | |
if self.pre_copy_step is not None: | |
param_dict['pre_copy_step'] = self.pre_copy_step | |
if self.clean_cache_after_forward is not None: | |
param_dict['clean_cache_after_forward'] = self.clean_cache_after_forward | |
if self.debug is not None: | |
param_dict['debug'] = self.debug | |
return param_dict | |
def offload_layer_param_dict(self): | |
param_dict = {} | |
param_dict['module'] = self.offload_module | |
param_dict['offload_layer_dict'] = self.offload_layer_dict | |
param_dict['ignore_layer_list'] = self.ignore_layer_list | |
param_dict['dtype'] = self.dtype | |
return param_dict | |
def clean_cache_param_dict(self): | |
param_dict = {} | |
if self.clean_cache_wrapper is not None: | |
param_dict['module'] = self.clean_cache_wrapper.module | |
param_dict['method_name'] = self.clean_cache_wrapper.method_name | |
param_dict['diff_mem_gb_thre'] = self.clean_cache_wrapper.diff_mem_gb_thre | |
return param_dict | |
def recursive_print(model, indent=0): | |
for field_name, field_info in model.__fields__.items(): | |
field_value = getattr(model, field_name) | |
print(" " * indent + f"{field_name}:") | |
if issubclass(type(field_value), BaseModel): | |
print(" " * (indent + 2) + f"--- Nested model: {field_value.__class__.__name__}") | |
OffloadParam.recursive_print(field_value, indent + 4) | |
else: | |
print(" " * (indent + 2) + f"class: {field_value.__class__.__name__}") | |
if isinstance(field_value, torch.nn.Module): | |
pass | |
else: | |
print(" " * (indent + 2) + f"value: {field_value}") | |
def show(self): | |
print("-"*20 + "[OffloadParam]" + "-"*20) | |
OffloadParam.recursive_print(self) | |
print("-"*40) | |
class OffloadParamParse: | |
def __init__(self): | |
pass | |
def _get_model(root_model: torch.nn.Module, model_dir: str): | |
assert(model_dir.startswith("self")), f"model_dir {model_dir} must startswith `self`" | |
model = root_model | |
for layer in model_dir.split('.'): | |
if layer == "self": | |
continue | |
assert(hasattr(model, layer)), f"model not has layer [{layer}]!" | |
model = getattr(model, layer) | |
return model | |
def parse_config(root_model: torch.nn.Module, cfg: omegaconf.DictConfig)->OffloadParam: | |
assert(hasattr(cfg, "offload_module") and hasattr(cfg, "cpu_mem_gb") and hasattr(cfg, "dtype")) | |
offload_module = OffloadParamParse._get_model(root_model, cfg.offload_module) | |
cpu_mem_gb = cfg.cpu_mem_gb | |
dtype = cfg.dtype | |
pre_copy_step = cfg.pre_copy_step \ | |
if hasattr(cfg, "pre_copy_step") else None | |
clean_cache_after_forward = cfg.clean_cache_after_forward \ | |
if hasattr(cfg, "clean_cache_after_forward") else None | |
offload_layer_dict = {k: v for k, v in cfg.offload_layer_dict.items()} \ | |
if hasattr(cfg, "offload_layer_dict") else {} | |
ignore_layer_list = cfg.ignore_layer_list \ | |
if hasattr(cfg, "ignore_layer_list") else [] | |
debug = cfg.debug if hasattr(cfg, "debug") else None | |
clean_cache_wrapper = None | |
if hasattr(cfg, "clean_cache_wrapper"): | |
clean_cache_cfg = cfg.clean_cache_wrapper | |
cc_module = OffloadParamParse._get_model(root_model, clean_cache_cfg.module) | |
cc_method_name = clean_cache_cfg.method_name | |
diff_mem_gb_thre = clean_cache_cfg.diff_mem_gb_thre | |
clean_cache_wrapper = OffloadCleanCacheWrapperParam( | |
module=cc_module, | |
method_name=cc_method_name, | |
diff_mem_gb_thre=diff_mem_gb_thre) | |
return OffloadParam( | |
offload_module=offload_module, | |
cpu_mem_gb=cpu_mem_gb, | |
pre_copy_step=pre_copy_step, | |
clean_cache_after_forward=clean_cache_after_forward, | |
dtype=dtype, | |
offload_layer_dict=offload_layer_dict, | |
ignore_layer_list=ignore_layer_list, | |
clean_cache_wrapper=clean_cache_wrapper, | |
debug=debug | |
) | |
class LayerParamStruct: | |
def __init__(self): | |
self.count = 0 | |
self.device_state = None | |
class OffloadProfiler: | |
def __init__(self, device_index=0, cpu_mem_gb=-1, pre_copy_step=1, clean_cache_after_forward=False, debug=False): | |
self.clean_cache_after_forward = clean_cache_after_forward | |
self.cpu_mem_gb = cpu_mem_gb | |
self.cpu_mem_b_count = 0 | |
self.device_index = device_index | |
self.execution_order = [] | |
self.execution_order_idx = {} | |
self.pin_memory = False | |
test_data = torch.rand(1,1, device='cpu') | |
pin_data = test_data.pin_memory() | |
self.pin_memory = pin_data.is_pinned() | |
print(f"pin:{self.pin_memory}") | |
self.copy_stream = torch.cuda.Stream() | |
self.copy_queue = queue.Queue() | |
self.layer_param:Dict[str, LayerParamStruct] = {} | |
self.model_map = {} | |
self.stop_flag = False | |
self.copy_condition = threading.Condition() | |
self.queue_condition = threading.Condition() | |
self.mem_line_b = 0 | |
self.copy_thread = threading.Thread(target=self._copy_thread_fun) | |
self.copy_thread.daemon = True | |
self.copy_thread.start() | |
self.cur_copy_idx = 0 | |
self.execute_over = False | |
self.pre_copy_step = pre_copy_step | |
self.tmp_state_list = [] | |
self.tmp_state_idx = 0 | |
for i in range(pre_copy_step + 2): | |
self.tmp_state_list.append(None) | |
self.debug = debug | |
def stop(self): | |
self.stop_flag = True | |
with self.queue_condition: | |
self.queue_condition.notify() | |
self.copy_thread.join() | |
del self.layer_param | |
del self.model_map | |
del self.copy_stream | |
def _copy_thread_fun(self): | |
while self.stop_flag == False: | |
layer_name = "--" | |
with self.queue_condition: | |
while self.copy_queue.qsize() == 0 and self.stop_flag == False: | |
self.queue_condition.wait() | |
if self.stop_flag == True: | |
break | |
layer_name = self.copy_queue.get() | |
with torch.cuda.stream(self.copy_stream): | |
if layer_name in self.model_map: | |
model = self.model_map[layer_name] | |
self.tmp_state_list[self.tmp_state_idx] = { | |
k: v.to(torch.device(f"cuda:{self.device_index}"), non_blocking=False) | |
for k, v in model.state_dict().items() | |
} | |
self.copy_stream.synchronize() | |
device_state = self.tmp_state_list[self.tmp_state_idx] | |
self.tmp_state_idx = (self.tmp_state_idx + 1) % len(self.tmp_state_list) | |
with self.copy_condition: | |
if layer_name in self.layer_param: | |
self.layer_param[layer_name].count += 1 | |
else: | |
self.layer_param[layer_name] = LayerParamStruct() | |
self.layer_param[layer_name].count = 1 | |
self.layer_param[layer_name].device_state = device_state | |
self.copy_condition.notify() | |
else: | |
print(f"get model error! {layer_name}") | |
print("copy thread stop..") | |
def _get_new_step_copy_begin_end(self, tag_name): | |
pre_copy_step = self.pre_copy_step | |
pre_copy_step = min(pre_copy_step, len(self.execution_order) // 2) | |
cur_exe_idx = self.execution_order_idx[tag_name] | |
copy_begin = self.cur_copy_idx | |
copy_end = cur_exe_idx + pre_copy_step + 1 | |
if copy_end - copy_begin > len(self.execution_order): | |
copy_end %= len(self.execution_order) | |
if copy_end - copy_begin > pre_copy_step + 1 or copy_end - copy_begin < 0: | |
# jump | |
self.cur_copy_idx = cur_exe_idx | |
copy_begin, copy_end = self._get_new_step_copy_begin_end(tag_name=tag_name) | |
return copy_begin, copy_end | |
def make_forward_wrapper(self, module, tag_name, ignore_layer_list=[]): | |
original_forward = module.forward | |
layer_param_size = 0 | |
for name, param in module.named_parameters(): | |
layer_param_size += param.data.numel() * param.data.element_size() / 1024 / 1024 #MB | |
taget_cpu_mem_b = self.cpu_mem_gb * 1024 * 1024 * 1024 | |
offload = False | |
for name, param in module.named_parameters(): | |
p_name = f"{tag_name}.{name}" if tag_name else name | |
for i_layer in ignore_layer_list: | |
if p_name.startswith(i_layer): | |
if self.debug: | |
print(f"ignore layer param: {p_name}") | |
continue | |
if taget_cpu_mem_b >= 0 and self.cpu_mem_b_count >= taget_cpu_mem_b: | |
break | |
cpu_data = torch.empty_strided(size=param.data.size(), | |
stride=param.data.stride(), | |
dtype=param.data.dtype, | |
layout=param.data.layout, | |
device='cpu', | |
pin_memory=self.pin_memory) | |
cpu_data.copy_(param.data) | |
param.data = cpu_data | |
param_size = param.data.numel() * param.data.element_size() | |
self.cpu_mem_b_count += param_size | |
offload = True | |
if self.debug: | |
print(f"layer: {tag_name}, type: {module.__class__.__name__}, size(MB): {layer_param_size}, offload: {offload}, sum_offload_size(MB): {self.cpu_mem_b_count/1024/1024}") | |
if offload: | |
copy_condition = self.copy_condition | |
queue_condition = self.queue_condition | |
copy_queue = self.copy_queue | |
layer_param = self.layer_param | |
def forward_wrapper(*args, **kwargs): | |
module.forward = original_forward | |
execute_over = False if tag_name not in self.execution_order_idx else True | |
if execute_over == False: | |
self.model_map[tag_name] = module | |
self.execution_order.append(tag_name) | |
self.execution_order_idx[tag_name] = len(self.execution_order) - 1 | |
copy_queue.put(tag_name) | |
with queue_condition: | |
queue_condition.notify() | |
else: | |
copy_begin, copy_end = self._get_new_step_copy_begin_end(tag_name=tag_name) | |
if copy_end > copy_begin: | |
for idx in range(copy_begin, copy_end): | |
idx = idx % len(self.execution_order) | |
copy_tag_name = self.execution_order[idx] | |
copy_queue.put(copy_tag_name) | |
with queue_condition: | |
queue_condition.notify() | |
self.cur_copy_idx = copy_end % len(self.execution_order) | |
run_state = None | |
with self.copy_condition: | |
while tag_name not in self.layer_param: | |
copy_condition.wait() | |
run_state = self.layer_param[tag_name].device_state | |
self.layer_param[tag_name].count -= 1 | |
module.eval() | |
with torch.no_grad(): | |
output = functional_call(module, run_state, args=args, kwargs=kwargs) | |
with self.copy_condition: | |
if self.layer_param[tag_name].count == 0: | |
del self.layer_param[tag_name] | |
diff_mem_b_thre = 1 * (1024 ** 3) | |
if self.clean_cache_after_forward: | |
reserved = torch.cuda.memory_reserved() | |
if reserved > self.mem_line_b: | |
torch.cuda.empty_cache() | |
cur_reserved = torch.cuda.memory_reserved() | |
diff_mem = reserved - cur_reserved | |
if diff_mem > diff_mem_b_thre: | |
self.mem_line_b = cur_reserved + (reserved - cur_reserved) / 2 + 10 | |
else: | |
self.mem_line_b = reserved + 10 | |
if self.debug: | |
print(f"child mem line update, clean cache:{reserved/1024/1024}, cur mem: {cur_reserved/1024/1024} new limit: {self.mem_line_b / 1024 / 1024}, child name: {tag_name}") | |
module.forward = forward_wrapper | |
return output | |
module.forward = forward_wrapper | |
torch.cuda.empty_cache() | |
return module | |
def reset_empty_cache_mem_line(self): | |
self.mem_line_b = 0 | |
torch.cuda.empty_cache() | |
def clean_cache_wrapper(self, module, method_name='', diff_mem_gb_thre=1): | |
if not hasattr(module, method_name) or not callable(getattr(module, method_name)): | |
print(f"no this method {method_name}") | |
return module | |
original_fun = getattr(module, method_name) | |
diff_mem_b_thre = diff_mem_gb_thre * (1024 ** 3) | |
self.reset_empty_cache_mem_line() | |
def clean_wrapper(*args, **kwargs): | |
setattr(module, method_name, original_fun) | |
output = original_fun(*args, **kwargs) | |
reserved = torch.cuda.memory_reserved() | |
if reserved > self.mem_line_b: | |
torch.cuda.empty_cache() | |
cur_reserved = torch.cuda.memory_reserved() | |
diff_mem = reserved - cur_reserved | |
if diff_mem > diff_mem_b_thre: | |
self.mem_line_b = cur_reserved + (reserved - cur_reserved) / 2 + 10 | |
else: | |
self.mem_line_b = reserved + 10 | |
if self.debug: | |
print(f"mem line update, clean cache:{reserved/1024/1024}, cur mem: {cur_reserved/1024/1024} new limit: {self.mem_line_b / 1024 / 1024}") | |
setattr(module, method_name, clean_wrapper) | |
return output | |
setattr(module, method_name, clean_wrapper) | |
return module | |
def offload_layer(self, module, offload_layer_dict={}, ignore_layer_list=[], dtype:torch.dtype = None): | |
return self._offload_layer( | |
module=module, | |
tag="", | |
offload_layer_dict=offload_layer_dict, | |
ignore_layer_list=ignore_layer_list, | |
dtype=dtype | |
) | |
def _offload_layer(self, module, tag="", offload_layer_dict={}, ignore_layer_list=[], dtype:torch.dtype = None): | |
""" | |
Offload specific layers of a PyTorch model to a specified depth. | |
A model can only be offloaded once. | |
Args: | |
module (torch.nn.Module): | |
The PyTorch model containing the layers to offload. This is the model that will be modified in place. | |
tag (str, optional): | |
A string identifier for the model. | |
Default is an empty string. | |
offload_layer_dict (dict, optional): | |
A dictionary where keys are layer names and values represent the depth at which the offloading should occur. | |
For example, | |
```offload_layer_dict = {'cfm_wrapper': 5, 'hubert': 4}``` means that the `cfm_wrapper` layer should | |
be offloaded at depth 5, and the `hubert` layer should be offloaded at depth 4. | |
Default is an empty dictionary. | |
ignore_layer_list (list, optional): | |
A list of layer names or parameter identifiers to be ignored during the offloading process. | |
Layers in this list will not be offloaded, even if they are present in the `offload_layer_dict`. | |
For example, | |
```ignore_layer_list = ['cfm_wrapper.estimator.h', 'cfm_wrapper.estimator.adaln_single']``` | |
means that layers starting with `cfm_wrapper.estimator.h` or 'cfm_wrapper.estimator.adaln_single' will not be offload. | |
Default is an empty list. | |
dtype (torch.dtype, optional): | |
The data type (e.g., `torch.float16`, `torch.float32`) to which the offloaded layers should be converted. | |
If `None`, the data type of the layers will remain unchanged. Default is `None`. | |
Returns: | |
None | |
""" | |
for p in module._parameters.values(): | |
if p is not None: | |
p.data = p.data.to(torch.device(f"cuda:{self.device_index}")) | |
if dtype is not None: | |
p.data = p.data.to(dtype) | |
for b in module._buffers.values(): | |
if b is not None: | |
b.data = b.data.to(torch.device(f"cuda:{self.device_index}")) | |
if dtype is not None: | |
b.data = b.data.to(dtype) | |
for attr_name, attr in module.__dict__.items(): | |
if isinstance(attr, torch.Tensor) and not attr_name.startswith('_'): | |
attr.data = attr.data.to(torch.device(f"cuda:{self.device_index}")) | |
if dtype is not None: | |
attr.data = attr.data.to(dtype) | |
for name, child in module.named_children(): | |
current_tag = f"{tag}.{name}" if tag else name | |
child = child.to(torch.device(f"cuda:{self.device_index}")) | |
if dtype is not None: | |
child = child.to(dtype) | |
torch.cuda.empty_cache() | |
setattr(module, name, child) | |
pre_name = current_tag.split('.')[0] | |
if pre_name not in offload_layer_dict: | |
param_size = 0 | |
for p in child.parameters(): | |
param_size += p.data.numel() * p.data.element_size() | |
param_size = param_size / 1024 / 1024 | |
if self.debug: | |
print(f"not offload layer {current_tag}, size: {param_size}MB") | |
continue | |
has_children = any(child.named_children()) | |
layer_count = current_tag.count('.') + 1 | |
layer_deep = offload_layer_dict[pre_name] | |
if layer_count >= layer_deep: | |
has_children = False | |
if has_children: | |
self._offload_layer(module=child, | |
tag=current_tag, | |
offload_layer_dict=offload_layer_dict, | |
ignore_layer_list=ignore_layer_list, | |
dtype=dtype) | |
continue | |
ignore = False | |
for i_layer in ignore_layer_list: | |
if current_tag.startswith(i_layer): | |
ignore = True | |
if self.debug: | |
print(f"ignore layer offload: {current_tag}") | |
break | |
if hasattr(child, "forward") and not ignore: | |
child = self.make_forward_wrapper( | |
child, current_tag, ignore_layer_list=ignore_layer_list | |
) | |
return module | |
def get_execution_order(self): | |
return self.execution_order | |