Spaces:

alexnasa
/

Wan2.2-Animate-ZEROGPU

Running on Zero

App Files Files Community

alex commited on 6 days ago

Commit

76cd760

1 Parent(s): fd0980c

further optimisation

Browse files

Files changed (5) hide show

generate.py +2 -2
wan/animate.py +2 -0
wan/modules/animate/model_animate.py +4 -5
wan/modules/animate/preprocess/process_pipepline.py +101 -47
wan/modules/model.py +8 -11

generate.py CHANGED Viewed

@@ -114,7 +114,7 @@ def _parse_args():
     args.size                    = "720*1280"
     args.frame_num               = None
     args.ckpt_dir                = "./Wan2.2-Animate-14B/"
-    args.offload_model           = True
     args.ulysses_size            = 1
     args.t5_fsdp                 = False
     args.t5_cpu                  = False
@@ -130,7 +130,7 @@ def _parse_args():
     args.sample_steps            = None
     args.sample_shift            = None
     args.sample_guide_scale      = None
-    args.convert_model_dtype     = False
     # animate
     args.refert_num              = 1

     args.size                    = "720*1280"
     args.frame_num               = None
     args.ckpt_dir                = "./Wan2.2-Animate-14B/"
+    args.offload_model           = False
     args.ulysses_size            = 1
     args.t5_fsdp                 = False
     args.t5_cpu                  = False
     args.sample_steps            = None
     args.sample_shift            = None
     args.sample_guide_scale      = None
+    args.convert_model_dtype     = True
     # animate
     args.refert_num              = 1

wan/animate.py CHANGED Viewed

@@ -131,6 +131,8 @@ class WanAnimate:
             checkpoint_dir=checkpoint_dir,
             config=config
             )
         if use_sp:
             self.sp_size = get_world_size()

             checkpoint_dir=checkpoint_dir,
             config=config
             )
+        # self.noise_model = torch.compile(self.noise_model)
         if use_sp:
             self.sp_size = get_world_size()

wan/modules/animate/model_animate.py CHANGED Viewed

@@ -313,11 +313,12 @@ class WanAnimateModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         # buffers (don't use register_buffer otherwise dtype will be changed in to())
         assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
         d = dim // num_heads
-        self.freqs = torch.cat([
             rope_params(1024, d - 4 * (d // 6)),
             rope_params(1024, 2 * (d // 6)),
             rope_params(1024, 2 * (d // 6))
         ], dim=1)
         self.img_emb = MLPProj(1280, dim)
@@ -381,9 +382,7 @@ class WanAnimateModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
         face_pixel_values=None
     ):
         # params
-        device = self.patch_embedding.weight.device
-        if self.freqs.device != device:
-            self.freqs = self.freqs.to(device)
         if y is not None:
             x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
@@ -428,7 +427,7 @@ class WanAnimateModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
             e=e0,
             seq_lens=seq_lens,
             grid_sizes=grid_sizes,
-            freqs=self.freqs,
             context=context,
             context_lens=context_lens)

         # buffers (don't use register_buffer otherwise dtype will be changed in to())
         assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
         d = dim // num_heads
+        _freqs = torch.cat([
             rope_params(1024, d - 4 * (d // 6)),
             rope_params(1024, 2 * (d // 6)),
             rope_params(1024, 2 * (d // 6))
         ], dim=1)
+        self.register_buffer("freqs", _freqs, persistent=False)
         self.img_emb = MLPProj(1280, dim)
         face_pixel_values=None
     ):
         # params
+        freqs = self.freqs
         if y is not None:
             x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
             e=e0,
             seq_lens=seq_lens,
             grid_sizes=grid_sizes,
+            freqs=freqs,
             context=context,
             context_lens=context_lens)

wan/modules/animate/preprocess/process_pipepline.py CHANGED Viewed

@@ -19,9 +19,9 @@ from utils import resize_by_area, get_frame_indices, padding_resize, get_face_bb
 from human_visualization import draw_aapose_by_meta_new
 from retarget_pose import get_retarget_pose
 import sam2.modeling.sam.transformer as transformer
-transformer.USE_FLASH_ATTN = False
 transformer.MATH_KERNEL_ON = True
-transformer.OLD_GPU = True
 from sam_utils import build_sam2_video_predictor
@@ -401,52 +401,53 @@ class ProcessPipeline():
                 key_frame_boxes.append(np.array([x1, y1, x2, y2], dtype=np.float32))
             # init SAM2 for this chunk
-            inference_state = self.predictor.init_state_v2(frames=each_frames)
-            self.predictor.reset_state(inference_state)
-            ann_obj_id = 1
-            # seed with box prompts (preferred), else fall back to points
-            for ann_frame_idx, box_xyxy in zip(key_frame_index_list, key_frame_boxes):
-                used_box = False
-                try:
-                    # If your predictor exposes a box API, this is ideal.
-                    _ = self.predictor.add_new_box(
-                        inference_state=inference_state,
-                        frame_idx=ann_frame_idx,
-                        obj_id=ann_obj_id,
-                        box=box_xyxy[None, :]  # shape (1, 4)
-                    )
-                    used_box = True
-                except Exception:
                     used_box = False
-                if not used_box:
-                    # Fallback: sample a few positive points inside the box
-                    x1, y1, x2, y2 = box_xyxy.astype(int)
-                    cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
-                    pts = np.array([
-                        [cx, cy],
-                        [x1 + (x2 - x1) // 4, cy],
-                        [x2 - (x2 - x1) // 4, cy],
-                        [cx, y1 + (y2 - y1) // 4],
-                        [cx, y2 - (y2 - y1) // 4],
-                    ], dtype=np.int32)
-                    labels = np.ones(len(pts), dtype=np.int32)  # 1 = positive
-                    _ = self.predictor.add_new_points(
-                        inference_state=inference_state,
-                        frame_idx=ann_frame_idx,
-                        obj_id=ann_obj_id,
-                        points=pts,
-                        labels=labels,
-                    )
-            # propagate across the chunk
-            video_segments = {}
-            for out_frame_idx, out_obj_ids, out_mask_logits in self.predictor.propagate_in_video(inference_state):
-                video_segments[out_frame_idx] = {
-                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
-                    for i, out_obj_id in enumerate(out_obj_ids)
-                }
             # collect masks (single object id)
             for out_frame_idx in range(len(video_segments)):
@@ -475,7 +476,7 @@ class ProcessPipeline():
                 continue
             # choose a few key frames to seed the object
-            key_frame_num = 4 if len(each_frames) > 4 else 1
             key_frame_step = max(1, len(kp2ds) // key_frame_num)
             key_frame_index_list = list(range(0, len(kp2ds), key_frame_step))[:key_frame_num]
@@ -529,3 +530,56 @@ class ProcessPipeline():
         return all_mask

 from human_visualization import draw_aapose_by_meta_new
 from retarget_pose import get_retarget_pose
 import sam2.modeling.sam.transformer as transformer
+transformer.USE_FLASH_ATTN = True
 transformer.MATH_KERNEL_ON = True
+transformer.OLD_GPU = False
 from sam_utils import build_sam2_video_predictor
                 key_frame_boxes.append(np.array([x1, y1, x2, y2], dtype=np.float32))
             # init SAM2 for this chunk
+            with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float16):
+                inference_state = self.predictor.init_state_v2(frames=each_frames)
+                self.predictor.reset_state(inference_state)
+                ann_obj_id = 1
+                # seed with box prompts (preferred), else fall back to points
+                for ann_frame_idx, box_xyxy in zip(key_frame_index_list, key_frame_boxes):
                     used_box = False
+                    try:
+                        # If your predictor exposes a box API, this is ideal.
+                        _ = self.predictor.add_new_box(
+                            inference_state=inference_state,
+                            frame_idx=ann_frame_idx,
+                            obj_id=ann_obj_id,
+                            box=box_xyxy[None, :]  # shape (1, 4)
+                        )
+                        used_box = True
+                    except Exception:
+                        used_box = False
+                    if not used_box:
+                        # Fallback: sample a few positive points inside the box
+                        x1, y1, x2, y2 = box_xyxy.astype(int)
+                        cx, cy = (x1 + x2) // 2, (y1 + y2) // 2
+                        pts = np.array([
+                            [cx, cy],
+                            [x1 + (x2 - x1) // 4, cy],
+                            [x2 - (x2 - x1) // 4, cy],
+                            [cx, y1 + (y2 - y1) // 4],
+                            [cx, y2 - (y2 - y1) // 4],
+                        ], dtype=np.int32)
+                        labels = np.ones(len(pts), dtype=np.int32)  # 1 = positive
+                        _ = self.predictor.add_new_points(
+                            inference_state=inference_state,
+                            frame_idx=ann_frame_idx,
+                            obj_id=ann_obj_id,
+                            points=pts,
+                            labels=labels,
+                        )
+                # propagate across the chunk
+                video_segments = {}
+                for out_frame_idx, out_obj_ids, out_mask_logits in self.predictor.propagate_in_video(inference_state):
+                    video_segments[out_frame_idx] = {
+                        out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy()
+                        for i, out_obj_id in enumerate(out_obj_ids)
+                    }
             # collect masks (single object id)
             for out_frame_idx in range(len(video_segments)):
                 continue
             # choose a few key frames to seed the object
+            key_frame_num = 1
             key_frame_step = max(1, len(kp2ds) // key_frame_num)
             key_frame_index_list = list(range(0, len(kp2ds), key_frame_step))[:key_frame_num]
         return all_mask
+    def get_face_bbox_masks(self, frames, kp2ds_all, scale=1.3, feather_px=0, keep_soft=False):
+        """
+        Create a per-frame mask that's simply the face bounding box.
+        - scale: bbox scale factor used by get_face_bboxes
+        - feather_px: optional Gaussian blur in pixels to feather edges
+        - keep_soft: if True, keep float [0,1] soft mask (after blur); else binarize to {0,1}
+        """
+        H, W = frames[0].shape[:2]
+        def _clip_box(x1, y1, x2, y2):
+            x1 = max(0, min(int(x1), W - 1))
+            x2 = max(0, min(int(x2), W - 1))
+            y1 = max(0, min(int(y1), H - 1))
+            y2 = max(0, min(int(y2), H - 1))
+            if x2 <= x1: x2 = min(W - 1, x1 + 1)
+            if y2 <= y1: y2 = min(H - 1, y1 + 1)
+            return x1, y1, x2, y2
+        masks = []
+        last_box = None
+        for meta in kp2ds_all:
+            # get_face_bboxes returns (x1, x2, y1, y2)
+            try:
+                x1, x2, y1, y2 = get_face_bboxes(
+                    meta['keypoints_face'][:, :2],
+                    scale=scale,
+                    image_shape=(H, W)
+                )
+                x1, y1, x2, y2 = _clip_box(x1, y1, x2, y2)
+                last_box = (x1, y1, x2, y2)
+            except Exception:
+                # fallback: reuse last seen box to avoid holes
+                if last_box is None:
+                    # no detection yet: push empty mask
+                    masks.append(np.zeros((H, W), dtype=np.uint8))
+                    continue
+                x1, y1, x2, y2 = last_box
+            m = np.zeros((H, W), dtype=np.float32)
+            m[y1:y2, x1:x2] = 1.0
+            if feather_px and feather_px > 0:
+                # kernel size must be odd and >= 3
+                k = max(3, int(feather_px) | 1)
+                m = cv2.GaussianBlur(m, (k, k), 0)
+            if keep_soft:
+                masks.append(m)  # float [0,1]
+            else:
+                masks.append((m >= 0.5).astype(np.uint8))  # hard {0,1}
+        return masks

wan/modules/model.py CHANGED Viewed

@@ -30,7 +30,7 @@ def rope_params(max_seq_len, dim, theta=10000):
     freqs = torch.outer(
         torch.arange(max_seq_len),
         1.0 / torch.pow(theta,
-                        torch.arange(0, dim, 2).to(torch.float64).div(dim)))
     freqs = torch.polar(torch.ones_like(freqs), freqs)
     return freqs
@@ -41,14 +41,14 @@ def rope_apply(x, grid_sizes, freqs):
     # split freqs
     freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
     # loop over samples
     output = []
     for i, (f, h, w) in enumerate(grid_sizes.tolist()):
         seq_len = f * h * w
         # precompute multipliers
-        x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape(
             seq_len, n, -1, 2))
         freqs_i = torch.cat([
             freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
@@ -65,7 +65,6 @@ def rope_apply(x, grid_sizes, freqs):
         output.append(x_i)
     return torch.stack(output).float()
 class WanRMSNorm(nn.Module):
     def __init__(self, dim, eps=1e-5):
@@ -397,12 +396,12 @@ class WanModel(ModelMixin, ConfigMixin):
         # buffers (don't use register_buffer otherwise dtype will be changed in to())
         assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
         d = dim // num_heads
-        self.freqs = torch.cat([
             rope_params(1024, d - 4 * (d // 6)),
             rope_params(1024, 2 * (d // 6)),
             rope_params(1024, 2 * (d // 6))
-        ],
-                               dim=1)
         # initialize weights
         self.init_weights()
@@ -437,9 +436,7 @@ class WanModel(ModelMixin, ConfigMixin):
         if self.model_type == 'i2v':
             assert y is not None
         # params
-        device = self.patch_embedding.weight.device
-        if self.freqs.device != device:
-            self.freqs = self.freqs.to(device)
         if y is not None:
             x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
@@ -482,7 +479,7 @@ class WanModel(ModelMixin, ConfigMixin):
             e=e0,
             seq_lens=seq_lens,
             grid_sizes=grid_sizes,
-            freqs=self.freqs,
             context=context,
             context_lens=context_lens)

     freqs = torch.outer(
         torch.arange(max_seq_len),
         1.0 / torch.pow(theta,
+                        torch.arange(0, dim, 2).to(torch.float32).div(dim)))
     freqs = torch.polar(torch.ones_like(freqs), freqs)
     return freqs
     # split freqs
     freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
     # loop over samples
     output = []
     for i, (f, h, w) in enumerate(grid_sizes.tolist()):
         seq_len = f * h * w
         # precompute multipliers
+        x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float32).reshape(
             seq_len, n, -1, 2))
         freqs_i = torch.cat([
             freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
         output.append(x_i)
     return torch.stack(output).float()
 class WanRMSNorm(nn.Module):
     def __init__(self, dim, eps=1e-5):
         # buffers (don't use register_buffer otherwise dtype will be changed in to())
         assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
         d = dim // num_heads
+        _freqs = torch.cat([
             rope_params(1024, d - 4 * (d // 6)),
             rope_params(1024, 2 * (d // 6)),
             rope_params(1024, 2 * (d // 6))
+        ], dim=1)
+        self.register_buffer("freqs", _freqs, persistent=False)
         # initialize weights
         self.init_weights()
         if self.model_type == 'i2v':
             assert y is not None
         # params
+        freqs = self.freqs
         if y is not None:
             x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
             e=e0,
             seq_lens=seq_lens,
             grid_sizes=grid_sizes,
+            freqs=freqs,
             context=context,
             context_lens=context_lens)