wan2.2onAMD

Running

App Files Files Community

vivienfanghua commited on Oct 23

Commit

5e60fc9

verified ·

1 Parent(s): df1d959

Delete wan

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

wan/__init__.py +0 -5
wan/__pycache__/__init__.cpython-310.pyc +0 -0
wan/__pycache__/image2video.cpython-310.pyc +0 -0
wan/__pycache__/text2video.cpython-310.pyc +0 -0
wan/__pycache__/textimage2video.cpython-310.pyc +0 -0
wan/configs/__init__.py +0 -39
wan/configs/__pycache__/__init__.cpython-310.pyc +0 -0
wan/configs/__pycache__/shared_config.cpython-310.pyc +0 -0
wan/configs/__pycache__/wan_i2v_A14B.cpython-310.pyc +0 -0
wan/configs/__pycache__/wan_t2v_A14B.cpython-310.pyc +0 -0
wan/configs/__pycache__/wan_ti2v_5B.cpython-310.pyc +0 -0
wan/configs/shared_config.py +0 -20
wan/configs/wan_i2v_A14B.py +0 -37
wan/configs/wan_t2v_A14B.py +0 -37
wan/configs/wan_ti2v_5B.py +0 -36
wan/distributed/__init__.py +0 -1
wan/distributed/__pycache__/__init__.cpython-310.pyc +0 -0
wan/distributed/__pycache__/fsdp.cpython-310.pyc +0 -0
wan/distributed/__pycache__/sequence_parallel.cpython-310.pyc +0 -0
wan/distributed/__pycache__/ulysses.cpython-310.pyc +0 -0
wan/distributed/__pycache__/util.cpython-310.pyc +0 -0
wan/distributed/fsdp.py +0 -43
wan/distributed/sequence_parallel.py +0 -176
wan/distributed/ulysses.py +0 -47
wan/distributed/util.py +0 -51
wan/image2video.py +0 -431
wan/modules/__init__.py +0 -19
wan/modules/__pycache__/__init__.cpython-310.pyc +0 -0
wan/modules/__pycache__/attention.cpython-310.pyc +0 -0
wan/modules/__pycache__/model.cpython-310.pyc +0 -0
wan/modules/__pycache__/t5.cpython-310.pyc +0 -0
wan/modules/__pycache__/tokenizers.cpython-310.pyc +0 -0
wan/modules/__pycache__/vae2_1.cpython-310.pyc +0 -0
wan/modules/__pycache__/vae2_2.cpython-310.pyc +0 -0
wan/modules/attention.py +0 -179
wan/modules/model.py +0 -546
wan/modules/t5.py +0 -513
wan/modules/tokenizers.py +0 -82
wan/modules/vae2_1.py +0 -663
wan/modules/vae2_2.py +0 -1051
wan/text2video.py +0 -378
wan/textimage2video.py +0 -619
wan/utils/__init__.py +0 -12
wan/utils/__pycache__/__init__.cpython-310.pyc +0 -0
wan/utils/__pycache__/fm_solvers.cpython-310.pyc +0 -0
wan/utils/__pycache__/fm_solvers_unipc.cpython-310.pyc +0 -0
wan/utils/__pycache__/utils.cpython-310.pyc +0 -0
wan/utils/fm_solvers.py +0 -859
wan/utils/fm_solvers_unipc.py +0 -802
wan/utils/prompt_extend.py +0 -542

wan/__init__.py DELETED Viewed

@@ -1,5 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-from . import configs, distributed, modules
-from .image2video import WanI2V
-from .text2video import WanT2V
-from .textimage2video import WanTI2V

wan/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (333 Bytes)

wan/__pycache__/image2video.cpython-310.pyc DELETED Viewed

Binary file (12.3 kB)

wan/__pycache__/text2video.cpython-310.pyc DELETED Viewed

Binary file (11.1 kB)

wan/__pycache__/textimage2video.cpython-310.pyc DELETED Viewed

Binary file (17.5 kB)

wan/configs/__init__.py DELETED Viewed

@@ -1,39 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import copy
-import os
-os.environ['TOKENIZERS_PARALLELISM'] = 'false'
-from .wan_i2v_A14B import i2v_A14B
-from .wan_t2v_A14B import t2v_A14B
-from .wan_ti2v_5B import ti2v_5B
-WAN_CONFIGS = {
-    't2v-A14B': t2v_A14B,
-    'i2v-A14B': i2v_A14B,
-    'ti2v-5B': ti2v_5B,
-}
-SIZE_CONFIGS = {
-    '720*1280': (720, 1280),
-    '1280*720': (1280, 720),
-    '480*832': (480, 832),
-    '832*480': (832, 480),
-    '704*1280': (704, 1280),
-    '1280*704': (1280, 704)
-}
-MAX_AREA_CONFIGS = {
-    '720*1280': 720 * 1280,
-    '1280*720': 1280 * 720,
-    '480*832': 480 * 832,
-    '832*480': 832 * 480,
-    '704*1280': 704 * 1280,
-    '1280*704': 1280 * 704,
-}
-SUPPORTED_SIZES = {
-    't2v-A14B': ('720*1280', '1280*720', '480*832', '832*480'),
-    'i2v-A14B': ('720*1280', '1280*720', '480*832', '832*480'),
-    'ti2v-5B': ('704*1280', '1280*704'),
-}

wan/configs/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (737 Bytes)

wan/configs/__pycache__/shared_config.cpython-310.pyc DELETED Viewed

Binary file (848 Bytes)

wan/configs/__pycache__/wan_i2v_A14B.cpython-310.pyc DELETED Viewed

Binary file (968 Bytes)

wan/configs/__pycache__/wan_t2v_A14B.cpython-310.pyc DELETED Viewed

Binary file (955 Bytes)

wan/configs/__pycache__/wan_ti2v_5B.cpython-310.pyc DELETED Viewed

Binary file (868 Bytes)

wan/configs/shared_config.py DELETED Viewed

@@ -1,20 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import torch
-from easydict import EasyDict
-#------------------------ Wan shared config ------------------------#
-wan_shared_cfg = EasyDict()
-# t5
-wan_shared_cfg.t5_model = 'umt5_xxl'
-wan_shared_cfg.t5_dtype = torch.bfloat16
-wan_shared_cfg.text_len = 512
-# transformer
-wan_shared_cfg.param_dtype = torch.bfloat16
-# inference
-wan_shared_cfg.num_train_timesteps = 1000
-wan_shared_cfg.sample_fps = 16
-wan_shared_cfg.sample_neg_prompt = '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'
-wan_shared_cfg.frame_num = 81

wan/configs/wan_i2v_A14B.py DELETED Viewed

@@ -1,37 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import torch
-from easydict import EasyDict
-from .shared_config import wan_shared_cfg
-#------------------------ Wan I2V A14B ------------------------#
-i2v_A14B = EasyDict(__name__='Config: Wan I2V A14B')
-i2v_A14B.update(wan_shared_cfg)
-i2v_A14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
-i2v_A14B.t5_tokenizer = 'google/umt5-xxl'
-# vae
-i2v_A14B.vae_checkpoint = 'Wan2.1_VAE.pth'
-i2v_A14B.vae_stride = (4, 8, 8)
-# transformer
-i2v_A14B.patch_size = (1, 2, 2)
-i2v_A14B.dim = 5120
-i2v_A14B.ffn_dim = 13824
-i2v_A14B.freq_dim = 256
-i2v_A14B.num_heads = 40
-i2v_A14B.num_layers = 40
-i2v_A14B.window_size = (-1, -1)
-i2v_A14B.qk_norm = True
-i2v_A14B.cross_attn_norm = True
-i2v_A14B.eps = 1e-6
-i2v_A14B.low_noise_checkpoint = 'low_noise_model'
-i2v_A14B.high_noise_checkpoint = 'high_noise_model'
-# inference
-i2v_A14B.sample_shift = 5.0
-i2v_A14B.sample_steps = 40
-i2v_A14B.boundary = 0.900
-i2v_A14B.sample_guide_scale = (3.5, 3.5)  # low noise, high noise

wan/configs/wan_t2v_A14B.py DELETED Viewed

@@ -1,37 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-from easydict import EasyDict
-from .shared_config import wan_shared_cfg
-#------------------------ Wan T2V A14B ------------------------#
-t2v_A14B = EasyDict(__name__='Config: Wan T2V A14B')
-t2v_A14B.update(wan_shared_cfg)
-# t5
-t2v_A14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
-t2v_A14B.t5_tokenizer = 'google/umt5-xxl'
-# vae
-t2v_A14B.vae_checkpoint = 'Wan2.1_VAE.pth'
-t2v_A14B.vae_stride = (4, 8, 8)
-# transformer
-t2v_A14B.patch_size = (1, 2, 2)
-t2v_A14B.dim = 5120
-t2v_A14B.ffn_dim = 13824
-t2v_A14B.freq_dim = 256
-t2v_A14B.num_heads = 40
-t2v_A14B.num_layers = 40
-t2v_A14B.window_size = (-1, -1)
-t2v_A14B.qk_norm = True
-t2v_A14B.cross_attn_norm = True
-t2v_A14B.eps = 1e-6
-t2v_A14B.low_noise_checkpoint = 'low_noise_model'
-t2v_A14B.high_noise_checkpoint = 'high_noise_model'
-# inference
-t2v_A14B.sample_shift = 12.0
-t2v_A14B.sample_steps = 40
-t2v_A14B.boundary = 0.875
-t2v_A14B.sample_guide_scale = (3.0, 4.0)  # low noise, high noise

wan/configs/wan_ti2v_5B.py DELETED Viewed

@@ -1,36 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-from easydict import EasyDict
-from .shared_config import wan_shared_cfg
-#------------------------ Wan TI2V 5B ------------------------#
-ti2v_5B = EasyDict(__name__='Config: Wan TI2V 5B')
-ti2v_5B.update(wan_shared_cfg)
-# t5
-ti2v_5B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
-ti2v_5B.t5_tokenizer = 'google/umt5-xxl'
-# vae
-ti2v_5B.vae_checkpoint = 'Wan2.2_VAE.pth'
-ti2v_5B.vae_stride = (4, 16, 16)
-# transformer
-ti2v_5B.patch_size = (1, 2, 2)
-ti2v_5B.dim = 3072
-ti2v_5B.ffn_dim = 14336
-ti2v_5B.freq_dim = 256
-ti2v_5B.num_heads = 24
-ti2v_5B.num_layers = 30
-ti2v_5B.window_size = (-1, -1)
-ti2v_5B.qk_norm = True
-ti2v_5B.cross_attn_norm = True
-ti2v_5B.eps = 1e-6
-# inference
-ti2v_5B.sample_fps = 12
-ti2v_5B.sample_shift = 5.0
-ti2v_5B.sample_steps = 50
-ti2v_5B.sample_guide_scale = 5.0
-ti2v_5B.frame_num = 121

wan/distributed/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.

wan/distributed/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (143 Bytes)

wan/distributed/__pycache__/fsdp.cpython-310.pyc DELETED Viewed

Binary file (1.36 kB)

wan/distributed/__pycache__/sequence_parallel.cpython-310.pyc DELETED Viewed

Binary file (5.24 kB)

wan/distributed/__pycache__/ulysses.cpython-310.pyc DELETED Viewed

Binary file (1.23 kB)

wan/distributed/__pycache__/util.cpython-310.pyc DELETED Viewed

Binary file (1.93 kB)

wan/distributed/fsdp.py DELETED Viewed

@@ -1,43 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import gc
-from functools import partial
-import torch
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
-from torch.distributed.fsdp.wrap import lambda_auto_wrap_policy
-from torch.distributed.utils import _free_storage
-def shard_model(
-    model,
-    device_id,
-    param_dtype=torch.bfloat16,
-    reduce_dtype=torch.float32,
-    buffer_dtype=torch.float32,
-    process_group=None,
-    sharding_strategy=ShardingStrategy.FULL_SHARD,
-    sync_module_states=True,
-):
-    model = FSDP(
-        module=model,
-        process_group=process_group,
-        sharding_strategy=sharding_strategy,
-        auto_wrap_policy=partial(
-            lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.blocks),
-        mixed_precision=MixedPrecision(
-            param_dtype=param_dtype,
-            reduce_dtype=reduce_dtype,
-            buffer_dtype=buffer_dtype),
-        device_id=device_id,
-        sync_module_states=sync_module_states)
-    return model
-def free_model(model):
-    for m in model.modules():
-        if isinstance(m, FSDP):
-            _free_storage(m._handle.flat_param.data)
-    del model
-    gc.collect()
-    torch.cuda.empty_cache()

wan/distributed/sequence_parallel.py DELETED Viewed

@@ -1,176 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import torch
-import torch.cuda.amp as amp
-from ..modules.model import sinusoidal_embedding_1d
-from .ulysses import distributed_attention
-from .util import gather_forward, get_rank, get_world_size
-def pad_freqs(original_tensor, target_len):
-    seq_len, s1, s2 = original_tensor.shape
-    pad_size = target_len - seq_len
-    padding_tensor = torch.ones(
-        pad_size,
-        s1,
-        s2,
-        dtype=original_tensor.dtype,
-        device=original_tensor.device)
-    padded_tensor = torch.cat([original_tensor, padding_tensor], dim=0)
-    return padded_tensor
-@torch.amp.autocast('cuda', enabled=False)
-def rope_apply(x, grid_sizes, freqs):
-    """
-    x:          [B, L, N, C].
-    grid_sizes: [B, 3].
-    freqs:      [M, C // 2].
-    """
-    s, n, c = x.size(1), x.size(2), x.size(3) // 2
-    # split freqs
-    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
-    # loop over samples
-    output = []
-    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
-        seq_len = f * h * w
-        # precompute multipliers
-        x_i = torch.view_as_complex(x[i, :s].to(torch.float64).reshape(
-            s, n, -1, 2))
-        freqs_i = torch.cat([
-            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
-            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
-            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
-        ],
-                            dim=-1).reshape(seq_len, 1, -1)
-        # apply rotary embedding
-        sp_size = get_world_size()
-        sp_rank = get_rank()
-        freqs_i = pad_freqs(freqs_i, s * sp_size)
-        s_per_rank = s
-        freqs_i_rank = freqs_i[(sp_rank * s_per_rank):((sp_rank + 1) *
-                                                       s_per_rank), :, :]
-        x_i = torch.view_as_real(x_i * freqs_i_rank).flatten(2)
-        x_i = torch.cat([x_i, x[i, s:]])
-        # append to collection
-        output.append(x_i)
-    return torch.stack(output).float()
-def sp_dit_forward(
-    self,
-    x,
-    t,
-    context,
-    seq_len,
-    y=None,
-):
-    """
-    x:              A list of videos each with shape [C, T, H, W].
-    t:              [B].
-    context:        A list of text embeddings each with shape [L, C].
-    """
-    if self.model_type == 'i2v':
-        assert y is not None
-    # params
-    device = self.patch_embedding.weight.device
-    if self.freqs.device != device:
-        self.freqs = self.freqs.to(device)
-    if y is not None:
-        x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
-    # embeddings
-    x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
-    grid_sizes = torch.stack(
-        [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
-    x = [u.flatten(2).transpose(1, 2) for u in x]
-    seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
-    assert seq_lens.max() <= seq_len
-    x = torch.cat([
-        torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))], dim=1)
-        for u in x
-    ])
-    # time embeddings
-    if t.dim() == 1:
-        t = t.expand(t.size(0), seq_len)
-    with torch.amp.autocast('cuda', dtype=torch.float32):
-        bt = t.size(0)
-        t = t.flatten()
-        e = self.time_embedding(
-            sinusoidal_embedding_1d(self.freq_dim,
-                                    t).unflatten(0, (bt, seq_len)).float())
-        e0 = self.time_projection(e).unflatten(2, (6, self.dim))
-        assert e.dtype == torch.float32 and e0.dtype == torch.float32
-    # context
-    context_lens = None
-    context = self.text_embedding(
-        torch.stack([
-            torch.cat([u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
-            for u in context
-        ]))
-    # Context Parallel
-    x = torch.chunk(x, get_world_size(), dim=1)[get_rank()]
-    e = torch.chunk(e, get_world_size(), dim=1)[get_rank()]
-    e0 = torch.chunk(e0, get_world_size(), dim=1)[get_rank()]
-    # arguments
-    kwargs = dict(
-        e=e0,
-        seq_lens=seq_lens,
-        grid_sizes=grid_sizes,
-        freqs=self.freqs,
-        context=context,
-        context_lens=context_lens)
-    for block in self.blocks:
-        x = block(x, **kwargs)
-    # head
-    x = self.head(x, e)
-    # Context Parallel
-    x = gather_forward(x, dim=1)
-    # unpatchify
-    x = self.unpatchify(x, grid_sizes)
-    return [u.float() for u in x]
-def sp_attn_forward(self, x, seq_lens, grid_sizes, freqs, dtype=torch.bfloat16):
-    b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-    half_dtypes = (torch.float16, torch.bfloat16)
-    def half(x):
-        return x if x.dtype in half_dtypes else x.to(dtype)
-    # query, key, value function
-    def qkv_fn(x):
-        q = self.norm_q(self.q(x)).view(b, s, n, d)
-        k = self.norm_k(self.k(x)).view(b, s, n, d)
-        v = self.v(x).view(b, s, n, d)
-        return q, k, v
-    q, k, v = qkv_fn(x)
-    q = rope_apply(q, grid_sizes, freqs)
-    k = rope_apply(k, grid_sizes, freqs)
-    x = distributed_attention(
-        half(q),
-        half(k),
-        half(v),
-        seq_lens,
-        window_size=self.window_size,
-    )
-    # output
-    x = x.flatten(2)
-    x = self.o(x)
-    return x

wan/distributed/ulysses.py DELETED Viewed

@@ -1,47 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import torch
-import torch.distributed as dist
-from ..modules.attention import flash_attention
-from .util import all_to_all
-def distributed_attention(
-        q,
-        k,
-        v,
-        seq_lens,
-        window_size=(-1, -1),
-):
-    """
-    Performs distributed attention based on DeepSpeed Ulysses attention mechanism.
-    please refer to https://arxiv.org/pdf/2309.14509
-    Args:
-        q:           [B, Lq // p, Nq, C1].
-        k:           [B, Lk // p, Nk, C1].
-        v:           [B, Lk // p, Nk, C2]. Nq must be divisible by Nk.
-        seq_lens:    [B], length of each sequence in batch
-        window_size: (left right). If not (-1, -1), apply sliding window local attention.
-    """
-    if not dist.is_initialized():
-        raise ValueError("distributed group should be initialized.")
-    b = q.shape[0]
-    # gather q/k/v sequence
-    q = all_to_all(q, scatter_dim=2, gather_dim=1)
-    k = all_to_all(k, scatter_dim=2, gather_dim=1)
-    v = all_to_all(v, scatter_dim=2, gather_dim=1)
-    # apply attention
-    x = flash_attention(
-        q,
-        k,
-        v,
-        k_lens=seq_lens,
-        window_size=window_size,
-    )
-    # scatter q/k/v sequence
-    x = all_to_all(x, scatter_dim=1, gather_dim=2)
-    return x

wan/distributed/util.py DELETED Viewed

@@ -1,51 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import torch
-import torch.distributed as dist
-def init_distributed_group():
-    """r initialize sequence parallel group.
-    """
-    if not dist.is_initialized():
-        dist.init_process_group(backend='nccl')
-def get_rank():
-    return dist.get_rank()
-def get_world_size():
-    return dist.get_world_size()
-def all_to_all(x, scatter_dim, gather_dim, group=None, **kwargs):
-    """
-    `scatter` along one dimension and `gather` along another.
-    """
-    world_size = get_world_size()
-    if world_size > 1:
-        inputs = [u.contiguous() for u in x.chunk(world_size, dim=scatter_dim)]
-        outputs = [torch.empty_like(u) for u in inputs]
-        dist.all_to_all(outputs, inputs, group=group, **kwargs)
-        x = torch.cat(outputs, dim=gather_dim).contiguous()
-    return x
-def all_gather(tensor):
-    world_size = dist.get_world_size()
-    if world_size == 1:
-        return [tensor]
-    tensor_list = [torch.empty_like(tensor) for _ in range(world_size)]
-    torch.distributed.all_gather(tensor_list, tensor)
-    return tensor_list
-def gather_forward(input, dim):
-    # skip if world_size == 1
-    world_size = dist.get_world_size()
-    if world_size == 1:
-        return input
-    # gather sequence
-    output = all_gather(input)
-    return torch.cat(output, dim=dim).contiguous()

wan/image2video.py DELETED Viewed

@@ -1,431 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import gc
-import logging
-import math
-import os
-import random
-import sys
-import types
-from contextlib import contextmanager
-from functools import partial
-import numpy as np
-import torch
-import torch.cuda.amp as amp
-import torch.distributed as dist
-import torchvision.transforms.functional as TF
-from tqdm import tqdm
-from .distributed.fsdp import shard_model
-from .distributed.sequence_parallel import sp_attn_forward, sp_dit_forward
-from .distributed.util import get_world_size
-from .modules.model import WanModel
-from .modules.t5 import T5EncoderModel
-from .modules.vae2_1 import Wan2_1_VAE
-from .utils.fm_solvers import (
-    FlowDPMSolverMultistepScheduler,
-    get_sampling_sigmas,
-    retrieve_timesteps,
-)
-from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
-class WanI2V:
-    def __init__(
-        self,
-        config,
-        checkpoint_dir,
-        device_id=0,
-        rank=0,
-        t5_fsdp=False,
-        dit_fsdp=False,
-        use_sp=False,
-        t5_cpu=False,
-        init_on_cpu=True,
-        convert_model_dtype=False,
-    ):
-        r"""
-        Initializes the image-to-video generation model components.
-        Args:
-            config (EasyDict):
-                Object containing model parameters initialized from config.py
-            checkpoint_dir (`str`):
-                Path to directory containing model checkpoints
-            device_id (`int`,  *optional*, defaults to 0):
-                Id of target GPU device
-            rank (`int`,  *optional*, defaults to 0):
-                Process rank for distributed training
-            t5_fsdp (`bool`, *optional*, defaults to False):
-                Enable FSDP sharding for T5 model
-            dit_fsdp (`bool`, *optional*, defaults to False):
-                Enable FSDP sharding for DiT model
-            use_sp (`bool`, *optional*, defaults to False):
-                Enable distribution strategy of sequence parallel.
-            t5_cpu (`bool`, *optional*, defaults to False):
-                Whether to place T5 model on CPU. Only works without t5_fsdp.
-            init_on_cpu (`bool`, *optional*, defaults to True):
-                Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
-            convert_model_dtype (`bool`, *optional*, defaults to False):
-                Convert DiT model parameters dtype to 'config.param_dtype'.
-                Only works without FSDP.
-        """
-        self.device = torch.device(f"cuda:{device_id}")
-        self.config = config
-        self.rank = rank
-        self.t5_cpu = t5_cpu
-        self.init_on_cpu = init_on_cpu
-        self.num_train_timesteps = config.num_train_timesteps
-        self.boundary = config.boundary
-        self.param_dtype = config.param_dtype
-        if t5_fsdp or dit_fsdp or use_sp:
-            self.init_on_cpu = False
-        shard_fn = partial(shard_model, device_id=device_id)
-        self.text_encoder = T5EncoderModel(
-            text_len=config.text_len,
-            dtype=config.t5_dtype,
-            device=torch.device('cpu'),
-            checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
-            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
-            shard_fn=shard_fn if t5_fsdp else None,
-        )
-        self.vae_stride = config.vae_stride
-        self.patch_size = config.patch_size
-        self.vae = Wan2_1_VAE(
-            vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
-            device=self.device)
-        logging.info(f"Creating WanModel from {checkpoint_dir}")
-        self.low_noise_model = WanModel.from_pretrained(
-            checkpoint_dir, subfolder=config.low_noise_checkpoint)
-        self.low_noise_model = self._configure_model(
-            model=self.low_noise_model,
-            use_sp=use_sp,
-            dit_fsdp=dit_fsdp,
-            shard_fn=shard_fn,
-            convert_model_dtype=convert_model_dtype)
-        self.high_noise_model = WanModel.from_pretrained(
-            checkpoint_dir, subfolder=config.high_noise_checkpoint)
-        self.high_noise_model = self._configure_model(
-            model=self.high_noise_model,
-            use_sp=use_sp,
-            dit_fsdp=dit_fsdp,
-            shard_fn=shard_fn,
-            convert_model_dtype=convert_model_dtype)
-        if use_sp:
-            self.sp_size = get_world_size()
-        else:
-            self.sp_size = 1
-        self.sample_neg_prompt = config.sample_neg_prompt
-    def _configure_model(self, model, use_sp, dit_fsdp, shard_fn,
-                         convert_model_dtype):
-        """
-        Configures a model object. This includes setting evaluation modes,
-        applying distributed parallel strategy, and handling device placement.
-        Args:
-            model (torch.nn.Module):
-                The model instance to configure.
-            use_sp (`bool`):
-                Enable distribution strategy of sequence parallel.
-            dit_fsdp (`bool`):
-                Enable FSDP sharding for DiT model.
-            shard_fn (callable):
-                The function to apply FSDP sharding.
-            convert_model_dtype (`bool`):
-                Convert DiT model parameters dtype to 'config.param_dtype'.
-                Only works without FSDP.
-        Returns:
-            torch.nn.Module:
-                The configured model.
-        """
-        model.eval().requires_grad_(False)
-        if use_sp:
-            for block in model.blocks:
-                block.self_attn.forward = types.MethodType(
-                    sp_attn_forward, block.self_attn)
-            model.forward = types.MethodType(sp_dit_forward, model)
-        if dist.is_initialized():
-            dist.barrier()
-        if dit_fsdp:
-            model = shard_fn(model)
-        else:
-            if convert_model_dtype:
-                model.to(self.param_dtype)
-            if not self.init_on_cpu:
-                model.to(self.device)
-        return model
-    def _prepare_model_for_timestep(self, t, boundary, offload_model):
-        r"""
-        Prepares and returns the required model for the current timestep.
-        Args:
-            t (torch.Tensor):
-                current timestep.
-            boundary (`int`):
-                The timestep threshold. If `t` is at or above this value,
-                the `high_noise_model` is considered as the required model.
-            offload_model (`bool`):
-                A flag intended to control the offloading behavior.
-        Returns:
-            torch.nn.Module:
-                The active model on the target device for the current timestep.
-        """
-        if t.item() >= boundary:
-            required_model_name = 'high_noise_model'
-            offload_model_name = 'low_noise_model'
-        else:
-            required_model_name = 'low_noise_model'
-            offload_model_name = 'high_noise_model'
-        if offload_model or self.init_on_cpu:
-            if next(getattr(
-                    self,
-                    offload_model_name).parameters()).device.type == 'cuda':
-                getattr(self, offload_model_name).to('cpu')
-            if next(getattr(
-                    self,
-                    required_model_name).parameters()).device.type == 'cpu':
-                getattr(self, required_model_name).to(self.device)
-        return getattr(self, required_model_name)
-    def generate(self,
-                 input_prompt,
-                 img,
-                 max_area=720 * 1280,
-                 frame_num=81,
-                 shift=5.0,
-                 sample_solver='unipc',
-                 sampling_steps=40,
-                 guide_scale=5.0,
-                 n_prompt="",
-                 seed=-1,
-                 offload_model=True):
-        r"""
-        Generates video frames from input image and text prompt using diffusion process.
-        Args:
-            input_prompt (`str`):
-                Text prompt for content generation.
-            img (PIL.Image.Image):
-                Input image tensor. Shape: [3, H, W]
-            max_area (`int`, *optional*, defaults to 720*1280):
-                Maximum pixel area for latent space calculation. Controls video resolution scaling
-            frame_num (`int`, *optional*, defaults to 81):
-                How many frames to sample from a video. The number should be 4n+1
-            shift (`float`, *optional*, defaults to 5.0):
-                Noise schedule shift parameter. Affects temporal dynamics
-                [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
-            sample_solver (`str`, *optional*, defaults to 'unipc'):
-                Solver used to sample the video.
-            sampling_steps (`int`, *optional*, defaults to 40):
-                Number of diffusion sampling steps. Higher values improve quality but slow generation
-            guide_scale (`float` or tuple[`float`], *optional*, defaults 5.0):
-                Classifier-free guidance scale. Controls prompt adherence vs. creativity.
-                If tuple, the first guide_scale will be used for low noise model and
-                the second guide_scale will be used for high noise model.
-            n_prompt (`str`, *optional*, defaults to ""):
-                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
-            seed (`int`, *optional*, defaults to -1):
-                Random seed for noise generation. If -1, use random seed
-            offload_model (`bool`, *optional*, defaults to True):
-                If True, offloads models to CPU during generation to save VRAM
-        Returns:
-            torch.Tensor:
-                Generated video frames tensor. Dimensions: (C, N H, W) where:
-                - C: Color channels (3 for RGB)
-                - N: Number of frames (81)
-                - H: Frame height (from max_area)
-                - W: Frame width from max_area)
-        """
-        # preprocess
-        guide_scale = (guide_scale, guide_scale) if isinstance(
-            guide_scale, float) else guide_scale
-        img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(self.device)
-        F = frame_num
-        h, w = img.shape[1:]
-        aspect_ratio = h / w
-        lat_h = round(
-            np.sqrt(max_area * aspect_ratio) // self.vae_stride[1] //
-            self.patch_size[1] * self.patch_size[1])
-        lat_w = round(
-            np.sqrt(max_area / aspect_ratio) // self.vae_stride[2] //
-            self.patch_size[2] * self.patch_size[2])
-        h = lat_h * self.vae_stride[1]
-        w = lat_w * self.vae_stride[2]
-        max_seq_len = ((F - 1) // self.vae_stride[0] + 1) * lat_h * lat_w // (
-            self.patch_size[1] * self.patch_size[2])
-        max_seq_len = int(math.ceil(max_seq_len / self.sp_size)) * self.sp_size
-        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
-        seed_g = torch.Generator(device=self.device)
-        seed_g.manual_seed(seed)
-        noise = torch.randn(
-            16,
-            21,
-            lat_h,
-            lat_w,
-            dtype=torch.float32,
-            generator=seed_g,
-            device=self.device)
-        msk = torch.ones(1, 81, lat_h, lat_w, device=self.device)
-        msk[:, 1:] = 0
-        msk = torch.concat([
-            torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]
-        ],
-                           dim=1)
-        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
-        msk = msk.transpose(1, 2)[0]
-        if n_prompt == "":
-            n_prompt = self.sample_neg_prompt
-        # preprocess
-        if not self.t5_cpu:
-            self.text_encoder.model.to(self.device)
-            context = self.text_encoder([input_prompt], self.device)
-            context_null = self.text_encoder([n_prompt], self.device)
-            if offload_model:
-                self.text_encoder.model.cpu()
-        else:
-            context = self.text_encoder([input_prompt], torch.device('cpu'))
-            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
-            context = [t.to(self.device) for t in context]
-            context_null = [t.to(self.device) for t in context_null]
-        y = self.vae.encode([
-            torch.concat([
-                torch.nn.functional.interpolate(
-                    img[None].cpu(), size=(h, w), mode='bicubic').transpose(
-                        0, 1),
-                torch.zeros(3, 80, h, w)
-            ],
-                         dim=1).to(self.device)
-        ])[0]
-        y = torch.concat([msk, y])
-        @contextmanager
-        def noop_no_sync():
-            yield
-        no_sync_low_noise = getattr(self.low_noise_model, 'no_sync',
-                                    noop_no_sync)
-        no_sync_high_noise = getattr(self.high_noise_model, 'no_sync',
-                                     noop_no_sync)
-        # evaluation mode
-        with (
-                torch.amp.autocast('cuda', dtype=self.param_dtype),
-                torch.no_grad(),
-                no_sync_low_noise(),
-                no_sync_high_noise(),
-        ):
-            boundary = self.boundary * self.num_train_timesteps
-            if sample_solver == 'unipc':
-                sample_scheduler = FlowUniPCMultistepScheduler(
-                    num_train_timesteps=self.num_train_timesteps,
-                    shift=1,
-                    use_dynamic_shifting=False)
-                sample_scheduler.set_timesteps(
-                    sampling_steps, device=self.device, shift=shift)
-                timesteps = sample_scheduler.timesteps
-            elif sample_solver == 'dpm++':
-                sample_scheduler = FlowDPMSolverMultistepScheduler(
-                    num_train_timesteps=self.num_train_timesteps,
-                    shift=1,
-                    use_dynamic_shifting=False)
-                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
-                timesteps, _ = retrieve_timesteps(
-                    sample_scheduler,
-                    device=self.device,
-                    sigmas=sampling_sigmas)
-            else:
-                raise NotImplementedError("Unsupported solver.")
-            # sample videos
-            latent = noise
-            arg_c = {
-                'context': [context[0]],
-                'seq_len': max_seq_len,
-                'y': [y],
-            }
-            arg_null = {
-                'context': context_null,
-                'seq_len': max_seq_len,
-                'y': [y],
-            }
-            if offload_model:
-                torch.cuda.empty_cache()
-            for _, t in enumerate(tqdm(timesteps)):
-                latent_model_input = [latent.to(self.device)]
-                timestep = [t]
-                timestep = torch.stack(timestep).to(self.device)
-                model = self._prepare_model_for_timestep(
-                    t, boundary, offload_model)
-                sample_guide_scale = guide_scale[1] if t.item(
-                ) >= boundary else guide_scale[0]
-                noise_pred_cond = model(
-                    latent_model_input, t=timestep, **arg_c)[0]
-                if offload_model:
-                    torch.cuda.empty_cache()
-                noise_pred_uncond = model(
-                    latent_model_input, t=timestep, **arg_null)[0]
-                if offload_model:
-                    torch.cuda.empty_cache()
-                noise_pred = noise_pred_uncond + sample_guide_scale * (
-                    noise_pred_cond - noise_pred_uncond)
-                temp_x0 = sample_scheduler.step(
-                    noise_pred.unsqueeze(0),
-                    t,
-                    latent.unsqueeze(0),
-                    return_dict=False,
-                    generator=seed_g)[0]
-                latent = temp_x0.squeeze(0)
-                x0 = [latent]
-                del latent_model_input, timestep
-            if offload_model:
-                self.low_noise_model.cpu()
-                self.high_noise_model.cpu()
-                torch.cuda.empty_cache()
-            if self.rank == 0:
-                videos = self.vae.decode(x0)
-        del noise, latent, x0
-        del sample_scheduler
-        if offload_model:
-            gc.collect()
-            torch.cuda.synchronize()
-        if dist.is_initialized():
-            dist.barrier()
-        return videos[0] if self.rank == 0 else None

wan/modules/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-from .attention import flash_attention
-from .model import WanModel
-from .t5 import T5Decoder, T5Encoder, T5EncoderModel, T5Model
-from .tokenizers import HuggingfaceTokenizer
-from .vae2_1 import Wan2_1_VAE
-from .vae2_2 import Wan2_2_VAE
-__all__ = [
-    'Wan2_1_VAE',
-    'Wan2_2_VAE',
-    'WanModel',
-    'T5Model',
-    'T5Encoder',
-    'T5Decoder',
-    'T5EncoderModel',
-    'HuggingfaceTokenizer',
-    'flash_attention',
-]

wan/modules/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (528 Bytes)

wan/modules/__pycache__/attention.cpython-310.pyc DELETED Viewed

Binary file (3.95 kB)

wan/modules/__pycache__/model.cpython-310.pyc DELETED Viewed

Binary file (16.9 kB)

wan/modules/__pycache__/t5.cpython-310.pyc DELETED Viewed

Binary file (12.9 kB)

wan/modules/__pycache__/tokenizers.cpython-310.pyc DELETED Viewed

Binary file (2.55 kB)

wan/modules/__pycache__/vae2_1.cpython-310.pyc DELETED Viewed

Binary file (16.9 kB)

wan/modules/__pycache__/vae2_2.cpython-310.pyc DELETED Viewed

Binary file (22.1 kB)

wan/modules/attention.py DELETED Viewed

@@ -1,179 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import torch
-try:
-    import flash_attn_interface
-    FLASH_ATTN_3_AVAILABLE = True
-except ModuleNotFoundError:
-    FLASH_ATTN_3_AVAILABLE = False
-try:
-    import flash_attn
-    FLASH_ATTN_2_AVAILABLE = True
-except ModuleNotFoundError:
-    FLASH_ATTN_2_AVAILABLE = False
-import warnings
-__all__ = [
-    'flash_attention',
-    'attention',
-]
-def flash_attention(
-    q,
-    k,
-    v,
-    q_lens=None,
-    k_lens=None,
-    dropout_p=0.,
-    softmax_scale=None,
-    q_scale=None,
-    causal=False,
-    window_size=(-1, -1),
-    deterministic=False,
-    dtype=torch.bfloat16,
-    version=None,
-):
-    """
-    q:              [B, Lq, Nq, C1].
-    k:              [B, Lk, Nk, C1].
-    v:              [B, Lk, Nk, C2]. Nq must be divisible by Nk.
-    q_lens:         [B].
-    k_lens:         [B].
-    dropout_p:      float. Dropout probability.
-    softmax_scale:  float. The scaling of QK^T before applying softmax.
-    causal:         bool. Whether to apply causal attention mask.
-    window_size:    (left right). If not (-1, -1), apply sliding window local attention.
-    deterministic:  bool. If True, slightly slower and uses more memory.
-    dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
-    """
-    half_dtypes = (torch.float16, torch.bfloat16)
-    assert dtype in half_dtypes
-    assert q.device.type == 'cuda' and q.size(-1) <= 256
-    # params
-    b, lq, lk, out_dtype = q.size(0), q.size(1), k.size(1), q.dtype
-    def half(x):
-        return x if x.dtype in half_dtypes else x.to(dtype)
-    # preprocess query
-    if q_lens is None:
-        q = half(q.flatten(0, 1))
-        q_lens = torch.tensor(
-            [lq] * b, dtype=torch.int32).to(
-                device=q.device, non_blocking=True)
-    else:
-        q = half(torch.cat([u[:v] for u, v in zip(q, q_lens)]))
-    # preprocess key, value
-    if k_lens is None:
-        k = half(k.flatten(0, 1))
-        v = half(v.flatten(0, 1))
-        k_lens = torch.tensor(
-            [lk] * b, dtype=torch.int32).to(
-                device=k.device, non_blocking=True)
-    else:
-        k = half(torch.cat([u[:v] for u, v in zip(k, k_lens)]))
-        v = half(torch.cat([u[:v] for u, v in zip(v, k_lens)]))
-    q = q.to(v.dtype)
-    k = k.to(v.dtype)
-    if q_scale is not None:
-        q = q * q_scale
-    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
-        warnings.warn(
-            'Flash attention 3 is not available, use flash attention 2 instead.'
-        )
-    # apply attention
-    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
-        # Note: dropout_p, window_size are not supported in FA3 now.
-        x = flash_attn_interface.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
-            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
-            seqused_q=None,
-            seqused_k=None,
-            max_seqlen_q=lq,
-            max_seqlen_k=lk,
-            softmax_scale=softmax_scale,
-            causal=causal,
-            deterministic=deterministic)[0].unflatten(0, (b, lq))
-    else:
-        assert FLASH_ATTN_2_AVAILABLE
-        x = flash_attn.flash_attn_varlen_func(
-            q=q,
-            k=k,
-            v=v,
-            cu_seqlens_q=torch.cat([q_lens.new_zeros([1]), q_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
-            cu_seqlens_k=torch.cat([k_lens.new_zeros([1]), k_lens]).cumsum(
-                0, dtype=torch.int32).to(q.device, non_blocking=True),
-            max_seqlen_q=lq,
-            max_seqlen_k=lk,
-            dropout_p=dropout_p,
-            softmax_scale=softmax_scale,
-            causal=causal,
-            window_size=window_size,
-            deterministic=deterministic).unflatten(0, (b, lq))
-    # output
-    return x.type(out_dtype)
-def attention(
-    q,
-    k,
-    v,
-    q_lens=None,
-    k_lens=None,
-    dropout_p=0.,
-    softmax_scale=None,
-    q_scale=None,
-    causal=False,
-    window_size=(-1, -1),
-    deterministic=False,
-    dtype=torch.bfloat16,
-    fa_version=None,
-):
-    if FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
-        return flash_attention(
-            q=q,
-            k=k,
-            v=v,
-            q_lens=q_lens,
-            k_lens=k_lens,
-            dropout_p=dropout_p,
-            softmax_scale=softmax_scale,
-            q_scale=q_scale,
-            causal=causal,
-            window_size=window_size,
-            deterministic=deterministic,
-            dtype=dtype,
-            version=fa_version,
-        )
-    else:
-        if q_lens is not None or k_lens is not None:
-            warnings.warn(
-                'Padding mask is disabled when using scaled_dot_product_attention. It can have a significant impact on performance.'
-            )
-        attn_mask = None
-        q = q.transpose(1, 2).to(dtype)
-        k = k.transpose(1, 2).to(dtype)
-        v = v.transpose(1, 2).to(dtype)
-        out = torch.nn.functional.scaled_dot_product_attention(
-            q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p)
-        out = out.transpose(1, 2).contiguous()
-        return out

wan/modules/model.py DELETED Viewed

@@ -1,546 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-import torch
-import torch.nn as nn
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.modeling_utils import ModelMixin
-from .attention import flash_attention
-__all__ = ['WanModel']
-def sinusoidal_embedding_1d(dim, position):
-    # preprocess
-    assert dim % 2 == 0
-    half = dim // 2
-    position = position.type(torch.float64)
-    # calculation
-    sinusoid = torch.outer(
-        position, torch.pow(10000, -torch.arange(half).to(position).div(half)))
-    x = torch.cat([torch.cos(sinusoid), torch.sin(sinusoid)], dim=1)
-    return x
-@torch.amp.autocast('cuda', enabled=False)
-def rope_params(max_seq_len, dim, theta=10000):
-    assert dim % 2 == 0
-    freqs = torch.outer(
-        torch.arange(max_seq_len),
-        1.0 / torch.pow(theta,
-                        torch.arange(0, dim, 2).to(torch.float64).div(dim)))
-    freqs = torch.polar(torch.ones_like(freqs), freqs)
-    return freqs
-@torch.amp.autocast('cuda', enabled=False)
-def rope_apply(x, grid_sizes, freqs):
-    n, c = x.size(2), x.size(3) // 2
-    # split freqs
-    freqs = freqs.split([c - 2 * (c // 3), c // 3, c // 3], dim=1)
-    # loop over samples
-    output = []
-    for i, (f, h, w) in enumerate(grid_sizes.tolist()):
-        seq_len = f * h * w
-        # precompute multipliers
-        x_i = torch.view_as_complex(x[i, :seq_len].to(torch.float64).reshape(
-            seq_len, n, -1, 2))
-        freqs_i = torch.cat([
-            freqs[0][:f].view(f, 1, 1, -1).expand(f, h, w, -1),
-            freqs[1][:h].view(1, h, 1, -1).expand(f, h, w, -1),
-            freqs[2][:w].view(1, 1, w, -1).expand(f, h, w, -1)
-        ],
-                            dim=-1).reshape(seq_len, 1, -1)
-        # apply rotary embedding
-        x_i = torch.view_as_real(x_i * freqs_i).flatten(2)
-        x_i = torch.cat([x_i, x[i, seq_len:]])
-        # append to collection
-        output.append(x_i)
-    return torch.stack(output).float()
-class WanRMSNorm(nn.Module):
-    def __init__(self, dim, eps=1e-5):
-        super().__init__()
-        self.dim = dim
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, C]
-        """
-        return self._norm(x.float()).type_as(x) * self.weight
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.eps)
-class WanLayerNorm(nn.LayerNorm):
-    def __init__(self, dim, eps=1e-6, elementwise_affine=False):
-        super().__init__(dim, elementwise_affine=elementwise_affine, eps=eps)
-    def forward(self, x):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, C]
-        """
-        return super().forward(x.float()).type_as(x)
-class WanSelfAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 eps=1e-6):
-        assert dim % num_heads == 0
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.eps = eps
-        # layers
-        self.q = nn.Linear(dim, dim)
-        self.k = nn.Linear(dim, dim)
-        self.v = nn.Linear(dim, dim)
-        self.o = nn.Linear(dim, dim)
-        self.norm_q = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-        self.norm_k = WanRMSNorm(dim, eps=eps) if qk_norm else nn.Identity()
-    def forward(self, x, seq_lens, grid_sizes, freqs):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, num_heads, C / num_heads]
-            seq_lens(Tensor): Shape [B]
-            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
-            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
-        """
-        b, s, n, d = *x.shape[:2], self.num_heads, self.head_dim
-        # query, key, value function
-        def qkv_fn(x):
-            q = self.norm_q(self.q(x)).view(b, s, n, d)
-            k = self.norm_k(self.k(x)).view(b, s, n, d)
-            v = self.v(x).view(b, s, n, d)
-            return q, k, v
-        q, k, v = qkv_fn(x)
-        x = flash_attention(
-            q=rope_apply(q, grid_sizes, freqs),
-            k=rope_apply(k, grid_sizes, freqs),
-            v=v,
-            k_lens=seq_lens,
-            window_size=self.window_size)
-        # output
-        x = x.flatten(2)
-        x = self.o(x)
-        return x
-class WanCrossAttention(WanSelfAttention):
-    def forward(self, x, context, context_lens):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            context(Tensor): Shape [B, L2, C]
-            context_lens(Tensor): Shape [B]
-        """
-        b, n, d = x.size(0), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.norm_q(self.q(x)).view(b, -1, n, d)
-        k = self.norm_k(self.k(context)).view(b, -1, n, d)
-        v = self.v(context).view(b, -1, n, d)
-        # compute attention
-        x = flash_attention(q, k, v, k_lens=context_lens)
-        # output
-        x = x.flatten(2)
-        x = self.o(x)
-        return x
-class WanAttentionBlock(nn.Module):
-    def __init__(self,
-                 dim,
-                 ffn_dim,
-                 num_heads,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=False,
-                 eps=1e-6):
-        super().__init__()
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-        # layers
-        self.norm1 = WanLayerNorm(dim, eps)
-        self.self_attn = WanSelfAttention(dim, num_heads, window_size, qk_norm,
-                                          eps)
-        self.norm3 = WanLayerNorm(
-            dim, eps,
-            elementwise_affine=True) if cross_attn_norm else nn.Identity()
-        self.cross_attn = WanCrossAttention(dim, num_heads, (-1, -1), qk_norm,
-                                            eps)
-        self.norm2 = WanLayerNorm(dim, eps)
-        self.ffn = nn.Sequential(
-            nn.Linear(dim, ffn_dim), nn.GELU(approximate='tanh'),
-            nn.Linear(ffn_dim, dim))
-        # modulation
-        self.modulation = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
-    def forward(
-        self,
-        x,
-        e,
-        seq_lens,
-        grid_sizes,
-        freqs,
-        context,
-        context_lens,
-    ):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L, C]
-            e(Tensor): Shape [B, L1, 6, C]
-            seq_lens(Tensor): Shape [B], length of each sequence in batch
-            grid_sizes(Tensor): Shape [B, 3], the second dimension contains (F, H, W)
-            freqs(Tensor): Rope freqs, shape [1024, C / num_heads / 2]
-        """
-        assert e.dtype == torch.float32
-        with torch.amp.autocast('cuda', dtype=torch.float32):
-            e = (self.modulation.unsqueeze(0) + e).chunk(6, dim=2)
-        assert e[0].dtype == torch.float32
-        # self-attention
-        y = self.self_attn(
-            self.norm1(x).float() * (1 + e[1].squeeze(2)) + e[0].squeeze(2),
-            seq_lens, grid_sizes, freqs)
-        with torch.amp.autocast('cuda', dtype=torch.float32):
-            x = x + y * e[2].squeeze(2)
-        # cross-attention & ffn function
-        def cross_attn_ffn(x, context, context_lens, e):
-            x = x + self.cross_attn(self.norm3(x), context, context_lens)
-            y = self.ffn(
-                self.norm2(x).float() * (1 + e[4].squeeze(2)) + e[3].squeeze(2))
-            with torch.amp.autocast('cuda', dtype=torch.float32):
-                x = x + y * e[5].squeeze(2)
-            return x
-        x = cross_attn_ffn(x, context, context_lens, e)
-        return x
-class Head(nn.Module):
-    def __init__(self, dim, out_dim, patch_size, eps=1e-6):
-        super().__init__()
-        self.dim = dim
-        self.out_dim = out_dim
-        self.patch_size = patch_size
-        self.eps = eps
-        # layers
-        out_dim = math.prod(patch_size) * out_dim
-        self.norm = WanLayerNorm(dim, eps)
-        self.head = nn.Linear(dim, out_dim)
-        # modulation
-        self.modulation = nn.Parameter(torch.randn(1, 2, dim) / dim**0.5)
-    def forward(self, x, e):
-        r"""
-        Args:
-            x(Tensor): Shape [B, L1, C]
-            e(Tensor): Shape [B, L1, C]
-        """
-        assert e.dtype == torch.float32
-        with torch.amp.autocast('cuda', dtype=torch.float32):
-            e = (self.modulation.unsqueeze(0) + e.unsqueeze(2)).chunk(2, dim=2)
-            x = (
-                self.head(
-                    self.norm(x) * (1 + e[1].squeeze(2)) + e[0].squeeze(2)))
-        return x
-class WanModel(ModelMixin, ConfigMixin):
-    r"""
-    Wan diffusion backbone supporting both text-to-video and image-to-video.
-    """
-    ignore_for_config = [
-        'patch_size', 'cross_attn_norm', 'qk_norm', 'text_dim', 'window_size'
-    ]
-    _no_split_modules = ['WanAttentionBlock']
-    @register_to_config
-    def __init__(self,
-                 model_type='t2v',
-                 patch_size=(1, 2, 2),
-                 text_len=512,
-                 in_dim=16,
-                 dim=2048,
-                 ffn_dim=8192,
-                 freq_dim=256,
-                 text_dim=4096,
-                 out_dim=16,
-                 num_heads=16,
-                 num_layers=32,
-                 window_size=(-1, -1),
-                 qk_norm=True,
-                 cross_attn_norm=True,
-                 eps=1e-6):
-        r"""
-        Initialize the diffusion model backbone.
-        Args:
-            model_type (`str`, *optional*, defaults to 't2v'):
-                Model variant - 't2v' (text-to-video) or 'i2v' (image-to-video)
-            patch_size (`tuple`, *optional*, defaults to (1, 2, 2)):
-                3D patch dimensions for video embedding (t_patch, h_patch, w_patch)
-            text_len (`int`, *optional*, defaults to 512):
-                Fixed length for text embeddings
-            in_dim (`int`, *optional*, defaults to 16):
-                Input video channels (C_in)
-            dim (`int`, *optional*, defaults to 2048):
-                Hidden dimension of the transformer
-            ffn_dim (`int`, *optional*, defaults to 8192):
-                Intermediate dimension in feed-forward network
-            freq_dim (`int`, *optional*, defaults to 256):
-                Dimension for sinusoidal time embeddings
-            text_dim (`int`, *optional*, defaults to 4096):
-                Input dimension for text embeddings
-            out_dim (`int`, *optional*, defaults to 16):
-                Output video channels (C_out)
-            num_heads (`int`, *optional*, defaults to 16):
-                Number of attention heads
-            num_layers (`int`, *optional*, defaults to 32):
-                Number of transformer blocks
-            window_size (`tuple`, *optional*, defaults to (-1, -1)):
-                Window size for local attention (-1 indicates global attention)
-            qk_norm (`bool`, *optional*, defaults to True):
-                Enable query/key normalization
-            cross_attn_norm (`bool`, *optional*, defaults to False):
-                Enable cross-attention normalization
-            eps (`float`, *optional*, defaults to 1e-6):
-                Epsilon value for normalization layers
-        """
-        super().__init__()
-        assert model_type in ['t2v', 'i2v', 'ti2v']
-        self.model_type = model_type
-        self.patch_size = patch_size
-        self.text_len = text_len
-        self.in_dim = in_dim
-        self.dim = dim
-        self.ffn_dim = ffn_dim
-        self.freq_dim = freq_dim
-        self.text_dim = text_dim
-        self.out_dim = out_dim
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.window_size = window_size
-        self.qk_norm = qk_norm
-        self.cross_attn_norm = cross_attn_norm
-        self.eps = eps
-        # embeddings
-        self.patch_embedding = nn.Conv3d(
-            in_dim, dim, kernel_size=patch_size, stride=patch_size)
-        self.text_embedding = nn.Sequential(
-            nn.Linear(text_dim, dim), nn.GELU(approximate='tanh'),
-            nn.Linear(dim, dim))
-        self.time_embedding = nn.Sequential(
-            nn.Linear(freq_dim, dim), nn.SiLU(), nn.Linear(dim, dim))
-        self.time_projection = nn.Sequential(nn.SiLU(), nn.Linear(dim, dim * 6))
-        # blocks
-        self.blocks = nn.ModuleList([
-            WanAttentionBlock(dim, ffn_dim, num_heads, window_size, qk_norm,
-                              cross_attn_norm, eps) for _ in range(num_layers)
-        ])
-        # head
-        self.head = Head(dim, out_dim, patch_size, eps)
-        # buffers (don't use register_buffer otherwise dtype will be changed in to())
-        assert (dim % num_heads) == 0 and (dim // num_heads) % 2 == 0
-        d = dim // num_heads
-        self.freqs = torch.cat([
-            rope_params(1024, d - 4 * (d // 6)),
-            rope_params(1024, 2 * (d // 6)),
-            rope_params(1024, 2 * (d // 6))
-        ],
-                               dim=1)
-        # initialize weights
-        self.init_weights()
-    def forward(
-        self,
-        x,
-        t,
-        context,
-        seq_len,
-        y=None,
-    ):
-        r"""
-        Forward pass through the diffusion model
-        Args:
-            x (List[Tensor]):
-                List of input video tensors, each with shape [C_in, F, H, W]
-            t (Tensor):
-                Diffusion timesteps tensor of shape [B]
-            context (List[Tensor]):
-                List of text embeddings each with shape [L, C]
-            seq_len (`int`):
-                Maximum sequence length for positional encoding
-            y (List[Tensor], *optional*):
-                Conditional video inputs for image-to-video mode, same shape as x
-        Returns:
-            List[Tensor]:
-                List of denoised video tensors with original input shapes [C_out, F, H / 8, W / 8]
-        """
-        if self.model_type == 'i2v':
-            assert y is not None
-        # params
-        device = self.patch_embedding.weight.device
-        if self.freqs.device != device:
-            self.freqs = self.freqs.to(device)
-        if y is not None:
-            x = [torch.cat([u, v], dim=0) for u, v in zip(x, y)]
-        # embeddings
-        x = [self.patch_embedding(u.unsqueeze(0)) for u in x]
-        grid_sizes = torch.stack(
-            [torch.tensor(u.shape[2:], dtype=torch.long) for u in x])
-        x = [u.flatten(2).transpose(1, 2) for u in x]
-        seq_lens = torch.tensor([u.size(1) for u in x], dtype=torch.long)
-        assert seq_lens.max() <= seq_len
-        x = torch.cat([
-            torch.cat([u, u.new_zeros(1, seq_len - u.size(1), u.size(2))],
-                      dim=1) for u in x
-        ])
-        # time embeddings
-        if t.dim() == 1:
-            t = t.expand(t.size(0), seq_len)
-        with torch.amp.autocast('cuda', dtype=torch.float32):
-            bt = t.size(0)
-            t = t.flatten()
-            e = self.time_embedding(
-                sinusoidal_embedding_1d(self.freq_dim,
-                                        t).unflatten(0, (bt, seq_len)).float())
-            e0 = self.time_projection(e).unflatten(2, (6, self.dim))
-            assert e.dtype == torch.float32 and e0.dtype == torch.float32
-        # context
-        context_lens = None
-        context = self.text_embedding(
-            torch.stack([
-                torch.cat(
-                    [u, u.new_zeros(self.text_len - u.size(0), u.size(1))])
-                for u in context
-            ]))
-        # arguments
-        kwargs = dict(
-            e=e0,
-            seq_lens=seq_lens,
-            grid_sizes=grid_sizes,
-            freqs=self.freqs,
-            context=context,
-            context_lens=context_lens)
-        for block in self.blocks:
-            x = block(x, **kwargs)
-        # head
-        x = self.head(x, e)
-        # unpatchify
-        x = self.unpatchify(x, grid_sizes)
-        return [u.float() for u in x]
-    def unpatchify(self, x, grid_sizes):
-        r"""
-        Reconstruct video tensors from patch embeddings.
-        Args:
-            x (List[Tensor]):
-                List of patchified features, each with shape [L, C_out * prod(patch_size)]
-            grid_sizes (Tensor):
-                Original spatial-temporal grid dimensions before patching,
-                    shape [B, 3] (3 dimensions correspond to F_patches, H_patches, W_patches)
-        Returns:
-            List[Tensor]:
-                Reconstructed video tensors with shape [C_out, F, H / 8, W / 8]
-        """
-        c = self.out_dim
-        out = []
-        for u, v in zip(x, grid_sizes.tolist()):
-            u = u[:math.prod(v)].view(*v, *self.patch_size, c)
-            u = torch.einsum('fhwpqrc->cfphqwr', u)
-            u = u.reshape(c, *[i * j for i, j in zip(v, self.patch_size)])
-            out.append(u)
-        return out
-    def init_weights(self):
-        r"""
-        Initialize model parameters using Xavier initialization.
-        """
-        # basic init
-        for m in self.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.xavier_uniform_(m.weight)
-                if m.bias is not None:
-                    nn.init.zeros_(m.bias)
-        # init embeddings
-        nn.init.xavier_uniform_(self.patch_embedding.weight.flatten(1))
-        for m in self.text_embedding.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, std=.02)
-        for m in self.time_embedding.modules():
-            if isinstance(m, nn.Linear):
-                nn.init.normal_(m.weight, std=.02)
-        # init output layer
-        nn.init.zeros_(self.head.head.weight)

wan/modules/t5.py DELETED Viewed

@@ -1,513 +0,0 @@
-# Modified from transformers.models.t5.modeling_t5
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import logging
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from .tokenizers import HuggingfaceTokenizer
-__all__ = [
-    'T5Model',
-    'T5Encoder',
-    'T5Decoder',
-    'T5EncoderModel',
-]
-def fp16_clamp(x):
-    if x.dtype == torch.float16 and torch.isinf(x).any():
-        clamp = torch.finfo(x.dtype).max - 1000
-        x = torch.clamp(x, min=-clamp, max=clamp)
-    return x
-def init_weights(m):
-    if isinstance(m, T5LayerNorm):
-        nn.init.ones_(m.weight)
-    elif isinstance(m, T5Model):
-        nn.init.normal_(m.token_embedding.weight, std=1.0)
-    elif isinstance(m, T5FeedForward):
-        nn.init.normal_(m.gate[0].weight, std=m.dim**-0.5)
-        nn.init.normal_(m.fc1.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.fc2.weight, std=m.dim_ffn**-0.5)
-    elif isinstance(m, T5Attention):
-        nn.init.normal_(m.q.weight, std=(m.dim * m.dim_attn)**-0.5)
-        nn.init.normal_(m.k.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.v.weight, std=m.dim**-0.5)
-        nn.init.normal_(m.o.weight, std=(m.num_heads * m.dim_attn)**-0.5)
-    elif isinstance(m, T5RelativeEmbedding):
-        nn.init.normal_(
-            m.embedding.weight, std=(2 * m.num_buckets * m.num_heads)**-0.5)
-class GELU(nn.Module):
-    def forward(self, x):
-        return 0.5 * x * (1.0 + torch.tanh(
-            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
-class T5LayerNorm(nn.Module):
-    def __init__(self, dim, eps=1e-6):
-        super(T5LayerNorm, self).__init__()
-        self.dim = dim
-        self.eps = eps
-        self.weight = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        x = x * torch.rsqrt(x.float().pow(2).mean(dim=-1, keepdim=True) +
-                            self.eps)
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            x = x.type_as(self.weight)
-        return self.weight * x
-class T5Attention(nn.Module):
-    def __init__(self, dim, dim_attn, num_heads, dropout=0.1):
-        assert dim_attn % num_heads == 0
-        super(T5Attention, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.num_heads = num_heads
-        self.head_dim = dim_attn // num_heads
-        # layers
-        self.q = nn.Linear(dim, dim_attn, bias=False)
-        self.k = nn.Linear(dim, dim_attn, bias=False)
-        self.v = nn.Linear(dim, dim_attn, bias=False)
-        self.o = nn.Linear(dim_attn, dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x, context=None, mask=None, pos_bias=None):
-        """
-        x:          [B, L1, C].
-        context:    [B, L2, C] or None.
-        mask:       [B, L2] or [B, L1, L2] or None.
-        """
-        # check inputs
-        context = x if context is None else context
-        b, n, c = x.size(0), self.num_heads, self.head_dim
-        # compute query, key, value
-        q = self.q(x).view(b, -1, n, c)
-        k = self.k(context).view(b, -1, n, c)
-        v = self.v(context).view(b, -1, n, c)
-        # attention bias
-        attn_bias = x.new_zeros(b, n, q.size(1), k.size(1))
-        if pos_bias is not None:
-            attn_bias += pos_bias
-        if mask is not None:
-            assert mask.ndim in [2, 3]
-            mask = mask.view(b, 1, 1,
-                             -1) if mask.ndim == 2 else mask.unsqueeze(1)
-            attn_bias.masked_fill_(mask == 0, torch.finfo(x.dtype).min)
-        # compute attention (T5 does not use scaling)
-        attn = torch.einsum('binc,bjnc->bnij', q, k) + attn_bias
-        attn = F.softmax(attn.float(), dim=-1).type_as(attn)
-        x = torch.einsum('bnij,bjnc->binc', attn, v)
-        # output
-        x = x.reshape(b, -1, n * c)
-        x = self.o(x)
-        x = self.dropout(x)
-        return x
-class T5FeedForward(nn.Module):
-    def __init__(self, dim, dim_ffn, dropout=0.1):
-        super(T5FeedForward, self).__init__()
-        self.dim = dim
-        self.dim_ffn = dim_ffn
-        # layers
-        self.gate = nn.Sequential(nn.Linear(dim, dim_ffn, bias=False), GELU())
-        self.fc1 = nn.Linear(dim, dim_ffn, bias=False)
-        self.fc2 = nn.Linear(dim_ffn, dim, bias=False)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        x = self.fc1(x) * self.gate(x)
-        x = self.dropout(x)
-        x = self.fc2(x)
-        x = self.dropout(x)
-        return x
-class T5SelfAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 dim_attn,
-                 dim_ffn,
-                 num_heads,
-                 num_buckets,
-                 shared_pos=True,
-                 dropout=0.1):
-        super(T5SelfAttention, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.dim_ffn = dim_ffn
-        self.num_heads = num_heads
-        self.num_buckets = num_buckets
-        self.shared_pos = shared_pos
-        # layers
-        self.norm1 = T5LayerNorm(dim)
-        self.attn = T5Attention(dim, dim_attn, num_heads, dropout)
-        self.norm2 = T5LayerNorm(dim)
-        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
-        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
-            num_buckets, num_heads, bidirectional=True)
-    def forward(self, x, mask=None, pos_bias=None):
-        e = pos_bias if self.shared_pos else self.pos_embedding(
-            x.size(1), x.size(1))
-        x = fp16_clamp(x + self.attn(self.norm1(x), mask=mask, pos_bias=e))
-        x = fp16_clamp(x + self.ffn(self.norm2(x)))
-        return x
-class T5CrossAttention(nn.Module):
-    def __init__(self,
-                 dim,
-                 dim_attn,
-                 dim_ffn,
-                 num_heads,
-                 num_buckets,
-                 shared_pos=True,
-                 dropout=0.1):
-        super(T5CrossAttention, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.dim_ffn = dim_ffn
-        self.num_heads = num_heads
-        self.num_buckets = num_buckets
-        self.shared_pos = shared_pos
-        # layers
-        self.norm1 = T5LayerNorm(dim)
-        self.self_attn = T5Attention(dim, dim_attn, num_heads, dropout)
-        self.norm2 = T5LayerNorm(dim)
-        self.cross_attn = T5Attention(dim, dim_attn, num_heads, dropout)
-        self.norm3 = T5LayerNorm(dim)
-        self.ffn = T5FeedForward(dim, dim_ffn, dropout)
-        self.pos_embedding = None if shared_pos else T5RelativeEmbedding(
-            num_buckets, num_heads, bidirectional=False)
-    def forward(self,
-                x,
-                mask=None,
-                encoder_states=None,
-                encoder_mask=None,
-                pos_bias=None):
-        e = pos_bias if self.shared_pos else self.pos_embedding(
-            x.size(1), x.size(1))
-        x = fp16_clamp(x + self.self_attn(self.norm1(x), mask=mask, pos_bias=e))
-        x = fp16_clamp(x + self.cross_attn(
-            self.norm2(x), context=encoder_states, mask=encoder_mask))
-        x = fp16_clamp(x + self.ffn(self.norm3(x)))
-        return x
-class T5RelativeEmbedding(nn.Module):
-    def __init__(self, num_buckets, num_heads, bidirectional, max_dist=128):
-        super(T5RelativeEmbedding, self).__init__()
-        self.num_buckets = num_buckets
-        self.num_heads = num_heads
-        self.bidirectional = bidirectional
-        self.max_dist = max_dist
-        # layers
-        self.embedding = nn.Embedding(num_buckets, num_heads)
-    def forward(self, lq, lk):
-        device = self.embedding.weight.device
-        # rel_pos = torch.arange(lk).unsqueeze(0).to(device) - \
-        #     torch.arange(lq).unsqueeze(1).to(device)
-        rel_pos = torch.arange(lk, device=device).unsqueeze(0) - \
-            torch.arange(lq, device=device).unsqueeze(1)
-        rel_pos = self._relative_position_bucket(rel_pos)
-        rel_pos_embeds = self.embedding(rel_pos)
-        rel_pos_embeds = rel_pos_embeds.permute(2, 0, 1).unsqueeze(
-            0)  # [1, N, Lq, Lk]
-        return rel_pos_embeds.contiguous()
-    def _relative_position_bucket(self, rel_pos):
-        # preprocess
-        if self.bidirectional:
-            num_buckets = self.num_buckets // 2
-            rel_buckets = (rel_pos > 0).long() * num_buckets
-            rel_pos = torch.abs(rel_pos)
-        else:
-            num_buckets = self.num_buckets
-            rel_buckets = 0
-            rel_pos = -torch.min(rel_pos, torch.zeros_like(rel_pos))
-        # embeddings for small and large positions
-        max_exact = num_buckets // 2
-        rel_pos_large = max_exact + (torch.log(rel_pos.float() / max_exact) /
-                                     math.log(self.max_dist / max_exact) *
-                                     (num_buckets - max_exact)).long()
-        rel_pos_large = torch.min(
-            rel_pos_large, torch.full_like(rel_pos_large, num_buckets - 1))
-        rel_buckets += torch.where(rel_pos < max_exact, rel_pos, rel_pos_large)
-        return rel_buckets
-class T5Encoder(nn.Module):
-    def __init__(self,
-                 vocab,
-                 dim,
-                 dim_attn,
-                 dim_ffn,
-                 num_heads,
-                 num_layers,
-                 num_buckets,
-                 shared_pos=True,
-                 dropout=0.1):
-        super(T5Encoder, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.dim_ffn = dim_ffn
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.num_buckets = num_buckets
-        self.shared_pos = shared_pos
-        # layers
-        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
-            else nn.Embedding(vocab, dim)
-        self.pos_embedding = T5RelativeEmbedding(
-            num_buckets, num_heads, bidirectional=True) if shared_pos else None
-        self.dropout = nn.Dropout(dropout)
-        self.blocks = nn.ModuleList([
-            T5SelfAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
-                            shared_pos, dropout) for _ in range(num_layers)
-        ])
-        self.norm = T5LayerNorm(dim)
-        # initialize weights
-        self.apply(init_weights)
-    def forward(self, ids, mask=None):
-        x = self.token_embedding(ids)
-        x = self.dropout(x)
-        e = self.pos_embedding(x.size(1),
-                               x.size(1)) if self.shared_pos else None
-        for block in self.blocks:
-            x = block(x, mask, pos_bias=e)
-        x = self.norm(x)
-        x = self.dropout(x)
-        return x
-class T5Decoder(nn.Module):
-    def __init__(self,
-                 vocab,
-                 dim,
-                 dim_attn,
-                 dim_ffn,
-                 num_heads,
-                 num_layers,
-                 num_buckets,
-                 shared_pos=True,
-                 dropout=0.1):
-        super(T5Decoder, self).__init__()
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.dim_ffn = dim_ffn
-        self.num_heads = num_heads
-        self.num_layers = num_layers
-        self.num_buckets = num_buckets
-        self.shared_pos = shared_pos
-        # layers
-        self.token_embedding = vocab if isinstance(vocab, nn.Embedding) \
-            else nn.Embedding(vocab, dim)
-        self.pos_embedding = T5RelativeEmbedding(
-            num_buckets, num_heads, bidirectional=False) if shared_pos else None
-        self.dropout = nn.Dropout(dropout)
-        self.blocks = nn.ModuleList([
-            T5CrossAttention(dim, dim_attn, dim_ffn, num_heads, num_buckets,
-                             shared_pos, dropout) for _ in range(num_layers)
-        ])
-        self.norm = T5LayerNorm(dim)
-        # initialize weights
-        self.apply(init_weights)
-    def forward(self, ids, mask=None, encoder_states=None, encoder_mask=None):
-        b, s = ids.size()
-        # causal mask
-        if mask is None:
-            mask = torch.tril(torch.ones(1, s, s).to(ids.device))
-        elif mask.ndim == 2:
-            mask = torch.tril(mask.unsqueeze(1).expand(-1, s, -1))
-        # layers
-        x = self.token_embedding(ids)
-        x = self.dropout(x)
-        e = self.pos_embedding(x.size(1),
-                               x.size(1)) if self.shared_pos else None
-        for block in self.blocks:
-            x = block(x, mask, encoder_states, encoder_mask, pos_bias=e)
-        x = self.norm(x)
-        x = self.dropout(x)
-        return x
-class T5Model(nn.Module):
-    def __init__(self,
-                 vocab_size,
-                 dim,
-                 dim_attn,
-                 dim_ffn,
-                 num_heads,
-                 encoder_layers,
-                 decoder_layers,
-                 num_buckets,
-                 shared_pos=True,
-                 dropout=0.1):
-        super(T5Model, self).__init__()
-        self.vocab_size = vocab_size
-        self.dim = dim
-        self.dim_attn = dim_attn
-        self.dim_ffn = dim_ffn
-        self.num_heads = num_heads
-        self.encoder_layers = encoder_layers
-        self.decoder_layers = decoder_layers
-        self.num_buckets = num_buckets
-        # layers
-        self.token_embedding = nn.Embedding(vocab_size, dim)
-        self.encoder = T5Encoder(self.token_embedding, dim, dim_attn, dim_ffn,
-                                 num_heads, encoder_layers, num_buckets,
-                                 shared_pos, dropout)
-        self.decoder = T5Decoder(self.token_embedding, dim, dim_attn, dim_ffn,
-                                 num_heads, decoder_layers, num_buckets,
-                                 shared_pos, dropout)
-        self.head = nn.Linear(dim, vocab_size, bias=False)
-        # initialize weights
-        self.apply(init_weights)
-    def forward(self, encoder_ids, encoder_mask, decoder_ids, decoder_mask):
-        x = self.encoder(encoder_ids, encoder_mask)
-        x = self.decoder(decoder_ids, decoder_mask, x, encoder_mask)
-        x = self.head(x)
-        return x
-def _t5(name,
-        encoder_only=False,
-        decoder_only=False,
-        return_tokenizer=False,
-        tokenizer_kwargs={},
-        dtype=torch.float32,
-        device='cpu',
-        **kwargs):
-    # sanity check
-    assert not (encoder_only and decoder_only)
-    # params
-    if encoder_only:
-        model_cls = T5Encoder
-        kwargs['vocab'] = kwargs.pop('vocab_size')
-        kwargs['num_layers'] = kwargs.pop('encoder_layers')
-        _ = kwargs.pop('decoder_layers')
-    elif decoder_only:
-        model_cls = T5Decoder
-        kwargs['vocab'] = kwargs.pop('vocab_size')
-        kwargs['num_layers'] = kwargs.pop('decoder_layers')
-        _ = kwargs.pop('encoder_layers')
-    else:
-        model_cls = T5Model
-    # init model
-    with torch.device(device):
-        model = model_cls(**kwargs)
-    # set device
-    model = model.to(dtype=dtype, device=device)
-    # init tokenizer
-    if return_tokenizer:
-        from .tokenizers import HuggingfaceTokenizer
-        tokenizer = HuggingfaceTokenizer(f'google/{name}', **tokenizer_kwargs)
-        return model, tokenizer
-    else:
-        return model
-def umt5_xxl(**kwargs):
-    cfg = dict(
-        vocab_size=256384,
-        dim=4096,
-        dim_attn=4096,
-        dim_ffn=10240,
-        num_heads=64,
-        encoder_layers=24,
-        decoder_layers=24,
-        num_buckets=32,
-        shared_pos=False,
-        dropout=0.1)
-    cfg.update(**kwargs)
-    return _t5('umt5-xxl', **cfg)
-class T5EncoderModel:
-    def __init__(
-        self,
-        text_len,
-        dtype=torch.bfloat16,
-        device=torch.cuda.current_device(),
-        checkpoint_path=None,
-        tokenizer_path=None,
-        shard_fn=None,
-    ):
-        self.text_len = text_len
-        self.dtype = dtype
-        self.device = device
-        self.checkpoint_path = checkpoint_path
-        self.tokenizer_path = tokenizer_path
-        # init model
-        model = umt5_xxl(
-            encoder_only=True,
-            return_tokenizer=False,
-            dtype=dtype,
-            device=device).eval().requires_grad_(False)
-        logging.info(f'loading {checkpoint_path}')
-        model.load_state_dict(torch.load(checkpoint_path, map_location='cpu'))
-        self.model = model
-        if shard_fn is not None:
-            self.model = shard_fn(self.model, sync_module_states=False)
-        else:
-            self.model.to(self.device)
-        # init tokenizer
-        self.tokenizer = HuggingfaceTokenizer(
-            name=tokenizer_path, seq_len=text_len, clean='whitespace')
-    def __call__(self, texts, device):
-        ids, mask = self.tokenizer(
-            texts, return_mask=True, add_special_tokens=True)
-        ids = ids.to(device)
-        mask = mask.to(device)
-        seq_lens = mask.gt(0).sum(dim=1).long()
-        context = self.model(ids, mask)
-        return [u[:v] for u, v in zip(context, seq_lens)]

wan/modules/tokenizers.py DELETED Viewed

@@ -1,82 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import html
-import string
-import ftfy
-import regex as re
-from transformers import AutoTokenizer
-__all__ = ['HuggingfaceTokenizer']
-def basic_clean(text):
-    text = ftfy.fix_text(text)
-    text = html.unescape(html.unescape(text))
-    return text.strip()
-def whitespace_clean(text):
-    text = re.sub(r'\s+', ' ', text)
-    text = text.strip()
-    return text
-def canonicalize(text, keep_punctuation_exact_string=None):
-    text = text.replace('_', ' ')
-    if keep_punctuation_exact_string:
-        text = keep_punctuation_exact_string.join(
-            part.translate(str.maketrans('', '', string.punctuation))
-            for part in text.split(keep_punctuation_exact_string))
-    else:
-        text = text.translate(str.maketrans('', '', string.punctuation))
-    text = text.lower()
-    text = re.sub(r'\s+', ' ', text)
-    return text.strip()
-class HuggingfaceTokenizer:
-    def __init__(self, name, seq_len=None, clean=None, **kwargs):
-        assert clean in (None, 'whitespace', 'lower', 'canonicalize')
-        self.name = name
-        self.seq_len = seq_len
-        self.clean = clean
-        # init tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
-        self.vocab_size = self.tokenizer.vocab_size
-    def __call__(self, sequence, **kwargs):
-        return_mask = kwargs.pop('return_mask', False)
-        # arguments
-        _kwargs = {'return_tensors': 'pt'}
-        if self.seq_len is not None:
-            _kwargs.update({
-                'padding': 'max_length',
-                'truncation': True,
-                'max_length': self.seq_len
-            })
-        _kwargs.update(**kwargs)
-        # tokenization
-        if isinstance(sequence, str):
-            sequence = [sequence]
-        if self.clean:
-            sequence = [self._clean(u) for u in sequence]
-        ids = self.tokenizer(sequence, **_kwargs)
-        # output
-        if return_mask:
-            return ids.input_ids, ids.attention_mask
-        else:
-            return ids.input_ids
-    def _clean(self, text):
-        if self.clean == 'whitespace':
-            text = whitespace_clean(basic_clean(text))
-        elif self.clean == 'lower':
-            text = whitespace_clean(basic_clean(text)).lower()
-        elif self.clean == 'canonicalize':
-            text = canonicalize(basic_clean(text))
-        return text

wan/modules/vae2_1.py DELETED Viewed

@@ -1,663 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import logging
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-__all__ = [
-    'Wan2_1_VAE',
-]
-CACHE_T = 2
-class CausalConv3d(nn.Conv3d):
-    """
-    Causal 3d convolusion.
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._padding = (self.padding[2], self.padding[2], self.padding[1],
-                         self.padding[1], 2 * self.padding[0], 0)
-        self.padding = (0, 0, 0)
-    def forward(self, x, cache_x=None):
-        padding = list(self._padding)
-        if cache_x is not None and self._padding[4] > 0:
-            cache_x = cache_x.to(x.device)
-            x = torch.cat([cache_x, x], dim=2)
-            padding[4] -= cache_x.shape[2]
-        x = F.pad(x, padding)
-        return super().forward(x)
-class RMS_norm(nn.Module):
-    def __init__(self, dim, channel_first=True, images=True, bias=False):
-        super().__init__()
-        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
-        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
-        self.channel_first = channel_first
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(shape))
-        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.
-    def forward(self, x):
-        return F.normalize(
-            x, dim=(1 if self.channel_first else
-                    -1)) * self.scale * self.gamma + self.bias
-class Upsample(nn.Upsample):
-    def forward(self, x):
-        """
-        Fix bfloat16 support for nearest neighbor interpolation.
-        """
-        return super().forward(x.float()).type_as(x)
-class Resample(nn.Module):
-    def __init__(self, dim, mode):
-        assert mode in ('none', 'upsample2d', 'upsample3d', 'downsample2d',
-                        'downsample3d')
-        super().__init__()
-        self.dim = dim
-        self.mode = mode
-        # layers
-        if mode == 'upsample2d':
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
-                nn.Conv2d(dim, dim // 2, 3, padding=1))
-        elif mode == 'upsample3d':
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2., 2.), mode='nearest-exact'),
-                nn.Conv2d(dim, dim // 2, 3, padding=1))
-            self.time_conv = CausalConv3d(
-                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
-        elif mode == 'downsample2d':
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-        elif mode == 'downsample3d':
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-            self.time_conv = CausalConv3d(
-                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
-        else:
-            self.resample = nn.Identity()
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        b, c, t, h, w = x.size()
-        if self.mode == 'upsample3d':
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = 'Rep'
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                    if cache_x.shape[2] < 2 and feat_cache[
-                            idx] is not None and feat_cache[idx] != 'Rep':
-                        # cache last frame of last two chunk
-                        cache_x = torch.cat([
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device), cache_x
-                        ],
-                                            dim=2)
-                    if cache_x.shape[2] < 2 and feat_cache[
-                            idx] is not None and feat_cache[idx] == 'Rep':
-                        cache_x = torch.cat([
-                            torch.zeros_like(cache_x).to(cache_x.device),
-                            cache_x
-                        ],
-                                            dim=2)
-                    if feat_cache[idx] == 'Rep':
-                        x = self.time_conv(x)
-                    else:
-                        x = self.time_conv(x, feat_cache[idx])
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-                    x = x.reshape(b, 2, c, t, h, w)
-                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
-                                    3)
-                    x = x.reshape(b, c, t * 2, h, w)
-        t = x.shape[2]
-        x = rearrange(x, 'b c t h w -> (b t) c h w')
-        x = self.resample(x)
-        x = rearrange(x, '(b t) c h w -> b c t h w', t=t)
-        if self.mode == 'downsample3d':
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = x.clone()
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -1:, :, :].clone()
-                    # if cache_x.shape[2] < 2 and feat_cache[idx] is not None and feat_cache[idx]!='Rep':
-                    #     # cache last frame of last two chunk
-                    #     cache_x = torch.cat([feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(cache_x.device), cache_x], dim=2)
-                    x = self.time_conv(
-                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-        return x
-    def init_weight(self, conv):
-        conv_weight = conv.weight
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        one_matrix = torch.eye(c1, c2)
-        init_matrix = one_matrix
-        nn.init.zeros_(conv_weight)
-        #conv_weight.data[:,:,-1,1,1] = init_matrix * 0.5
-        conv_weight.data[:, :, 1, 0, 0] = init_matrix  #* 0.5
-        conv.weight.data.copy_(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-    def init_weight2(self, conv):
-        conv_weight = conv.weight.data
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        init_matrix = torch.eye(c1 // 2, c2)
-        #init_matrix = repeat(init_matrix, 'o ... -> (o 2) ...').permute(1,0,2).contiguous().reshape(c1,c2)
-        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
-        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
-        conv.weight.data.copy_(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-class ResidualBlock(nn.Module):
-    def __init__(self, in_dim, out_dim, dropout=0.0):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        # layers
-        self.residual = nn.Sequential(
-            RMS_norm(in_dim, images=False), nn.SiLU(),
-            CausalConv3d(in_dim, out_dim, 3, padding=1),
-            RMS_norm(out_dim, images=False), nn.SiLU(), nn.Dropout(dropout),
-            CausalConv3d(out_dim, out_dim, 3, padding=1))
-        self.shortcut = CausalConv3d(in_dim, out_dim, 1) \
-            if in_dim != out_dim else nn.Identity()
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        h = self.shortcut(x)
-        for layer in self.residual:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x + h
-class AttentionBlock(nn.Module):
-    """
-    Causal self-attention with a single head.
-    """
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-        # layers
-        self.norm = RMS_norm(dim)
-        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
-        self.proj = nn.Conv2d(dim, dim, 1)
-        # zero out the last layer params
-        nn.init.zeros_(self.proj.weight)
-    def forward(self, x):
-        identity = x
-        b, c, t, h, w = x.size()
-        x = rearrange(x, 'b c t h w -> (b t) c h w')
-        x = self.norm(x)
-        # compute query, key, value
-        q, k, v = self.to_qkv(x).reshape(b * t, 1, c * 3,
-                                         -1).permute(0, 1, 3,
-                                                     2).contiguous().chunk(
-                                                         3, dim=-1)
-        # apply attention
-        x = F.scaled_dot_product_attention(
-            q,
-            k,
-            v,
-        )
-        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
-        # output
-        x = self.proj(x)
-        x = rearrange(x, '(b t) c h w-> b c t h w', t=t)
-        return x + identity
-class Encoder3d(nn.Module):
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_downsample=[True, True, False],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        # dimensions
-        dims = [dim * u for u in [1] + dim_mult]
-        scale = 1.0
-        # init block
-        self.conv1 = CausalConv3d(3, dims[0], 3, padding=1)
-        # downsample blocks
-        downsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            # residual (+attention) blocks
-            for _ in range(num_res_blocks):
-                downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-                if scale in attn_scales:
-                    downsamples.append(AttentionBlock(out_dim))
-                in_dim = out_dim
-            # downsample block
-            if i != len(dim_mult) - 1:
-                mode = 'downsample3d' if temperal_downsample[
-                    i] else 'downsample2d'
-                downsamples.append(Resample(out_dim, mode=mode))
-                scale /= 2.0
-        self.downsamples = nn.Sequential(*downsamples)
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(out_dim, out_dim, dropout), AttentionBlock(out_dim),
-            ResidualBlock(out_dim, out_dim, dropout))
-        # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False), nn.SiLU(),
-            CausalConv3d(out_dim, z_dim, 3, padding=1))
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([
-                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                        cache_x.device), cache_x
-                ],
-                                    dim=2)
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        ## downsamples
-        for layer in self.downsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## middle
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-class Decoder3d(nn.Module):
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_upsample=[False, True, True],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_upsample = temperal_upsample
-        # dimensions
-        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
-        scale = 1.0 / 2**(len(dim_mult) - 2)
-        # init block
-        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(dims[0], dims[0], dropout), AttentionBlock(dims[0]),
-            ResidualBlock(dims[0], dims[0], dropout))
-        # upsample blocks
-        upsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            # residual (+attention) blocks
-            if i == 1 or i == 2 or i == 3:
-                in_dim = in_dim // 2
-            for _ in range(num_res_blocks + 1):
-                upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-                if scale in attn_scales:
-                    upsamples.append(AttentionBlock(out_dim))
-                in_dim = out_dim
-            # upsample block
-            if i != len(dim_mult) - 1:
-                mode = 'upsample3d' if temperal_upsample[i] else 'upsample2d'
-                upsamples.append(Resample(out_dim, mode=mode))
-                scale *= 2.0
-        self.upsamples = nn.Sequential(*upsamples)
-        # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False), nn.SiLU(),
-            CausalConv3d(out_dim, 3, 3, padding=1))
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        ## conv1
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                # cache last frame of last two chunk
-                cache_x = torch.cat([
-                    feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                        cache_x.device), cache_x
-                ],
-                                    dim=2)
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        ## middle
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## upsamples
-        for layer in self.upsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat([
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device), cache_x
-                    ],
-                                        dim=2)
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-def count_conv3d(model):
-    count = 0
-    for m in model.modules():
-        if isinstance(m, CausalConv3d):
-            count += 1
-    return count
-class WanVAE_(nn.Module):
-    def __init__(self,
-                 dim=128,
-                 z_dim=4,
-                 dim_mult=[1, 2, 4, 4],
-                 num_res_blocks=2,
-                 attn_scales=[],
-                 temperal_downsample=[True, True, False],
-                 dropout=0.0):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        self.temperal_upsample = temperal_downsample[::-1]
-        # modules
-        self.encoder = Encoder3d(dim, z_dim * 2, dim_mult, num_res_blocks,
-                                 attn_scales, self.temperal_downsample, dropout)
-        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
-        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
-        self.decoder = Decoder3d(dim, z_dim, dim_mult, num_res_blocks,
-                                 attn_scales, self.temperal_upsample, dropout)
-    def forward(self, x):
-        mu, log_var = self.encode(x)
-        z = self.reparameterize(mu, log_var)
-        x_recon = self.decode(z)
-        return x_recon, mu, log_var
-    def encode(self, x, scale):
-        self.clear_cache()
-        ## cache
-        t = x.shape[2]
-        iter_ = 1 + (t - 1) // 4
-        ## 对encode输入的x，按时间拆分为1、4、4、4....
-        for i in range(iter_):
-            self._enc_conv_idx = [0]
-            if i == 0:
-                out = self.encoder(
-                    x[:, :, :1, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx)
-            else:
-                out_ = self.encoder(
-                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx)
-                out = torch.cat([out, out_], 2)
-        mu, log_var = self.conv1(out).chunk(2, dim=1)
-        if isinstance(scale[0], torch.Tensor):
-            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
-                1, self.z_dim, 1, 1, 1)
-        else:
-            mu = (mu - scale[0]) * scale[1]
-        self.clear_cache()
-        return mu
-    def decode(self, z, scale):
-        self.clear_cache()
-        # z: [b,c,t,h,w]
-        if isinstance(scale[0], torch.Tensor):
-            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
-                1, self.z_dim, 1, 1, 1)
-        else:
-            z = z / scale[1] + scale[0]
-        iter_ = z.shape[2]
-        x = self.conv2(z)
-        for i in range(iter_):
-            self._conv_idx = [0]
-            if i == 0:
-                out = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx)
-            else:
-                out_ = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx)
-                out = torch.cat([out, out_], 2)
-        self.clear_cache()
-        return out
-    def reparameterize(self, mu, log_var):
-        std = torch.exp(0.5 * log_var)
-        eps = torch.randn_like(std)
-        return eps * std + mu
-    def sample(self, imgs, deterministic=False):
-        mu, log_var = self.encode(imgs)
-        if deterministic:
-            return mu
-        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
-        return mu + std * torch.randn_like(std)
-    def clear_cache(self):
-        self._conv_num = count_conv3d(self.decoder)
-        self._conv_idx = [0]
-        self._feat_map = [None] * self._conv_num
-        #cache encode
-        self._enc_conv_num = count_conv3d(self.encoder)
-        self._enc_conv_idx = [0]
-        self._enc_feat_map = [None] * self._enc_conv_num
-def _video_vae(pretrained_path=None, z_dim=None, device='cpu', **kwargs):
-    """
-    Autoencoder3d adapted from Stable Diffusion 1.x, 2.x and XL.
-    """
-    # params
-    cfg = dict(
-        dim=96,
-        z_dim=z_dim,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[False, True, True],
-        dropout=0.0)
-    cfg.update(**kwargs)
-    # init model
-    with torch.device('meta'):
-        model = WanVAE_(**cfg)
-    # load checkpoint
-    logging.info(f'loading {pretrained_path}')
-    model.load_state_dict(
-        torch.load(pretrained_path, map_location=device), assign=True)
-    return model
-class Wan2_1_VAE:
-    def __init__(self,
-                 z_dim=16,
-                 vae_pth='cache/vae_step_411000.pth',
-                 dtype=torch.float,
-                 device="cuda"):
-        self.dtype = dtype
-        self.device = device
-        mean = [
-            -0.7571, -0.7089, -0.9113, 0.1075, -0.1745, 0.9653, -0.1517, 1.5508,
-            0.4134, -0.0715, 0.5517, -0.3632, -0.1922, -0.9497, 0.2503, -0.2921
-        ]
-        std = [
-            2.8184, 1.4541, 2.3275, 2.6558, 1.2196, 1.7708, 2.6052, 2.0743,
-            3.2687, 2.1526, 2.8652, 1.5579, 1.6382, 1.1253, 2.8251, 1.9160
-        ]
-        self.mean = torch.tensor(mean, dtype=dtype, device=device)
-        self.std = torch.tensor(std, dtype=dtype, device=device)
-        self.scale = [self.mean, 1.0 / self.std]
-        # init model
-        self.model = _video_vae(
-            pretrained_path=vae_pth,
-            z_dim=z_dim,
-        ).eval().requires_grad_(False).to(device)
-    def encode(self, videos):
-        """
-        videos: A list of videos each with shape [C, T, H, W].
-        """
-        with amp.autocast(dtype=self.dtype):
-            return [
-                self.model.encode(u.unsqueeze(0), self.scale).float().squeeze(0)
-                for u in videos
-            ]
-    def decode(self, zs):
-        with amp.autocast(dtype=self.dtype):
-            return [
-                self.model.decode(u.unsqueeze(0),
-                                  self.scale).float().clamp_(-1, 1).squeeze(0)
-                for u in zs
-            ]

wan/modules/vae2_2.py DELETED Viewed

@@ -1,1051 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import logging
-import torch
-import torch.cuda.amp as amp
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-__all__ = [
-    "Wan2_2_VAE",
-]
-CACHE_T = 2
-class CausalConv3d(nn.Conv3d):
-    """
-    Causal 3d convolusion.
-    """
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._padding = (
-            self.padding[2],
-            self.padding[2],
-            self.padding[1],
-            self.padding[1],
-            2 * self.padding[0],
-            0,
-        )
-        self.padding = (0, 0, 0)
-    def forward(self, x, cache_x=None):
-        padding = list(self._padding)
-        if cache_x is not None and self._padding[4] > 0:
-            cache_x = cache_x.to(x.device)
-            x = torch.cat([cache_x, x], dim=2)
-            padding[4] -= cache_x.shape[2]
-        x = F.pad(x, padding)
-        return super().forward(x)
-class RMS_norm(nn.Module):
-    def __init__(self, dim, channel_first=True, images=True, bias=False):
-        super().__init__()
-        broadcastable_dims = (1, 1, 1) if not images else (1, 1)
-        shape = (dim, *broadcastable_dims) if channel_first else (dim,)
-        self.channel_first = channel_first
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(shape))
-        self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
-    def forward(self, x):
-        return (F.normalize(x, dim=(1 if self.channel_first else -1)) *
-                self.scale * self.gamma + self.bias)
-class Upsample(nn.Upsample):
-    def forward(self, x):
-        """
-        Fix bfloat16 support for nearest neighbor interpolation.
-        """
-        return super().forward(x.float()).type_as(x)
-class Resample(nn.Module):
-    def __init__(self, dim, mode):
-        assert mode in (
-            "none",
-            "upsample2d",
-            "upsample3d",
-            "downsample2d",
-            "downsample3d",
-        )
-        super().__init__()
-        self.dim = dim
-        self.mode = mode
-        # layers
-        if mode == "upsample2d":
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
-                nn.Conv2d(dim, dim, 3, padding=1),
-            )
-        elif mode == "upsample3d":
-            self.resample = nn.Sequential(
-                Upsample(scale_factor=(2.0, 2.0), mode="nearest-exact"),
-                nn.Conv2d(dim, dim, 3, padding=1),
-                # nn.Conv2d(dim, dim//2, 3, padding=1)
-            )
-            self.time_conv = CausalConv3d(
-                dim, dim * 2, (3, 1, 1), padding=(1, 0, 0))
-        elif mode == "downsample2d":
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-        elif mode == "downsample3d":
-            self.resample = nn.Sequential(
-                nn.ZeroPad2d((0, 1, 0, 1)),
-                nn.Conv2d(dim, dim, 3, stride=(2, 2)))
-            self.time_conv = CausalConv3d(
-                dim, dim, (3, 1, 1), stride=(2, 1, 1), padding=(0, 0, 0))
-        else:
-            self.resample = nn.Identity()
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        b, c, t, h, w = x.size()
-        if self.mode == "upsample3d":
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = "Rep"
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                    if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
-                            feat_cache[idx] != "Rep"):
-                        # cache last frame of last two chunk
-                        cache_x = torch.cat(
-                            [
-                                feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                    cache_x.device),
-                                cache_x,
-                            ],
-                            dim=2,
-                        )
-                    if (cache_x.shape[2] < 2 and feat_cache[idx] is not None and
-                            feat_cache[idx] == "Rep"):
-                        cache_x = torch.cat(
-                            [
-                                torch.zeros_like(cache_x).to(cache_x.device),
-                                cache_x
-                            ],
-                            dim=2,
-                        )
-                    if feat_cache[idx] == "Rep":
-                        x = self.time_conv(x)
-                    else:
-                        x = self.time_conv(x, feat_cache[idx])
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-                    x = x.reshape(b, 2, c, t, h, w)
-                    x = torch.stack((x[:, 0, :, :, :, :], x[:, 1, :, :, :, :]),
-                                    3)
-                    x = x.reshape(b, c, t * 2, h, w)
-        t = x.shape[2]
-        x = rearrange(x, "b c t h w -> (b t) c h w")
-        x = self.resample(x)
-        x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
-        if self.mode == "downsample3d":
-            if feat_cache is not None:
-                idx = feat_idx[0]
-                if feat_cache[idx] is None:
-                    feat_cache[idx] = x.clone()
-                    feat_idx[0] += 1
-                else:
-                    cache_x = x[:, :, -1:, :, :].clone()
-                    x = self.time_conv(
-                        torch.cat([feat_cache[idx][:, :, -1:, :, :], x], 2))
-                    feat_cache[idx] = cache_x
-                    feat_idx[0] += 1
-        return x
-    def init_weight(self, conv):
-        conv_weight = conv.weight.detach().clone()
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        one_matrix = torch.eye(c1, c2)
-        init_matrix = one_matrix
-        nn.init.zeros_(conv_weight)
-        conv_weight.data[:, :, 1, 0, 0] = init_matrix  # * 0.5
-        conv.weight = nn.Parameter(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-    def init_weight2(self, conv):
-        conv_weight = conv.weight.data.detach().clone()
-        nn.init.zeros_(conv_weight)
-        c1, c2, t, h, w = conv_weight.size()
-        init_matrix = torch.eye(c1 // 2, c2)
-        conv_weight[:c1 // 2, :, -1, 0, 0] = init_matrix
-        conv_weight[c1 // 2:, :, -1, 0, 0] = init_matrix
-        conv.weight = nn.Parameter(conv_weight)
-        nn.init.zeros_(conv.bias.data)
-class ResidualBlock(nn.Module):
-    def __init__(self, in_dim, out_dim, dropout=0.0):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        # layers
-        self.residual = nn.Sequential(
-            RMS_norm(in_dim, images=False),
-            nn.SiLU(),
-            CausalConv3d(in_dim, out_dim, 3, padding=1),
-            RMS_norm(out_dim, images=False),
-            nn.SiLU(),
-            nn.Dropout(dropout),
-            CausalConv3d(out_dim, out_dim, 3, padding=1),
-        )
-        self.shortcut = (
-            CausalConv3d(in_dim, out_dim, 1)
-            if in_dim != out_dim else nn.Identity())
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        h = self.shortcut(x)
-        for layer in self.residual:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    # cache last frame of last two chunk
-                    cache_x = torch.cat(
-                        [
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device),
-                            cache_x,
-                        ],
-                        dim=2,
-                    )
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x + h
-class AttentionBlock(nn.Module):
-    """
-    Causal self-attention with a single head.
-    """
-    def __init__(self, dim):
-        super().__init__()
-        self.dim = dim
-        # layers
-        self.norm = RMS_norm(dim)
-        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
-        self.proj = nn.Conv2d(dim, dim, 1)
-        # zero out the last layer params
-        nn.init.zeros_(self.proj.weight)
-    def forward(self, x):
-        identity = x
-        b, c, t, h, w = x.size()
-        x = rearrange(x, "b c t h w -> (b t) c h w")
-        x = self.norm(x)
-        # compute query, key, value
-        q, k, v = (
-            self.to_qkv(x).reshape(b * t, 1, c * 3,
-                                   -1).permute(0, 1, 3,
-                                               2).contiguous().chunk(3, dim=-1))
-        # apply attention
-        x = F.scaled_dot_product_attention(
-            q,
-            k,
-            v,
-        )
-        x = x.squeeze(1).permute(0, 2, 1).reshape(b * t, c, h, w)
-        # output
-        x = self.proj(x)
-        x = rearrange(x, "(b t) c h w-> b c t h w", t=t)
-        return x + identity
-def patchify(x, patch_size):
-    if patch_size == 1:
-        return x
-    if x.dim() == 4:
-        x = rearrange(
-            x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
-    elif x.dim() == 5:
-        x = rearrange(
-            x,
-            "b c f (h q) (w r) -> b (c r q) f h w",
-            q=patch_size,
-            r=patch_size,
-        )
-    else:
-        raise ValueError(f"Invalid input shape: {x.shape}")
-    return x
-def unpatchify(x, patch_size):
-    if patch_size == 1:
-        return x
-    if x.dim() == 4:
-        x = rearrange(
-            x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
-    elif x.dim() == 5:
-        x = rearrange(
-            x,
-            "b (c r q) f h w -> b c f (h q) (w r)",
-            q=patch_size,
-            r=patch_size,
-        )
-    return x
-class AvgDown3D(nn.Module):
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        factor_t,
-        factor_s=1,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.factor_t = factor_t
-        self.factor_s = factor_s
-        self.factor = self.factor_t * self.factor_s * self.factor_s
-        assert in_channels * self.factor % out_channels == 0
-        self.group_size = in_channels * self.factor // out_channels
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        pad_t = (self.factor_t - x.shape[2] % self.factor_t) % self.factor_t
-        pad = (0, 0, 0, 0, pad_t, 0)
-        x = F.pad(x, pad)
-        B, C, T, H, W = x.shape
-        x = x.view(
-            B,
-            C,
-            T // self.factor_t,
-            self.factor_t,
-            H // self.factor_s,
-            self.factor_s,
-            W // self.factor_s,
-            self.factor_s,
-        )
-        x = x.permute(0, 1, 3, 5, 7, 2, 4, 6).contiguous()
-        x = x.view(
-            B,
-            C * self.factor,
-            T // self.factor_t,
-            H // self.factor_s,
-            W // self.factor_s,
-        )
-        x = x.view(
-            B,
-            self.out_channels,
-            self.group_size,
-            T // self.factor_t,
-            H // self.factor_s,
-            W // self.factor_s,
-        )
-        x = x.mean(dim=2)
-        return x
-class DupUp3D(nn.Module):
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        factor_t,
-        factor_s=1,
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.factor_t = factor_t
-        self.factor_s = factor_s
-        self.factor = self.factor_t * self.factor_s * self.factor_s
-        assert out_channels * self.factor % in_channels == 0
-        self.repeats = out_channels * self.factor // in_channels
-    def forward(self, x: torch.Tensor, first_chunk=False) -> torch.Tensor:
-        x = x.repeat_interleave(self.repeats, dim=1)
-        x = x.view(
-            x.size(0),
-            self.out_channels,
-            self.factor_t,
-            self.factor_s,
-            self.factor_s,
-            x.size(2),
-            x.size(3),
-            x.size(4),
-        )
-        x = x.permute(0, 1, 5, 2, 6, 3, 7, 4).contiguous()
-        x = x.view(
-            x.size(0),
-            self.out_channels,
-            x.size(2) * self.factor_t,
-            x.size(4) * self.factor_s,
-            x.size(6) * self.factor_s,
-        )
-        if first_chunk:
-            x = x[:, :, self.factor_t - 1:, :, :]
-        return x
-class Down_ResidualBlock(nn.Module):
-    def __init__(self,
-                 in_dim,
-                 out_dim,
-                 dropout,
-                 mult,
-                 temperal_downsample=False,
-                 down_flag=False):
-        super().__init__()
-        # Shortcut path with downsample
-        self.avg_shortcut = AvgDown3D(
-            in_dim,
-            out_dim,
-            factor_t=2 if temperal_downsample else 1,
-            factor_s=2 if down_flag else 1,
-        )
-        # Main path with residual blocks and downsample
-        downsamples = []
-        for _ in range(mult):
-            downsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-            in_dim = out_dim
-        # Add the final downsample block
-        if down_flag:
-            mode = "downsample3d" if temperal_downsample else "downsample2d"
-            downsamples.append(Resample(out_dim, mode=mode))
-        self.downsamples = nn.Sequential(*downsamples)
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        x_copy = x.clone()
-        for module in self.downsamples:
-            x = module(x, feat_cache, feat_idx)
-        return x + self.avg_shortcut(x_copy)
-class Up_ResidualBlock(nn.Module):
-    def __init__(self,
-                 in_dim,
-                 out_dim,
-                 dropout,
-                 mult,
-                 temperal_upsample=False,
-                 up_flag=False):
-        super().__init__()
-        # Shortcut path with upsample
-        if up_flag:
-            self.avg_shortcut = DupUp3D(
-                in_dim,
-                out_dim,
-                factor_t=2 if temperal_upsample else 1,
-                factor_s=2 if up_flag else 1,
-            )
-        else:
-            self.avg_shortcut = None
-        # Main path with residual blocks and upsample
-        upsamples = []
-        for _ in range(mult):
-            upsamples.append(ResidualBlock(in_dim, out_dim, dropout))
-            in_dim = out_dim
-        # Add the final upsample block
-        if up_flag:
-            mode = "upsample3d" if temperal_upsample else "upsample2d"
-            upsamples.append(Resample(out_dim, mode=mode))
-        self.upsamples = nn.Sequential(*upsamples)
-    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
-        x_main = x.clone()
-        for module in self.upsamples:
-            x_main = module(x_main, feat_cache, feat_idx)
-        if self.avg_shortcut is not None:
-            x_shortcut = self.avg_shortcut(x, first_chunk)
-            return x_main + x_shortcut
-        else:
-            return x_main
-class Encoder3d(nn.Module):
-    def __init__(
-        self,
-        dim=128,
-        z_dim=4,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[True, True, False],
-        dropout=0.0,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        # dimensions
-        dims = [dim * u for u in [1] + dim_mult]
-        scale = 1.0
-        # init block
-        self.conv1 = CausalConv3d(12, dims[0], 3, padding=1)
-        # downsample blocks
-        downsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            t_down_flag = (
-                temperal_downsample[i]
-                if i < len(temperal_downsample) else False)
-            downsamples.append(
-                Down_ResidualBlock(
-                    in_dim=in_dim,
-                    out_dim=out_dim,
-                    dropout=dropout,
-                    mult=num_res_blocks,
-                    temperal_downsample=t_down_flag,
-                    down_flag=i != len(dim_mult) - 1,
-                ))
-            scale /= 2.0
-        self.downsamples = nn.Sequential(*downsamples)
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(out_dim, out_dim, dropout),
-            AttentionBlock(out_dim),
-            ResidualBlock(out_dim, out_dim, dropout),
-        )
-        # # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False),
-            nn.SiLU(),
-            CausalConv3d(out_dim, z_dim, 3, padding=1),
-        )
-    def forward(self, x, feat_cache=None, feat_idx=[0]):
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                cache_x = torch.cat(
-                    [
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device),
-                        cache_x,
-                    ],
-                    dim=2,
-                )
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        ## downsamples
-        for layer in self.downsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## middle
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    cache_x = torch.cat(
-                        [
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device),
-                            cache_x,
-                        ],
-                        dim=2,
-                    )
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-class Decoder3d(nn.Module):
-    def __init__(
-        self,
-        dim=128,
-        z_dim=4,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_upsample=[False, True, True],
-        dropout=0.0,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_upsample = temperal_upsample
-        # dimensions
-        dims = [dim * u for u in [dim_mult[-1]] + dim_mult[::-1]]
-        scale = 1.0 / 2**(len(dim_mult) - 2)
-        # init block
-        self.conv1 = CausalConv3d(z_dim, dims[0], 3, padding=1)
-        # middle blocks
-        self.middle = nn.Sequential(
-            ResidualBlock(dims[0], dims[0], dropout),
-            AttentionBlock(dims[0]),
-            ResidualBlock(dims[0], dims[0], dropout),
-        )
-        # upsample blocks
-        upsamples = []
-        for i, (in_dim, out_dim) in enumerate(zip(dims[:-1], dims[1:])):
-            t_up_flag = temperal_upsample[i] if i < len(
-                temperal_upsample) else False
-            upsamples.append(
-                Up_ResidualBlock(
-                    in_dim=in_dim,
-                    out_dim=out_dim,
-                    dropout=dropout,
-                    mult=num_res_blocks + 1,
-                    temperal_upsample=t_up_flag,
-                    up_flag=i != len(dim_mult) - 1,
-                ))
-        self.upsamples = nn.Sequential(*upsamples)
-        # output blocks
-        self.head = nn.Sequential(
-            RMS_norm(out_dim, images=False),
-            nn.SiLU(),
-            CausalConv3d(out_dim, 12, 3, padding=1),
-        )
-    def forward(self, x, feat_cache=None, feat_idx=[0], first_chunk=False):
-        if feat_cache is not None:
-            idx = feat_idx[0]
-            cache_x = x[:, :, -CACHE_T:, :, :].clone()
-            if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                cache_x = torch.cat(
-                    [
-                        feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                            cache_x.device),
-                        cache_x,
-                    ],
-                    dim=2,
-                )
-            x = self.conv1(x, feat_cache[idx])
-            feat_cache[idx] = cache_x
-            feat_idx[0] += 1
-        else:
-            x = self.conv1(x)
-        for layer in self.middle:
-            if isinstance(layer, ResidualBlock) and feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx)
-            else:
-                x = layer(x)
-        ## upsamples
-        for layer in self.upsamples:
-            if feat_cache is not None:
-                x = layer(x, feat_cache, feat_idx, first_chunk)
-            else:
-                x = layer(x)
-        ## head
-        for layer in self.head:
-            if isinstance(layer, CausalConv3d) and feat_cache is not None:
-                idx = feat_idx[0]
-                cache_x = x[:, :, -CACHE_T:, :, :].clone()
-                if cache_x.shape[2] < 2 and feat_cache[idx] is not None:
-                    cache_x = torch.cat(
-                        [
-                            feat_cache[idx][:, :, -1, :, :].unsqueeze(2).to(
-                                cache_x.device),
-                            cache_x,
-                        ],
-                        dim=2,
-                    )
-                x = layer(x, feat_cache[idx])
-                feat_cache[idx] = cache_x
-                feat_idx[0] += 1
-            else:
-                x = layer(x)
-        return x
-def count_conv3d(model):
-    count = 0
-    for m in model.modules():
-        if isinstance(m, CausalConv3d):
-            count += 1
-    return count
-class WanVAE_(nn.Module):
-    def __init__(
-        self,
-        dim=160,
-        dec_dim=256,
-        z_dim=16,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[True, True, False],
-        dropout=0.0,
-    ):
-        super().__init__()
-        self.dim = dim
-        self.z_dim = z_dim
-        self.dim_mult = dim_mult
-        self.num_res_blocks = num_res_blocks
-        self.attn_scales = attn_scales
-        self.temperal_downsample = temperal_downsample
-        self.temperal_upsample = temperal_downsample[::-1]
-        # modules
-        self.encoder = Encoder3d(
-            dim,
-            z_dim * 2,
-            dim_mult,
-            num_res_blocks,
-            attn_scales,
-            self.temperal_downsample,
-            dropout,
-        )
-        self.conv1 = CausalConv3d(z_dim * 2, z_dim * 2, 1)
-        self.conv2 = CausalConv3d(z_dim, z_dim, 1)
-        self.decoder = Decoder3d(
-            dec_dim,
-            z_dim,
-            dim_mult,
-            num_res_blocks,
-            attn_scales,
-            self.temperal_upsample,
-            dropout,
-        )
-    def forward(self, x, scale=[0, 1]):
-        mu = self.encode(x, scale)
-        x_recon = self.decode(mu, scale)
-        return x_recon, mu
-    def encode(self, x, scale):
-        self.clear_cache()
-        x = patchify(x, patch_size=2)
-        t = x.shape[2]
-        iter_ = 1 + (t - 1) // 4
-        for i in range(iter_):
-            self._enc_conv_idx = [0]
-            if i == 0:
-                out = self.encoder(
-                    x[:, :, :1, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx,
-                )
-            else:
-                out_ = self.encoder(
-                    x[:, :, 1 + 4 * (i - 1):1 + 4 * i, :, :],
-                    feat_cache=self._enc_feat_map,
-                    feat_idx=self._enc_conv_idx,
-                )
-                out = torch.cat([out, out_], 2)
-        mu, log_var = self.conv1(out).chunk(2, dim=1)
-        if isinstance(scale[0], torch.Tensor):
-            mu = (mu - scale[0].view(1, self.z_dim, 1, 1, 1)) * scale[1].view(
-                1, self.z_dim, 1, 1, 1)
-        else:
-            mu = (mu - scale[0]) * scale[1]
-        self.clear_cache()
-        return mu
-    def decode(self, z, scale):
-        self.clear_cache()
-        if isinstance(scale[0], torch.Tensor):
-            z = z / scale[1].view(1, self.z_dim, 1, 1, 1) + scale[0].view(
-                1, self.z_dim, 1, 1, 1)
-        else:
-            z = z / scale[1] + scale[0]
-        iter_ = z.shape[2]
-        x = self.conv2(z)
-        for i in range(iter_):
-            self._conv_idx = [0]
-            if i == 0:
-                out = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx,
-                    first_chunk=True,
-                )
-            else:
-                out_ = self.decoder(
-                    x[:, :, i:i + 1, :, :],
-                    feat_cache=self._feat_map,
-                    feat_idx=self._conv_idx,
-                )
-                out = torch.cat([out, out_], 2)
-        out = unpatchify(out, patch_size=2)
-        self.clear_cache()
-        return out
-    def reparameterize(self, mu, log_var):
-        std = torch.exp(0.5 * log_var)
-        eps = torch.randn_like(std)
-        return eps * std + mu
-    def sample(self, imgs, deterministic=False):
-        mu, log_var = self.encode(imgs)
-        if deterministic:
-            return mu
-        std = torch.exp(0.5 * log_var.clamp(-30.0, 20.0))
-        return mu + std * torch.randn_like(std)
-    def clear_cache(self):
-        self._conv_num = count_conv3d(self.decoder)
-        self._conv_idx = [0]
-        self._feat_map = [None] * self._conv_num
-        # cache encode
-        self._enc_conv_num = count_conv3d(self.encoder)
-        self._enc_conv_idx = [0]
-        self._enc_feat_map = [None] * self._enc_conv_num
-def _video_vae(pretrained_path=None, z_dim=16, dim=160, device="cpu", **kwargs):
-    # params
-    cfg = dict(
-        dim=dim,
-        z_dim=z_dim,
-        dim_mult=[1, 2, 4, 4],
-        num_res_blocks=2,
-        attn_scales=[],
-        temperal_downsample=[True, True, True],
-        dropout=0.0,
-    )
-    cfg.update(**kwargs)
-    # init model
-    with torch.device("meta"):
-        model = WanVAE_(**cfg)
-    # load checkpoint
-    logging.info(f"loading {pretrained_path}")
-    model.load_state_dict(
-        torch.load(pretrained_path, map_location=device), assign=True)
-    return model
-class Wan2_2_VAE:
-    def __init__(
-        self,
-        z_dim=48,
-        c_dim=160,
-        vae_pth=None,
-        dim_mult=[1, 2, 4, 4],
-        temperal_downsample=[False, True, True],
-        dtype=torch.float,
-        device="cuda",
-    ):
-        self.dtype = dtype
-        self.device = device
-        mean = torch.tensor(
-            [
-                -0.2289,
-                -0.0052,
-                -0.1323,
-                -0.2339,
-                -0.2799,
-                0.0174,
-                0.1838,
-                0.1557,
-                -0.1382,
-                0.0542,
-                0.2813,
-                0.0891,
-                0.1570,
-                -0.0098,
-                0.0375,
-                -0.1825,
-                -0.2246,
-                -0.1207,
-                -0.0698,
-                0.5109,
-                0.2665,
-                -0.2108,
-                -0.2158,
-                0.2502,
-                -0.2055,
-                -0.0322,
-                0.1109,
-                0.1567,
-                -0.0729,
-                0.0899,
-                -0.2799,
-                -0.1230,
-                -0.0313,
-                -0.1649,
-                0.0117,
-                0.0723,
-                -0.2839,
-                -0.2083,
-                -0.0520,
-                0.3748,
-                0.0152,
-                0.1957,
-                0.1433,
-                -0.2944,
-                0.3573,
-                -0.0548,
-                -0.1681,
-                -0.0667,
-            ],
-            dtype=dtype,
-            device=device,
-        )
-        std = torch.tensor(
-            [
-                0.4765,
-                1.0364,
-                0.4514,
-                1.1677,
-                0.5313,
-                0.4990,
-                0.4818,
-                0.5013,
-                0.8158,
-                1.0344,
-                0.5894,
-                1.0901,
-                0.6885,
-                0.6165,
-                0.8454,
-                0.4978,
-                0.5759,
-                0.3523,
-                0.7135,
-                0.6804,
-                0.5833,
-                1.4146,
-                0.8986,
-                0.5659,
-                0.7069,
-                0.5338,
-                0.4889,
-                0.4917,
-                0.4069,
-                0.4999,
-                0.6866,
-                0.4093,
-                0.5709,
-                0.6065,
-                0.6415,
-                0.4944,
-                0.5726,
-                1.2042,
-                0.5458,
-                1.6887,
-                0.3971,
-                1.0600,
-                0.3943,
-                0.5537,
-                0.5444,
-                0.4089,
-                0.7468,
-                0.7744,
-            ],
-            dtype=dtype,
-            device=device,
-        )
-        self.scale = [mean, 1.0 / std]
-        # init model
-        self.model = (
-            _video_vae(
-                pretrained_path=vae_pth,
-                z_dim=z_dim,
-                dim=c_dim,
-                dim_mult=dim_mult,
-                temperal_downsample=temperal_downsample,
-            ).eval().requires_grad_(False).to(device))
-    def encode(self, videos):
-        try:
-            if not isinstance(videos, list):
-                raise TypeError("videos should be a list")
-            with amp.autocast(dtype=self.dtype):
-                return [
-                    self.model.encode(u.unsqueeze(0),
-                                      self.scale).float().squeeze(0)
-                    for u in videos
-                ]
-        except TypeError as e:
-            logging.info(e)
-            return None
-    def decode(self, zs):
-        try:
-            if not isinstance(zs, list):
-                raise TypeError("zs should be a list")
-            with amp.autocast(dtype=self.dtype):
-                return [
-                    self.model.decode(u.unsqueeze(0),
-                                      self.scale).float().clamp_(-1,
-                                                                 1).squeeze(0)
-                    for u in zs
-                ]
-        except TypeError as e:
-            logging.info(e)
-            return None

wan/text2video.py DELETED Viewed

@@ -1,378 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import gc
-import logging
-import math
-import os
-import random
-import sys
-import types
-from contextlib import contextmanager
-from functools import partial
-import torch
-import torch.cuda.amp as amp
-import torch.distributed as dist
-from tqdm import tqdm
-from .distributed.fsdp import shard_model
-from .distributed.sequence_parallel import sp_attn_forward, sp_dit_forward
-from .distributed.util import get_world_size
-from .modules.model import WanModel
-from .modules.t5 import T5EncoderModel
-from .modules.vae2_1 import Wan2_1_VAE
-from .utils.fm_solvers import (
-    FlowDPMSolverMultistepScheduler,
-    get_sampling_sigmas,
-    retrieve_timesteps,
-)
-from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
-class WanT2V:
-    def __init__(
-        self,
-        config,
-        checkpoint_dir,
-        device_id=0,
-        rank=0,
-        t5_fsdp=False,
-        dit_fsdp=False,
-        use_sp=False,
-        t5_cpu=False,
-        init_on_cpu=True,
-        convert_model_dtype=False,
-    ):
-        r"""
-        Initializes the Wan text-to-video generation model components.
-        Args:
-            config (EasyDict):
-                Object containing model parameters initialized from config.py
-            checkpoint_dir (`str`):
-                Path to directory containing model checkpoints
-            device_id (`int`,  *optional*, defaults to 0):
-                Id of target GPU device
-            rank (`int`,  *optional*, defaults to 0):
-                Process rank for distributed training
-            t5_fsdp (`bool`, *optional*, defaults to False):
-                Enable FSDP sharding for T5 model
-            dit_fsdp (`bool`, *optional*, defaults to False):
-                Enable FSDP sharding for DiT model
-            use_sp (`bool`, *optional*, defaults to False):
-                Enable distribution strategy of sequence parallel.
-            t5_cpu (`bool`, *optional*, defaults to False):
-                Whether to place T5 model on CPU. Only works without t5_fsdp.
-            init_on_cpu (`bool`, *optional*, defaults to True):
-                Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
-            convert_model_dtype (`bool`, *optional*, defaults to False):
-                Convert DiT model parameters dtype to 'config.param_dtype'.
-                Only works without FSDP.
-        """
-        self.device = torch.device(f"cuda:{device_id}")
-        self.config = config
-        self.rank = rank
-        self.t5_cpu = t5_cpu
-        self.init_on_cpu = init_on_cpu
-        self.num_train_timesteps = config.num_train_timesteps
-        self.boundary = config.boundary
-        self.param_dtype = config.param_dtype
-        if t5_fsdp or dit_fsdp or use_sp:
-            self.init_on_cpu = False
-        shard_fn = partial(shard_model, device_id=device_id)
-        self.text_encoder = T5EncoderModel(
-            text_len=config.text_len,
-            dtype=config.t5_dtype,
-            device=torch.device('cpu'),
-            checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
-            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
-            shard_fn=shard_fn if t5_fsdp else None)
-        self.vae_stride = config.vae_stride
-        self.patch_size = config.patch_size
-        self.vae = Wan2_1_VAE(
-            vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
-            device=self.device)
-        logging.info(f"Creating WanModel from {checkpoint_dir}")
-        self.low_noise_model = WanModel.from_pretrained(
-            checkpoint_dir, subfolder=config.low_noise_checkpoint)
-        self.low_noise_model = self._configure_model(
-            model=self.low_noise_model,
-            use_sp=use_sp,
-            dit_fsdp=dit_fsdp,
-            shard_fn=shard_fn,
-            convert_model_dtype=convert_model_dtype)
-        self.high_noise_model = WanModel.from_pretrained(
-            checkpoint_dir, subfolder=config.high_noise_checkpoint)
-        self.high_noise_model = self._configure_model(
-            model=self.high_noise_model,
-            use_sp=use_sp,
-            dit_fsdp=dit_fsdp,
-            shard_fn=shard_fn,
-            convert_model_dtype=convert_model_dtype)
-        if use_sp:
-            self.sp_size = get_world_size()
-        else:
-            self.sp_size = 1
-        self.sample_neg_prompt = config.sample_neg_prompt
-    def _configure_model(self, model, use_sp, dit_fsdp, shard_fn,
-                         convert_model_dtype):
-        """
-        Configures a model object. This includes setting evaluation modes,
-        applying distributed parallel strategy, and handling device placement.
-        Args:
-            model (torch.nn.Module):
-                The model instance to configure.
-            use_sp (`bool`):
-                Enable distribution strategy of sequence parallel.
-            dit_fsdp (`bool`):
-                Enable FSDP sharding for DiT model.
-            shard_fn (callable):
-                The function to apply FSDP sharding.
-            convert_model_dtype (`bool`):
-                Convert DiT model parameters dtype to 'config.param_dtype'.
-                Only works without FSDP.
-        Returns:
-            torch.nn.Module:
-                The configured model.
-        """
-        model.eval().requires_grad_(False)
-        if use_sp:
-            for block in model.blocks:
-                block.self_attn.forward = types.MethodType(
-                    sp_attn_forward, block.self_attn)
-            model.forward = types.MethodType(sp_dit_forward, model)
-        if dist.is_initialized():
-            dist.barrier()
-        if dit_fsdp:
-            model = shard_fn(model)
-        else:
-            if convert_model_dtype:
-                model.to(self.param_dtype)
-            if not self.init_on_cpu:
-                model.to(self.device)
-        return model
-    def _prepare_model_for_timestep(self, t, boundary, offload_model):
-        r"""
-        Prepares and returns the required model for the current timestep.
-        Args:
-            t (torch.Tensor):
-                current timestep.
-            boundary (`int`):
-                The timestep threshold. If `t` is at or above this value,
-                the `high_noise_model` is considered as the required model.
-            offload_model (`bool`):
-                A flag intended to control the offloading behavior.
-        Returns:
-            torch.nn.Module:
-                The active model on the target device for the current timestep.
-        """
-        if t.item() >= boundary:
-            required_model_name = 'high_noise_model'
-            offload_model_name = 'low_noise_model'
-        else:
-            required_model_name = 'low_noise_model'
-            offload_model_name = 'high_noise_model'
-        if offload_model or self.init_on_cpu:
-            if next(getattr(
-                    self,
-                    offload_model_name).parameters()).device.type == 'cuda':
-                getattr(self, offload_model_name).to('cpu')
-            if next(getattr(
-                    self,
-                    required_model_name).parameters()).device.type == 'cpu':
-                getattr(self, required_model_name).to(self.device)
-        return getattr(self, required_model_name)
-    def generate(self,
-                 input_prompt,
-                 size=(1280, 720),
-                 frame_num=81,
-                 shift=5.0,
-                 sample_solver='unipc',
-                 sampling_steps=50,
-                 guide_scale=5.0,
-                 n_prompt="",
-                 seed=-1,
-                 offload_model=True):
-        r"""
-        Generates video frames from text prompt using diffusion process.
-        Args:
-            input_prompt (`str`):
-                Text prompt for content generation
-            size (`tuple[int]`, *optional*, defaults to (1280,720)):
-                Controls video resolution, (width,height).
-            frame_num (`int`, *optional*, defaults to 81):
-                How many frames to sample from a video. The number should be 4n+1
-            shift (`float`, *optional*, defaults to 5.0):
-                Noise schedule shift parameter. Affects temporal dynamics
-            sample_solver (`str`, *optional*, defaults to 'unipc'):
-                Solver used to sample the video.
-            sampling_steps (`int`, *optional*, defaults to 50):
-                Number of diffusion sampling steps. Higher values improve quality but slow generation
-            guide_scale (`float` or tuple[`float`], *optional*, defaults 5.0):
-                Classifier-free guidance scale. Controls prompt adherence vs. creativity.
-                If tuple, the first guide_scale will be used for low noise model and
-                the second guide_scale will be used for high noise model.
-            n_prompt (`str`, *optional*, defaults to ""):
-                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
-            seed (`int`, *optional*, defaults to -1):
-                Random seed for noise generation. If -1, use random seed.
-            offload_model (`bool`, *optional*, defaults to True):
-                If True, offloads models to CPU during generation to save VRAM
-        Returns:
-            torch.Tensor:
-                Generated video frames tensor. Dimensions: (C, N H, W) where:
-                - C: Color channels (3 for RGB)
-                - N: Number of frames (81)
-                - H: Frame height (from size)
-                - W: Frame width from size)
-        """
-        # preprocess
-        guide_scale = (guide_scale, guide_scale) if isinstance(
-            guide_scale, float) else guide_scale
-        F = frame_num
-        target_shape = (self.vae.model.z_dim, (F - 1) // self.vae_stride[0] + 1,
-                        size[1] // self.vae_stride[1],
-                        size[0] // self.vae_stride[2])
-        seq_len = math.ceil((target_shape[2] * target_shape[3]) /
-                            (self.patch_size[1] * self.patch_size[2]) *
-                            target_shape[1] / self.sp_size) * self.sp_size
-        if n_prompt == "":
-            n_prompt = self.sample_neg_prompt
-        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
-        seed_g = torch.Generator(device=self.device)
-        seed_g.manual_seed(seed)
-        if not self.t5_cpu:
-            self.text_encoder.model.to(self.device)
-            context = self.text_encoder([input_prompt], self.device)
-            context_null = self.text_encoder([n_prompt], self.device)
-            if offload_model:
-                self.text_encoder.model.cpu()
-        else:
-            context = self.text_encoder([input_prompt], torch.device('cpu'))
-            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
-            context = [t.to(self.device) for t in context]
-            context_null = [t.to(self.device) for t in context_null]
-        noise = [
-            torch.randn(
-                target_shape[0],
-                target_shape[1],
-                target_shape[2],
-                target_shape[3],
-                dtype=torch.float32,
-                device=self.device,
-                generator=seed_g)
-        ]
-        @contextmanager
-        def noop_no_sync():
-            yield
-        no_sync_low_noise = getattr(self.low_noise_model, 'no_sync',
-                                    noop_no_sync)
-        no_sync_high_noise = getattr(self.high_noise_model, 'no_sync',
-                                     noop_no_sync)
-        # evaluation mode
-        with (
-                torch.amp.autocast('cuda', dtype=self.param_dtype),
-                torch.no_grad(),
-                no_sync_low_noise(),
-                no_sync_high_noise(),
-        ):
-            boundary = self.boundary * self.num_train_timesteps
-            if sample_solver == 'unipc':
-                sample_scheduler = FlowUniPCMultistepScheduler(
-                    num_train_timesteps=self.num_train_timesteps,
-                    shift=1,
-                    use_dynamic_shifting=False)
-                sample_scheduler.set_timesteps(
-                    sampling_steps, device=self.device, shift=shift)
-                timesteps = sample_scheduler.timesteps
-            elif sample_solver == 'dpm++':
-                sample_scheduler = FlowDPMSolverMultistepScheduler(
-                    num_train_timesteps=self.num_train_timesteps,
-                    shift=1,
-                    use_dynamic_shifting=False)
-                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
-                timesteps, _ = retrieve_timesteps(
-                    sample_scheduler,
-                    device=self.device,
-                    sigmas=sampling_sigmas)
-            else:
-                raise NotImplementedError("Unsupported solver.")
-            # sample videos
-            latents = noise
-            arg_c = {'context': context, 'seq_len': seq_len}
-            arg_null = {'context': context_null, 'seq_len': seq_len}
-            for _, t in enumerate(tqdm(timesteps)):
-                latent_model_input = latents
-                timestep = [t]
-                timestep = torch.stack(timestep)
-                model = self._prepare_model_for_timestep(
-                    t, boundary, offload_model)
-                sample_guide_scale = guide_scale[1] if t.item(
-                ) >= boundary else guide_scale[0]
-                noise_pred_cond = model(
-                    latent_model_input, t=timestep, **arg_c)[0]
-                noise_pred_uncond = model(
-                    latent_model_input, t=timestep, **arg_null)[0]
-                noise_pred = noise_pred_uncond + sample_guide_scale * (
-                    noise_pred_cond - noise_pred_uncond)
-                temp_x0 = sample_scheduler.step(
-                    noise_pred.unsqueeze(0),
-                    t,
-                    latents[0].unsqueeze(0),
-                    return_dict=False,
-                    generator=seed_g)[0]
-                latents = [temp_x0.squeeze(0)]
-            x0 = latents
-            if offload_model:
-                self.low_noise_model.cpu()
-                self.high_noise_model.cpu()
-                torch.cuda.empty_cache()
-            if self.rank == 0:
-                videos = self.vae.decode(x0)
-        del noise, latents
-        del sample_scheduler
-        if offload_model:
-            gc.collect()
-            torch.cuda.synchronize()
-        if dist.is_initialized():
-            dist.barrier()
-        return videos[0] if self.rank == 0 else None

wan/textimage2video.py DELETED Viewed

@@ -1,619 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import gc
-import logging
-import math
-import os
-import random
-import sys
-import types
-from contextlib import contextmanager
-from functools import partial
-import torch
-import torch.cuda.amp as amp
-import torch.distributed as dist
-import torchvision.transforms.functional as TF
-from PIL import Image
-from tqdm import tqdm
-from .distributed.fsdp import shard_model
-from .distributed.sequence_parallel import sp_attn_forward, sp_dit_forward
-from .distributed.util import get_world_size
-from .modules.model import WanModel
-from .modules.t5 import T5EncoderModel
-from .modules.vae2_2 import Wan2_2_VAE
-from .utils.fm_solvers import (
-    FlowDPMSolverMultistepScheduler,
-    get_sampling_sigmas,
-    retrieve_timesteps,
-)
-from .utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
-from .utils.utils import best_output_size, masks_like
-class WanTI2V:
-    def __init__(
-        self,
-        config,
-        checkpoint_dir,
-        device_id=0,
-        rank=0,
-        t5_fsdp=False,
-        dit_fsdp=False,
-        use_sp=False,
-        t5_cpu=False,
-        init_on_cpu=True,
-        convert_model_dtype=False,
-    ):
-        r"""
-        Initializes the Wan text-to-video generation model components.
-        Args:
-            config (EasyDict):
-                Object containing model parameters initialized from config.py
-            checkpoint_dir (`str`):
-                Path to directory containing model checkpoints
-            device_id (`int`,  *optional*, defaults to 0):
-                Id of target GPU device
-            rank (`int`,  *optional*, defaults to 0):
-                Process rank for distributed training
-            t5_fsdp (`bool`, *optional*, defaults to False):
-                Enable FSDP sharding for T5 model
-            dit_fsdp (`bool`, *optional*, defaults to False):
-                Enable FSDP sharding for DiT model
-            use_sp (`bool`, *optional*, defaults to False):
-                Enable distribution strategy of sequence parallel.
-            t5_cpu (`bool`, *optional*, defaults to False):
-                Whether to place T5 model on CPU. Only works without t5_fsdp.
-            init_on_cpu (`bool`, *optional*, defaults to True):
-                Enable initializing Transformer Model on CPU. Only works without FSDP or USP.
-            convert_model_dtype (`bool`, *optional*, defaults to False):
-                Convert DiT model parameters dtype to 'config.param_dtype'.
-                Only works without FSDP.
-        """
-        self.device = torch.device(f"cuda:{device_id}")
-        self.config = config
-        self.rank = rank
-        self.t5_cpu = t5_cpu
-        self.init_on_cpu = init_on_cpu
-        self.num_train_timesteps = config.num_train_timesteps
-        self.param_dtype = config.param_dtype
-        if t5_fsdp or dit_fsdp or use_sp:
-            self.init_on_cpu = False
-        shard_fn = partial(shard_model, device_id=device_id)
-        self.text_encoder = T5EncoderModel(
-            text_len=config.text_len,
-            dtype=config.t5_dtype,
-            device=torch.device('cpu'),
-            checkpoint_path=os.path.join(checkpoint_dir, config.t5_checkpoint),
-            tokenizer_path=os.path.join(checkpoint_dir, config.t5_tokenizer),
-            shard_fn=shard_fn if t5_fsdp else None)
-        self.vae_stride = config.vae_stride
-        self.patch_size = config.patch_size
-        self.vae = Wan2_2_VAE(
-            vae_pth=os.path.join(checkpoint_dir, config.vae_checkpoint),
-            device=self.device)
-        logging.info(f"Creating WanModel from {checkpoint_dir}")
-        self.model = WanModel.from_pretrained(checkpoint_dir)
-        self.model = self._configure_model(
-            model=self.model,
-            use_sp=use_sp,
-            dit_fsdp=dit_fsdp,
-            shard_fn=shard_fn,
-            convert_model_dtype=convert_model_dtype)
-        if use_sp:
-            self.sp_size = get_world_size()
-        else:
-            self.sp_size = 1
-        self.sample_neg_prompt = config.sample_neg_prompt
-    def _configure_model(self, model, use_sp, dit_fsdp, shard_fn,
-                         convert_model_dtype):
-        """
-        Configures a model object. This includes setting evaluation modes,
-        applying distributed parallel strategy, and handling device placement.
-        Args:
-            model (torch.nn.Module):
-                The model instance to configure.
-            use_sp (`bool`):
-                Enable distribution strategy of sequence parallel.
-            dit_fsdp (`bool`):
-                Enable FSDP sharding for DiT model.
-            shard_fn (callable):
-                The function to apply FSDP sharding.
-            convert_model_dtype (`bool`):
-                Convert DiT model parameters dtype to 'config.param_dtype'.
-                Only works without FSDP.
-        Returns:
-            torch.nn.Module:
-                The configured model.
-        """
-        model.eval().requires_grad_(False)
-        if use_sp:
-            for block in model.blocks:
-                block.self_attn.forward = types.MethodType(
-                    sp_attn_forward, block.self_attn)
-            model.forward = types.MethodType(sp_dit_forward, model)
-        if dist.is_initialized():
-            dist.barrier()
-        if dit_fsdp:
-            model = shard_fn(model)
-        else:
-            if convert_model_dtype:
-                model.to(self.param_dtype)
-            if not self.init_on_cpu:
-                model.to(self.device)
-        return model
-    def generate(self,
-                 input_prompt,
-                 img=None,
-                 size=(1280, 704),
-                 max_area=704 * 1280,
-                 frame_num=81,
-                 shift=5.0,
-                 sample_solver='unipc',
-                 sampling_steps=50,
-                 guide_scale=5.0,
-                 n_prompt="",
-                 seed=-1,
-                 offload_model=True):
-        r"""
-        Generates video frames from text prompt using diffusion process.
-        Args:
-            input_prompt (`str`):
-                Text prompt for content generation
-            img (PIL.Image.Image):
-                Input image tensor. Shape: [3, H, W]
-            size (`tuple[int]`, *optional*, defaults to (1280,704)):
-                Controls video resolution, (width,height).
-            max_area (`int`, *optional*, defaults to 704*1280):
-                Maximum pixel area for latent space calculation. Controls video resolution scaling
-            frame_num (`int`, *optional*, defaults to 81):
-                How many frames to sample from a video. The number should be 4n+1
-            shift (`float`, *optional*, defaults to 5.0):
-                Noise schedule shift parameter. Affects temporal dynamics
-            sample_solver (`str`, *optional*, defaults to 'unipc'):
-                Solver used to sample the video.
-            sampling_steps (`int`, *optional*, defaults to 50):
-                Number of diffusion sampling steps. Higher values improve quality but slow generation
-            guide_scale (`float`, *optional*, defaults 5.0):
-                Classifier-free guidance scale. Controls prompt adherence vs. creativity.
-            n_prompt (`str`, *optional*, defaults to ""):
-                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
-            seed (`int`, *optional*, defaults to -1):
-                Random seed for noise generation. If -1, use random seed.
-            offload_model (`bool`, *optional*, defaults to True):
-                If True, offloads models to CPU during generation to save VRAM
-        Returns:
-            torch.Tensor:
-                Generated video frames tensor. Dimensions: (C, N H, W) where:
-                - C: Color channels (3 for RGB)
-                - N: Number of frames (81)
-                - H: Frame height (from size)
-                - W: Frame width from size)
-        """
-        # i2v
-        if img is not None:
-            return self.i2v(
-                input_prompt=input_prompt,
-                img=img,
-                max_area=max_area,
-                frame_num=frame_num,
-                shift=shift,
-                sample_solver=sample_solver,
-                sampling_steps=sampling_steps,
-                guide_scale=guide_scale,
-                n_prompt=n_prompt,
-                seed=seed,
-                offload_model=offload_model)
-        # t2v
-        return self.t2v(
-            input_prompt=input_prompt,
-            size=size,
-            frame_num=frame_num,
-            shift=shift,
-            sample_solver=sample_solver,
-            sampling_steps=sampling_steps,
-            guide_scale=guide_scale,
-            n_prompt=n_prompt,
-            seed=seed,
-            offload_model=offload_model)
-    def t2v(self,
-            input_prompt,
-            size=(1280, 704),
-            frame_num=121,
-            shift=5.0,
-            sample_solver='unipc',
-            sampling_steps=50,
-            guide_scale=5.0,
-            n_prompt="",
-            seed=-1,
-            offload_model=True):
-        r"""
-        Generates video frames from text prompt using diffusion process.
-        Args:
-            input_prompt (`str`):
-                Text prompt for content generation
-            size (`tuple[int]`, *optional*, defaults to (1280,704)):
-                Controls video resolution, (width,height).
-            frame_num (`int`, *optional*, defaults to 121):
-                How many frames to sample from a video. The number should be 4n+1
-            shift (`float`, *optional*, defaults to 5.0):
-                Noise schedule shift parameter. Affects temporal dynamics
-            sample_solver (`str`, *optional*, defaults to 'unipc'):
-                Solver used to sample the video.
-            sampling_steps (`int`, *optional*, defaults to 50):
-                Number of diffusion sampling steps. Higher values improve quality but slow generation
-            guide_scale (`float`, *optional*, defaults 5.0):
-                Classifier-free guidance scale. Controls prompt adherence vs. creativity.
-            n_prompt (`str`, *optional*, defaults to ""):
-                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
-            seed (`int`, *optional*, defaults to -1):
-                Random seed for noise generation. If -1, use random seed.
-            offload_model (`bool`, *optional*, defaults to True):
-                If True, offloads models to CPU during generation to save VRAM
-        Returns:
-            torch.Tensor:
-                Generated video frames tensor. Dimensions: (C, N H, W) where:
-                - C: Color channels (3 for RGB)
-                - N: Number of frames (81)
-                - H: Frame height (from size)
-                - W: Frame width from size)
-        """
-        # preprocess
-        F = frame_num
-        target_shape = (self.vae.model.z_dim, (F - 1) // self.vae_stride[0] + 1,
-                        size[1] // self.vae_stride[1],
-                        size[0] // self.vae_stride[2])
-        seq_len = math.ceil((target_shape[2] * target_shape[3]) /
-                            (self.patch_size[1] * self.patch_size[2]) *
-                            target_shape[1] / self.sp_size) * self.sp_size
-        if n_prompt == "":
-            n_prompt = self.sample_neg_prompt
-        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
-        seed_g = torch.Generator(device=self.device)
-        seed_g.manual_seed(seed)
-        if not self.t5_cpu:
-            self.text_encoder.model.to(self.device)
-            context = self.text_encoder([input_prompt], self.device)
-            context_null = self.text_encoder([n_prompt], self.device)
-            if offload_model:
-                self.text_encoder.model.cpu()
-        else:
-            context = self.text_encoder([input_prompt], torch.device('cpu'))
-            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
-            context = [t.to(self.device) for t in context]
-            context_null = [t.to(self.device) for t in context_null]
-        noise = [
-            torch.randn(
-                target_shape[0],
-                target_shape[1],
-                target_shape[2],
-                target_shape[3],
-                dtype=torch.float32,
-                device=self.device,
-                generator=seed_g)
-        ]
-        @contextmanager
-        def noop_no_sync():
-            yield
-        no_sync = getattr(self.model, 'no_sync', noop_no_sync)
-        # evaluation mode
-        with (
-                torch.amp.autocast('cuda', dtype=self.param_dtype),
-                torch.no_grad(),
-                no_sync(),
-        ):
-            if sample_solver == 'unipc':
-                sample_scheduler = FlowUniPCMultistepScheduler(
-                    num_train_timesteps=self.num_train_timesteps,
-                    shift=1,
-                    use_dynamic_shifting=False)
-                sample_scheduler.set_timesteps(
-                    sampling_steps, device=self.device, shift=shift)
-                timesteps = sample_scheduler.timesteps
-            elif sample_solver == 'dpm++':
-                sample_scheduler = FlowDPMSolverMultistepScheduler(
-                    num_train_timesteps=self.num_train_timesteps,
-                    shift=1,
-                    use_dynamic_shifting=False)
-                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
-                timesteps, _ = retrieve_timesteps(
-                    sample_scheduler,
-                    device=self.device,
-                    sigmas=sampling_sigmas)
-            else:
-                raise NotImplementedError("Unsupported solver.")
-            # sample videos
-            latents = noise
-            mask1, mask2 = masks_like(noise, zero=False)
-            arg_c = {'context': context, 'seq_len': seq_len}
-            arg_null = {'context': context_null, 'seq_len': seq_len}
-            if offload_model or self.init_on_cpu:
-                self.model.to(self.device)
-                torch.cuda.empty_cache()
-            for _, t in enumerate(tqdm(timesteps)):
-                latent_model_input = latents
-                timestep = [t]
-                timestep = torch.stack(timestep)
-                temp_ts = (mask2[0][0][:, ::2, ::2] * timestep).flatten()
-                temp_ts = torch.cat([
-                    temp_ts,
-                    temp_ts.new_ones(seq_len - temp_ts.size(0)) * timestep
-                ])
-                timestep = temp_ts.unsqueeze(0)
-                noise_pred_cond = self.model(
-                    latent_model_input, t=timestep, **arg_c)[0]
-                noise_pred_uncond = self.model(
-                    latent_model_input, t=timestep, **arg_null)[0]
-                noise_pred = noise_pred_uncond + guide_scale * (
-                    noise_pred_cond - noise_pred_uncond)
-                temp_x0 = sample_scheduler.step(
-                    noise_pred.unsqueeze(0),
-                    t,
-                    latents[0].unsqueeze(0),
-                    return_dict=False,
-                    generator=seed_g)[0]
-                latents = [temp_x0.squeeze(0)]
-            x0 = latents
-            if offload_model:
-                self.model.cpu()
-                torch.cuda.synchronize()
-                torch.cuda.empty_cache()
-            if self.rank == 0:
-                videos = self.vae.decode(x0)
-        del noise, latents
-        del sample_scheduler
-        if offload_model:
-            gc.collect()
-            torch.cuda.synchronize()
-        if dist.is_initialized():
-            dist.barrier()
-        return videos[0] if self.rank == 0 else None
-    def i2v(self,
-            input_prompt,
-            img,
-            max_area=704 * 1280,
-            frame_num=121,
-            shift=5.0,
-            sample_solver='unipc',
-            sampling_steps=40,
-            guide_scale=5.0,
-            n_prompt="",
-            seed=-1,
-            offload_model=True):
-        r"""
-        Generates video frames from input image and text prompt using diffusion process.
-        Args:
-            input_prompt (`str`):
-                Text prompt for content generation.
-            img (PIL.Image.Image):
-                Input image tensor. Shape: [3, H, W]
-            max_area (`int`, *optional*, defaults to 704*1280):
-                Maximum pixel area for latent space calculation. Controls video resolution scaling
-            frame_num (`int`, *optional*, defaults to 121):
-                How many frames to sample from a video. The number should be 4n+1
-            shift (`float`, *optional*, defaults to 5.0):
-                Noise schedule shift parameter. Affects temporal dynamics
-                [NOTE]: If you want to generate a 480p video, it is recommended to set the shift value to 3.0.
-            sample_solver (`str`, *optional*, defaults to 'unipc'):
-                Solver used to sample the video.
-            sampling_steps (`int`, *optional*, defaults to 40):
-                Number of diffusion sampling steps. Higher values improve quality but slow generation
-            guide_scale (`float`, *optional*, defaults 5.0):
-                Classifier-free guidance scale. Controls prompt adherence vs. creativity.
-            n_prompt (`str`, *optional*, defaults to ""):
-                Negative prompt for content exclusion. If not given, use `config.sample_neg_prompt`
-            seed (`int`, *optional*, defaults to -1):
-                Random seed for noise generation. If -1, use random seed
-            offload_model (`bool`, *optional*, defaults to True):
-                If True, offloads models to CPU during generation to save VRAM
-        Returns:
-            torch.Tensor:
-                Generated video frames tensor. Dimensions: (C, N H, W) where:
-                - C: Color channels (3 for RGB)
-                - N: Number of frames (121)
-                - H: Frame height (from max_area)
-                - W: Frame width (from max_area)
-        """
-        # preprocess
-        ih, iw = img.height, img.width
-        dh, dw = self.patch_size[1] * self.vae_stride[1], self.patch_size[
-            2] * self.vae_stride[2]
-        ow, oh = best_output_size(iw, ih, dw, dh, max_area)
-        scale = max(ow / iw, oh / ih)
-        img = img.resize((round(iw * scale), round(ih * scale)), Image.LANCZOS)
-        # center-crop
-        x1 = (img.width - ow) // 2
-        y1 = (img.height - oh) // 2
-        img = img.crop((x1, y1, x1 + ow, y1 + oh))
-        assert img.width == ow and img.height == oh
-        # to tensor
-        img = TF.to_tensor(img).sub_(0.5).div_(0.5).to(self.device).unsqueeze(1)
-        F = frame_num
-        seq_len = ((F - 1) // self.vae_stride[0] + 1) * (
-            oh // self.vae_stride[1]) * (ow // self.vae_stride[2]) // (
-                self.patch_size[1] * self.patch_size[2])
-        seq_len = int(math.ceil(seq_len / self.sp_size)) * self.sp_size
-        seed = seed if seed >= 0 else random.randint(0, sys.maxsize)
-        seed_g = torch.Generator(device=self.device)
-        seed_g.manual_seed(seed)
-        noise = torch.randn(
-            self.vae.model.z_dim, (F - 1) // self.vae_stride[0] + 1,
-            oh // self.vae_stride[1],
-            ow // self.vae_stride[2],
-            dtype=torch.float32,
-            generator=seed_g,
-            device=self.device)
-        if n_prompt == "":
-            n_prompt = self.sample_neg_prompt
-        # preprocess
-        if not self.t5_cpu:
-            self.text_encoder.model.to(self.device)
-            context = self.text_encoder([input_prompt], self.device)
-            context_null = self.text_encoder([n_prompt], self.device)
-            if offload_model:
-                self.text_encoder.model.cpu()
-        else:
-            context = self.text_encoder([input_prompt], torch.device('cpu'))
-            context_null = self.text_encoder([n_prompt], torch.device('cpu'))
-            context = [t.to(self.device) for t in context]
-            context_null = [t.to(self.device) for t in context_null]
-        z = self.vae.encode([img])
-        @contextmanager
-        def noop_no_sync():
-            yield
-        no_sync = getattr(self.model, 'no_sync', noop_no_sync)
-        # evaluation mode
-        with (
-                torch.amp.autocast('cuda', dtype=self.param_dtype),
-                torch.no_grad(),
-                no_sync(),
-        ):
-            if sample_solver == 'unipc':
-                sample_scheduler = FlowUniPCMultistepScheduler(
-                    num_train_timesteps=self.num_train_timesteps,
-                    shift=1,
-                    use_dynamic_shifting=False)
-                sample_scheduler.set_timesteps(
-                    sampling_steps, device=self.device, shift=shift)
-                timesteps = sample_scheduler.timesteps
-            elif sample_solver == 'dpm++':
-                sample_scheduler = FlowDPMSolverMultistepScheduler(
-                    num_train_timesteps=self.num_train_timesteps,
-                    shift=1,
-                    use_dynamic_shifting=False)
-                sampling_sigmas = get_sampling_sigmas(sampling_steps, shift)
-                timesteps, _ = retrieve_timesteps(
-                    sample_scheduler,
-                    device=self.device,
-                    sigmas=sampling_sigmas)
-            else:
-                raise NotImplementedError("Unsupported solver.")
-            # sample videos
-            latent = noise
-            mask1, mask2 = masks_like([noise], zero=True)
-            latent = (1. - mask2[0]) * z[0] + mask2[0] * latent
-            arg_c = {
-                'context': [context[0]],
-                'seq_len': seq_len,
-            }
-            arg_null = {
-                'context': context_null,
-                'seq_len': seq_len,
-            }
-            if offload_model or self.init_on_cpu:
-                self.model.to(self.device)
-                torch.cuda.empty_cache()
-            for _, t in enumerate(tqdm(timesteps)):
-                latent_model_input = [latent.to(self.device)]
-                timestep = [t]
-                timestep = torch.stack(timestep).to(self.device)
-                temp_ts = (mask2[0][0][:, ::2, ::2] * timestep).flatten()
-                temp_ts = torch.cat([
-                    temp_ts,
-                    temp_ts.new_ones(seq_len - temp_ts.size(0)) * timestep
-                ])
-                timestep = temp_ts.unsqueeze(0)
-                noise_pred_cond = self.model(
-                    latent_model_input, t=timestep, **arg_c)[0]
-                if offload_model:
-                    torch.cuda.empty_cache()
-                noise_pred_uncond = self.model(
-                    latent_model_input, t=timestep, **arg_null)[0]
-                if offload_model:
-                    torch.cuda.empty_cache()
-                noise_pred = noise_pred_uncond + guide_scale * (
-                    noise_pred_cond - noise_pred_uncond)
-                temp_x0 = sample_scheduler.step(
-                    noise_pred.unsqueeze(0),
-                    t,
-                    latent.unsqueeze(0),
-                    return_dict=False,
-                    generator=seed_g)[0]
-                latent = temp_x0.squeeze(0)
-                latent = (1. - mask2[0]) * z[0] + mask2[0] * latent
-                x0 = [latent]
-                del latent_model_input, timestep
-            if offload_model:
-                self.model.cpu()
-                torch.cuda.synchronize()
-                torch.cuda.empty_cache()
-            if self.rank == 0:
-                videos = self.vae.decode(x0)
-        del noise, latent, x0
-        del sample_scheduler
-        if offload_model:
-            gc.collect()
-            torch.cuda.synchronize()
-        if dist.is_initialized():
-            dist.barrier()
-        return videos[0] if self.rank == 0 else None

wan/utils/__init__.py DELETED Viewed

@@ -1,12 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-from .fm_solvers import (
-    FlowDPMSolverMultistepScheduler,
-    get_sampling_sigmas,
-    retrieve_timesteps,
-)
-from .fm_solvers_unipc import FlowUniPCMultistepScheduler
-__all__ = [
-    'HuggingfaceTokenizer', 'get_sampling_sigmas', 'retrieve_timesteps',
-    'FlowDPMSolverMultistepScheduler', 'FlowUniPCMultistepScheduler'
-]

wan/utils/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (393 Bytes)

wan/utils/__pycache__/fm_solvers.cpython-310.pyc DELETED Viewed

Binary file (26.1 kB)

wan/utils/__pycache__/fm_solvers_unipc.cpython-310.pyc DELETED Viewed

Binary file (22.2 kB)

wan/utils/__pycache__/utils.cpython-310.pyc DELETED Viewed

Binary file (4.31 kB)

wan/utils/fm_solvers.py DELETED Viewed

@@ -1,859 +0,0 @@
-# Copied from https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
-# Convert dpm solver for flow matching
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import inspect
-import math
-from typing import List, Optional, Tuple, Union
-import numpy as np
-import torch
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.schedulers.scheduling_utils import (
-    KarrasDiffusionSchedulers,
-    SchedulerMixin,
-    SchedulerOutput,
-)
-from diffusers.utils import deprecate, is_scipy_available
-from diffusers.utils.torch_utils import randn_tensor
-if is_scipy_available():
-    pass
-def get_sampling_sigmas(sampling_steps, shift):
-    sigma = np.linspace(1, 0, sampling_steps + 1)[:sampling_steps]
-    sigma = (shift * sigma / (1 + (shift - 1) * sigma))
-    return sigma
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps=None,
-    device=None,
-    timesteps=None,
-    sigmas=None,
-    **kwargs,
-):
-    if timesteps is not None and sigmas is not None:
-        raise ValueError(
-            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
-        )
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(
-            inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(
-            inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-class FlowDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
-    """
-    `FlowDPMSolverMultistepScheduler` is a fast dedicated high-order solver for diffusion ODEs.
-    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
-    methods the library implements for all schedulers such as loading and saving.
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model. This determines the resolution of the diffusion process.
-        solver_order (`int`, defaults to 2):
-            The DPMSolver order which can be `1`, `2`, or `3`. It is recommended to use `solver_order=2` for guided
-            sampling, and `solver_order=3` for unconditional sampling. This affects the number of model outputs stored
-            and used in multistep updates.
-        prediction_type (`str`, defaults to "flow_prediction"):
-            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
-            the flow of the diffusion process.
-        shift (`float`, *optional*, defaults to 1.0):
-            A factor used to adjust the sigmas in the noise schedule. It modifies the step sizes during the sampling
-            process.
-        use_dynamic_shifting (`bool`, defaults to `False`):
-            Whether to apply dynamic shifting to the timesteps based on image resolution. If `True`, the shifting is
-            applied on the fly.
-        thresholding (`bool`, defaults to `False`):
-            Whether to use the "dynamic thresholding" method. This method adjusts the predicted sample to prevent
-            saturation and improve photorealism.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
-            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
-            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and
-            `algorithm_type="dpmsolver++"`.
-        algorithm_type (`str`, defaults to `dpmsolver++`):
-            Algorithm type for the solver; can be `dpmsolver`, `dpmsolver++`, `sde-dpmsolver` or `sde-dpmsolver++`. The
-            `dpmsolver` type implements the algorithms in the [DPMSolver](https://huggingface.co/papers/2206.00927)
-            paper, and the `dpmsolver++` type implements the algorithms in the
-            [DPMSolver++](https://huggingface.co/papers/2211.01095) paper. It is recommended to use `dpmsolver++` or
-            `sde-dpmsolver++` with `solver_order=2` for guided sampling like in Stable Diffusion.
-        solver_type (`str`, defaults to `midpoint`):
-            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
-            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
-        lower_order_final (`bool`, defaults to `True`):
-            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
-            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
-        euler_at_final (`bool`, defaults to `False`):
-            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
-            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
-            steps, but sometimes may result in blurring.
-        final_sigmas_type (`str`, *optional*, defaults to "zero"):
-            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
-        lambda_min_clipped (`float`, defaults to `-inf`):
-            Clipping threshold for the minimum value of `lambda(t)` for numerical stability. This is critical for the
-            cosine (`squaredcos_cap_v2`) noise schedule.
-        variance_type (`str`, *optional*):
-            Set to "learned" or "learned_range" for diffusion models that predict variance. If set, the model's output
-            contains the predicted Gaussian variance.
-    """
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        solver_order: int = 2,
-        prediction_type: str = "flow_prediction",
-        shift: Optional[float] = 1.0,
-        use_dynamic_shifting=False,
-        thresholding: bool = False,
-        dynamic_thresholding_ratio: float = 0.995,
-        sample_max_value: float = 1.0,
-        algorithm_type: str = "dpmsolver++",
-        solver_type: str = "midpoint",
-        lower_order_final: bool = True,
-        euler_at_final: bool = False,
-        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
-        lambda_min_clipped: float = -float("inf"),
-        variance_type: Optional[str] = None,
-        invert_sigmas: bool = False,
-    ):
-        if algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
-            deprecation_message = f"algorithm_type {algorithm_type} is deprecated and will be removed in a future version. Choose from `dpmsolver++` or `sde-dpmsolver++` instead"
-            deprecate("algorithm_types dpmsolver and sde-dpmsolver", "1.0.0",
-                      deprecation_message)
-        # settings for DPM-Solver
-        if algorithm_type not in [
-                "dpmsolver", "dpmsolver++", "sde-dpmsolver", "sde-dpmsolver++"
-        ]:
-            if algorithm_type == "deis":
-                self.register_to_config(algorithm_type="dpmsolver++")
-            else:
-                raise NotImplementedError(
-                    f"{algorithm_type} is not implemented for {self.__class__}")
-        if solver_type not in ["midpoint", "heun"]:
-            if solver_type in ["logrho", "bh1", "bh2"]:
-                self.register_to_config(solver_type="midpoint")
-            else:
-                raise NotImplementedError(
-                    f"{solver_type} is not implemented for {self.__class__}")
-        if algorithm_type not in ["dpmsolver++", "sde-dpmsolver++"
-                                 ] and final_sigmas_type == "zero":
-            raise ValueError(
-                f"`final_sigmas_type` {final_sigmas_type} is not supported for `algorithm_type` {algorithm_type}. Please choose `sigma_min` instead."
-            )
-        # setable values
-        self.num_inference_steps = None
-        alphas = np.linspace(1, 1 / num_train_timesteps,
-                             num_train_timesteps)[::-1].copy()
-        sigmas = 1.0 - alphas
-        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
-        if not use_dynamic_shifting:
-            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
-            sigmas = shift * sigmas / (1 +
-                                       (shift - 1) * sigmas)  # pyright: ignore
-        self.sigmas = sigmas
-        self.timesteps = sigmas * num_train_timesteps
-        self.model_outputs = [None] * solver_order
-        self.lower_order_nums = 0
-        self._step_index = None
-        self._begin_index = None
-        # self.sigmas = self.sigmas.to(
-        #     "cpu")  # to avoid too much CPU/GPU communication
-        self.sigma_min = self.sigmas[-1].item()
-        self.sigma_max = self.sigmas[0].item()
-    @property
-    def step_index(self):
-        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
-        """
-        return self._step_index
-    @property
-    def begin_index(self):
-        """
-        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
-        """
-        return self._begin_index
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
-        """
-        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
-        Args:
-            begin_index (`int`):
-                The begin index for the scheduler.
-        """
-        self._begin_index = begin_index
-    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
-    def set_timesteps(
-        self,
-        num_inference_steps: Union[int, None] = None,
-        device: Union[str, torch.device] = None,
-        sigmas: Optional[List[float]] = None,
-        mu: Optional[Union[float, None]] = None,
-        shift: Optional[Union[float, None]] = None,
-    ):
-        """
-        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
-        Args:
-            num_inference_steps (`int`):
-                Total number of the spacing of the time steps.
-            device (`str` or `torch.device`, *optional*):
-                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        """
-        if self.config.use_dynamic_shifting and mu is None:
-            raise ValueError(
-                " you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`"
-            )
-        if sigmas is None:
-            sigmas = np.linspace(self.sigma_max, self.sigma_min,
-                                 num_inference_steps +
-                                 1).copy()[:-1]  # pyright: ignore
-        if self.config.use_dynamic_shifting:
-            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
-        else:
-            if shift is None:
-                shift = self.config.shift
-            sigmas = shift * sigmas / (1 +
-                                       (shift - 1) * sigmas)  # pyright: ignore
-        if self.config.final_sigmas_type == "sigma_min":
-            sigma_last = ((1 - self.alphas_cumprod[0]) /
-                          self.alphas_cumprod[0])**0.5
-        elif self.config.final_sigmas_type == "zero":
-            sigma_last = 0
-        else:
-            raise ValueError(
-                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
-            )
-        timesteps = sigmas * self.config.num_train_timesteps
-        sigmas = np.concatenate([sigmas, [sigma_last]
-                                ]).astype(np.float32)  # pyright: ignore
-        self.sigmas = torch.from_numpy(sigmas)
-        self.timesteps = torch.from_numpy(timesteps).to(
-            device=device, dtype=torch.int64)
-        self.num_inference_steps = len(timesteps)
-        self.model_outputs = [
-            None,
-        ] * self.config.solver_order
-        self.lower_order_nums = 0
-        self._step_index = None
-        self._begin_index = None
-        # self.sigmas = self.sigmas.to(
-        #     "cpu")  # to avoid too much CPU/GPU communication
-    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
-    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, *remaining_dims = sample.shape
-        if dtype not in (torch.float32, torch.float64):
-            sample = sample.float(
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
-        # Flatten sample for doing quantile calculation along each image
-        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-        s = torch.quantile(
-            abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
-        s = torch.clamp(
-            s, min=1, max=self.config.sample_max_value
-        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(
-            1)  # (batch_size, 1) because clamp will broadcast along dim=0
-        sample = torch.clamp(
-            sample, -s, s
-        ) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-        sample = sample.reshape(batch_size, channels, *remaining_dims)
-        sample = sample.to(dtype)
-        return sample
-    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma):
-        return sigma * self.config.num_train_timesteps
-    def _sigma_to_alpha_sigma_t(self, sigma):
-        return 1 - sigma, sigma
-    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
-    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
-        return math.exp(mu) / (math.exp(mu) + (1 / t - 1)**sigma)
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.convert_model_output
-    def convert_model_output(
-        self,
-        model_output: torch.Tensor,
-        *args,
-        sample: torch.Tensor = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
-        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
-        integral of the data prediction model.
-        <Tip>
-        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
-        prediction and data prediction models.
-        </Tip>
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from the learned diffusion model.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-        Returns:
-            `torch.Tensor`:
-                The converted model output.
-        """
-        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
-        if sample is None:
-            if len(args) > 1:
-                sample = args[1]
-            else:
-                raise ValueError(
-                    "missing `sample` as a required keyward argument")
-        if timestep is not None:
-            deprecate(
-                "timesteps",
-                "1.0.0",
-                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        # DPM-Solver++ needs to solve an integral of the data prediction model.
-        if self.config.algorithm_type in ["dpmsolver++", "sde-dpmsolver++"]:
-            if self.config.prediction_type == "flow_prediction":
-                sigma_t = self.sigmas[self.step_index]
-                x0_pred = sample - sigma_t * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
-                    " `v_prediction`, or `flow_prediction` for the FlowDPMSolverMultistepScheduler."
-                )
-            if self.config.thresholding:
-                x0_pred = self._threshold_sample(x0_pred)
-            return x0_pred
-        # DPM-Solver needs to solve an integral of the noise prediction model.
-        elif self.config.algorithm_type in ["dpmsolver", "sde-dpmsolver"]:
-            if self.config.prediction_type == "flow_prediction":
-                sigma_t = self.sigmas[self.step_index]
-                epsilon = sample - (1 - sigma_t) * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
-                    " `v_prediction` or `flow_prediction` for the FlowDPMSolverMultistepScheduler."
-                )
-            if self.config.thresholding:
-                sigma_t = self.sigmas[self.step_index]
-                x0_pred = sample - sigma_t * model_output
-                x0_pred = self._threshold_sample(x0_pred)
-                epsilon = model_output + x0_pred
-            return epsilon
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.dpm_solver_first_order_update
-    def dpm_solver_first_order_update(
-        self,
-        model_output: torch.Tensor,
-        *args,
-        sample: torch.Tensor = None,
-        noise: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        One step for the first-order DPMSolver (equivalent to DDIM).
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from the learned diffusion model.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-        Returns:
-            `torch.Tensor`:
-                The sample tensor at the previous timestep.
-        """
-        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
-        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
-            "prev_timestep", None)
-        if sample is None:
-            if len(args) > 2:
-                sample = args[2]
-            else:
-                raise ValueError(
-                    " missing `sample` as a required keyward argument")
-        if timestep is not None:
-            deprecate(
-                "timesteps",
-                "1.0.0",
-                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        if prev_timestep is not None:
-            deprecate(
-                "prev_timestep",
-                "1.0.0",
-                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[
-            self.step_index]  # pyright: ignore
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
-        h = lambda_t - lambda_s
-        if self.config.algorithm_type == "dpmsolver++":
-            x_t = (sigma_t /
-                   sigma_s) * sample - (alpha_t *
-                                        (torch.exp(-h) - 1.0)) * model_output
-        elif self.config.algorithm_type == "dpmsolver":
-            x_t = (alpha_t /
-                   alpha_s) * sample - (sigma_t *
-                                        (torch.exp(h) - 1.0)) * model_output
-        elif self.config.algorithm_type == "sde-dpmsolver++":
-            assert noise is not None
-            x_t = ((sigma_t / sigma_s * torch.exp(-h)) * sample +
-                   (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output +
-                   sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
-        elif self.config.algorithm_type == "sde-dpmsolver":
-            assert noise is not None
-            x_t = ((alpha_t / alpha_s) * sample - 2.0 *
-                   (sigma_t * (torch.exp(h) - 1.0)) * model_output +
-                   sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
-        return x_t  # pyright: ignore
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_second_order_update
-    def multistep_dpm_solver_second_order_update(
-        self,
-        model_output_list: List[torch.Tensor],
-        *args,
-        sample: torch.Tensor = None,
-        noise: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        One step for the second-order multistep DPMSolver.
-        Args:
-            model_output_list (`List[torch.Tensor]`):
-                The direct outputs from learned diffusion model at current and latter timesteps.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-        Returns:
-            `torch.Tensor`:
-                The sample tensor at the previous timestep.
-        """
-        timestep_list = args[0] if len(args) > 0 else kwargs.pop(
-            "timestep_list", None)
-        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
-            "prev_timestep", None)
-        if sample is None:
-            if len(args) > 2:
-                sample = args[2]
-            else:
-                raise ValueError(
-                    " missing `sample` as a required keyward argument")
-        if timestep_list is not None:
-            deprecate(
-                "timestep_list",
-                "1.0.0",
-                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        if prev_timestep is not None:
-            deprecate(
-                "prev_timestep",
-                "1.0.0",
-                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        sigma_t, sigma_s0, sigma_s1 = (
-            self.sigmas[self.step_index + 1],  # pyright: ignore
-            self.sigmas[self.step_index],
-            self.sigmas[self.step_index - 1],  # pyright: ignore
-        )
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
-        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
-        m0, m1 = model_output_list[-1], model_output_list[-2]
-        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
-        r0 = h_0 / h
-        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
-        if self.config.algorithm_type == "dpmsolver++":
-            # See https://arxiv.org/abs/2211.01095 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = ((sigma_t / sigma_s0) * sample -
-                       (alpha_t * (torch.exp(-h) - 1.0)) * D0 - 0.5 *
-                       (alpha_t * (torch.exp(-h) - 1.0)) * D1)
-            elif self.config.solver_type == "heun":
-                x_t = ((sigma_t / sigma_s0) * sample -
-                       (alpha_t * (torch.exp(-h) - 1.0)) * D0 +
-                       (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1)
-        elif self.config.algorithm_type == "dpmsolver":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            if self.config.solver_type == "midpoint":
-                x_t = ((alpha_t / alpha_s0) * sample -
-                       (sigma_t * (torch.exp(h) - 1.0)) * D0 - 0.5 *
-                       (sigma_t * (torch.exp(h) - 1.0)) * D1)
-            elif self.config.solver_type == "heun":
-                x_t = ((alpha_t / alpha_s0) * sample -
-                       (sigma_t * (torch.exp(h) - 1.0)) * D0 -
-                       (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1)
-        elif self.config.algorithm_type == "sde-dpmsolver++":
-            assert noise is not None
-            if self.config.solver_type == "midpoint":
-                x_t = ((sigma_t / sigma_s0 * torch.exp(-h)) * sample +
-                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D0 + 0.5 *
-                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D1 +
-                       sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
-            elif self.config.solver_type == "heun":
-                x_t = ((sigma_t / sigma_s0 * torch.exp(-h)) * sample +
-                       (alpha_t * (1 - torch.exp(-2.0 * h))) * D0 +
-                       (alpha_t * ((1.0 - torch.exp(-2.0 * h)) /
-                                   (-2.0 * h) + 1.0)) * D1 +
-                       sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise)
-        elif self.config.algorithm_type == "sde-dpmsolver":
-            assert noise is not None
-            if self.config.solver_type == "midpoint":
-                x_t = ((alpha_t / alpha_s0) * sample - 2.0 *
-                       (sigma_t * (torch.exp(h) - 1.0)) * D0 -
-                       (sigma_t * (torch.exp(h) - 1.0)) * D1 +
-                       sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
-            elif self.config.solver_type == "heun":
-                x_t = ((alpha_t / alpha_s0) * sample - 2.0 *
-                       (sigma_t * (torch.exp(h) - 1.0)) * D0 - 2.0 *
-                       (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 +
-                       sigma_t * torch.sqrt(torch.exp(2 * h) - 1.0) * noise)
-        return x_t  # pyright: ignore
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.multistep_dpm_solver_third_order_update
-    def multistep_dpm_solver_third_order_update(
-        self,
-        model_output_list: List[torch.Tensor],
-        *args,
-        sample: torch.Tensor = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        One step for the third-order multistep DPMSolver.
-        Args:
-            model_output_list (`List[torch.Tensor]`):
-                The direct outputs from learned diffusion model at current and latter timesteps.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by diffusion process.
-        Returns:
-            `torch.Tensor`:
-                The sample tensor at the previous timestep.
-        """
-        timestep_list = args[0] if len(args) > 0 else kwargs.pop(
-            "timestep_list", None)
-        prev_timestep = args[1] if len(args) > 1 else kwargs.pop(
-            "prev_timestep", None)
-        if sample is None:
-            if len(args) > 2:
-                sample = args[2]
-            else:
-                raise ValueError(
-                    " missing`sample` as a required keyward argument")
-        if timestep_list is not None:
-            deprecate(
-                "timestep_list",
-                "1.0.0",
-                "Passing `timestep_list` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        if prev_timestep is not None:
-            deprecate(
-                "prev_timestep",
-                "1.0.0",
-                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        sigma_t, sigma_s0, sigma_s1, sigma_s2 = (
-            self.sigmas[self.step_index + 1],  # pyright: ignore
-            self.sigmas[self.step_index],
-            self.sigmas[self.step_index - 1],  # pyright: ignore
-            self.sigmas[self.step_index - 2],  # pyright: ignore
-        )
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
-        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
-        alpha_s2, sigma_s2 = self._sigma_to_alpha_sigma_t(sigma_s2)
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
-        lambda_s2 = torch.log(alpha_s2) - torch.log(sigma_s2)
-        m0, m1, m2 = model_output_list[-1], model_output_list[
-            -2], model_output_list[-3]
-        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
-        r0, r1 = h_0 / h, h_1 / h
-        D0 = m0
-        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
-        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
-        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
-        if self.config.algorithm_type == "dpmsolver++":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            x_t = ((sigma_t / sigma_s0) * sample -
-                   (alpha_t * (torch.exp(-h) - 1.0)) * D0 +
-                   (alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0)) * D1 -
-                   (alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5)) * D2)
-        elif self.config.algorithm_type == "dpmsolver":
-            # See https://arxiv.org/abs/2206.00927 for detailed derivations
-            x_t = ((alpha_t / alpha_s0) * sample - (sigma_t *
-                                                    (torch.exp(h) - 1.0)) * D0 -
-                   (sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)) * D1 -
-                   (sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5)) * D2)
-        return x_t  # pyright: ignore
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
-        if schedule_timesteps is None:
-            schedule_timesteps = self.timesteps
-        indices = (schedule_timesteps == timestep).nonzero()
-        # The sigma index that is taken for the **very** first `step`
-        # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in
-        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
-        pos = 1 if len(indices) > 1 else 0
-        return indices[pos].item()
-    def _init_step_index(self, timestep):
-        """
-        Initialize the step_index counter for the scheduler.
-        """
-        if self.begin_index is None:
-            if isinstance(timestep, torch.Tensor):
-                timestep = timestep.to(self.timesteps.device)
-            self._step_index = self.index_for_timestep(timestep)
-        else:
-            self._step_index = self._begin_index
-    # Modified from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.step
-    def step(
-        self,
-        model_output: torch.Tensor,
-        timestep: Union[int, torch.Tensor],
-        sample: torch.Tensor,
-        generator=None,
-        variance_noise: Optional[torch.Tensor] = None,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
-        the multistep DPMSolver.
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from learned diffusion model.
-            timestep (`int`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-            generator (`torch.Generator`, *optional*):
-                A random number generator.
-            variance_noise (`torch.Tensor`):
-                Alternative to generating noise with `generator` by directly providing the noise for the variance
-                itself. Useful for methods such as [`LEdits++`].
-            return_dict (`bool`):
-                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
-        Returns:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-        if self.step_index is None:
-            self._init_step_index(timestep)
-        # Improve numerical stability for small number of steps
-        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
-            self.config.euler_at_final or
-            (self.config.lower_order_final and len(self.timesteps) < 15) or
-            self.config.final_sigmas_type == "zero")
-        lower_order_second = ((self.step_index == len(self.timesteps) - 2) and
-                              self.config.lower_order_final and
-                              len(self.timesteps) < 15)
-        model_output = self.convert_model_output(model_output, sample=sample)
-        for i in range(self.config.solver_order - 1):
-            self.model_outputs[i] = self.model_outputs[i + 1]
-        self.model_outputs[-1] = model_output
-        # Upcast to avoid precision issues when computing prev_sample
-        sample = sample.to(torch.float32)
-        if self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"
-                                         ] and variance_noise is None:
-            noise = randn_tensor(
-                model_output.shape,
-                generator=generator,
-                device=model_output.device,
-                dtype=torch.float32)
-        elif self.config.algorithm_type in ["sde-dpmsolver", "sde-dpmsolver++"]:
-            noise = variance_noise.to(
-                device=model_output.device,
-                dtype=torch.float32)  # pyright: ignore
-        else:
-            noise = None
-        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
-            prev_sample = self.dpm_solver_first_order_update(
-                model_output, sample=sample, noise=noise)
-        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
-            prev_sample = self.multistep_dpm_solver_second_order_update(
-                self.model_outputs, sample=sample, noise=noise)
-        else:
-            prev_sample = self.multistep_dpm_solver_third_order_update(
-                self.model_outputs, sample=sample)
-        if self.lower_order_nums < self.config.solver_order:
-            self.lower_order_nums += 1
-        # Cast sample back to expected dtype
-        prev_sample = prev_sample.to(model_output.dtype)
-        # upon completion increase step index by one
-        self._step_index += 1  # pyright: ignore
-        if not return_dict:
-            return (prev_sample,)
-        return SchedulerOutput(prev_sample=prev_sample)
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
-    def scale_model_input(self, sample: torch.Tensor, *args,
-                          **kwargs) -> torch.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-        Args:
-            sample (`torch.Tensor`):
-                The input sample.
-        Returns:
-            `torch.Tensor`:
-                A scaled input sample.
-        """
-        return sample
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.scale_model_input
-    def add_noise(
-        self,
-        original_samples: torch.Tensor,
-        noise: torch.Tensor,
-        timesteps: torch.IntTensor,
-    ) -> torch.Tensor:
-        # Make sure sigmas and timesteps have the same device and dtype as original_samples
-        sigmas = self.sigmas.to(
-            device=original_samples.device, dtype=original_samples.dtype)
-        if original_samples.device.type == "mps" and torch.is_floating_point(
-                timesteps):
-            # mps does not support float64
-            schedule_timesteps = self.timesteps.to(
-                original_samples.device, dtype=torch.float32)
-            timesteps = timesteps.to(
-                original_samples.device, dtype=torch.float32)
-        else:
-            schedule_timesteps = self.timesteps.to(original_samples.device)
-            timesteps = timesteps.to(original_samples.device)
-        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
-        if self.begin_index is None:
-            step_indices = [
-                self.index_for_timestep(t, schedule_timesteps)
-                for t in timesteps
-            ]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
-        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
-            step_indices = [self.begin_index] * timesteps.shape[0]
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
-        noisy_samples = alpha_t * original_samples + sigma_t * noise
-        return noisy_samples
-    def __len__(self):
-        return self.config.num_train_timesteps

wan/utils/fm_solvers_unipc.py DELETED Viewed

@@ -1,802 +0,0 @@
-# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
-# Convert unipc for flow matching
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import math
-from typing import List, Optional, Tuple, Union
-import numpy as np
-import torch
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.schedulers.scheduling_utils import (
-    KarrasDiffusionSchedulers,
-    SchedulerMixin,
-    SchedulerOutput,
-)
-from diffusers.utils import deprecate, is_scipy_available
-if is_scipy_available():
-    import scipy.stats
-class FlowUniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
-    """
-    `UniPCMultistepScheduler` is a training-free framework designed for the fast sampling of diffusion models.
-    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
-    methods the library implements for all schedulers such as loading and saving.
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        solver_order (`int`, default `2`):
-            The UniPC order which can be any positive integer. The effective order of accuracy is `solver_order + 1`
-            due to the UniC. It is recommended to use `solver_order=2` for guided sampling, and `solver_order=3` for
-            unconditional sampling.
-        prediction_type (`str`, defaults to "flow_prediction"):
-            Prediction type of the scheduler function; must be `flow_prediction` for this scheduler, which predicts
-            the flow of the diffusion process.
-        thresholding (`bool`, defaults to `False`):
-            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
-            as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
-            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
-            The threshold value for dynamic thresholding. Valid only when `thresholding=True` and `predict_x0=True`.
-        predict_x0 (`bool`, defaults to `True`):
-            Whether to use the updating algorithm on the predicted x0.
-        solver_type (`str`, default `bh2`):
-            Solver type for UniPC. It is recommended to use `bh1` for unconditional sampling when steps < 10, and `bh2`
-            otherwise.
-        lower_order_final (`bool`, default `True`):
-            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
-            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
-        disable_corrector (`list`, default `[]`):
-            Decides which step to disable the corrector to mitigate the misalignment between `epsilon_theta(x_t, c)`
-            and `epsilon_theta(x_t^c, c)` which can influence convergence for a large guidance scale. Corrector is
-            usually disabled during the first few steps.
-        solver_p (`SchedulerMixin`, default `None`):
-            Any other scheduler that if specified, the algorithm becomes `solver_p + UniC`.
-        use_karras_sigmas (`bool`, *optional*, defaults to `False`):
-            Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
-            the sigmas are determined according to a sequence of noise levels {σi}.
-        use_exponential_sigmas (`bool`, *optional*, defaults to `False`):
-            Whether to use exponential sigmas for step sizes in the noise schedule during the sampling process.
-        timestep_spacing (`str`, defaults to `"linspace"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps, as required by some model families.
-        final_sigmas_type (`str`, defaults to `"zero"`):
-            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
-    """
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-    @register_to_config
-    def __init__(
-            self,
-            num_train_timesteps: int = 1000,
-            solver_order: int = 2,
-            prediction_type: str = "flow_prediction",
-            shift: Optional[float] = 1.0,
-            use_dynamic_shifting=False,
-            thresholding: bool = False,
-            dynamic_thresholding_ratio: float = 0.995,
-            sample_max_value: float = 1.0,
-            predict_x0: bool = True,
-            solver_type: str = "bh2",
-            lower_order_final: bool = True,
-            disable_corrector: List[int] = [],
-            solver_p: SchedulerMixin = None,
-            timestep_spacing: str = "linspace",
-            steps_offset: int = 0,
-            final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
-    ):
-        if solver_type not in ["bh1", "bh2"]:
-            if solver_type in ["midpoint", "heun", "logrho"]:
-                self.register_to_config(solver_type="bh2")
-            else:
-                raise NotImplementedError(
-                    f"{solver_type} is not implemented for {self.__class__}")
-        self.predict_x0 = predict_x0
-        # setable values
-        self.num_inference_steps = None
-        alphas = np.linspace(1, 1 / num_train_timesteps,
-                             num_train_timesteps)[::-1].copy()
-        sigmas = 1.0 - alphas
-        sigmas = torch.from_numpy(sigmas).to(dtype=torch.float32)
-        if not use_dynamic_shifting:
-            # when use_dynamic_shifting is True, we apply the timestep shifting on the fly based on the image resolution
-            sigmas = shift * sigmas / (1 +
-                                       (shift - 1) * sigmas)  # pyright: ignore
-        self.sigmas = sigmas
-        self.timesteps = sigmas * num_train_timesteps
-        self.model_outputs = [None] * solver_order
-        self.timestep_list = [None] * solver_order
-        self.lower_order_nums = 0
-        self.disable_corrector = disable_corrector
-        self.solver_p = solver_p
-        self.last_sample = None
-        self._step_index = None
-        self._begin_index = None
-        self.sigmas = self.sigmas.to(
-            "cpu")  # to avoid too much CPU/GPU communication
-        self.sigma_min = self.sigmas[-1].item()
-        self.sigma_max = self.sigmas[0].item()
-    @property
-    def step_index(self):
-        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
-        """
-        return self._step_index
-    @property
-    def begin_index(self):
-        """
-        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
-        """
-        return self._begin_index
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
-        """
-        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
-        Args:
-            begin_index (`int`):
-                The begin index for the scheduler.
-        """
-        self._begin_index = begin_index
-    # Modified from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler.set_timesteps
-    def set_timesteps(
-        self,
-        num_inference_steps: Union[int, None] = None,
-        device: Union[str, torch.device] = None,
-        sigmas: Optional[List[float]] = None,
-        mu: Optional[Union[float, None]] = None,
-        shift: Optional[Union[float, None]] = None,
-    ):
-        """
-        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
-        Args:
-            num_inference_steps (`int`):
-                Total number of the spacing of the time steps.
-            device (`str` or `torch.device`, *optional*):
-                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        """
-        if self.config.use_dynamic_shifting and mu is None:
-            raise ValueError(
-                " you have to pass a value for `mu` when `use_dynamic_shifting` is set to be `True`"
-            )
-        if sigmas is None:
-            sigmas = np.linspace(self.sigma_max, self.sigma_min,
-                                 num_inference_steps +
-                                 1).copy()[:-1]  # pyright: ignore
-        if self.config.use_dynamic_shifting:
-            sigmas = self.time_shift(mu, 1.0, sigmas)  # pyright: ignore
-        else:
-            if shift is None:
-                shift = self.config.shift
-            sigmas = shift * sigmas / (1 +
-                                       (shift - 1) * sigmas)  # pyright: ignore
-        if self.config.final_sigmas_type == "sigma_min":
-            sigma_last = ((1 - self.alphas_cumprod[0]) /
-                          self.alphas_cumprod[0])**0.5
-        elif self.config.final_sigmas_type == "zero":
-            sigma_last = 0
-        else:
-            raise ValueError(
-                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
-            )
-        timesteps = sigmas * self.config.num_train_timesteps
-        sigmas = np.concatenate([sigmas, [sigma_last]
-                                ]).astype(np.float32)  # pyright: ignore
-        self.sigmas = torch.from_numpy(sigmas)
-        self.timesteps = torch.from_numpy(timesteps).to(
-            device=device, dtype=torch.int64)
-        self.num_inference_steps = len(timesteps)
-        self.model_outputs = [
-            None,
-        ] * self.config.solver_order
-        self.lower_order_nums = 0
-        self.last_sample = None
-        if self.solver_p:
-            self.solver_p.set_timesteps(self.num_inference_steps, device=device)
-        # add an index counter for schedulers that allow duplicated timesteps
-        self._step_index = None
-        self._begin_index = None
-        self.sigmas = self.sigmas.to(
-            "cpu")  # to avoid too much CPU/GPU communication
-    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
-    def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
-        """
-        "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
-        prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
-        s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
-        pixels from saturation at each step. We find that dynamic thresholding results in significantly better
-        photorealism as well as better image-text alignment, especially when using very large guidance weights."
-        https://arxiv.org/abs/2205.11487
-        """
-        dtype = sample.dtype
-        batch_size, channels, *remaining_dims = sample.shape
-        if dtype not in (torch.float32, torch.float64):
-            sample = sample.float(
-            )  # upcast for quantile calculation, and clamp not implemented for cpu half
-        # Flatten sample for doing quantile calculation along each image
-        sample = sample.reshape(batch_size, channels * np.prod(remaining_dims))
-        abs_sample = sample.abs()  # "a certain percentile absolute pixel value"
-        s = torch.quantile(
-            abs_sample, self.config.dynamic_thresholding_ratio, dim=1)
-        s = torch.clamp(
-            s, min=1, max=self.config.sample_max_value
-        )  # When clamped to min=1, equivalent to standard clipping to [-1, 1]
-        s = s.unsqueeze(
-            1)  # (batch_size, 1) because clamp will broadcast along dim=0
-        sample = torch.clamp(
-            sample, -s, s
-        ) / s  # "we threshold xt0 to the range [-s, s] and then divide by s"
-        sample = sample.reshape(batch_size, channels, *remaining_dims)
-        sample = sample.to(dtype)
-        return sample
-    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.FlowMatchEulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma):
-        return sigma * self.config.num_train_timesteps
-    def _sigma_to_alpha_sigma_t(self, sigma):
-        return 1 - sigma, sigma
-    # Copied from diffusers.schedulers.scheduling_flow_match_euler_discrete.set_timesteps
-    def time_shift(self, mu: float, sigma: float, t: torch.Tensor):
-        return math.exp(mu) / (math.exp(mu) + (1 / t - 1)**sigma)
-    def convert_model_output(
-        self,
-        model_output: torch.Tensor,
-        *args,
-        sample: torch.Tensor = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        r"""
-        Convert the model output to the corresponding type the UniPC algorithm needs.
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from the learned diffusion model.
-            timestep (`int`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-        Returns:
-            `torch.Tensor`:
-                The converted model output.
-        """
-        timestep = args[0] if len(args) > 0 else kwargs.pop("timestep", None)
-        if sample is None:
-            if len(args) > 1:
-                sample = args[1]
-            else:
-                raise ValueError(
-                    "missing `sample` as a required keyward argument")
-        if timestep is not None:
-            deprecate(
-                "timesteps",
-                "1.0.0",
-                "Passing `timesteps` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        sigma = self.sigmas[self.step_index]
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
-        if self.predict_x0:
-            if self.config.prediction_type == "flow_prediction":
-                sigma_t = self.sigmas[self.step_index]
-                x0_pred = sample - sigma_t * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
-                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
-                )
-            if self.config.thresholding:
-                x0_pred = self._threshold_sample(x0_pred)
-            return x0_pred
-        else:
-            if self.config.prediction_type == "flow_prediction":
-                sigma_t = self.sigmas[self.step_index]
-                epsilon = sample - (1 - sigma_t) * model_output
-            else:
-                raise ValueError(
-                    f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`,"
-                    " `v_prediction` or `flow_prediction` for the UniPCMultistepScheduler."
-                )
-            if self.config.thresholding:
-                sigma_t = self.sigmas[self.step_index]
-                x0_pred = sample - sigma_t * model_output
-                x0_pred = self._threshold_sample(x0_pred)
-                epsilon = model_output + x0_pred
-            return epsilon
-    def multistep_uni_p_bh_update(
-        self,
-        model_output: torch.Tensor,
-        *args,
-        sample: torch.Tensor = None,
-        order: int = None,  # pyright: ignore
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        One step for the UniP (B(h) version). Alternatively, `self.solver_p` is used if is specified.
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from the learned diffusion model at the current timestep.
-            prev_timestep (`int`):
-                The previous discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-            order (`int`):
-                The order of UniP at this timestep (corresponds to the *p* in UniPC-p).
-        Returns:
-            `torch.Tensor`:
-                The sample tensor at the previous timestep.
-        """
-        prev_timestep = args[0] if len(args) > 0 else kwargs.pop(
-            "prev_timestep", None)
-        if sample is None:
-            if len(args) > 1:
-                sample = args[1]
-            else:
-                raise ValueError(
-                    " missing `sample` as a required keyward argument")
-        if order is None:
-            if len(args) > 2:
-                order = args[2]
-            else:
-                raise ValueError(
-                    " missing `order` as a required keyward argument")
-        if prev_timestep is not None:
-            deprecate(
-                "prev_timestep",
-                "1.0.0",
-                "Passing `prev_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        model_output_list = self.model_outputs
-        s0 = self.timestep_list[-1]
-        m0 = model_output_list[-1]
-        x = sample
-        if self.solver_p:
-            x_t = self.solver_p.step(model_output, s0, x).prev_sample
-            return x_t
-        sigma_t, sigma_s0 = self.sigmas[self.step_index + 1], self.sigmas[
-            self.step_index]  # pyright: ignore
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-        h = lambda_t - lambda_s0
-        device = sample.device
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            si = self.step_index - i  # pyright: ignore
-            mi = model_output_list[-(i + 1)]
-            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
-            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
-            rk = (lambda_si - lambda_s0) / h
-            rks.append(rk)
-            D1s.append((mi - m0) / rk)  # pyright: ignore
-        rks.append(1.0)
-        rks = torch.tensor(rks, device=device)
-        R = []
-        b = []
-        hh = -h if self.predict_x0 else h
-        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
-        h_phi_k = h_phi_1 / hh - 1
-        factorial_i = 1
-        if self.config.solver_type == "bh1":
-            B_h = hh
-        elif self.config.solver_type == "bh2":
-            B_h = torch.expm1(hh)
-        else:
-            raise NotImplementedError()
-        for i in range(1, order + 1):
-            R.append(torch.pow(rks, i - 1))
-            b.append(h_phi_k * factorial_i / B_h)
-            factorial_i *= i + 1
-            h_phi_k = h_phi_k / hh - 1 / factorial_i
-        R = torch.stack(R)
-        b = torch.tensor(b, device=device)
-        if len(D1s) > 0:
-            D1s = torch.stack(D1s, dim=1)  # (B, K)
-            # for order 2, we use a simplified version
-            if order == 2:
-                rhos_p = torch.tensor([0.5], dtype=x.dtype, device=device)
-            else:
-                rhos_p = torch.linalg.solve(R[:-1, :-1],
-                                            b[:-1]).to(device).to(x.dtype)
-        else:
-            D1s = None
-        if self.predict_x0:
-            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
-            if D1s is not None:
-                pred_res = torch.einsum("k,bkc...->bc...", rhos_p,
-                                        D1s)  # pyright: ignore
-            else:
-                pred_res = 0
-            x_t = x_t_ - alpha_t * B_h * pred_res
-        else:
-            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
-            if D1s is not None:
-                pred_res = torch.einsum("k,bkc...->bc...", rhos_p,
-                                        D1s)  # pyright: ignore
-            else:
-                pred_res = 0
-            x_t = x_t_ - sigma_t * B_h * pred_res
-        x_t = x_t.to(x.dtype)
-        return x_t
-    def multistep_uni_c_bh_update(
-        self,
-        this_model_output: torch.Tensor,
-        *args,
-        last_sample: torch.Tensor = None,
-        this_sample: torch.Tensor = None,
-        order: int = None,  # pyright: ignore
-        **kwargs,
-    ) -> torch.Tensor:
-        """
-        One step for the UniC (B(h) version).
-        Args:
-            this_model_output (`torch.Tensor`):
-                The model outputs at `x_t`.
-            this_timestep (`int`):
-                The current timestep `t`.
-            last_sample (`torch.Tensor`):
-                The generated sample before the last predictor `x_{t-1}`.
-            this_sample (`torch.Tensor`):
-                The generated sample after the last predictor `x_{t}`.
-            order (`int`):
-                The `p` of UniC-p at this step. The effective order of accuracy should be `order + 1`.
-        Returns:
-            `torch.Tensor`:
-                The corrected sample tensor at the current timestep.
-        """
-        this_timestep = args[0] if len(args) > 0 else kwargs.pop(
-            "this_timestep", None)
-        if last_sample is None:
-            if len(args) > 1:
-                last_sample = args[1]
-            else:
-                raise ValueError(
-                    " missing`last_sample` as a required keyward argument")
-        if this_sample is None:
-            if len(args) > 2:
-                this_sample = args[2]
-            else:
-                raise ValueError(
-                    " missing`this_sample` as a required keyward argument")
-        if order is None:
-            if len(args) > 3:
-                order = args[3]
-            else:
-                raise ValueError(
-                    " missing`order` as a required keyward argument")
-        if this_timestep is not None:
-            deprecate(
-                "this_timestep",
-                "1.0.0",
-                "Passing `this_timestep` is deprecated and has no effect as model output conversion is now handled via an internal counter `self.step_index`",
-            )
-        model_output_list = self.model_outputs
-        m0 = model_output_list[-1]
-        x = last_sample
-        x_t = this_sample
-        model_t = this_model_output
-        sigma_t, sigma_s0 = self.sigmas[self.step_index], self.sigmas[
-            self.step_index - 1]  # pyright: ignore
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-        h = lambda_t - lambda_s0
-        device = this_sample.device
-        rks = []
-        D1s = []
-        for i in range(1, order):
-            si = self.step_index - (i + 1)  # pyright: ignore
-            mi = model_output_list[-(i + 1)]
-            alpha_si, sigma_si = self._sigma_to_alpha_sigma_t(self.sigmas[si])
-            lambda_si = torch.log(alpha_si) - torch.log(sigma_si)
-            rk = (lambda_si - lambda_s0) / h
-            rks.append(rk)
-            D1s.append((mi - m0) / rk)  # pyright: ignore
-        rks.append(1.0)
-        rks = torch.tensor(rks, device=device)
-        R = []
-        b = []
-        hh = -h if self.predict_x0 else h
-        h_phi_1 = torch.expm1(hh)  # h\phi_1(h) = e^h - 1
-        h_phi_k = h_phi_1 / hh - 1
-        factorial_i = 1
-        if self.config.solver_type == "bh1":
-            B_h = hh
-        elif self.config.solver_type == "bh2":
-            B_h = torch.expm1(hh)
-        else:
-            raise NotImplementedError()
-        for i in range(1, order + 1):
-            R.append(torch.pow(rks, i - 1))
-            b.append(h_phi_k * factorial_i / B_h)
-            factorial_i *= i + 1
-            h_phi_k = h_phi_k / hh - 1 / factorial_i
-        R = torch.stack(R)
-        b = torch.tensor(b, device=device)
-        if len(D1s) > 0:
-            D1s = torch.stack(D1s, dim=1)
-        else:
-            D1s = None
-        # for order 1, we use a simplified version
-        if order == 1:
-            rhos_c = torch.tensor([0.5], dtype=x.dtype, device=device)
-        else:
-            rhos_c = torch.linalg.solve(R, b).to(device).to(x.dtype)
-        if self.predict_x0:
-            x_t_ = sigma_t / sigma_s0 * x - alpha_t * h_phi_1 * m0
-            if D1s is not None:
-                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
-            else:
-                corr_res = 0
-            D1_t = model_t - m0
-            x_t = x_t_ - alpha_t * B_h * (corr_res + rhos_c[-1] * D1_t)
-        else:
-            x_t_ = alpha_t / alpha_s0 * x - sigma_t * h_phi_1 * m0
-            if D1s is not None:
-                corr_res = torch.einsum("k,bkc...->bc...", rhos_c[:-1], D1s)
-            else:
-                corr_res = 0
-            D1_t = model_t - m0
-            x_t = x_t_ - sigma_t * B_h * (corr_res + rhos_c[-1] * D1_t)
-        x_t = x_t.to(x.dtype)
-        return x_t
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
-        if schedule_timesteps is None:
-            schedule_timesteps = self.timesteps
-        indices = (schedule_timesteps == timestep).nonzero()
-        # The sigma index that is taken for the **very** first `step`
-        # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in
-        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
-        pos = 1 if len(indices) > 1 else 0
-        return indices[pos].item()
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
-    def _init_step_index(self, timestep):
-        """
-        Initialize the step_index counter for the scheduler.
-        """
-        if self.begin_index is None:
-            if isinstance(timestep, torch.Tensor):
-                timestep = timestep.to(self.timesteps.device)
-            self._step_index = self.index_for_timestep(timestep)
-        else:
-            self._step_index = self._begin_index
-    def step(self,
-             model_output: torch.Tensor,
-             timestep: Union[int, torch.Tensor],
-             sample: torch.Tensor,
-             return_dict: bool = True,
-             generator=None) -> Union[SchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
-        the multistep UniPC.
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from learned diffusion model.
-            timestep (`int`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-            return_dict (`bool`):
-                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
-        Returns:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-        if self.step_index is None:
-            self._init_step_index(timestep)
-        use_corrector = (
-            self.step_index > 0 and
-            self.step_index - 1 not in self.disable_corrector and
-            self.last_sample is not None  # pyright: ignore
-        )
-        model_output_convert = self.convert_model_output(
-            model_output, sample=sample)
-        if use_corrector:
-            sample = self.multistep_uni_c_bh_update(
-                this_model_output=model_output_convert,
-                last_sample=self.last_sample,
-                this_sample=sample,
-                order=self.this_order,
-            )
-        for i in range(self.config.solver_order - 1):
-            self.model_outputs[i] = self.model_outputs[i + 1]
-            self.timestep_list[i] = self.timestep_list[i + 1]
-        self.model_outputs[-1] = model_output_convert
-        self.timestep_list[-1] = timestep  # pyright: ignore
-        if self.config.lower_order_final:
-            this_order = min(self.config.solver_order,
-                             len(self.timesteps) -
-                             self.step_index)  # pyright: ignore
-        else:
-            this_order = self.config.solver_order
-        self.this_order = min(this_order,
-                              self.lower_order_nums + 1)  # warmup for multistep
-        assert self.this_order > 0
-        self.last_sample = sample
-        prev_sample = self.multistep_uni_p_bh_update(
-            model_output=model_output,  # pass the original non-converted model output, in case solver-p is used
-            sample=sample,
-            order=self.this_order,
-        )
-        if self.lower_order_nums < self.config.solver_order:
-            self.lower_order_nums += 1
-        # upon completion increase step index by one
-        self._step_index += 1  # pyright: ignore
-        if not return_dict:
-            return (prev_sample,)
-        return SchedulerOutput(prev_sample=prev_sample)
-    def scale_model_input(self, sample: torch.Tensor, *args,
-                          **kwargs) -> torch.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-        Args:
-            sample (`torch.Tensor`):
-                The input sample.
-        Returns:
-            `torch.Tensor`:
-                A scaled input sample.
-        """
-        return sample
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.add_noise
-    def add_noise(
-        self,
-        original_samples: torch.Tensor,
-        noise: torch.Tensor,
-        timesteps: torch.IntTensor,
-    ) -> torch.Tensor:
-        # Make sure sigmas and timesteps have the same device and dtype as original_samples
-        sigmas = self.sigmas.to(
-            device=original_samples.device, dtype=original_samples.dtype)
-        if original_samples.device.type == "mps" and torch.is_floating_point(
-                timesteps):
-            # mps does not support float64
-            schedule_timesteps = self.timesteps.to(
-                original_samples.device, dtype=torch.float32)
-            timesteps = timesteps.to(
-                original_samples.device, dtype=torch.float32)
-        else:
-            schedule_timesteps = self.timesteps.to(original_samples.device)
-            timesteps = timesteps.to(original_samples.device)
-        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
-        if self.begin_index is None:
-            step_indices = [
-                self.index_for_timestep(t, schedule_timesteps)
-                for t in timesteps
-            ]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
-        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
-            step_indices = [self.begin_index] * timesteps.shape[0]
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma)
-        noisy_samples = alpha_t * original_samples + sigma_t * noise
-        return noisy_samples
-    def __len__(self):
-        return self.config.num_train_timesteps

wan/utils/prompt_extend.py DELETED Viewed

@@ -1,542 +0,0 @@
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-import json
-import logging
-import math
-import os
-import random
-import sys
-import tempfile
-from dataclasses import dataclass
-from http import HTTPStatus
-from typing import Optional, Union
-import dashscope
-import torch
-from PIL import Image
-try:
-    from flash_attn import flash_attn_varlen_func
-    FLASH_VER = 2
-except ModuleNotFoundError:
-    flash_attn_varlen_func = None  # in compatible with CPU machines
-    FLASH_VER = None
-from .system_prompt import *
-DEFAULT_SYS_PROMPTS = {
-    "t2v-A14B": {
-        "zh": T2V_A14B_ZH_SYS_PROMPT,
-        "en": T2V_A14B_EN_SYS_PROMPT,
-    },
-    "i2v-A14B": {
-        "zh": I2V_A14B_ZH_SYS_PROMPT,
-        "en": I2V_A14B_EN_SYS_PROMPT,
-        "empty": {
-            "zh": I2V_A14B_EMPTY_ZH_SYS_PROMPT,
-            "en": I2V_A14B_EMPTY_EN_SYS_PROMPT,
-        }
-    },
-    "ti2v-5B": {
-        "t2v": {
-            "zh": T2V_A14B_ZH_SYS_PROMPT,
-            "en": T2V_A14B_EN_SYS_PROMPT,
-        },
-        "i2v": {
-            "zh": I2V_A14B_ZH_SYS_PROMPT,
-            "en": I2V_A14B_EN_SYS_PROMPT,
-        }
-    },
-}
-@dataclass
-class PromptOutput(object):
-    status: bool
-    prompt: str
-    seed: int
-    system_prompt: str
-    message: str
-    def add_custom_field(self, key: str, value) -> None:
-        self.__setattr__(key, value)
-class PromptExpander:
-    def __init__(self, model_name, task, is_vl=False, device=0, **kwargs):
-        self.model_name = model_name
-        self.task = task
-        self.is_vl = is_vl
-        self.device = device
-    def extend_with_img(self,
-                        prompt,
-                        system_prompt,
-                        image=None,
-                        seed=-1,
-                        *args,
-                        **kwargs):
-        pass
-    def extend(self, prompt, system_prompt, seed=-1, *args, **kwargs):
-        pass
-    def decide_system_prompt(self, tar_lang="zh", prompt=None):
-        assert self.task is not None
-        if "ti2v" in self.task:
-            if self.is_vl:
-                return DEFAULT_SYS_PROMPTS[self.task]["i2v"][tar_lang]
-            else:
-                return DEFAULT_SYS_PROMPTS[self.task]["t2v"][tar_lang]
-        if "i2v" in self.task and len(prompt) == 0:
-            return DEFAULT_SYS_PROMPTS[self.task]["empty"][tar_lang]
-        return DEFAULT_SYS_PROMPTS[self.task][tar_lang]
-    def __call__(self,
-                 prompt,
-                 system_prompt=None,
-                 tar_lang="zh",
-                 image=None,
-                 seed=-1,
-                 *args,
-                 **kwargs):
-        if system_prompt is None:
-            system_prompt = self.decide_system_prompt(
-                tar_lang=tar_lang, prompt=prompt)
-        if seed < 0:
-            seed = random.randint(0, sys.maxsize)
-        if image is not None and self.is_vl:
-            return self.extend_with_img(
-                prompt, system_prompt, image=image, seed=seed, *args, **kwargs)
-        elif not self.is_vl:
-            return self.extend(prompt, system_prompt, seed, *args, **kwargs)
-        else:
-            raise NotImplementedError
-class DashScopePromptExpander(PromptExpander):
-    def __init__(self,
-                 api_key=None,
-                 model_name=None,
-                 task=None,
-                 max_image_size=512 * 512,
-                 retry_times=4,
-                 is_vl=False,
-                 **kwargs):
-        '''
-        Args:
-            api_key: The API key for Dash Scope authentication and access to related services.
-            model_name: Model name, 'qwen-plus' for extending prompts, 'qwen-vl-max' for extending prompt-images.
-            task: Task name. This is required to determine the default system prompt.
-            max_image_size: The maximum size of the image; unit unspecified (e.g., pixels, KB). Please specify the unit based on actual usage.
-            retry_times: Number of retry attempts in case of request failure.
-            is_vl: A flag indicating whether the task involves visual-language processing.
-            **kwargs: Additional keyword arguments that can be passed to the function or method.
-        '''
-        if model_name is None:
-            model_name = 'qwen-plus' if not is_vl else 'qwen-vl-max'
-        super().__init__(model_name, task, is_vl, **kwargs)
-        if api_key is not None:
-            dashscope.api_key = api_key
-        elif 'DASH_API_KEY' in os.environ and os.environ[
-                'DASH_API_KEY'] is not None:
-            dashscope.api_key = os.environ['DASH_API_KEY']
-        else:
-            raise ValueError("DASH_API_KEY is not set")
-        if 'DASH_API_URL' in os.environ and os.environ[
-                'DASH_API_URL'] is not None:
-            dashscope.base_http_api_url = os.environ['DASH_API_URL']
-        else:
-            dashscope.base_http_api_url = 'https://dashscope.aliyuncs.com/api/v1'
-        self.api_key = api_key
-        self.max_image_size = max_image_size
-        self.model = model_name
-        self.retry_times = retry_times
-    def extend(self, prompt, system_prompt, seed=-1, *args, **kwargs):
-        messages = [{
-            'role': 'system',
-            'content': system_prompt
-        }, {
-            'role': 'user',
-            'content': prompt
-        }]
-        exception = None
-        for _ in range(self.retry_times):
-            try:
-                response = dashscope.Generation.call(
-                    self.model,
-                    messages=messages,
-                    seed=seed,
-                    result_format='message',  # set the result to be "message" format.
-                )
-                assert response.status_code == HTTPStatus.OK, response
-                expanded_prompt = response['output']['choices'][0]['message'][
-                    'content']
-                return PromptOutput(
-                    status=True,
-                    prompt=expanded_prompt,
-                    seed=seed,
-                    system_prompt=system_prompt,
-                    message=json.dumps(response, ensure_ascii=False))
-            except Exception as e:
-                exception = e
-        return PromptOutput(
-            status=False,
-            prompt=prompt,
-            seed=seed,
-            system_prompt=system_prompt,
-            message=str(exception))
-    def extend_with_img(self,
-                        prompt,
-                        system_prompt,
-                        image: Union[Image.Image, str] = None,
-                        seed=-1,
-                        *args,
-                        **kwargs):
-        if isinstance(image, str):
-            image = Image.open(image).convert('RGB')
-        w = image.width
-        h = image.height
-        area = min(w * h, self.max_image_size)
-        aspect_ratio = h / w
-        resized_h = round(math.sqrt(area * aspect_ratio))
-        resized_w = round(math.sqrt(area / aspect_ratio))
-        image = image.resize((resized_w, resized_h))
-        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as f:
-            image.save(f.name)
-            fname = f.name
-            image_path = f"file://{f.name}"
-        prompt = f"{prompt}"
-        messages = [
-            {
-                'role': 'system',
-                'content': [{
-                    "text": system_prompt
-                }]
-            },
-            {
-                'role': 'user',
-                'content': [{
-                    "text": prompt
-                }, {
-                    "image": image_path
-                }]
-            },
-        ]
-        response = None
-        result_prompt = prompt
-        exception = None
-        status = False
-        for _ in range(self.retry_times):
-            try:
-                response = dashscope.MultiModalConversation.call(
-                    self.model,
-                    messages=messages,
-                    seed=seed,
-                    result_format='message',  # set the result to be "message" format.
-                )
-                assert response.status_code == HTTPStatus.OK, response
-                result_prompt = response['output']['choices'][0]['message'][
-                    'content'][0]['text'].replace('\n', '\\n')
-                status = True
-                break
-            except Exception as e:
-                exception = e
-        result_prompt = result_prompt.replace('\n', '\\n')
-        os.remove(fname)
-        return PromptOutput(
-            status=status,
-            prompt=result_prompt,
-            seed=seed,
-            system_prompt=system_prompt,
-            message=str(exception) if not status else json.dumps(
-                response, ensure_ascii=False))
-class QwenPromptExpander(PromptExpander):
-    model_dict = {
-        "QwenVL2.5_3B": "Qwen/Qwen2.5-VL-3B-Instruct",
-        "QwenVL2.5_7B": "Qwen/Qwen2.5-VL-7B-Instruct",
-        "Qwen2.5_3B": "Qwen/Qwen2.5-3B-Instruct",
-        "Qwen2.5_7B": "Qwen/Qwen2.5-7B-Instruct",
-        "Qwen2.5_14B": "Qwen/Qwen2.5-14B-Instruct",
-    }
-    def __init__(self,
-                 model_name=None,
-                 task=None,
-                 device=0,
-                 is_vl=False,
-                 **kwargs):
-        '''
-        Args:
-            model_name: Use predefined model names such as 'QwenVL2.5_7B' and 'Qwen2.5_14B',
-                which are specific versions of the Qwen model. Alternatively, you can use the
-                local path to a downloaded model or the model name from Hugging Face."
-              Detailed Breakdown:
-                Predefined Model Names:
-                * 'QwenVL2.5_7B' and 'Qwen2.5_14B' are specific versions of the Qwen model.
-                Local Path:
-                * You can provide the path to a model that you have downloaded locally.
-                Hugging Face Model Name:
-                * You can also specify the model name from Hugging Face's model hub.
-            task: Task name. This is required to determine the default system prompt.
-            is_vl: A flag indicating whether the task involves visual-language processing.
-            **kwargs: Additional keyword arguments that can be passed to the function or method.
-        '''
-        if model_name is None:
-            model_name = 'Qwen2.5_14B' if not is_vl else 'QwenVL2.5_7B'
-        super().__init__(model_name, task, is_vl, device, **kwargs)
-        if (not os.path.exists(self.model_name)) and (self.model_name
-                                                      in self.model_dict):
-            self.model_name = self.model_dict[self.model_name]
-        if self.is_vl:
-            # default: Load the model on the available device(s)
-            from transformers import (
-                AutoProcessor,
-                AutoTokenizer,
-                Qwen2_5_VLForConditionalGeneration,
-            )
-            try:
-                from .qwen_vl_utils import process_vision_info
-            except:
-                from qwen_vl_utils import process_vision_info
-            self.process_vision_info = process_vision_info
-            min_pixels = 256 * 28 * 28
-            max_pixels = 1280 * 28 * 28
-            self.processor = AutoProcessor.from_pretrained(
-                self.model_name,
-                min_pixels=min_pixels,
-                max_pixels=max_pixels,
-                use_fast=True)
-            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-                self.model_name,
-                torch_dtype=torch.bfloat16 if FLASH_VER == 2 else
-                torch.float16 if "AWQ" in self.model_name else "auto",
-                attn_implementation="flash_attention_2"
-                if FLASH_VER == 2 else None,
-                device_map="cpu")
-        else:
-            from transformers import AutoModelForCausalLM, AutoTokenizer
-            self.model = AutoModelForCausalLM.from_pretrained(
-                self.model_name,
-                torch_dtype=torch.float16
-                if "AWQ" in self.model_name else "auto",
-                attn_implementation="flash_attention_2"
-                if FLASH_VER == 2 else None,
-                device_map="cpu")
-            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-    def extend(self, prompt, system_prompt, seed=-1, *args, **kwargs):
-        self.model = self.model.to(self.device)
-        messages = [{
-            "role": "system",
-            "content": system_prompt
-        }, {
-            "role": "user",
-            "content": prompt
-        }]
-        text = self.tokenizer.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        model_inputs = self.tokenizer([text],
-                                      return_tensors="pt").to(self.model.device)
-        generated_ids = self.model.generate(**model_inputs, max_new_tokens=512)
-        generated_ids = [
-            output_ids[len(input_ids):] for input_ids, output_ids in zip(
-                model_inputs.input_ids, generated_ids)
-        ]
-        expanded_prompt = self.tokenizer.batch_decode(
-            generated_ids, skip_special_tokens=True)[0]
-        self.model = self.model.to("cpu")
-        return PromptOutput(
-            status=True,
-            prompt=expanded_prompt,
-            seed=seed,
-            system_prompt=system_prompt,
-            message=json.dumps({"content": expanded_prompt},
-                               ensure_ascii=False))
-    def extend_with_img(self,
-                        prompt,
-                        system_prompt,
-                        image: Union[Image.Image, str] = None,
-                        seed=-1,
-                        *args,
-                        **kwargs):
-        self.model = self.model.to(self.device)
-        messages = [{
-            'role': 'system',
-            'content': [{
-                "type": "text",
-                "text": system_prompt
-            }]
-        }, {
-            "role":
-                "user",
-            "content": [
-                {
-                    "type": "image",
-                    "image": image,
-                },
-                {
-                    "type": "text",
-                    "text": prompt
-                },
-            ],
-        }]
-        # Preparation for inference
-        text = self.processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True)
-        image_inputs, video_inputs = self.process_vision_info(messages)
-        inputs = self.processor(
-            text=[text],
-            images=image_inputs,
-            videos=video_inputs,
-            padding=True,
-            return_tensors="pt",
-        )
-        inputs = inputs.to(self.device)
-        # Inference: Generation of the output
-        generated_ids = self.model.generate(**inputs, max_new_tokens=512)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):]
-            for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        expanded_prompt = self.processor.batch_decode(
-            generated_ids_trimmed,
-            skip_special_tokens=True,
-            clean_up_tokenization_spaces=False)[0]
-        self.model = self.model.to("cpu")
-        return PromptOutput(
-            status=True,
-            prompt=expanded_prompt,
-            seed=seed,
-            system_prompt=system_prompt,
-            message=json.dumps({"content": expanded_prompt},
-                               ensure_ascii=False))
-if __name__ == "__main__":
-    logging.basicConfig(
-        level=logging.INFO,
-        format="[%(asctime)s] %(levelname)s: %(message)s",
-        handlers=[logging.StreamHandler(stream=sys.stdout)])
-    seed = 100
-    prompt = "夏日海滩度假风格，一只戴着墨镜的白色猫咪坐在冲浪板上。猫咪毛发蓬松，表情悠闲，直视镜头。背景是模糊的海滩景色，海水清澈，远处有绿色的山丘和蓝天白云。猫咪的姿态自然放松，仿佛在享受海风和阳光。近景特写，强调猫咪的细节和海滩的清新氛围。"
-    en_prompt = "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside."
-    image = "./examples/i2v_input.JPG"
-    def test(method,
-             prompt,
-             model_name,
-             task,
-             image=None,
-             en_prompt=None,
-             seed=None):
-        prompt_expander = method(
-            model_name=model_name, task=task, is_vl=image is not None)
-        result = prompt_expander(prompt, image=image, tar_lang="zh")
-        logging.info(f"zh prompt -> zh: {result.prompt}")
-        result = prompt_expander(prompt, image=image, tar_lang="en")
-        logging.info(f"zh prompt -> en: {result.prompt}")
-        if en_prompt is not None:
-            result = prompt_expander(en_prompt, image=image, tar_lang="zh")
-            logging.info(f"en prompt -> zh: {result.prompt}")
-            result = prompt_expander(en_prompt, image=image, tar_lang="en")
-            logging.info(f"en prompt -> en: {result.prompt}")
-    ds_model_name = None
-    ds_vl_model_name = None
-    qwen_model_name = None
-    qwen_vl_model_name = None
-    for task in ["t2v-A14B", "i2v-A14B", "ti2v-5B"]:
-        # test prompt extend
-        if "t2v" in task or "ti2v" in task:
-            # test dashscope api
-            logging.info(f"-" * 40)
-            logging.info(f"Testing {task} dashscope prompt extend")
-            test(
-                DashScopePromptExpander,
-                prompt,
-                ds_model_name,
-                task,
-                image=None,
-                en_prompt=en_prompt,
-                seed=seed)
-            # test qwen api
-            logging.info(f"-" * 40)
-            logging.info(f"Testing {task} qwen prompt extend")
-            test(
-                QwenPromptExpander,
-                prompt,
-                qwen_model_name,
-                task,
-                image=None,
-                en_prompt=en_prompt,
-                seed=seed)
-        # test prompt-image extend
-        if "i2v" in task:
-            # test dashscope api
-            logging.info(f"-" * 40)
-            logging.info(f"Testing {task} dashscope vl prompt extend")
-            test(
-                DashScopePromptExpander,
-                prompt,
-                ds_vl_model_name,
-                task,
-                image=image,
-                en_prompt=en_prompt,
-                seed=seed)
-            # test qwen api
-            logging.info(f"-" * 40)
-            logging.info(f"Testing {task} qwen vl prompt extend")
-            test(
-                QwenPromptExpander,
-                prompt,
-                qwen_vl_model_name,
-                task,
-                image=image,
-                en_prompt=en_prompt,
-                seed=seed)
-        # test empty prompt extend
-        if "i2v-A14B" in task:
-            # test dashscope api
-            logging.info(f"-" * 40)
-            logging.info(f"Testing {task} dashscope vl empty prompt extend")
-            test(
-                DashScopePromptExpander,
-                "",
-                ds_vl_model_name,
-                task,
-                image=image,
-                en_prompt=None,
-                seed=seed)
-            # test qwen api
-            logging.info(f"-" * 40)
-            logging.info(f"Testing {task} qwen vl empty prompt extend")
-            test(
-                QwenPromptExpander,
-                "",
-                qwen_vl_model_name,
-                task,
-                image=image,
-                en_prompt=None,
-                seed=seed)