aigc-apps · bubbliiiing · Dec 29, 2025 · Dec 30, 2025 · Dec 31, 2025 · Dec 31, 2025
diff --git a/examples/z_image/predict_t2i_omni.py b/examples/z_image/predict_t2i_omni.py
@@ -0,0 +1,233 @@
+import os
+import sys
+
+import torch
+from diffusers import FlowMatchEulerDiscreteScheduler
+
+current_file_path = os.path.abspath(__file__)
+project_roots = [os.path.dirname(current_file_path), os.path.dirname(os.path.dirname(current_file_path)), os.path.dirname(os.path.dirname(os.path.dirname(current_file_path)))]
+for project_root in project_roots:
+    sys.path.insert(0, project_root) if project_root not in sys.path else None
+
+from videox_fun.dist import set_multi_gpus_devices, shard_model
+from videox_fun.models import (AutoencoderKL, AutoProcessor, AutoTokenizer,
+                               Qwen3ForCausalLM, Siglip2VisionModel,
+                               ZImageOmniTransformer2DModel)
+from videox_fun.models.cache_utils import get_teacache_coefficients
+from videox_fun.pipeline import ZImageOmniPipeline
+from videox_fun.utils.fm_solvers import FlowDPMSolverMultistepScheduler
+from videox_fun.utils.fm_solvers_unipc import FlowUniPCMultistepScheduler
+from videox_fun.utils.fp8_optimization import (convert_model_weight_to_float8,
+                                               convert_weight_dtype_wrapper)
+from videox_fun.utils.lora_utils import merge_lora, unmerge_lora
+from videox_fun.utils.utils import (filter_kwargs, get_image, get_image_latent,
+                                    get_image_to_video_latent,
+                                    get_video_to_video_latent,
+                                    save_videos_grid)
+
+# GPU memory mode, which can be chosen in [model_full_load, model_full_load_and_qfloat8, model_cpu_offload, model_cpu_offload_and_qfloat8, sequential_cpu_offload].
+# model_full_load means that the entire model will be moved to the GPU.
+# 
+# model_full_load_and_qfloat8 means that the entire model will be moved to the GPU,
+# and the transformer model has been quantized to float8, which can save more GPU memory. 
+# 
+# model_cpu_offload means that the entire model will be moved to the CPU after use, which can save some GPU memory.
+# 
+# model_cpu_offload_and_qfloat8 indicates that the entire model will be moved to the CPU after use, 
+# and the transformer model has been quantized to float8, which can save more GPU memory. 
+# 
+# sequential_cpu_offload means that each layer of the model will be moved to the CPU after use, 
+# resulting in slower speeds but saving a large amount of GPU memory.
+GPU_memory_mode     = "model_cpu_offload"
+# Multi GPUs config
+# Please ensure that the product of ulysses_degree and ring_degree equals the number of GPUs used. 
+# For example, if you are using 8 GPUs, you can set ulysses_degree = 2 and ring_degree = 4.
+# If you are using 1 GPU, you can set ulysses_degree = 1 and ring_degree = 1.
+ulysses_degree      = 1
+ring_degree         = 1
+# Use FSDP to save more GPU memory in multi gpus.
+fsdp_dit            = False
+fsdp_text_encoder   = False
+# Compile will give a speedup in fixed resolution and need a little GPU memory. 
+# The compile_dit is not compatible with the fsdp_dit and sequential_cpu_offload.
+compile_dit         = False
+
+# model path
+model_name          = "models/Diffusion_Transformer/Z-Image-Base-Omni"
+
+# Choose the sampler in "Flow", "Flow_Unipc", "Flow_DPM++"
+sampler_name        = "Flow"
+
+# Load pretrained model if nee
+transformer_path    = None
+vae_path            = None
+lora_path           = None
+
+# Other params
+sample_size         = [1568, 1184]
+
+# Use torch.float16 if GPU does not support torch.bfloat16
+# ome graphics cards, such as v100, 2080ti, do not support torch.bfloat16
+weight_dtype        = torch.bfloat16
+image               = None
+
+# Please use as detailed a prompt as possible to describe the object that needs to be generated.
+prompt              = "这是一张充满东方古典韵味的人像摄影作品，画面中的年轻女子身着一袭精致的香槟色旗袍蹲在地上，面料上点缀着精美的白色刺绣花纹，在阳光照射下泛着柔和的光泽。"
+negative_prompt     = ""
+guidance_scale      = 5.00
+seed                = 42
+num_inference_steps = 40
+lora_weight         = 0.55
+save_path           = "samples/z-image-omni"
+
+device = set_multi_gpus_devices(ulysses_degree, ring_degree)
+
+transformer = ZImageOmniTransformer2DModel.from_pretrained(
+    model_name, 
+    subfolder="transformer",
+    low_cpu_mem_usage=True,
+    torch_dtype=weight_dtype,
+).to(weight_dtype)
+
+if transformer_path is not None:
+    print(f"From checkpoint: {transformer_path}")
+    if transformer_path.endswith("safetensors"):
+        from safetensors.torch import load_file, safe_open
+        state_dict = load_file(transformer_path)
+    else:
+        state_dict = torch.load(transformer_path, map_location="cpu")
+    state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict
+
+    m, u = transformer.load_state_dict(state_dict, strict=False)
+    print(f"missing keys: {len(m)}, unexpected keys: {len(u)}")
+
+# Get Vae
+vae = AutoencoderKL.from_pretrained(
+    model_name, 
+    subfolder="vae"
+).to(weight_dtype)
+
+if vae_path is not None:
+    print(f"From checkpoint: {vae_path}")
+    if vae_path.endswith("safetensors"):
+        from safetensors.torch import load_file, safe_open
+        state_dict = load_file(vae_path)
+    else:
+        state_dict = torch.load(vae_path, map_location="cpu")
+    state_dict = state_dict["state_dict"] if "state_dict" in state_dict else state_dict
+
+    m, u = vae.load_state_dict(state_dict, strict=False)
+    print(f"missing keys: {len(m)}, unexpected keys: {len(u)}")
+
+# Get tokenizer and text_encoder
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name, subfolder="tokenizer"
+)
+text_encoder = Qwen3ForCausalLM.from_pretrained(
+    model_name, subfolder="text_encoder", torch_dtype=weight_dtype,
+    low_cpu_mem_usage=True,
+)
+
+siglip = Siglip2VisionModel.from_pretrained(
+    model_name, subfolder="clip_encoder", 
+    torch_dtype=weight_dtype,
+)
+siglip_processor = AutoProcessor.from_pretrained(
+    model_name, subfolder="clip_encoder", 
+)
+
+# Get Scheduler
+Chosen_Scheduler = scheduler_dict = {
+    "Flow": FlowMatchEulerDiscreteScheduler,
+    "Flow_Unipc": FlowUniPCMultistepScheduler,
+    "Flow_DPM++": FlowDPMSolverMultistepScheduler,
+}[sampler_name]
+scheduler = Chosen_Scheduler.from_pretrained(
+    model_name, 
+    subfolder="scheduler"
+)
+
+pipeline = ZImageOmniPipeline(
+    vae=vae,
+    tokenizer=tokenizer,
+    text_encoder=text_encoder,
+    transformer=transformer,
+    siglip=siglip,
+    siglip_processor=siglip_processor,
+    scheduler=scheduler,
+)
+
+if ulysses_degree > 1 or ring_degree > 1:
+    from functools import partial
+    transformer.enable_multi_gpus_inference()
+    if fsdp_dit:
+        shard_fn = partial(shard_model, device_id=device, param_dtype=weight_dtype, module_to_wrapper=list(transformer.layers))
+        pipeline.transformer = shard_fn(pipeline.transformer)
+        print("Add FSDP DIT")
+    if fsdp_text_encoder:
+        shard_fn = partial(shard_model, device_id=device, param_dtype=weight_dtype, module_to_wrapper=list(text_encoder.model.layers))
+        text_encoder = shard_fn(text_encoder)
+        print("Add FSDP TEXT ENCODER")
+
+if compile_dit:
+    for i in range(len(pipeline.transformer.transformer_blocks)):
+        pipeline.transformer.transformer_blocks[i] = torch.compile(pipeline.transformer.transformer_blocks[i])
+    print("Add Compile")
+
+if GPU_memory_mode == "sequential_cpu_offload":
+    pipeline.enable_sequential_cpu_offload(device=device)
+elif GPU_memory_mode == "model_cpu_offload_and_qfloat8":
+    convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
+    convert_weight_dtype_wrapper(transformer, weight_dtype)
+    pipeline.enable_model_cpu_offload(device=device)
+elif GPU_memory_mode == "model_cpu_offload":
+    pipeline.enable_model_cpu_offload(device=device)
+elif GPU_memory_mode == "model_full_load_and_qfloat8":
+    convert_model_weight_to_float8(transformer, exclude_module_name=["img_in", "txt_in", "timestep"], device=device)
+    convert_weight_dtype_wrapper(transformer, weight_dtype)
+    pipeline.to(device=device)
+else:
+    pipeline.to(device=device)
+
+generator = torch.Generator(device=device).manual_seed(seed)
+
+if lora_path is not None:
+    pipeline = merge_lora(pipeline, lora_path, lora_weight, device=device, dtype=weight_dtype)
+
+with torch.no_grad():
+    if image is not None:
+        if not isinstance(image, list):
+            image = get_image(image).convert("RGB")
+        else:
+            image = [get_image(_image).convert("RGB") for _image in image]
+
+    sample = pipeline(
+        image       = image,
+        prompt      = prompt, 
+        negative_prompt = negative_prompt,
+        height      = sample_size[0],
+        width       = sample_size[1],
+        generator   = generator,
+        guidance_scale = guidance_scale,
+        num_inference_steps = num_inference_steps,
+    ).images
+
+if lora_path is not None:
+    pipeline = unmerge_lora(pipeline, lora_path, lora_weight, device=device, dtype=weight_dtype)
+
+def save_results():
+    if not os.path.exists(save_path):
+        os.makedirs(save_path, exist_ok=True)
+
+    index = len([path for path in os.listdir(save_path)]) + 1
+    prefix = str(index).zfill(8)
+    video_path = os.path.join(save_path, prefix + ".png")
+    image = sample[0]
+    image.save(video_path)
+
+if ulysses_degree * ring_degree > 1:
+    import torch.distributed as dist
+    if dist.get_rank() == 0:
+        save_results()
+else:
+    save_results()