fixbug

qianyu-dlut · qianyu-dlut · commit bf8aec39fd3f · 2026-02-25T12:07:05.000+08:00
diff --git a/examples/community/README.md b/examples/community/README.md
@@ -5566,17 +5566,21 @@ import torch
 from diffusers import VQModel, DiffusionPipeline
 from transformers import AutoTokenizer
 
+vqvae = VQModel.from_pretrained("Alpha-VLLM/Lumina-DiMOO", subfolder="vqvae").to(device='cuda', dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained("Alpha-VLLM/Lumina-DiMOO", trust_remote_code=True)
+
 pipe = DiffusionPipeline.from_pretrained(
     "Alpha-VLLM/Lumina-DiMOO",
+    vqvae=vqvae,
+    tokenizer=tokenizer,
     torch_dtype=torch.bfloat16,
-    device_map="auto",
     custom_pipeline="lumina_dimoo",
 )
 pipe.to("cuda")
 
 prompt = '''A striking photograph of a glass of orange juice on a wooden kitchen table, capturing a playful moment. The orange juice splashes out of the glass and forms the word \"Smile\" in a whimsical, swirling script just above the glass. The background is softly blurred, revealing a cozy, homely kitchen with warm lighting and a sense of comfort.'''
 
-out = pipe(
+img = pipe(
     prompt=prompt,
     task="text_to_image",
     height=768,
@@ -5587,7 +5591,7 @@ out = pipe(
     cache_ratio=0.9, 
     warmup_ratio=0.3,
     refresh_interval=5
-)
+).images[0]
 
 img.save("t2i_test_output.png")
 ```
@@ -5604,11 +5608,14 @@ from diffusers import VQModel, DiffusionPipeline
 from transformers import AutoTokenizer
 from diffusers.utils import load_image
 
+vqvae = VQModel.from_pretrained("Alpha-VLLM/Lumina-DiMOO", subfolder="vqvae").to(device='cuda', dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained("Alpha-VLLM/Lumina-DiMOO", trust_remote_code=True)
 
 pipe = DiffusionPipeline.from_pretrained(
     "Alpha-VLLM/Lumina-DiMOO",
+    vqvae=vqvae,
+    tokenizer=tokenizer,
     torch_dtype=torch.bfloat16,
-    device_map="auto",
     custom_pipeline="lumina_dimoo",
 )
 pipe.to("cuda")
@@ -5619,8 +5626,7 @@ input_image = load_image(
 
 prompt = "A functional wooden printer stand.Nestled next to a brick wall in a bustling city street, it stands firm as pedestrians hustle by, illuminated by the warm glow of vintage street lamps."
 
-
-out = pipe(
+img = pipe(
     prompt=prompt,
     image=input_image,
     edit_type="depth_control",
@@ -5629,9 +5635,10 @@ out = pipe(
     cfg_scale=2.5,
     cfg_img=4.0,
     task="image_to_image"
-)
+).images[0]
 
 img.save("i2i_test_output.png")
+
 ```
 
 
@@ -5742,11 +5749,14 @@ from diffusers import VQModel, DiffusionPipeline
 from transformers import AutoTokenizer
 from diffusers.utils import load_image
 
+vqvae = VQModel.from_pretrained("Alpha-VLLM/Lumina-DiMOO", subfolder="vqvae").to(device='cuda', dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained("Alpha-VLLM/Lumina-DiMOO", trust_remote_code=True)
 
 pipe = DiffusionPipeline.from_pretrained(
     "Alpha-VLLM/Lumina-DiMOO",
+    vqvae=vqvae,
+    tokenizer=tokenizer,
     torch_dtype=torch.bfloat16,
-    device_map="auto",
     custom_pipeline="lumina_dimoo",
 )
 pipe.to("cuda")
@@ -5768,5 +5778,7 @@ out = pipe(
     cfg_scale=0.0,
 )
 
-img.save("mmu_answer.txt")
+text = getattr(out, "text", out)
+with open("mmu_answer.txt", "w", encoding="utf-8") as f:
+    f.write(text.strip() + "\n")
 ```
diff --git a/examples/community/lumina_dimoo.py b/examples/community/lumina_dimoo.py
@@ -1960,8 +1960,6 @@ class LuminaDiMOOPipeline(DiffusionPipeline):
             An `AutoTokenizer` to tokenize text prompts.
     """
 
-    model_cpu_offload_seq = "llm->vqvae"
-
     def __init__(
         self,
         vqvae: VQModel,
@@ -2349,7 +2347,6 @@ def generate_text_understanding(
 
 
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def _image_to_image(
         self,
         prompt: str,
@@ -2362,29 +2359,6 @@ def _image_to_image(
         cfg_img: float = 4.0,
         output_type: Optional[str] = "pil",
     ):
-        """
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str`):
-                The prompt to guide image generation.
-            image (`PIL.Image.Image` or `str`):
-                The input image for image-to-image generation. Can be a PIL Image or a path to an image file.
-            ref_image (`PIL.Image.Image` or `str`, *optional*):
-                The reference image for style transfer. Can be a PIL Image or a path to an image file.
-            edit_type (`str`, *optional*, defaults to `"canny_pred"`):
-                The type of image-to-image editing to perform.
-            num_inference_steps (`int`, *optional*, defaults to 64):
-                The number of denoising steps. More steps usually lead to a higher quality image at the expense of
-                slower inference.
-            temperature (`float`, *optional*, defaults to 1.0):
-                The temperature for sampling. A higher temperature introduces more randomness. 0 for deterministic.
-            cfg_scale (`float`, *optional*, defaults to 2.5):
-                Classifier-Free Guidance scale for text conditioning.
-            cfg_img (`float`, *optional*, defaults to 4.0):
-                Classifier-Free Guidance scale for image conditioning.
-
-        """
         
         if isinstance(prompt, list):
             raise ValueError("Batching is not supported for this pipeline.")