From 29b2c93c9005c87f8f04b1f0835babbcea736204 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 30 Dec 2022 11:51:08 +0100 Subject: [PATCH] Make repo structure consistent (#1862) * move files a bit * more refactors * fix more * more fixes * fix more onnx * make style * upload * fix * up * fix more * up again * up * small fix * Update src/diffusers/__init__.py Co-authored-by: Pedro Cuenca * correct Co-authored-by: Pedro Cuenca --- docs/README.md | 6 +- docs/source/api/diffusion_pipeline.mdx | 2 +- docs/source/api/models.mdx | 4 +- docs/source/api/outputs.mdx | 2 +- examples/community/bit_diffusion.py | 3 +- examples/community/checkpoint_merger.py | 8 +- .../community/composable_stable_diffusion.py | 16 +- examples/community/imagic_stable_diffusion.py | 2 +- examples/community/img2img_inpainting.py | 2 +- .../community/interpolate_stable_diffusion.py | 2 +- .../community/lpw_stable_diffusion_onnx.py | 5 +- .../multilingual_stable_diffusion.py | 2 +- examples/community/sd_text2img_k_diffusion.py | 3 +- .../community/seed_resize_stable_diffusion.py | 2 +- examples/community/text_inpainting.py | 2 +- .../community/wildcard_stable_diffusion.py | 2 +- ...ert_stable_diffusion_checkpoint_to_onnx.py | 3 +- src/diffusers/__init__.py | 19 +- .../experimental/rl/value_guided_sampling.py | 2 +- src/diffusers/models/__init__.py | 7 +- src/diffusers/models/attention.py | 2 +- src/diffusers/models/autoencoder_kl.py | 177 ++++ src/diffusers/models/dual_transformer_2d.py | 151 +++ .../modeling_flax_pytorch_utils.py | 2 +- .../{ => models}/modeling_flax_utils.py | 6 +- src/diffusers/{ => models}/modeling_utils.py | 10 +- src/diffusers/models/prior_transformer.py | 2 +- src/diffusers/models/transformer_2d.py | 244 +++++ src/diffusers/models/unet_1d.py | 2 +- src/diffusers/models/unet_2d.py | 2 +- src/diffusers/models/unet_2d_condition.py | 2 +- .../models/unet_2d_condition_flax.py | 2 +- src/diffusers/models/vae.py | 283 +----- src/diffusers/models/vae_flax.py | 2 +- src/diffusers/models/vq_model.py | 148 +++ src/diffusers/pipeline_utils.py | 871 +---------------- src/diffusers/pipelines/__init__.py | 17 + .../alt_diffusion/pipeline_alt_diffusion.py | 2 +- .../pipeline_alt_diffusion_img2img.py | 2 +- .../pipeline_audio_diffusion.py | 2 +- .../pipeline_dance_diffusion.py | 9 +- src/diffusers/pipelines/ddim/pipeline_ddim.py | 9 +- src/diffusers/pipelines/ddpm/pipeline_ddpm.py | 9 +- .../pipeline_latent_diffusion.py | 13 +- ...peline_latent_diffusion_superresolution.py | 9 +- .../pipeline_latent_diffusion_uncond.py | 9 +- src/diffusers/{ => pipelines}/onnx_utils.py | 2 +- .../pipeline_paint_by_example.py | 2 +- .../{ => pipelines}/pipeline_flax_utils.py | 9 +- src/diffusers/pipelines/pipeline_utils.py | 881 ++++++++++++++++++ src/diffusers/pipelines/pndm/pipeline_pndm.py | 9 +- .../pipelines/repaint/pipeline_repaint.py | 9 +- .../score_sde_ve/pipeline_score_sde_ve.py | 9 +- .../pipeline_cycle_diffusion.py | 2 +- .../pipeline_flax_stable_diffusion.py | 2 +- .../pipeline_flax_stable_diffusion_img2img.py | 2 +- .../pipeline_onnx_stable_diffusion.py | 4 +- .../pipeline_onnx_stable_diffusion_img2img.py | 4 +- .../pipeline_onnx_stable_diffusion_inpaint.py | 4 +- ...ne_onnx_stable_diffusion_inpaint_legacy.py | 4 +- .../pipeline_stable_diffusion.py | 2 +- .../pipeline_stable_diffusion_depth2img.py | 2 +- ...peline_stable_diffusion_image_variation.py | 2 +- .../pipeline_stable_diffusion_img2img.py | 2 +- .../pipeline_stable_diffusion_inpaint.py | 2 +- ...ipeline_stable_diffusion_inpaint_legacy.py | 2 +- .../pipeline_stable_diffusion_k_diffusion.py | 2 +- .../pipeline_stable_diffusion_upscale.py | 2 +- .../pipeline_stable_diffusion_safe.py | 2 +- .../pipeline_stochastic_karras_ve.py | 9 +- .../pipelines/unclip/pipeline_unclip.py | 8 +- .../unclip/pipeline_unclip_image_variation.py | 8 +- src/diffusers/pipelines/unclip/text_proj.py | 3 +- .../versatile_diffusion/modeling_text_unet.py | 2 +- .../pipeline_versatile_diffusion.py | 2 +- ...ipeline_versatile_diffusion_dual_guided.py | 5 +- ...ine_versatile_diffusion_image_variation.py | 2 +- ...eline_versatile_diffusion_text_to_image.py | 5 +- .../vq_diffusion/pipeline_vq_diffusion.py | 14 +- src/diffusers/utils/__init__.py | 45 +- src/diffusers/utils/constants.py | 44 + src/diffusers/utils/dummy_onnx_objects.py | 19 + src/diffusers/utils/dummy_pt_objects.py | 36 +- .../{ => utils}/dynamic_modules_utils.py | 6 +- src/diffusers/{ => utils}/hub_utils.py | 10 +- tests/fixtures/custom_pipeline/pipeline.py | 6 +- tests/models/test_models_vae.py | 2 +- .../unclip/test_unclip_image_variation.py | 9 +- tests/test_modeling_common.py | 2 +- tests/test_pipelines.py | 4 +- 90 files changed, 1914 insertions(+), 1369 deletions(-) create mode 100644 src/diffusers/models/autoencoder_kl.py create mode 100644 src/diffusers/models/dual_transformer_2d.py rename src/diffusers/{ => models}/modeling_flax_pytorch_utils.py (99%) rename src/diffusers/{ => models}/modeling_flax_utils.py (99%) rename src/diffusers/{ => models}/modeling_utils.py (99%) create mode 100644 src/diffusers/models/transformer_2d.py create mode 100644 src/diffusers/models/vq_model.py rename src/diffusers/{ => pipelines}/onnx_utils.py (98%) rename src/diffusers/{ => pipelines}/pipeline_flax_utils.py (98%) create mode 100644 src/diffusers/pipelines/pipeline_utils.py create mode 100644 src/diffusers/utils/constants.py create mode 100644 src/diffusers/utils/dummy_onnx_objects.py rename src/diffusers/{ => utils}/dynamic_modules_utils.py (99%) rename src/diffusers/{ => utils}/hub_utils.py (96%) diff --git a/docs/README.md b/docs/README.md index b2d48dfe..23b96fa8 100644 --- a/docs/README.md +++ b/docs/README.md @@ -155,9 +155,9 @@ adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`funct function to be in the main package. If you want to create a link to some internal class or function, you need to -provide its path. For instance: \[\`pipeline_utils.ImagePipelineOutput\`\]. This will be converted into a link with -`pipeline_utils.ImagePipelineOutput` in the description. To get rid of the path and only keep the name of the object you are -linking to in the description, add a ~: \[\`~pipeline_utils.ImagePipelineOutput\`\] will generate a link with `ImagePipelineOutput` in the description. +provide its path. For instance: \[\`pipelines.ImagePipelineOutput\`\]. This will be converted into a link with +`pipelines.ImagePipelineOutput` in the description. To get rid of the path and only keep the name of the object you are +linking to in the description, add a ~: \[\`~pipelines.ImagePipelineOutput\`\] will generate a link with `ImagePipelineOutput` in the description. The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\]. diff --git a/docs/source/api/diffusion_pipeline.mdx b/docs/source/api/diffusion_pipeline.mdx index b037b4e2..ab138e3a 100644 --- a/docs/source/api/diffusion_pipeline.mdx +++ b/docs/source/api/diffusion_pipeline.mdx @@ -39,4 +39,4 @@ Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrain ## ImagePipelineOutput By default diffusion pipelines return an object of class -[[autodoc]] pipeline_utils.ImagePipelineOutput +[[autodoc]] pipelines.ImagePipelineOutput diff --git a/docs/source/api/models.mdx b/docs/source/api/models.mdx index 678c902d..0a2183f0 100644 --- a/docs/source/api/models.mdx +++ b/docs/source/api/models.mdx @@ -41,13 +41,13 @@ The models are built on the base class ['ModelMixin'] that is a `torch.nn.module [[autodoc]] models.vae.DecoderOutput ## VQEncoderOutput -[[autodoc]] models.vae.VQEncoderOutput +[[autodoc]] models.vq_model.VQEncoderOutput ## VQModel [[autodoc]] VQModel ## AutoencoderKLOutput -[[autodoc]] models.vae.AutoencoderKLOutput +[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput ## AutoencoderKL [[autodoc]] AutoencoderKL diff --git a/docs/source/api/outputs.mdx b/docs/source/api/outputs.mdx index 010761fb..291a7975 100644 --- a/docs/source/api/outputs.mdx +++ b/docs/source/api/outputs.mdx @@ -25,7 +25,7 @@ pipeline = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32") outputs = pipeline() ``` -The `outputs` object is a [`~pipeline_utils.ImagePipelineOutput`], as we can see in the +The `outputs` object is a [`~pipelines.ImagePipelineOutput`], as we can see in the documentation of that class below, it means it has an image attribute. You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you will get `None`: diff --git a/examples/community/bit_diffusion.py b/examples/community/bit_diffusion.py index 956e25a7..b27b67c9 100644 --- a/examples/community/bit_diffusion.py +++ b/examples/community/bit_diffusion.py @@ -2,8 +2,7 @@ from typing import Optional, Tuple, Union import torch -from diffusers import DDIMScheduler, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel -from diffusers.pipeline_utils import ImagePipelineOutput +from diffusers import DDIMScheduler, DDPMScheduler, DiffusionPipeline, ImagePipelineOutput, UNet2DConditionModel from diffusers.schedulers.scheduling_ddim import DDIMSchedulerOutput from diffusers.schedulers.scheduling_ddpm import DDPMSchedulerOutput from einops import rearrange, reduce diff --git a/examples/community/checkpoint_merger.py b/examples/community/checkpoint_merger.py index 982da664..b6e41822 100644 --- a/examples/community/checkpoint_merger.py +++ b/examples/community/checkpoint_merger.py @@ -5,13 +5,7 @@ from typing import Dict, List, Union import torch from diffusers import DiffusionPipeline, __version__ -from diffusers.pipeline_utils import ( - CONFIG_NAME, - DIFFUSERS_CACHE, - ONNX_WEIGHTS_NAME, - SCHEDULER_CONFIG_NAME, - WEIGHTS_NAME, -) +from diffusers.utils import CONFIG_NAME, DIFFUSERS_CACHE, ONNX_WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, WEIGHTS_NAME from huggingface_hub import snapshot_download diff --git a/examples/community/composable_stable_diffusion.py b/examples/community/composable_stable_diffusion.py index 1d7b6371..7ee99775 100644 --- a/examples/community/composable_stable_diffusion.py +++ b/examples/community/composable_stable_diffusion.py @@ -17,14 +17,10 @@ from typing import Callable, List, Optional, Union import torch -from diffusers.utils import is_accelerate_available -from packaging import version -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer - -from ...configuration_utils import FrozenDict -from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline -from ...schedulers import ( +from diffusers import DiffusionPipeline +from diffusers.configuration_utils import FrozenDict +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, @@ -32,6 +28,10 @@ from ...schedulers import ( LMSDiscreteScheduler, PNDMScheduler, ) +from diffusers.utils import is_accelerate_available +from packaging import version +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + from ...utils import deprecate, logging from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py index 2488675f..8b5bb7d0 100644 --- a/examples/community/imagic_stable_diffusion.py +++ b/examples/community/imagic_stable_diffusion.py @@ -12,8 +12,8 @@ import torch.nn.functional as F import PIL from accelerate import Accelerator +from diffusers import DiffusionPipeline from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler diff --git a/examples/community/img2img_inpainting.py b/examples/community/img2img_inpainting.py index 3fa7db13..cb8071e8 100644 --- a/examples/community/img2img_inpainting.py +++ b/examples/community/img2img_inpainting.py @@ -5,9 +5,9 @@ import numpy as np import torch import PIL +from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler diff --git a/examples/community/interpolate_stable_diffusion.py b/examples/community/interpolate_stable_diffusion.py index 4d7a73f5..9087dd7d 100644 --- a/examples/community/interpolate_stable_diffusion.py +++ b/examples/community/interpolate_stable_diffusion.py @@ -6,9 +6,9 @@ from typing import Callable, List, Optional, Union import numpy as np import torch +from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler diff --git a/examples/community/lpw_stable_diffusion_onnx.py b/examples/community/lpw_stable_diffusion_onnx.py index 58165dbd..f37bd6e8 100644 --- a/examples/community/lpw_stable_diffusion_onnx.py +++ b/examples/community/lpw_stable_diffusion_onnx.py @@ -7,8 +7,7 @@ import torch import diffusers import PIL -from diffusers import OnnxStableDiffusionPipeline, SchedulerMixin -from diffusers.onnx_utils import OnnxRuntimeModel +from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline, SchedulerMixin from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import deprecate, logging from packaging import version @@ -16,7 +15,7 @@ from transformers import CLIPFeatureExtractor, CLIPTokenizer try: - from diffusers.onnx_utils import ORT_TO_NP_TYPE + from diffusers.pipelines.onnx_utils import ORT_TO_NP_TYPE except ImportError: ORT_TO_NP_TYPE = { "tensor(bool)": np.bool_, diff --git a/examples/community/multilingual_stable_diffusion.py b/examples/community/multilingual_stable_diffusion.py index 19974d6d..bcadc803 100644 --- a/examples/community/multilingual_stable_diffusion.py +++ b/examples/community/multilingual_stable_diffusion.py @@ -3,9 +3,9 @@ from typing import Callable, List, Optional, Union import torch +from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler diff --git a/examples/community/sd_text2img_k_diffusion.py b/examples/community/sd_text2img_k_diffusion.py index 6b15674e..d5b32024 100755 --- a/examples/community/sd_text2img_k_diffusion.py +++ b/examples/community/sd_text2img_k_diffusion.py @@ -18,8 +18,7 @@ from typing import Callable, List, Optional, Union import torch -from diffusers import LMSDiscreteScheduler -from diffusers.pipeline_utils import DiffusionPipeline +from diffusers import DiffusionPipeline, LMSDiscreteScheduler from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import is_accelerate_available, logging from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser diff --git a/examples/community/seed_resize_stable_diffusion.py b/examples/community/seed_resize_stable_diffusion.py index 92cd1c04..a3d17441 100644 --- a/examples/community/seed_resize_stable_diffusion.py +++ b/examples/community/seed_resize_stable_diffusion.py @@ -6,8 +6,8 @@ from typing import Callable, List, Optional, Union import torch +from diffusers import DiffusionPipeline from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler diff --git a/examples/community/text_inpainting.py b/examples/community/text_inpainting.py index f02d449f..8a12044d 100644 --- a/examples/community/text_inpainting.py +++ b/examples/community/text_inpainting.py @@ -3,9 +3,9 @@ from typing import Callable, List, Optional, Union import torch import PIL +from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler diff --git a/examples/community/wildcard_stable_diffusion.py b/examples/community/wildcard_stable_diffusion.py index ee45e62a..6ba574cd 100644 --- a/examples/community/wildcard_stable_diffusion.py +++ b/examples/community/wildcard_stable_diffusion.py @@ -7,9 +7,9 @@ from typing import Callable, Dict, List, Optional, Union import torch +from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipeline_utils import DiffusionPipeline from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler diff --git a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py index 26d3d561..7a2a682d 100644 --- a/scripts/convert_stable_diffusion_checkpoint_to_onnx.py +++ b/scripts/convert_stable_diffusion_checkpoint_to_onnx.py @@ -21,8 +21,7 @@ import torch from torch.onnx import export import onnx -from diffusers import OnnxStableDiffusionPipeline, StableDiffusionPipeline -from diffusers.onnx_utils import OnnxRuntimeModel +from diffusers import OnnxRuntimeModel, OnnxStableDiffusionPipeline, StableDiffusionPipeline from packaging import version diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 7c158a4f..46480270 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -1,7 +1,6 @@ __version__ = "0.12.0.dev0" from .configuration_utils import ConfigMixin -from .onnx_utils import OnnxRuntimeModel from .utils import ( OptionalDependencyNotAvailable, is_flax_available, @@ -18,15 +17,23 @@ from .utils import ( ) +try: + if not is_onnx_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils.dummy_onnx_objects import * # noqa F403 +else: + from .pipelines import OnnxRuntimeModel + try: if not is_torch_available(): raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from .utils.dummy_pt_objects import * # noqa F403 else: - from .modeling_utils import ModelMixin from .models import ( AutoencoderKL, + ModelMixin, PriorTransformer, Transformer2DModel, UNet1DModel, @@ -43,11 +50,13 @@ else: get_polynomial_decay_schedule_with_warmup, get_scheduler, ) - from .pipeline_utils import DiffusionPipeline from .pipelines import ( + AudioPipelineOutput, DanceDiffusionPipeline, DDIMPipeline, DDPMPipeline, + DiffusionPipeline, + ImagePipelineOutput, KarrasVePipeline, LDMPipeline, LDMSuperResolutionPipeline, @@ -150,10 +159,10 @@ try: except OptionalDependencyNotAvailable: from .utils.dummy_flax_objects import * # noqa F403 else: - from .modeling_flax_utils import FlaxModelMixin + from .models.modeling_flax_utils import FlaxModelMixin from .models.unet_2d_condition_flax import FlaxUNet2DConditionModel from .models.vae_flax import FlaxAutoencoderKL - from .pipeline_flax_utils import FlaxDiffusionPipeline + from .pipelines import FlaxDiffusionPipeline from .schedulers import ( FlaxDDIMScheduler, FlaxDDPMScheduler, diff --git a/src/diffusers/experimental/rl/value_guided_sampling.py b/src/diffusers/experimental/rl/value_guided_sampling.py index 27bef081..1c840123 100644 --- a/src/diffusers/experimental/rl/value_guided_sampling.py +++ b/src/diffusers/experimental/rl/value_guided_sampling.py @@ -18,7 +18,7 @@ import torch import tqdm from ...models.unet_1d import UNet1DModel -from ...pipeline_utils import DiffusionPipeline +from ...pipelines import DiffusionPipeline from ...utils.dummy_pt_objects import DDPMScheduler diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py index d0ee290f..474b8412 100644 --- a/src/diffusers/models/__init__.py +++ b/src/diffusers/models/__init__.py @@ -16,12 +16,15 @@ from ..utils import is_flax_available, is_torch_available if is_torch_available(): - from .attention import Transformer2DModel + from .autoencoder_kl import AutoencoderKL + from .dual_transformer_2d import DualTransformer2DModel + from .modeling_utils import ModelMixin from .prior_transformer import PriorTransformer + from .transformer_2d import Transformer2DModel from .unet_1d import UNet1DModel from .unet_2d import UNet2DModel from .unet_2d_condition import UNet2DConditionModel - from .vae import AutoencoderKL, VQModel + from .vq_model import VQModel if is_flax_available(): from .unet_2d_condition_flax import FlaxUNet2DConditionModel diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index b6f5158e..acee0ff6 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -20,11 +20,11 @@ import torch.nn.functional as F from torch import nn from ..configuration_utils import ConfigMixin, register_to_config -from ..modeling_utils import ModelMixin from ..models.embeddings import ImagePositionalEmbeddings from ..utils import BaseOutput from ..utils.import_utils import is_xformers_available from .cross_attention import CrossAttention +from .modeling_utils import ModelMixin @dataclass diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py new file mode 100644 index 00000000..1bf86627 --- /dev/null +++ b/src/diffusers/models/autoencoder_kl.py @@ -0,0 +1,177 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn + +from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import BaseOutput +from .modeling_utils import ModelMixin +from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder + + +@dataclass +class AutoencoderKLOutput(BaseOutput): + """ + Output of AutoencoderKL encoding method. + + Args: + latent_dist (`DiagonalGaussianDistribution`): + Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`. + `DiagonalGaussianDistribution` allows for sampling latents from the distribution. + """ + + latent_dist: "DiagonalGaussianDistribution" + + +class AutoencoderKL(ModelMixin, ConfigMixin): + r"""Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma + and Max Welling. + + This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library + implements for all the model (such as downloading or saving, etc.) + + Parameters: + in_channels (int, *optional*, defaults to 3): Number of channels in the input image. + out_channels (int, *optional*, defaults to 3): Number of channels in the output. + down_block_types (`Tuple[str]`, *optional*, defaults to : + obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types. + up_block_types (`Tuple[str]`, *optional*, defaults to : + obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types. + block_out_channels (`Tuple[int]`, *optional*, defaults to : + obj:`(64,)`): Tuple of block output channels. + act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. + latent_channels (`int`, *optional*, defaults to `4`): Number of channels in the latent space. + sample_size (`int`, *optional*, defaults to `32`): TODO + """ + + @register_to_config + def __init__( + self, + in_channels: int = 3, + out_channels: int = 3, + down_block_types: Tuple[str] = ("DownEncoderBlock2D",), + up_block_types: Tuple[str] = ("UpDecoderBlock2D",), + block_out_channels: Tuple[int] = (64,), + layers_per_block: int = 1, + act_fn: str = "silu", + latent_channels: int = 4, + norm_num_groups: int = 32, + sample_size: int = 32, + ): + super().__init__() + + # pass init params to Encoder + self.encoder = Encoder( + in_channels=in_channels, + out_channels=latent_channels, + down_block_types=down_block_types, + block_out_channels=block_out_channels, + layers_per_block=layers_per_block, + act_fn=act_fn, + norm_num_groups=norm_num_groups, + double_z=True, + ) + + # pass init params to Decoder + self.decoder = Decoder( + in_channels=latent_channels, + out_channels=out_channels, + up_block_types=up_block_types, + block_out_channels=block_out_channels, + layers_per_block=layers_per_block, + norm_num_groups=norm_num_groups, + act_fn=act_fn, + ) + + self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) + self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1) + self.use_slicing = False + + def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput: + h = self.encoder(x) + moments = self.quant_conv(h) + posterior = DiagonalGaussianDistribution(moments) + + if not return_dict: + return (posterior,) + + return AutoencoderKLOutput(latent_dist=posterior) + + def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + z = self.post_quant_conv(z) + dec = self.decoder(z) + + if not return_dict: + return (dec,) + + return DecoderOutput(sample=dec) + + def enable_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.use_slicing = True + + def disable_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_slicing` was previously invoked, this method will go back to computing + decoding in one step. + """ + self.use_slicing = False + + def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + if self.use_slicing and z.shape[0] > 1: + decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)] + decoded = torch.cat(decoded_slices) + else: + decoded = self._decode(z).sample + + if not return_dict: + return (decoded,) + + return DecoderOutput(sample=decoded) + + def forward( + self, + sample: torch.FloatTensor, + sample_posterior: bool = False, + return_dict: bool = True, + generator: Optional[torch.Generator] = None, + ) -> Union[DecoderOutput, torch.FloatTensor]: + r""" + Args: + sample (`torch.FloatTensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + """ + x = sample + posterior = self.encode(x).latent_dist + if sample_posterior: + z = posterior.sample(generator=generator) + else: + z = posterior.mode() + dec = self.decode(z).sample + + if not return_dict: + return (dec,) + + return DecoderOutput(sample=dec) diff --git a/src/diffusers/models/dual_transformer_2d.py b/src/diffusers/models/dual_transformer_2d.py new file mode 100644 index 00000000..654ce405 --- /dev/null +++ b/src/diffusers/models/dual_transformer_2d.py @@ -0,0 +1,151 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +from torch import nn + +from .transformer_2d import Transformer2DModel, Transformer2DModelOutput + + +class DualTransformer2DModel(nn.Module): + """ + Dual transformer wrapper that combines two `Transformer2DModel`s for mixed inference. + + Parameters: + num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. + attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. + in_channels (`int`, *optional*): + Pass if the input is continuous. The number of channels in the input and output. + num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. + dropout (`float`, *optional*, defaults to 0.1): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use. + sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images. + Note that this is fixed at training time as it is used for learning a number of position embeddings. See + `ImagePositionalEmbeddings`. + num_vector_embeds (`int`, *optional*): + Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels. + Includes the class for the masked latent pixel. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`. + The number of diffusion steps used during training. Note that this is fixed at training time as it is used + to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for + up to but not more than steps than `num_embeds_ada_norm`. + attention_bias (`bool`, *optional*): + Configure if the TransformerBlocks' attention should contain a bias parameter. + """ + + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + ): + super().__init__() + self.transformers = nn.ModuleList( + [ + Transformer2DModel( + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + in_channels=in_channels, + num_layers=num_layers, + dropout=dropout, + norm_num_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attention_bias=attention_bias, + sample_size=sample_size, + num_vector_embeds=num_vector_embeds, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + ) + for _ in range(2) + ] + ) + + # Variables that can be set by a pipeline: + + # The ratio of transformer1 to transformer2's output states to be combined during inference + self.mix_ratio = 0.5 + + # The shape of `encoder_hidden_states` is expected to be + # `(batch_size, condition_lengths[0]+condition_lengths[1], num_features)` + self.condition_lengths = [77, 257] + + # Which transformer to use to encode which condition. + # E.g. `(1, 0)` means that we'll use `transformers[1](conditions[0])` and `transformers[0](conditions[1])` + self.transformer_index_for_condition = [1, 0] + + def forward( + self, + hidden_states, + encoder_hidden_states, + timestep=None, + attention_mask=None, + cross_attention_kwargs=None, + return_dict: bool = True, + ): + """ + Args: + hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. + When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input + hidden_states + encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): + Conditional embeddings for cross attention layer. If not given, cross-attention defaults to + self-attention. + timestep ( `torch.long`, *optional*): + Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. + attention_mask (`torch.FloatTensor`, *optional*): + Optional attention mask to be applied in CrossAttention + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + + Returns: + [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`] + if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample + tensor. + """ + input_states = hidden_states + + encoded_states = [] + tokens_start = 0 + # attention_mask is not used yet + for i in range(2): + # for each of the two transformers, pass the corresponding condition tokens + condition_state = encoder_hidden_states[:, tokens_start : tokens_start + self.condition_lengths[i]] + transformer_index = self.transformer_index_for_condition[i] + encoded_state = self.transformers[transformer_index]( + input_states, + encoder_hidden_states=condition_state, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + encoded_states.append(encoded_state - input_states) + tokens_start += self.condition_lengths[i] + + output_states = encoded_states[0] * self.mix_ratio + encoded_states[1] * (1 - self.mix_ratio) + output_states = output_states + input_states + + if not return_dict: + return (output_states,) + + return Transformer2DModelOutput(sample=output_states) diff --git a/src/diffusers/modeling_flax_pytorch_utils.py b/src/diffusers/models/modeling_flax_pytorch_utils.py similarity index 99% rename from src/diffusers/modeling_flax_pytorch_utils.py rename to src/diffusers/models/modeling_flax_pytorch_utils.py index 9c7a5de2..7463b408 100644 --- a/src/diffusers/modeling_flax_pytorch_utils.py +++ b/src/diffusers/models/modeling_flax_pytorch_utils.py @@ -19,7 +19,7 @@ import jax.numpy as jnp from flax.traverse_util import flatten_dict, unflatten_dict from jax.random import PRNGKey -from .utils import logging +from ..utils import logging logger = logging.get_logger(__name__) diff --git a/src/diffusers/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py similarity index 99% rename from src/diffusers/modeling_flax_utils.py rename to src/diffusers/models/modeling_flax_utils.py index be9b7dee..aeeeccad 100644 --- a/src/diffusers/modeling_flax_utils.py +++ b/src/diffusers/models/modeling_flax_utils.py @@ -27,9 +27,8 @@ from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError from requests import HTTPError -from . import __version__, is_torch_available -from .modeling_flax_pytorch_utils import convert_pytorch_state_dict_to_flax -from .utils import ( +from .. import __version__, is_torch_available +from ..utils import ( CONFIG_NAME, DIFFUSERS_CACHE, FLAX_WEIGHTS_NAME, @@ -37,6 +36,7 @@ from .utils import ( WEIGHTS_NAME, logging, ) +from .modeling_flax_pytorch_utils import convert_pytorch_state_dict_to_flax logger = logging.get_logger(__name__) diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/models/modeling_utils.py similarity index 99% rename from src/diffusers/modeling_utils.py rename to src/diffusers/models/modeling_utils.py index 9afece32..91c44973 100644 --- a/src/diffusers/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -26,11 +26,11 @@ from huggingface_hub import hf_hub_download from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError from requests import HTTPError -from . import __version__ -from .hub_utils import HF_HUB_OFFLINE -from .utils import ( +from .. import __version__ +from ..utils import ( CONFIG_NAME, DIFFUSERS_CACHE, + HF_HUB_OFFLINE, HUGGINGFACE_CO_RESOLVE_ENDPOINT, SAFETENSORS_WEIGHTS_NAME, WEIGHTS_NAME, @@ -149,7 +149,7 @@ class ModelMixin(torch.nn.Module): and saving models. - **config_name** ([`str`]) -- A filename under which the model should be stored when calling - [`~modeling_utils.ModelMixin.save_pretrained`]. + [`~models.ModelMixin.save_pretrained`]. """ config_name = CONFIG_NAME _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"] @@ -231,7 +231,7 @@ class ModelMixin(torch.nn.Module): ): """ Save a model and its configuration file to a directory, so that it can be re-loaded using the - `[`~modeling_utils.ModelMixin.from_pretrained`]` class method. + `[`~models.ModelMixin.from_pretrained`]` class method. Arguments: save_directory (`str` or `os.PathLike`): diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py index 714d5d52..998ca494 100644 --- a/src/diffusers/models/prior_transformer.py +++ b/src/diffusers/models/prior_transformer.py @@ -6,10 +6,10 @@ import torch.nn.functional as F from torch import nn from ..configuration_utils import ConfigMixin, register_to_config -from ..modeling_utils import ModelMixin from ..utils import BaseOutput from .attention import BasicTransformerBlock from .embeddings import TimestepEmbedding, Timesteps +from .modeling_utils import ModelMixin @dataclass diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py new file mode 100644 index 00000000..2513b57e --- /dev/null +++ b/src/diffusers/models/transformer_2d.py @@ -0,0 +1,244 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import nn + +from ..configuration_utils import ConfigMixin, register_to_config +from ..models.embeddings import ImagePositionalEmbeddings +from ..utils import BaseOutput +from .attention import BasicTransformerBlock +from .modeling_utils import ModelMixin + + +@dataclass +class Transformer2DModelOutput(BaseOutput): + """ + Args: + sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): + Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions + for the unnoised latent pixels. + """ + + sample: torch.FloatTensor + + +class Transformer2DModel(ModelMixin, ConfigMixin): + """ + Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual + embeddings) inputs. + + When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard + transformer action. Finally, reshape to image. + + When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional + embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict + classes of unnoised image. + + Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised + image do not contain a prediction for the masked pixel as the unnoised image cannot be masked. + + Parameters: + num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. + attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. + in_channels (`int`, *optional*): + Pass if the input is continuous. The number of channels in the input and output. + num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use. + sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images. + Note that this is fixed at training time as it is used for learning a number of position embeddings. See + `ImagePositionalEmbeddings`. + num_vector_embeds (`int`, *optional*): + Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels. + Includes the class for the masked latent pixel. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`. + The number of diffusion steps used during training. Note that this is fixed at training time as it is used + to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for + up to but not more than steps than `num_embeds_ada_norm`. + attention_bias (`bool`, *optional*): + Configure if the TransformerBlocks' attention should contain a bias parameter. + """ + + @register_to_config + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + ): + super().__init__() + self.use_linear_projection = use_linear_projection + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + inner_dim = num_attention_heads * attention_head_dim + + # 1. Transformer2DModel can process both standard continous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)` + # Define whether input is continuous or discrete depending on configuration + self.is_input_continuous = in_channels is not None + self.is_input_vectorized = num_vector_embeds is not None + + if self.is_input_continuous and self.is_input_vectorized: + raise ValueError( + f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make" + " sure that either `in_channels` or `num_vector_embeds` is None." + ) + elif not self.is_input_continuous and not self.is_input_vectorized: + raise ValueError( + f"Has to define either `in_channels`: {in_channels} or `num_vector_embeds`: {num_vector_embeds}. Make" + " sure that either `in_channels` or `num_vector_embeds` is not None." + ) + + # 2. Define input layers + if self.is_input_continuous: + self.in_channels = in_channels + + self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True) + if use_linear_projection: + self.proj_in = nn.Linear(in_channels, inner_dim) + else: + self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0) + elif self.is_input_vectorized: + assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size" + assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed" + + self.height = sample_size + self.width = sample_size + self.num_vector_embeds = num_vector_embeds + self.num_latent_pixels = self.height * self.width + + self.latent_image_embedding = ImagePositionalEmbeddings( + num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width + ) + + # 3. Define transformers blocks + self.transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + ) + for d in range(num_layers) + ] + ) + + # 4. Define output layers + if self.is_input_continuous: + if use_linear_projection: + self.proj_out = nn.Linear(in_channels, inner_dim) + else: + self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0) + elif self.is_input_vectorized: + self.norm_out = nn.LayerNorm(inner_dim) + self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1) + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + cross_attention_kwargs=None, + return_dict: bool = True, + ): + """ + Args: + hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. + When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input + hidden_states + encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): + Conditional embeddings for cross attention layer. If not given, cross-attention defaults to + self-attention. + timestep ( `torch.long`, *optional*): + Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + + Returns: + [`~models.attention.Transformer2DModelOutput`] or `tuple`: [`~models.attention.Transformer2DModelOutput`] + if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample + tensor. + """ + # 1. Input + if self.is_input_continuous: + batch, channel, height, width = hidden_states.shape + residual = hidden_states + + hidden_states = self.norm(hidden_states) + if not self.use_linear_projection: + hidden_states = self.proj_in(hidden_states) + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) + else: + inner_dim = hidden_states.shape[1] + hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim) + hidden_states = self.proj_in(hidden_states) + elif self.is_input_vectorized: + hidden_states = self.latent_image_embedding(hidden_states) + + # 2. Blocks + for block in self.transformer_blocks: + hidden_states = block( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + ) + + # 3. Output + if self.is_input_continuous: + if not self.use_linear_projection: + hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous() + hidden_states = self.proj_out(hidden_states) + else: + hidden_states = self.proj_out(hidden_states) + hidden_states = hidden_states.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous() + + output = hidden_states + residual + elif self.is_input_vectorized: + hidden_states = self.norm_out(hidden_states) + logits = self.out(hidden_states) + # (batch, self.num_vector_embeds - 1, self.num_latent_pixels) + logits = logits.permute(0, 2, 1) + + # log(p(x_0)) + output = F.log_softmax(logits.double(), dim=1).float() + + if not return_dict: + return (output,) + + return Transformer2DModelOutput(sample=output) diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py index 00083fb3..2ab6c69d 100644 --- a/src/diffusers/models/unet_1d.py +++ b/src/diffusers/models/unet_1d.py @@ -19,9 +19,9 @@ import torch import torch.nn as nn from ..configuration_utils import ConfigMixin, register_to_config -from ..modeling_utils import ModelMixin from ..utils import BaseOutput from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps +from .modeling_utils import ModelMixin from .unet_1d_blocks import get_down_block, get_mid_block, get_out_block, get_up_block diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py index b49931a1..ccbb218b 100644 --- a/src/diffusers/models/unet_2d.py +++ b/src/diffusers/models/unet_2d.py @@ -18,9 +18,9 @@ import torch import torch.nn as nn from ..configuration_utils import ConfigMixin, register_to_config -from ..modeling_utils import ModelMixin from ..utils import BaseOutput from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps +from .modeling_utils import ModelMixin from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index d5ccb169..8099cd84 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -19,10 +19,10 @@ import torch.nn as nn import torch.utils.checkpoint from ..configuration_utils import ConfigMixin, register_to_config -from ..modeling_utils import ModelMixin from ..utils import BaseOutput, logging from .cross_attention import AttnProcessor from .embeddings import TimestepEmbedding, Timesteps +from .modeling_utils import ModelMixin from .unet_2d_blocks import ( CrossAttnDownBlock2D, CrossAttnUpBlock2D, diff --git a/src/diffusers/models/unet_2d_condition_flax.py b/src/diffusers/models/unet_2d_condition_flax.py index fb9caca9..8d8308c5 100644 --- a/src/diffusers/models/unet_2d_condition_flax.py +++ b/src/diffusers/models/unet_2d_condition_flax.py @@ -20,9 +20,9 @@ import jax.numpy as jnp from flax.core.frozen_dict import FrozenDict from ..configuration_utils import ConfigMixin, flax_register_to_config -from ..modeling_flax_utils import FlaxModelMixin from ..utils import BaseOutput from .embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps +from .modeling_flax_utils import FlaxModelMixin from .unet_2d_blocks_flax import ( FlaxCrossAttnDownBlock2D, FlaxCrossAttnUpBlock2D, diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py index bcfc8789..f46cf7bd 100644 --- a/src/diffusers/models/vae.py +++ b/src/diffusers/models/vae.py @@ -12,14 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Optional, Tuple, Union +from typing import Optional import numpy as np import torch import torch.nn as nn -from ..configuration_utils import ConfigMixin, register_to_config -from ..modeling_utils import ModelMixin from ..utils import BaseOutput from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block @@ -37,33 +35,6 @@ class DecoderOutput(BaseOutput): sample: torch.FloatTensor -@dataclass -class VQEncoderOutput(BaseOutput): - """ - Output of VQModel encoding method. - - Args: - latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Encoded output sample of the model. Output of the last layer of the model. - """ - - latents: torch.FloatTensor - - -@dataclass -class AutoencoderKLOutput(BaseOutput): - """ - Output of AutoencoderKL encoding method. - - Args: - latent_dist (`DiagonalGaussianDistribution`): - Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`. - `DiagonalGaussianDistribution` allows for sampling latents from the distribution. - """ - - latent_dist: "DiagonalGaussianDistribution" - - class Encoder(nn.Module): def __init__( self, @@ -384,255 +355,3 @@ class DiagonalGaussianDistribution(object): def mode(self): return self.mean - - -class VQModel(ModelMixin, ConfigMixin): - r"""VQ-VAE model from the paper Neural Discrete Representation Learning by Aaron van den Oord, Oriol Vinyals and Koray - Kavukcuoglu. - - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the model (such as downloading or saving, etc.) - - Parameters: - in_channels (int, *optional*, defaults to 3): Number of channels in the input image. - out_channels (int, *optional*, defaults to 3): Number of channels in the output. - down_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types. - up_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types. - block_out_channels (`Tuple[int]`, *optional*, defaults to : - obj:`(64,)`): Tuple of block output channels. - act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. - latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space. - sample_size (`int`, *optional*, defaults to `32`): TODO - num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE. - vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE. - """ - - @register_to_config - def __init__( - self, - in_channels: int = 3, - out_channels: int = 3, - down_block_types: Tuple[str] = ("DownEncoderBlock2D",), - up_block_types: Tuple[str] = ("UpDecoderBlock2D",), - block_out_channels: Tuple[int] = (64,), - layers_per_block: int = 1, - act_fn: str = "silu", - latent_channels: int = 3, - sample_size: int = 32, - num_vq_embeddings: int = 256, - norm_num_groups: int = 32, - vq_embed_dim: Optional[int] = None, - ): - super().__init__() - - # pass init params to Encoder - self.encoder = Encoder( - in_channels=in_channels, - out_channels=latent_channels, - down_block_types=down_block_types, - block_out_channels=block_out_channels, - layers_per_block=layers_per_block, - act_fn=act_fn, - norm_num_groups=norm_num_groups, - double_z=False, - ) - - vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels - - self.quant_conv = torch.nn.Conv2d(latent_channels, vq_embed_dim, 1) - self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False) - self.post_quant_conv = torch.nn.Conv2d(vq_embed_dim, latent_channels, 1) - - # pass init params to Decoder - self.decoder = Decoder( - in_channels=latent_channels, - out_channels=out_channels, - up_block_types=up_block_types, - block_out_channels=block_out_channels, - layers_per_block=layers_per_block, - act_fn=act_fn, - norm_num_groups=norm_num_groups, - ) - - def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: - h = self.encoder(x) - h = self.quant_conv(h) - - if not return_dict: - return (h,) - - return VQEncoderOutput(latents=h) - - def decode( - self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True - ) -> Union[DecoderOutput, torch.FloatTensor]: - # also go through quantization layer - if not force_not_quantize: - quant, emb_loss, info = self.quantize(h) - else: - quant = h - quant = self.post_quant_conv(quant) - dec = self.decoder(quant) - - if not return_dict: - return (dec,) - - return DecoderOutput(sample=dec) - - def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: - r""" - Args: - sample (`torch.FloatTensor`): Input sample. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`DecoderOutput`] instead of a plain tuple. - """ - x = sample - h = self.encode(x).latents - dec = self.decode(h).sample - - if not return_dict: - return (dec,) - - return DecoderOutput(sample=dec) - - -class AutoencoderKL(ModelMixin, ConfigMixin): - r"""Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma - and Max Welling. - - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the model (such as downloading or saving, etc.) - - Parameters: - in_channels (int, *optional*, defaults to 3): Number of channels in the input image. - out_channels (int, *optional*, defaults to 3): Number of channels in the output. - down_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types. - up_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types. - block_out_channels (`Tuple[int]`, *optional*, defaults to : - obj:`(64,)`): Tuple of block output channels. - act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. - latent_channels (`int`, *optional*, defaults to `4`): Number of channels in the latent space. - sample_size (`int`, *optional*, defaults to `32`): TODO - """ - - @register_to_config - def __init__( - self, - in_channels: int = 3, - out_channels: int = 3, - down_block_types: Tuple[str] = ("DownEncoderBlock2D",), - up_block_types: Tuple[str] = ("UpDecoderBlock2D",), - block_out_channels: Tuple[int] = (64,), - layers_per_block: int = 1, - act_fn: str = "silu", - latent_channels: int = 4, - norm_num_groups: int = 32, - sample_size: int = 32, - ): - super().__init__() - - # pass init params to Encoder - self.encoder = Encoder( - in_channels=in_channels, - out_channels=latent_channels, - down_block_types=down_block_types, - block_out_channels=block_out_channels, - layers_per_block=layers_per_block, - act_fn=act_fn, - norm_num_groups=norm_num_groups, - double_z=True, - ) - - # pass init params to Decoder - self.decoder = Decoder( - in_channels=latent_channels, - out_channels=out_channels, - up_block_types=up_block_types, - block_out_channels=block_out_channels, - layers_per_block=layers_per_block, - norm_num_groups=norm_num_groups, - act_fn=act_fn, - ) - - self.quant_conv = torch.nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) - self.post_quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1) - self.use_slicing = False - - def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput: - h = self.encoder(x) - moments = self.quant_conv(h) - posterior = DiagonalGaussianDistribution(moments) - - if not return_dict: - return (posterior,) - - return AutoencoderKLOutput(latent_dist=posterior) - - def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: - z = self.post_quant_conv(z) - dec = self.decoder(z) - - if not return_dict: - return (dec,) - - return DecoderOutput(sample=dec) - - def enable_slicing(self): - r""" - Enable sliced VAE decoding. - - When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several - steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously invoked, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - - def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: - if self.use_slicing and z.shape[0] > 1: - decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)] - decoded = torch.cat(decoded_slices) - else: - decoded = self._decode(z).sample - - if not return_dict: - return (decoded,) - - return DecoderOutput(sample=decoded) - - def forward( - self, - sample: torch.FloatTensor, - sample_posterior: bool = False, - return_dict: bool = True, - generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, torch.FloatTensor]: - r""" - Args: - sample (`torch.FloatTensor`): Input sample. - sample_posterior (`bool`, *optional*, defaults to `False`): - Whether to sample from the posterior. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`DecoderOutput`] instead of a plain tuple. - """ - x = sample - posterior = self.encode(x).latent_dist - if sample_posterior: - z = posterior.sample(generator=generator) - else: - z = posterior.mode() - dec = self.decode(z).sample - - if not return_dict: - return (dec,) - - return DecoderOutput(sample=dec) diff --git a/src/diffusers/models/vae_flax.py b/src/diffusers/models/vae_flax.py index 9e18d66b..4533bb55 100644 --- a/src/diffusers/models/vae_flax.py +++ b/src/diffusers/models/vae_flax.py @@ -25,8 +25,8 @@ import jax.numpy as jnp from flax.core.frozen_dict import FrozenDict from ..configuration_utils import ConfigMixin, flax_register_to_config -from ..modeling_flax_utils import FlaxModelMixin from ..utils import BaseOutput +from .modeling_flax_utils import FlaxModelMixin @flax.struct.dataclass diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py new file mode 100644 index 00000000..18fa80cb --- /dev/null +++ b/src/diffusers/models/vq_model.py @@ -0,0 +1,148 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.nn as nn + +from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import BaseOutput +from .modeling_utils import ModelMixin +from .vae import Decoder, DecoderOutput, Encoder, VectorQuantizer + + +@dataclass +class VQEncoderOutput(BaseOutput): + """ + Output of VQModel encoding method. + + Args: + latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Encoded output sample of the model. Output of the last layer of the model. + """ + + latents: torch.FloatTensor + + +class VQModel(ModelMixin, ConfigMixin): + r"""VQ-VAE model from the paper Neural Discrete Representation Learning by Aaron van den Oord, Oriol Vinyals and Koray + Kavukcuoglu. + + This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library + implements for all the model (such as downloading or saving, etc.) + + Parameters: + in_channels (int, *optional*, defaults to 3): Number of channels in the input image. + out_channels (int, *optional*, defaults to 3): Number of channels in the output. + down_block_types (`Tuple[str]`, *optional*, defaults to : + obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types. + up_block_types (`Tuple[str]`, *optional*, defaults to : + obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types. + block_out_channels (`Tuple[int]`, *optional*, defaults to : + obj:`(64,)`): Tuple of block output channels. + act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. + latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space. + sample_size (`int`, *optional*, defaults to `32`): TODO + num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE. + vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE. + """ + + @register_to_config + def __init__( + self, + in_channels: int = 3, + out_channels: int = 3, + down_block_types: Tuple[str] = ("DownEncoderBlock2D",), + up_block_types: Tuple[str] = ("UpDecoderBlock2D",), + block_out_channels: Tuple[int] = (64,), + layers_per_block: int = 1, + act_fn: str = "silu", + latent_channels: int = 3, + sample_size: int = 32, + num_vq_embeddings: int = 256, + norm_num_groups: int = 32, + vq_embed_dim: Optional[int] = None, + ): + super().__init__() + + # pass init params to Encoder + self.encoder = Encoder( + in_channels=in_channels, + out_channels=latent_channels, + down_block_types=down_block_types, + block_out_channels=block_out_channels, + layers_per_block=layers_per_block, + act_fn=act_fn, + norm_num_groups=norm_num_groups, + double_z=False, + ) + + vq_embed_dim = vq_embed_dim if vq_embed_dim is not None else latent_channels + + self.quant_conv = nn.Conv2d(latent_channels, vq_embed_dim, 1) + self.quantize = VectorQuantizer(num_vq_embeddings, vq_embed_dim, beta=0.25, remap=None, sane_index_shape=False) + self.post_quant_conv = nn.Conv2d(vq_embed_dim, latent_channels, 1) + + # pass init params to Decoder + self.decoder = Decoder( + in_channels=latent_channels, + out_channels=out_channels, + up_block_types=up_block_types, + block_out_channels=block_out_channels, + layers_per_block=layers_per_block, + act_fn=act_fn, + norm_num_groups=norm_num_groups, + ) + + def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: + h = self.encoder(x) + h = self.quant_conv(h) + + if not return_dict: + return (h,) + + return VQEncoderOutput(latents=h) + + def decode( + self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True + ) -> Union[DecoderOutput, torch.FloatTensor]: + # also go through quantization layer + if not force_not_quantize: + quant, emb_loss, info = self.quantize(h) + else: + quant = h + quant = self.post_quant_conv(quant) + dec = self.decoder(quant) + + if not return_dict: + return (dec,) + + return DecoderOutput(sample=dec) + + def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: + r""" + Args: + sample (`torch.FloatTensor`): Input sample. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + """ + x = sample + h = self.encode(x).latents + dec = self.decode(h).sample + + if not return_dict: + return (dec,) + + return DecoderOutput(sample=dec) diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index 705c9e33..e8df9e93 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -1,6 +1,4 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Inc. team. -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# Copyright 2022 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,869 +10,10 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and + # limitations under the License. -import importlib -import inspect -import os -from dataclasses import dataclass -from pathlib import Path -from typing import Any, Dict, List, Optional, Union +# NOTE: This file is deprecated and will be removed in a future version. +# It only exists so that temporarely `from diffusers.pipelines import DiffusionPipeline` works -import numpy as np -import torch - -import diffusers -import PIL -from huggingface_hub import model_info, snapshot_download -from packaging import version -from PIL import Image -from tqdm.auto import tqdm - -from .configuration_utils import ConfigMixin -from .dynamic_modules_utils import get_class_from_dynamic_module -from .hub_utils import HF_HUB_OFFLINE, http_user_agent -from .modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT -from .schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from .utils import ( - CONFIG_NAME, - DIFFUSERS_CACHE, - ONNX_WEIGHTS_NAME, - WEIGHTS_NAME, - BaseOutput, - deprecate, - is_accelerate_available, - is_safetensors_available, - is_torch_version, - is_transformers_available, - logging, -) - - -if is_transformers_available(): - import transformers - from transformers import PreTrainedModel - - -INDEX_FILE = "diffusion_pytorch_model.bin" -CUSTOM_PIPELINE_FILE_NAME = "pipeline.py" -DUMMY_MODULES_FOLDER = "diffusers.utils" -TRANSFORMERS_DUMMY_MODULES_FOLDER = "transformers.utils" - - -logger = logging.get_logger(__name__) - - -LOADABLE_CLASSES = { - "diffusers": { - "ModelMixin": ["save_pretrained", "from_pretrained"], - "SchedulerMixin": ["save_pretrained", "from_pretrained"], - "DiffusionPipeline": ["save_pretrained", "from_pretrained"], - "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"], - }, - "transformers": { - "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"], - "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"], - "PreTrainedModel": ["save_pretrained", "from_pretrained"], - "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"], - "ProcessorMixin": ["save_pretrained", "from_pretrained"], - "ImageProcessingMixin": ["save_pretrained", "from_pretrained"], - }, - "onnxruntime.training": { - "ORTModule": ["save_pretrained", "from_pretrained"], - }, -} - -ALL_IMPORTABLE_CLASSES = {} -for library in LOADABLE_CLASSES: - ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library]) - - -@dataclass -class ImagePipelineOutput(BaseOutput): - """ - Output class for image pipelines. - - Args: - images (`List[PIL.Image.Image]` or `np.ndarray`) - List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, - num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. - """ - - images: Union[List[PIL.Image.Image], np.ndarray] - - -@dataclass -class AudioPipelineOutput(BaseOutput): - """ - Output class for audio pipelines. - - Args: - audios (`np.ndarray`) - List of denoised samples of shape `(batch_size, num_channels, sample_rate)`. Numpy array present the - denoised audio samples of the diffusion pipeline. - """ - - audios: np.ndarray - - -def is_safetensors_compatible(info) -> bool: - filenames = set(sibling.rfilename for sibling in info.siblings) - pt_filenames = set(filename for filename in filenames if filename.endswith(".bin")) - is_safetensors_compatible = any(file.endswith(".safetensors") for file in filenames) - for pt_filename in pt_filenames: - prefix, raw = os.path.split(pt_filename) - if raw == "pytorch_model.bin": - # transformers specific - sf_filename = os.path.join(prefix, "model.safetensors") - else: - sf_filename = pt_filename[: -len(".bin")] + ".safetensors" - if is_safetensors_compatible and sf_filename not in filenames: - logger.warning(f"{sf_filename} not found") - is_safetensors_compatible = False - return is_safetensors_compatible - - -class DiffusionPipeline(ConfigMixin): - r""" - Base class for all models. - - [`DiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion pipelines - and handles methods for loading, downloading and saving models as well as a few methods common to all pipelines to: - - - move all PyTorch modules to the device of your choice - - enabling/disabling the progress bar for the denoising iteration - - Class attributes: - - - **config_name** (`str`) -- name of the config file that will store the class and module names of all - components of the diffusion pipeline. - - **_optional_components** (List[`str`]) -- list of all components that are optional so they don't have to be - passed for the pipeline to function (should be overridden by subclasses). - """ - config_name = "model_index.json" - _optional_components = [] - - def register_modules(self, **kwargs): - # import it here to avoid circular import - from diffusers import pipelines - - for name, module in kwargs.items(): - # retrieve library - if module is None: - register_dict = {name: (None, None)} - else: - library = module.__module__.split(".")[0] - - # check if the module is a pipeline module - pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None - path = module.__module__.split(".") - is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir) - - # if library is not in LOADABLE_CLASSES, then it is a custom module. - # Or if it's a pipeline module, then the module is inside the pipeline - # folder so we set the library to module name. - if library not in LOADABLE_CLASSES or is_pipeline_module: - library = pipeline_dir - - # retrieve class_name - class_name = module.__class__.__name__ - - register_dict = {name: (library, class_name)} - - # save model index config - self.register_to_config(**register_dict) - - # set models - setattr(self, name, module) - - def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - safe_serialization: bool = False, - ): - """ - Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to - a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading - method. The pipeline can easily be re-loaded using the `[`~DiffusionPipeline.from_pretrained`]` class method. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to which to save. Will be created if it doesn't exist. - safe_serialization (`bool`, *optional*, defaults to `False`): - Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). - """ - self.save_config(save_directory) - - model_index_dict = dict(self.config) - model_index_dict.pop("_class_name") - model_index_dict.pop("_diffusers_version") - model_index_dict.pop("_module", None) - - expected_modules, optional_kwargs = self._get_signature_keys(self) - - def is_saveable_module(name, value): - if name not in expected_modules: - return False - if name in self._optional_components and value[0] is None: - return False - return True - - model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)} - - for pipeline_component_name in model_index_dict.keys(): - sub_model = getattr(self, pipeline_component_name) - model_cls = sub_model.__class__ - - save_method_name = None - # search for the model's base class in LOADABLE_CLASSES - for library_name, library_classes in LOADABLE_CLASSES.items(): - library = importlib.import_module(library_name) - for base_class, save_load_methods in library_classes.items(): - class_candidate = getattr(library, base_class, None) - if class_candidate is not None and issubclass(model_cls, class_candidate): - # if we found a suitable base class in LOADABLE_CLASSES then grab its save method - save_method_name = save_load_methods[0] - break - if save_method_name is not None: - break - - save_method = getattr(sub_model, save_method_name) - - # Call the save method with the argument safe_serialization only if it's supported - save_method_signature = inspect.signature(save_method) - save_method_accept_safe = "safe_serialization" in save_method_signature.parameters - if save_method_accept_safe: - save_method( - os.path.join(save_directory, pipeline_component_name), safe_serialization=safe_serialization - ) - else: - save_method(os.path.join(save_directory, pipeline_component_name)) - - def to(self, torch_device: Optional[Union[str, torch.device]] = None): - if torch_device is None: - return self - - module_names, _, _ = self.extract_init_dict(dict(self.config)) - for name in module_names.keys(): - module = getattr(self, name) - if isinstance(module, torch.nn.Module): - if module.dtype == torch.float16 and str(torch_device) in ["cpu"]: - logger.warning( - "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It" - " is not recommended to move them to `cpu` as running them will fail. Please make" - " sure to use an accelerator to run the pipeline in inference, due to the lack of" - " support for`float16` operations on this device in PyTorch. Please, remove the" - " `torch_dtype=torch.float16` argument, or use another device for inference." - ) - module.to(torch_device) - return self - - @property - def device(self) -> torch.device: - r""" - Returns: - `torch.device`: The torch device on which the pipeline is located. - """ - module_names, _, _ = self.extract_init_dict(dict(self.config)) - for name in module_names.keys(): - module = getattr(self, name) - if isinstance(module, torch.nn.Module): - return module.device - return torch.device("cpu") - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): - r""" - Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights. - - The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). - - The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come - pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning - task. - - The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those - weights are discarded. - - Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): - Can be either: - - - A string, the *repo id* of a pretrained pipeline hosted inside a model repo on - https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like - `CompVis/ldm-text2im-large-256`. - - A path to a *directory* containing pipeline weights saved using - [`~DiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`. - torch_dtype (`str` or `torch.dtype`, *optional*): - Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype - will be automatically derived from the model's weights. - custom_pipeline (`str`, *optional*): - - - - This is an experimental feature and is likely to change in the future. - - - - Can be either: - - - A string, the *repo id* of a custom pipeline hosted inside a model repo on - https://huggingface.co/. Valid repo ids have to be located under a user or organization name, - like `hf-internal-testing/diffusers-dummy-pipeline`. - - - - It is required that the model repo has a file, called `pipeline.py` that defines the custom - pipeline. - - - - - A string, the *file name* of a community pipeline hosted on GitHub under - https://github.com/huggingface/diffusers/tree/main/examples/community. Valid file names have to - match exactly the file name without `.py` located under the above link, *e.g.* - `clip_guided_stable_diffusion`. - - - - Community pipelines are always loaded from the current `main` branch of GitHub. - - - - - A path to a *directory* containing a custom pipeline, e.g., `./my_pipeline_directory/`. - - - - It is required that the directory has a file, called `pipeline.py` that defines the custom - pipeline. - - - - For more information on how to load and create custom pipelines, please have a look at [Loading and - Adding Custom - Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview) - - torch_dtype (`str` or `torch.dtype`, *optional*): - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - output_loading_info(`bool`, *optional*, defaults to `False`): - Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). - use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `huggingface-cli login` (stored in `~/.huggingface`). - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. - custom_revision (`str`, *optional*, defaults to `"main"` when loading from the Hub and to local version of `diffusers` when loading from GitHub): - The specific model version to use. It can be a branch name, a tag name, or a commit id similar to - `revision` when loading a custom pipeline from the Hub. It can be a diffusers version when loading a - custom pipeline from GitHub. - mirror (`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. specify the folder name here. - device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): - A map that specifies where each submodule should go. It doesn't need to be refined to each - parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the - same device. - - To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For - more information about each option see [designing a device - map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). - low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): - Speed up model loading by not initializing the weights and only loading the pre-trained weights. This - also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the - model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch, - setting this argument to `True` will raise an error. - return_cached_folder (`bool`, *optional*, defaults to `False`): - If set to `True`, path to downloaded cached folder will be returned in addition to loaded pipeline. - kwargs (remaining dictionary of keyword arguments, *optional*): - Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the - specific pipeline class. The overwritten components are then directly passed to the pipelines - `__init__` method. See example below for more information. - - - - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"` - - - - - - Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use - this method in a firewalled environment. - - - - Examples: - - ```py - >>> from diffusers import DiffusionPipeline - - >>> # Download pipeline from huggingface.co and cache. - >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256") - - >>> # Download pipeline that requires an authorization token - >>> # For more information on access tokens, please refer to this section - >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens) - >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - - >>> # Use a different scheduler - >>> from diffusers import LMSDiscreteScheduler - - >>> scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config) - >>> pipeline.scheduler = scheduler - ``` - """ - cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) - resume_download = kwargs.pop("resume_download", False) - force_download = kwargs.pop("force_download", False) - proxies = kwargs.pop("proxies", None) - local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE) - use_auth_token = kwargs.pop("use_auth_token", None) - revision = kwargs.pop("revision", None) - torch_dtype = kwargs.pop("torch_dtype", None) - custom_pipeline = kwargs.pop("custom_pipeline", None) - custom_revision = kwargs.pop("custom_revision", None) - provider = kwargs.pop("provider", None) - sess_options = kwargs.pop("sess_options", None) - device_map = kwargs.pop("device_map", None) - low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) - return_cached_folder = kwargs.pop("return_cached_folder", False) - - # 1. Download the checkpoints and configs - # use snapshot download here to get it working from from_pretrained - if not os.path.isdir(pretrained_model_name_or_path): - config_dict = cls.load_config( - pretrained_model_name_or_path, - cache_dir=cache_dir, - resume_download=resume_download, - force_download=force_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - ) - # make sure we only download sub-folders and `diffusers` filenames - folder_names = [k for k in config_dict.keys() if not k.startswith("_")] - allow_patterns = [os.path.join(k, "*") for k in folder_names] - allow_patterns += [WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, ONNX_WEIGHTS_NAME, cls.config_name] - - # make sure we don't download flax weights - ignore_patterns = ["*.msgpack"] - - if custom_pipeline is not None: - allow_patterns += [CUSTOM_PIPELINE_FILE_NAME] - - if cls != DiffusionPipeline: - requested_pipeline_class = cls.__name__ - else: - requested_pipeline_class = config_dict.get("_class_name", cls.__name__) - user_agent = {"pipeline_class": requested_pipeline_class} - if custom_pipeline is not None and not custom_pipeline.endswith(".py"): - user_agent["custom_pipeline"] = custom_pipeline - - user_agent = http_user_agent(user_agent) - - if is_safetensors_available(): - info = model_info( - pretrained_model_name_or_path, - use_auth_token=use_auth_token, - revision=revision, - ) - if is_safetensors_compatible(info): - ignore_patterns.append("*.bin") - else: - ignore_patterns.append("*.safetensors") - - # download all allow_patterns - cached_folder = snapshot_download( - pretrained_model_name_or_path, - cache_dir=cache_dir, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - allow_patterns=allow_patterns, - ignore_patterns=ignore_patterns, - user_agent=user_agent, - ) - else: - cached_folder = pretrained_model_name_or_path - - config_dict = cls.load_config(cached_folder) - - # 2. Load the pipeline class, if using custom module then load it from the hub - # if we load from explicit class, let's use it - if custom_pipeline is not None: - if custom_pipeline.endswith(".py"): - path = Path(custom_pipeline) - # decompose into folder & file - file_name = path.name - custom_pipeline = path.parent.absolute() - else: - file_name = CUSTOM_PIPELINE_FILE_NAME - - pipeline_class = get_class_from_dynamic_module( - custom_pipeline, module_file=file_name, cache_dir=cache_dir, revision=custom_revision - ) - elif cls != DiffusionPipeline: - pipeline_class = cls - else: - diffusers_module = importlib.import_module(cls.__module__.split(".")[0]) - pipeline_class = getattr(diffusers_module, config_dict["_class_name"]) - - # To be removed in 1.0.0 - if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse( - version.parse(config_dict["_diffusers_version"]).base_version - ) <= version.parse("0.5.1"): - from diffusers import StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy - - pipeline_class = StableDiffusionInpaintPipelineLegacy - - deprecation_message = ( - "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the" - f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For" - " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting" - " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your" - f" checkpoint {pretrained_model_name_or_path} to the format of" - " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain" - " the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0." - ) - deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, standard_warn=False) - - # some modules can be passed directly to the init - # in this case they are already instantiated in `kwargs` - # extract them here - expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class) - passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs} - passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs} - - init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) - - # define init kwargs - init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict} - init_kwargs = {**init_kwargs, **passed_pipe_kwargs} - - # remove `null` components - def load_module(name, value): - if value[0] is None: - return False - if name in passed_class_obj and passed_class_obj[name] is None: - return False - return True - - init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)} - - if len(unused_kwargs) > 0: - logger.warning( - f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored." - ) - - if low_cpu_mem_usage and not is_accelerate_available(): - low_cpu_mem_usage = False - logger.warning( - "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the" - " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install" - " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip" - " install accelerate\n```\n." - ) - - if device_map is not None and not is_torch_version(">=", "1.9.0"): - raise NotImplementedError( - "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set" - " `device_map=None`." - ) - - if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"): - raise NotImplementedError( - "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set" - " `low_cpu_mem_usage=False`." - ) - - if low_cpu_mem_usage is False and device_map is not None: - raise ValueError( - f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and" - " dispatching. Please make sure to set `low_cpu_mem_usage=True`." - ) - - # import it here to avoid circular import - from diffusers import pipelines - - # 3. Load each module in the pipeline - for name, (library_name, class_name) in init_dict.items(): - # 3.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names - if class_name.startswith("Flax"): - class_name = class_name[4:] - - is_pipeline_module = hasattr(pipelines, library_name) - loaded_sub_model = None - - # if the model is in a pipeline module, then we load it from the pipeline - if name in passed_class_obj: - # 1. check that passed_class_obj has correct parent class - if not is_pipeline_module: - library = importlib.import_module(library_name) - class_obj = getattr(library, class_name) - importable_classes = LOADABLE_CLASSES[library_name] - class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()} - - expected_class_obj = None - for class_name, class_candidate in class_candidates.items(): - if class_candidate is not None and issubclass(class_obj, class_candidate): - expected_class_obj = class_candidate - - if not issubclass(passed_class_obj[name].__class__, expected_class_obj): - raise ValueError( - f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be" - f" {expected_class_obj}" - ) - else: - logger.warning( - f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it" - " has the correct type" - ) - - # set passed class object - loaded_sub_model = passed_class_obj[name] - elif is_pipeline_module: - pipeline_module = getattr(pipelines, library_name) - class_obj = getattr(pipeline_module, class_name) - importable_classes = ALL_IMPORTABLE_CLASSES - class_candidates = {c: class_obj for c in importable_classes.keys()} - else: - # else we just import it from the library. - library = importlib.import_module(library_name) - - class_obj = getattr(library, class_name) - importable_classes = LOADABLE_CLASSES[library_name] - class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()} - - if loaded_sub_model is None: - load_method_name = None - for class_name, class_candidate in class_candidates.items(): - if class_candidate is not None and issubclass(class_obj, class_candidate): - load_method_name = importable_classes[class_name][1] - - if load_method_name is None: - none_module = class_obj.__module__ - is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith( - TRANSFORMERS_DUMMY_MODULES_FOLDER - ) - if is_dummy_path and "dummy" in none_module: - # call class_obj for nice error message of missing requirements - class_obj() - - raise ValueError( - f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have" - f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}." - ) - - load_method = getattr(class_obj, load_method_name) - loading_kwargs = {} - - if issubclass(class_obj, torch.nn.Module): - loading_kwargs["torch_dtype"] = torch_dtype - if issubclass(class_obj, diffusers.OnnxRuntimeModel): - loading_kwargs["provider"] = provider - loading_kwargs["sess_options"] = sess_options - - is_diffusers_model = issubclass(class_obj, diffusers.ModelMixin) - is_transformers_model = ( - is_transformers_available() - and issubclass(class_obj, PreTrainedModel) - and version.parse(version.parse(transformers.__version__).base_version) >= version.parse("4.20.0") - ) - - # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers. - # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default. - # This makes sure that the weights won't be initialized which significantly speeds up loading. - if is_diffusers_model or is_transformers_model: - loading_kwargs["device_map"] = device_map - loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage - - # check if the module is in a subdirectory - if os.path.isdir(os.path.join(cached_folder, name)): - loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) - else: - # else load from the root directory - loaded_sub_model = load_method(cached_folder, **loading_kwargs) - - init_kwargs[name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...) - - # 4. Potentially add passed objects if expected - missing_modules = set(expected_modules) - set(init_kwargs.keys()) - passed_modules = list(passed_class_obj.keys()) - optional_modules = pipeline_class._optional_components - if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules): - for module in missing_modules: - init_kwargs[module] = passed_class_obj.get(module, None) - elif len(missing_modules) > 0: - passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs - raise ValueError( - f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed." - ) - - # 5. Instantiate the pipeline - model = pipeline_class(**init_kwargs) - - if return_cached_folder: - return model, cached_folder - return model - - @staticmethod - def _get_signature_keys(obj): - parameters = inspect.signature(obj.__init__).parameters - required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty} - optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty}) - expected_modules = set(required_parameters.keys()) - set(["self"]) - return expected_modules, optional_parameters - - @property - def components(self) -> Dict[str, Any]: - r""" - - The `self.components` property can be useful to run different pipelines with the same weights and - configurations to not have to re-allocate memory. - - Examples: - - ```py - >>> from diffusers import ( - ... StableDiffusionPipeline, - ... StableDiffusionImg2ImgPipeline, - ... StableDiffusionInpaintPipeline, - ... ) - - >>> text2img = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") - >>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components) - >>> inpaint = StableDiffusionInpaintPipeline(**text2img.components) - ``` - - Returns: - A dictionary containing all the modules needed to initialize the pipeline. - """ - expected_modules, optional_parameters = self._get_signature_keys(self) - components = { - k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters - } - - if set(components.keys()) != expected_modules: - raise ValueError( - f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected" - f" {expected_modules} to be defined, but {components} are defined." - ) - - return components - - @staticmethod - def numpy_to_pil(images): - """ - Convert a numpy image or a batch of images to a PIL image. - """ - if images.ndim == 3: - images = images[None, ...] - images = (images * 255).round().astype("uint8") - if images.shape[-1] == 1: - # special case for grayscale (single channel) images - pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] - else: - pil_images = [Image.fromarray(image) for image in images] - - return pil_images - - def progress_bar(self, iterable=None, total=None): - if not hasattr(self, "_progress_bar_config"): - self._progress_bar_config = {} - elif not isinstance(self._progress_bar_config, dict): - raise ValueError( - f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}." - ) - - if iterable is not None: - return tqdm(iterable, **self._progress_bar_config) - elif total is not None: - return tqdm(total=total, **self._progress_bar_config) - else: - raise ValueError("Either `total` or `iterable` has to be defined.") - - def set_progress_bar_config(self, **kwargs): - self._progress_bar_config = kwargs - - def enable_xformers_memory_efficient_attention(self): - r""" - Enable memory efficient attention as implemented in xformers. - - When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference - time. Speed up at training time is not guaranteed. - - Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention - is used. - """ - self.set_use_memory_efficient_attention_xformers(True) - - def disable_xformers_memory_efficient_attention(self): - r""" - Disable memory efficient attention as implemented in xformers. - """ - self.set_use_memory_efficient_attention_xformers(False) - - def set_use_memory_efficient_attention_xformers(self, valid: bool) -> None: - # Recursively walk through all the children. - # Any children which exposes the set_use_memory_efficient_attention_xformers method - # gets the message - def fn_recursive_set_mem_eff(module: torch.nn.Module): - if hasattr(module, "set_use_memory_efficient_attention_xformers"): - module.set_use_memory_efficient_attention_xformers(valid) - - for child in module.children(): - fn_recursive_set_mem_eff(child) - - module_names, _, _ = self.extract_init_dict(dict(self.config)) - for module_name in module_names: - module = getattr(self, module_name) - if isinstance(module, torch.nn.Module): - fn_recursive_set_mem_eff(module) - - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): - r""" - Enable sliced attention computation. - - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. - - Args: - slice_size (`str` or `int`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` - must be a multiple of `slice_size`. - """ - self.set_attention_slice(slice_size) - - def disable_attention_slicing(self): - r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. - """ - # set slice_size = `None` to disable `attention slicing` - self.enable_attention_slicing(None) - - def set_attention_slice(self, slice_size: Optional[int]): - module_names, _, _ = self.extract_init_dict(dict(self.config)) - for module_name in module_names: - module = getattr(self, module_name) - if isinstance(module, torch.nn.Module) and hasattr(module, "set_attention_slice"): - module.set_attention_slice(slice_size) +from .pipelines import DiffusionPipeline, ImagePipelineOutput # noqa: F401 diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index ced96e60..dcf77c1a 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -20,6 +20,7 @@ else: from .ddpm import DDPMPipeline from .latent_diffusion import LDMSuperResolutionPipeline from .latent_diffusion_uncond import LDMPipeline + from .pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput from .pndm import PNDMPipeline from .repaint import RePaintPipeline from .score_sde_ve import ScoreSdeVePipeline @@ -62,6 +63,14 @@ else: ) from .vq_diffusion import VQDiffusionPipeline +try: + if not is_onnx_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ..utils.dummy_onnx_objects import * # noqa F403 +else: + from .onnx_utils import OnnxRuntimeModel + try: if not (is_torch_available() and is_transformers_available() and is_onnx_available()): raise OptionalDependencyNotAvailable() @@ -84,6 +93,14 @@ except OptionalDependencyNotAvailable: else: from .stable_diffusion import StableDiffusionKDiffusionPipeline +try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ..utils.dummy_flax_objects import * # noqa F403 +else: + from .pipeline_flax_utils import FlaxDiffusionPipeline + try: if not (is_flax_available() and is_transformers_available()): diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index 932c4afc..ab849c2f 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -23,7 +23,6 @@ from transformers import CLIPFeatureExtractor, XLMRobertaTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -33,6 +32,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import deprecate, logging +from ..pipeline_utils import DiffusionPipeline from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 6d712cac..c5579e4d 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -25,7 +25,6 @@ from transformers import CLIPFeatureExtractor, XLMRobertaTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -35,6 +34,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import PIL_INTERPOLATION, deprecate, logging +from ..pipeline_utils import DiffusionPipeline from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation diff --git a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py index 1d772212..120c1690 100644 --- a/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py +++ b/src/diffusers/pipelines/audio_diffusion/pipeline_audio_diffusion.py @@ -22,8 +22,8 @@ import torch from PIL import Image from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput from ...schedulers import DDIMScheduler, DDPMScheduler +from ..pipeline_utils import AudioPipelineOutput, BaseOutput, DiffusionPipeline, ImagePipelineOutput from .mel import Mel diff --git a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py index 903b50cf..4bf93417 100644 --- a/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py +++ b/src/diffusers/pipelines/dance_diffusion/pipeline_dance_diffusion.py @@ -17,8 +17,8 @@ from typing import List, Optional, Tuple, Union import torch -from ...pipeline_utils import AudioPipelineOutput, DiffusionPipeline from ...utils import logging +from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -63,12 +63,11 @@ class DanceDiffusionPipeline(DiffusionPipeline): The length of the generated audio sample in seconds. Note that the output of the pipeline, *i.e.* `sample_size`, will be `audio_length_in_s` * `self.unet.sample_rate`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.AudioPipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.AudioPipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.AudioPipelineOutput`] or `tuple`: [`~pipelines.utils.AudioPipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ if audio_length_in_s is None: diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py index a3d4b589..5489abf3 100644 --- a/src/diffusers/pipelines/ddim/pipeline_ddim.py +++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py @@ -16,8 +16,8 @@ from typing import List, Optional, Tuple, Union import torch -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...utils import deprecate +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput class DDIMPipeline(DiffusionPipeline): @@ -66,12 +66,11 @@ class DDIMPipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ if ( diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py index 123b4f84..f10e3aa9 100644 --- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py +++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py @@ -18,8 +18,8 @@ from typing import List, Optional, Tuple, Union import torch from ...configuration_utils import FrozenDict -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...utils import deprecate +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput class DDPMPipeline(DiffusionPipeline): @@ -62,12 +62,11 @@ class DDPMPipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ message = ( "Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler =" diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py index ec0c71af..ca3408d8 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py @@ -19,16 +19,14 @@ import torch import torch.nn as nn import torch.utils.checkpoint +from transformers import PretrainedConfig, PreTrainedModel, PreTrainedTokenizer from transformers.activations import ACT2FN -from transformers.configuration_utils import PretrainedConfig from transformers.modeling_outputs import BaseModelOutput -from transformers.modeling_utils import PreTrainedModel -from transformers.tokenization_utils import PreTrainedTokenizer from transformers.utils import logging from ...models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput class LDMTextToImagePipeline(DiffusionPipeline): @@ -105,12 +103,11 @@ class LDMTextToImagePipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ # 0. Default height and width to unet height = height or self.unet.config.sample_size * self.vae_scale_factor diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py index a97d18c9..18b8c498 100644 --- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py +++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion_superresolution.py @@ -8,7 +8,6 @@ import torch.utils.checkpoint import PIL from ...models import UNet2DModel, VQModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -18,6 +17,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import PIL_INTERPOLATION, deprecate +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput def preprocess(image): @@ -95,12 +95,11 @@ class LDMSuperResolutionPipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ message = "Please use `image` instead of `init_image`." init_image = deprecate("init_image", "0.13.0", message, take_from=kwargs) diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py index 19380d36..d8717023 100644 --- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py +++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py @@ -18,8 +18,8 @@ from typing import List, Optional, Tuple, Union import torch from ...models import UNet2DModel, VQModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import DDIMScheduler +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput class LDMPipeline(DiffusionPipeline): @@ -64,12 +64,11 @@ class LDMPipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ latents = torch.randn( diff --git a/src/diffusers/onnx_utils.py b/src/diffusers/pipelines/onnx_utils.py similarity index 98% rename from src/diffusers/onnx_utils.py rename to src/diffusers/pipelines/onnx_utils.py index b2c533ed..9308a187 100644 --- a/src/diffusers/onnx_utils.py +++ b/src/diffusers/pipelines/onnx_utils.py @@ -24,7 +24,7 @@ import numpy as np from huggingface_hub import hf_hub_download -from .utils import ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, is_onnx_available, logging +from ..utils import ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, is_onnx_available, logging if is_onnx_available(): diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index f9de7d91..b97e46b3 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -23,9 +23,9 @@ from diffusers.utils import is_accelerate_available from transformers import CLIPFeatureExtractor from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import logging +from ..pipeline_utils import DiffusionPipeline from ..stable_diffusion import StableDiffusionPipelineOutput from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker from .image_encoder import PaintByExampleImageEncoder diff --git a/src/diffusers/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py similarity index 98% rename from src/diffusers/pipeline_flax_utils.py rename to src/diffusers/pipelines/pipeline_flax_utils.py index 84817423..1922d0fa 100644 --- a/src/diffusers/pipeline_flax_utils.py +++ b/src/diffusers/pipelines/pipeline_flax_utils.py @@ -28,11 +28,10 @@ from huggingface_hub import snapshot_download from PIL import Image from tqdm.auto import tqdm -from .configuration_utils import ConfigMixin -from .hub_utils import http_user_agent -from .modeling_flax_utils import FLAX_WEIGHTS_NAME, FlaxModelMixin -from .schedulers.scheduling_utils_flax import SCHEDULER_CONFIG_NAME, FlaxSchedulerMixin -from .utils import CONFIG_NAME, DIFFUSERS_CACHE, BaseOutput, is_transformers_available, logging +from ..configuration_utils import ConfigMixin +from ..models.modeling_flax_utils import FLAX_WEIGHTS_NAME, FlaxModelMixin +from ..schedulers.scheduling_utils_flax import SCHEDULER_CONFIG_NAME, FlaxSchedulerMixin +from ..utils import CONFIG_NAME, DIFFUSERS_CACHE, BaseOutput, http_user_agent, is_transformers_available, logging if is_transformers_available(): diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py new file mode 100644 index 00000000..854a003e --- /dev/null +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -0,0 +1,881 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +import inspect +import os +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import numpy as np +import torch + +import diffusers +import PIL +from huggingface_hub import model_info, snapshot_download +from packaging import version +from PIL import Image +from tqdm.auto import tqdm + +from ..configuration_utils import ConfigMixin +from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT +from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME +from ..utils import ( + CONFIG_NAME, + DIFFUSERS_CACHE, + HF_HUB_OFFLINE, + ONNX_WEIGHTS_NAME, + WEIGHTS_NAME, + BaseOutput, + deprecate, + get_class_from_dynamic_module, + http_user_agent, + is_accelerate_available, + is_safetensors_available, + is_torch_version, + is_transformers_available, + logging, +) + + +if is_transformers_available(): + import transformers + from transformers import PreTrainedModel + + +INDEX_FILE = "diffusion_pytorch_model.bin" +CUSTOM_PIPELINE_FILE_NAME = "pipeline.py" +DUMMY_MODULES_FOLDER = "diffusers.utils" +TRANSFORMERS_DUMMY_MODULES_FOLDER = "transformers.utils" + + +logger = logging.get_logger(__name__) + + +LOADABLE_CLASSES = { + "diffusers": { + "ModelMixin": ["save_pretrained", "from_pretrained"], + "SchedulerMixin": ["save_pretrained", "from_pretrained"], + "DiffusionPipeline": ["save_pretrained", "from_pretrained"], + "OnnxRuntimeModel": ["save_pretrained", "from_pretrained"], + }, + "transformers": { + "PreTrainedTokenizer": ["save_pretrained", "from_pretrained"], + "PreTrainedTokenizerFast": ["save_pretrained", "from_pretrained"], + "PreTrainedModel": ["save_pretrained", "from_pretrained"], + "FeatureExtractionMixin": ["save_pretrained", "from_pretrained"], + "ProcessorMixin": ["save_pretrained", "from_pretrained"], + "ImageProcessingMixin": ["save_pretrained", "from_pretrained"], + }, + "onnxruntime.training": { + "ORTModule": ["save_pretrained", "from_pretrained"], + }, +} + +ALL_IMPORTABLE_CLASSES = {} +for library in LOADABLE_CLASSES: + ALL_IMPORTABLE_CLASSES.update(LOADABLE_CLASSES[library]) + + +@dataclass +class ImagePipelineOutput(BaseOutput): + """ + Output class for image pipelines. + + Args: + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + """ + + images: Union[List[PIL.Image.Image], np.ndarray] + + +@dataclass +class AudioPipelineOutput(BaseOutput): + """ + Output class for audio pipelines. + + Args: + audios (`np.ndarray`) + List of denoised samples of shape `(batch_size, num_channels, sample_rate)`. Numpy array present the + denoised audio samples of the diffusion pipeline. + """ + + audios: np.ndarray + + +def is_safetensors_compatible(info) -> bool: + filenames = set(sibling.rfilename for sibling in info.siblings) + pt_filenames = set(filename for filename in filenames if filename.endswith(".bin")) + is_safetensors_compatible = any(file.endswith(".safetensors") for file in filenames) + for pt_filename in pt_filenames: + prefix, raw = os.path.split(pt_filename) + if raw == "pytorch_model.bin": + # transformers specific + sf_filename = os.path.join(prefix, "model.safetensors") + else: + sf_filename = pt_filename[: -len(".bin")] + ".safetensors" + if is_safetensors_compatible and sf_filename not in filenames: + logger.warning(f"{sf_filename} not found") + is_safetensors_compatible = False + return is_safetensors_compatible + + +class DiffusionPipeline(ConfigMixin): + r""" + Base class for all models. + + [`DiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion pipelines + and handles methods for loading, downloading and saving models as well as a few methods common to all pipelines to: + + - move all PyTorch modules to the device of your choice + - enabling/disabling the progress bar for the denoising iteration + + Class attributes: + + - **config_name** (`str`) -- name of the config file that will store the class and module names of all + components of the diffusion pipeline. + - **_optional_components** (List[`str`]) -- list of all components that are optional so they don't have to be + passed for the pipeline to function (should be overridden by subclasses). + """ + config_name = "model_index.json" + _optional_components = [] + + def register_modules(self, **kwargs): + # import it here to avoid circular import + from diffusers import pipelines + + for name, module in kwargs.items(): + # retrieve library + if module is None: + register_dict = {name: (None, None)} + else: + library = module.__module__.split(".")[0] + + # check if the module is a pipeline module + pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None + path = module.__module__.split(".") + is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir) + + # if library is not in LOADABLE_CLASSES, then it is a custom module. + # Or if it's a pipeline module, then the module is inside the pipeline + # folder so we set the library to module name. + if library not in LOADABLE_CLASSES or is_pipeline_module: + library = pipeline_dir + + # retrieve class_name + class_name = module.__class__.__name__ + + register_dict = {name: (library, class_name)} + + # save model index config + self.register_to_config(**register_dict) + + # set models + setattr(self, name, module) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + safe_serialization: bool = False, + ): + """ + Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to + a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading + method. The pipeline can easily be re-loaded using the `[`~DiffusionPipeline.from_pretrained`]` class method. + + Arguments: + save_directory (`str` or `os.PathLike`): + Directory to which to save. Will be created if it doesn't exist. + safe_serialization (`bool`, *optional*, defaults to `False`): + Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + """ + self.save_config(save_directory) + + model_index_dict = dict(self.config) + model_index_dict.pop("_class_name") + model_index_dict.pop("_diffusers_version") + model_index_dict.pop("_module", None) + + expected_modules, optional_kwargs = self._get_signature_keys(self) + + def is_saveable_module(name, value): + if name not in expected_modules: + return False + if name in self._optional_components and value[0] is None: + return False + return True + + model_index_dict = {k: v for k, v in model_index_dict.items() if is_saveable_module(k, v)} + + for pipeline_component_name in model_index_dict.keys(): + sub_model = getattr(self, pipeline_component_name) + model_cls = sub_model.__class__ + + save_method_name = None + # search for the model's base class in LOADABLE_CLASSES + for library_name, library_classes in LOADABLE_CLASSES.items(): + library = importlib.import_module(library_name) + for base_class, save_load_methods in library_classes.items(): + class_candidate = getattr(library, base_class, None) + if class_candidate is not None and issubclass(model_cls, class_candidate): + # if we found a suitable base class in LOADABLE_CLASSES then grab its save method + save_method_name = save_load_methods[0] + break + if save_method_name is not None: + break + + save_method = getattr(sub_model, save_method_name) + + # Call the save method with the argument safe_serialization only if it's supported + save_method_signature = inspect.signature(save_method) + save_method_accept_safe = "safe_serialization" in save_method_signature.parameters + if save_method_accept_safe: + save_method( + os.path.join(save_directory, pipeline_component_name), safe_serialization=safe_serialization + ) + else: + save_method(os.path.join(save_directory, pipeline_component_name)) + + def to(self, torch_device: Optional[Union[str, torch.device]] = None): + if torch_device is None: + return self + + module_names, _, _ = self.extract_init_dict(dict(self.config)) + for name in module_names.keys(): + module = getattr(self, name) + if isinstance(module, torch.nn.Module): + if module.dtype == torch.float16 and str(torch_device) in ["cpu"]: + logger.warning( + "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It" + " is not recommended to move them to `cpu` as running them will fail. Please make" + " sure to use an accelerator to run the pipeline in inference, due to the lack of" + " support for`float16` operations on this device in PyTorch. Please, remove the" + " `torch_dtype=torch.float16` argument, or use another device for inference." + ) + module.to(torch_device) + return self + + @property + def device(self) -> torch.device: + r""" + Returns: + `torch.device`: The torch device on which the pipeline is located. + """ + module_names, _, _ = self.extract_init_dict(dict(self.config)) + for name in module_names.keys(): + module = getattr(self, name) + if isinstance(module, torch.nn.Module): + return module.device + return torch.device("cpu") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): + r""" + Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights. + + The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). + + The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come + pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning + task. + + The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those + weights are discarded. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): + Can be either: + + - A string, the *repo id* of a pretrained pipeline hosted inside a model repo on + https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like + `CompVis/ldm-text2im-large-256`. + - A path to a *directory* containing pipeline weights saved using + [`~DiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`. + torch_dtype (`str` or `torch.dtype`, *optional*): + Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype + will be automatically derived from the model's weights. + custom_pipeline (`str`, *optional*): + + + + This is an experimental feature and is likely to change in the future. + + + + Can be either: + + - A string, the *repo id* of a custom pipeline hosted inside a model repo on + https://huggingface.co/. Valid repo ids have to be located under a user or organization name, + like `hf-internal-testing/diffusers-dummy-pipeline`. + + + + It is required that the model repo has a file, called `pipeline.py` that defines the custom + pipeline. + + + + - A string, the *file name* of a community pipeline hosted on GitHub under + https://github.com/huggingface/diffusers/tree/main/examples/community. Valid file names have to + match exactly the file name without `.py` located under the above link, *e.g.* + `clip_guided_stable_diffusion`. + + + + Community pipelines are always loaded from the current `main` branch of GitHub. + + + + - A path to a *directory* containing a custom pipeline, e.g., `./my_pipeline_directory/`. + + + + It is required that the directory has a file, called `pipeline.py` that defines the custom + pipeline. + + + + For more information on how to load and create custom pipelines, please have a look at [Loading and + Adding Custom + Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview) + + torch_dtype (`str` or `torch.dtype`, *optional*): + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download (`bool`, *optional*, defaults to `False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + custom_revision (`str`, *optional*, defaults to `"main"` when loading from the Hub and to local version of `diffusers` when loading from GitHub): + The specific model version to use. It can be a branch name, a tag name, or a commit id similar to + `revision` when loading a custom pipeline from the Hub. It can be a diffusers version when loading a + custom pipeline from GitHub. + mirror (`str`, *optional*): + Mirror source to accelerate downloads in China. If you are from China and have an accessibility + problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. + Please refer to the mirror site for more information. specify the folder name here. + device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): + A map that specifies where each submodule should go. It doesn't need to be refined to each + parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the + same device. + + To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For + more information about each option see [designing a device + map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): + Speed up model loading by not initializing the weights and only loading the pre-trained weights. This + also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the + model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch, + setting this argument to `True` will raise an error. + return_cached_folder (`bool`, *optional*, defaults to `False`): + If set to `True`, path to downloaded cached folder will be returned in addition to loaded pipeline. + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the + specific pipeline class. The overwritten components are then directly passed to the pipelines + `__init__` method. See example below for more information. + + + + It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated + models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"` + + + + + + Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use + this method in a firewalled environment. + + + + Examples: + + ```py + >>> from diffusers import DiffusionPipeline + + >>> # Download pipeline from huggingface.co and cache. + >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256") + + >>> # Download pipeline that requires an authorization token + >>> # For more information on access tokens, please refer to this section + >>> # of the documentation](https://huggingface.co/docs/hub/security-tokens) + >>> pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + + >>> # Use a different scheduler + >>> from diffusers import LMSDiscreteScheduler + + >>> scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config) + >>> pipeline.scheduler = scheduler + ``` + """ + cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) + resume_download = kwargs.pop("resume_download", False) + force_download = kwargs.pop("force_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", HF_HUB_OFFLINE) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + torch_dtype = kwargs.pop("torch_dtype", None) + custom_pipeline = kwargs.pop("custom_pipeline", None) + custom_revision = kwargs.pop("custom_revision", None) + provider = kwargs.pop("provider", None) + sess_options = kwargs.pop("sess_options", None) + device_map = kwargs.pop("device_map", None) + low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) + return_cached_folder = kwargs.pop("return_cached_folder", False) + + # 1. Download the checkpoints and configs + # use snapshot download here to get it working from from_pretrained + if not os.path.isdir(pretrained_model_name_or_path): + config_dict = cls.load_config( + pretrained_model_name_or_path, + cache_dir=cache_dir, + resume_download=resume_download, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + ) + # make sure we only download sub-folders and `diffusers` filenames + folder_names = [k for k in config_dict.keys() if not k.startswith("_")] + allow_patterns = [os.path.join(k, "*") for k in folder_names] + allow_patterns += [WEIGHTS_NAME, SCHEDULER_CONFIG_NAME, CONFIG_NAME, ONNX_WEIGHTS_NAME, cls.config_name] + + # make sure we don't download flax weights + ignore_patterns = ["*.msgpack"] + + if custom_pipeline is not None: + allow_patterns += [CUSTOM_PIPELINE_FILE_NAME] + + if cls != DiffusionPipeline: + requested_pipeline_class = cls.__name__ + else: + requested_pipeline_class = config_dict.get("_class_name", cls.__name__) + user_agent = {"pipeline_class": requested_pipeline_class} + if custom_pipeline is not None and not custom_pipeline.endswith(".py"): + user_agent["custom_pipeline"] = custom_pipeline + + user_agent = http_user_agent(user_agent) + + if is_safetensors_available(): + info = model_info( + pretrained_model_name_or_path, + use_auth_token=use_auth_token, + revision=revision, + ) + if is_safetensors_compatible(info): + ignore_patterns.append("*.bin") + else: + ignore_patterns.append("*.safetensors") + + # download all allow_patterns + cached_folder = snapshot_download( + pretrained_model_name_or_path, + cache_dir=cache_dir, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + allow_patterns=allow_patterns, + ignore_patterns=ignore_patterns, + user_agent=user_agent, + ) + else: + cached_folder = pretrained_model_name_or_path + + config_dict = cls.load_config(cached_folder) + + # 2. Load the pipeline class, if using custom module then load it from the hub + # if we load from explicit class, let's use it + if custom_pipeline is not None: + if custom_pipeline.endswith(".py"): + path = Path(custom_pipeline) + # decompose into folder & file + file_name = path.name + custom_pipeline = path.parent.absolute() + else: + file_name = CUSTOM_PIPELINE_FILE_NAME + + pipeline_class = get_class_from_dynamic_module( + custom_pipeline, module_file=file_name, cache_dir=cache_dir, revision=custom_revision + ) + elif cls != DiffusionPipeline: + pipeline_class = cls + else: + diffusers_module = importlib.import_module(cls.__module__.split(".")[0]) + pipeline_class = getattr(diffusers_module, config_dict["_class_name"]) + + # To be removed in 1.0.0 + if pipeline_class.__name__ == "StableDiffusionInpaintPipeline" and version.parse( + version.parse(config_dict["_diffusers_version"]).base_version + ) <= version.parse("0.5.1"): + from diffusers import StableDiffusionInpaintPipeline, StableDiffusionInpaintPipelineLegacy + + pipeline_class = StableDiffusionInpaintPipelineLegacy + + deprecation_message = ( + "You are using a legacy checkpoint for inpainting with Stable Diffusion, therefore we are loading the" + f" {StableDiffusionInpaintPipelineLegacy} class instead of {StableDiffusionInpaintPipeline}. For" + " better inpainting results, we strongly suggest using Stable Diffusion's official inpainting" + " checkpoint: https://huggingface.co/runwayml/stable-diffusion-inpainting instead or adapting your" + f" checkpoint {pretrained_model_name_or_path} to the format of" + " https://huggingface.co/runwayml/stable-diffusion-inpainting. Note that we do not actively maintain" + " the {StableDiffusionInpaintPipelineLegacy} class and will likely remove it in version 1.0.0." + ) + deprecate("StableDiffusionInpaintPipelineLegacy", "1.0.0", deprecation_message, standard_warn=False) + + # some modules can be passed directly to the init + # in this case they are already instantiated in `kwargs` + # extract them here + expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class) + passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs} + passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs} + + init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) + + # define init kwargs + init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict} + init_kwargs = {**init_kwargs, **passed_pipe_kwargs} + + # remove `null` components + def load_module(name, value): + if value[0] is None: + return False + if name in passed_class_obj and passed_class_obj[name] is None: + return False + return True + + init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)} + + if len(unused_kwargs) > 0: + logger.warning( + f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored." + ) + + if low_cpu_mem_usage and not is_accelerate_available(): + low_cpu_mem_usage = False + logger.warning( + "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the" + " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install" + " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip" + " install accelerate\n```\n." + ) + + if device_map is not None and not is_torch_version(">=", "1.9.0"): + raise NotImplementedError( + "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set" + " `device_map=None`." + ) + + if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"): + raise NotImplementedError( + "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set" + " `low_cpu_mem_usage=False`." + ) + + if low_cpu_mem_usage is False and device_map is not None: + raise ValueError( + f"You cannot set `low_cpu_mem_usage` to False while using device_map={device_map} for loading and" + " dispatching. Please make sure to set `low_cpu_mem_usage=True`." + ) + + # import it here to avoid circular import + from diffusers import pipelines + + # 3. Load each module in the pipeline + for name, (library_name, class_name) in init_dict.items(): + # 3.1 - now that JAX/Flax is an official framework of the library, we might load from Flax names + if class_name.startswith("Flax"): + class_name = class_name[4:] + + is_pipeline_module = hasattr(pipelines, library_name) + loaded_sub_model = None + + # if the model is in a pipeline module, then we load it from the pipeline + if name in passed_class_obj: + # 1. check that passed_class_obj has correct parent class + if not is_pipeline_module: + library = importlib.import_module(library_name) + class_obj = getattr(library, class_name) + importable_classes = LOADABLE_CLASSES[library_name] + class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()} + + expected_class_obj = None + for class_name, class_candidate in class_candidates.items(): + if class_candidate is not None and issubclass(class_obj, class_candidate): + expected_class_obj = class_candidate + + if not issubclass(passed_class_obj[name].__class__, expected_class_obj): + raise ValueError( + f"{passed_class_obj[name]} is of type: {type(passed_class_obj[name])}, but should be" + f" {expected_class_obj}" + ) + else: + logger.warning( + f"You have passed a non-standard module {passed_class_obj[name]}. We cannot verify whether it" + " has the correct type" + ) + + # set passed class object + loaded_sub_model = passed_class_obj[name] + elif is_pipeline_module: + pipeline_module = getattr(pipelines, library_name) + class_obj = getattr(pipeline_module, class_name) + importable_classes = ALL_IMPORTABLE_CLASSES + class_candidates = {c: class_obj for c in importable_classes.keys()} + else: + # else we just import it from the library. + library = importlib.import_module(library_name) + + class_obj = getattr(library, class_name) + importable_classes = LOADABLE_CLASSES[library_name] + class_candidates = {c: getattr(library, c, None) for c in importable_classes.keys()} + + if loaded_sub_model is None: + load_method_name = None + for class_name, class_candidate in class_candidates.items(): + if class_candidate is not None and issubclass(class_obj, class_candidate): + load_method_name = importable_classes[class_name][1] + + if load_method_name is None: + none_module = class_obj.__module__ + is_dummy_path = none_module.startswith(DUMMY_MODULES_FOLDER) or none_module.startswith( + TRANSFORMERS_DUMMY_MODULES_FOLDER + ) + if is_dummy_path and "dummy" in none_module: + # call class_obj for nice error message of missing requirements + class_obj() + + raise ValueError( + f"The component {class_obj} of {pipeline_class} cannot be loaded as it does not seem to have" + f" any of the loading methods defined in {ALL_IMPORTABLE_CLASSES}." + ) + + load_method = getattr(class_obj, load_method_name) + loading_kwargs = {} + + if issubclass(class_obj, torch.nn.Module): + loading_kwargs["torch_dtype"] = torch_dtype + if issubclass(class_obj, diffusers.OnnxRuntimeModel): + loading_kwargs["provider"] = provider + loading_kwargs["sess_options"] = sess_options + + is_diffusers_model = issubclass(class_obj, diffusers.ModelMixin) + is_transformers_model = ( + is_transformers_available() + and issubclass(class_obj, PreTrainedModel) + and version.parse(version.parse(transformers.__version__).base_version) >= version.parse("4.20.0") + ) + + # When loading a transformers model, if the device_map is None, the weights will be initialized as opposed to diffusers. + # To make default loading faster we set the `low_cpu_mem_usage=low_cpu_mem_usage` flag which is `True` by default. + # This makes sure that the weights won't be initialized which significantly speeds up loading. + if is_diffusers_model or is_transformers_model: + loading_kwargs["device_map"] = device_map + loading_kwargs["low_cpu_mem_usage"] = low_cpu_mem_usage + + # check if the module is in a subdirectory + if os.path.isdir(os.path.join(cached_folder, name)): + loaded_sub_model = load_method(os.path.join(cached_folder, name), **loading_kwargs) + else: + # else load from the root directory + loaded_sub_model = load_method(cached_folder, **loading_kwargs) + + init_kwargs[name] = loaded_sub_model # UNet(...), # DiffusionSchedule(...) + + # 4. Potentially add passed objects if expected + missing_modules = set(expected_modules) - set(init_kwargs.keys()) + passed_modules = list(passed_class_obj.keys()) + optional_modules = pipeline_class._optional_components + if len(missing_modules) > 0 and missing_modules <= set(passed_modules + optional_modules): + for module in missing_modules: + init_kwargs[module] = passed_class_obj.get(module, None) + elif len(missing_modules) > 0: + passed_modules = set(list(init_kwargs.keys()) + list(passed_class_obj.keys())) - optional_kwargs + raise ValueError( + f"Pipeline {pipeline_class} expected {expected_modules}, but only {passed_modules} were passed." + ) + + # 5. Instantiate the pipeline + model = pipeline_class(**init_kwargs) + + if return_cached_folder: + return model, cached_folder + return model + + @staticmethod + def _get_signature_keys(obj): + parameters = inspect.signature(obj.__init__).parameters + required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty} + optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty}) + expected_modules = set(required_parameters.keys()) - set(["self"]) + return expected_modules, optional_parameters + + @property + def components(self) -> Dict[str, Any]: + r""" + + The `self.components` property can be useful to run different pipelines with the same weights and + configurations to not have to re-allocate memory. + + Examples: + + ```py + >>> from diffusers import ( + ... StableDiffusionPipeline, + ... StableDiffusionImg2ImgPipeline, + ... StableDiffusionInpaintPipeline, + ... ) + + >>> text2img = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") + >>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components) + >>> inpaint = StableDiffusionInpaintPipeline(**text2img.components) + ``` + + Returns: + A dictionary containing all the modules needed to initialize the pipeline. + """ + expected_modules, optional_parameters = self._get_signature_keys(self) + components = { + k: getattr(self, k) for k in self.config.keys() if not k.startswith("_") and k not in optional_parameters + } + + if set(components.keys()) != expected_modules: + raise ValueError( + f"{self} has been incorrectly initialized or {self.__class__} is incorrectly implemented. Expected" + f" {expected_modules} to be defined, but {components} are defined." + ) + + return components + + @staticmethod + def numpy_to_pil(images): + """ + Convert a numpy image or a batch of images to a PIL image. + """ + if images.ndim == 3: + images = images[None, ...] + images = (images * 255).round().astype("uint8") + if images.shape[-1] == 1: + # special case for grayscale (single channel) images + pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] + else: + pil_images = [Image.fromarray(image) for image in images] + + return pil_images + + def progress_bar(self, iterable=None, total=None): + if not hasattr(self, "_progress_bar_config"): + self._progress_bar_config = {} + elif not isinstance(self._progress_bar_config, dict): + raise ValueError( + f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}." + ) + + if iterable is not None: + return tqdm(iterable, **self._progress_bar_config) + elif total is not None: + return tqdm(total=total, **self._progress_bar_config) + else: + raise ValueError("Either `total` or `iterable` has to be defined.") + + def set_progress_bar_config(self, **kwargs): + self._progress_bar_config = kwargs + + def enable_xformers_memory_efficient_attention(self): + r""" + Enable memory efficient attention as implemented in xformers. + + When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference + time. Speed up at training time is not guaranteed. + + Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention + is used. + """ + self.set_use_memory_efficient_attention_xformers(True) + + def disable_xformers_memory_efficient_attention(self): + r""" + Disable memory efficient attention as implemented in xformers. + """ + self.set_use_memory_efficient_attention_xformers(False) + + def set_use_memory_efficient_attention_xformers(self, valid: bool) -> None: + # Recursively walk through all the children. + # Any children which exposes the set_use_memory_efficient_attention_xformers method + # gets the message + def fn_recursive_set_mem_eff(module: torch.nn.Module): + if hasattr(module, "set_use_memory_efficient_attention_xformers"): + module.set_use_memory_efficient_attention_xformers(valid) + + for child in module.children(): + fn_recursive_set_mem_eff(child) + + module_names, _, _ = self.extract_init_dict(dict(self.config)) + for module_name in module_names: + module = getattr(self, module_name) + if isinstance(module, torch.nn.Module): + fn_recursive_set_mem_eff(module) + + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module will split the input tensor in slices, to compute attention + in several steps. This is useful to save some memory in exchange for a small speed decrease. + + Args: + slice_size (`str` or `int`, *optional*, defaults to `"auto"`): + When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If + `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + self.set_attention_slice(slice_size) + + def disable_attention_slicing(self): + r""" + Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go + back to computing attention in one step. + """ + # set slice_size = `None` to disable `attention slicing` + self.enable_attention_slicing(None) + + def set_attention_slice(self, slice_size: Optional[int]): + module_names, _, _ = self.extract_init_dict(dict(self.config)) + for module_name in module_names: + module = getattr(self, module_name) + if isinstance(module, torch.nn.Module) and hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size) diff --git a/src/diffusers/pipelines/pndm/pipeline_pndm.py b/src/diffusers/pipelines/pndm/pipeline_pndm.py index 020dd7c4..34204b12 100644 --- a/src/diffusers/pipelines/pndm/pipeline_pndm.py +++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py @@ -18,8 +18,8 @@ from typing import List, Optional, Tuple, Union import torch from ...models import UNet2DModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import PNDMScheduler +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput class PNDMPipeline(DiffusionPipeline): @@ -62,12 +62,11 @@ class PNDMPipeline(DiffusionPipeline): output_type (`str`, `optional`, defaults to `"pil"`): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, `optional`, defaults to `True`): Whether or not to return a - [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ # For more information on the sampling method you can take a look at Algorithm 2 of # the official paper: https://arxiv.org/pdf/2202.09778.pdf diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py index a316e5bc..32374e9a 100644 --- a/src/diffusers/pipelines/repaint/pipeline_repaint.py +++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py @@ -21,9 +21,9 @@ import torch import PIL from ...models import UNet2DModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import RePaintScheduler from ...utils import PIL_INTERPOLATION, deprecate, logging +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -118,12 +118,11 @@ class RePaintPipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ message = "Please use `image` instead of `original_image`." diff --git a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py index 8dc7001a..a53d0840 100644 --- a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py +++ b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py @@ -17,8 +17,8 @@ from typing import List, Optional, Tuple, Union import torch from ...models import UNet2DModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import ScoreSdeVeScheduler +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput class ScoreSdeVePipeline(DiffusionPipeline): @@ -57,12 +57,11 @@ class ScoreSdeVePipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ img_size = self.unet.config.sample_size diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 3e8917bd..4392d9d8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -25,9 +25,9 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler from ...utils import PIL_INTERPOLATION, deprecate, logging +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py index b4f57668..d5add171 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion.py @@ -28,7 +28,6 @@ from PIL import Image from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel -from ...pipeline_flax_utils import FlaxDiffusionPipeline from ...schedulers import ( FlaxDDIMScheduler, FlaxDPMSolverMultistepScheduler, @@ -36,6 +35,7 @@ from ...schedulers import ( FlaxPNDMScheduler, ) from ...utils import deprecate, logging +from ..pipeline_flax_utils import FlaxDiffusionPipeline from . import FlaxStableDiffusionPipelineOutput from .safety_checker_flax import FlaxStableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py index 5eeede7d..16ac2ba1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_flax_stable_diffusion_img2img.py @@ -27,7 +27,6 @@ from PIL import Image from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel from ...models import FlaxAutoencoderKL, FlaxUNet2DConditionModel -from ...pipeline_flax_utils import FlaxDiffusionPipeline from ...schedulers import ( FlaxDDIMScheduler, FlaxDPMSolverMultistepScheduler, @@ -35,6 +34,7 @@ from ...schedulers import ( FlaxPNDMScheduler, ) from ...utils import PIL_INTERPOLATION, logging +from ..pipeline_flax_utils import FlaxDiffusionPipeline from . import FlaxStableDiffusionPipelineOutput from .safety_checker_flax import FlaxStableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py index 1b9a8ff7..d7ae6aa8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py @@ -21,10 +21,10 @@ import torch from transformers import CLIPFeatureExtractor, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import deprecate, logging +from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 71b5fdbb..6e216a67 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -22,10 +22,10 @@ import PIL from transformers import CLIPFeatureExtractor, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import PIL_INTERPOLATION, deprecate, logging +from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py index 930d61de..f7bb440a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint.py @@ -22,10 +22,10 @@ import PIL from transformers import CLIPFeatureExtractor, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import PIL_INTERPOLATION, deprecate, logging +from ..onnx_utils import ORT_TO_NP_TYPE, OnnxRuntimeModel +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py index d32586fe..98914eaa 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_inpaint_legacy.py @@ -8,10 +8,10 @@ import PIL from transformers import CLIPFeatureExtractor, CLIPTokenizer from ...configuration_utils import FrozenDict -from ...onnx_utils import OnnxRuntimeModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import deprecate, logging +from ..onnx_utils import OnnxRuntimeModel +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 37a041bd..b9b87e8f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -23,7 +23,6 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -33,6 +32,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import deprecate, logging +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index dfd3d233..e86f2d72 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -26,7 +26,6 @@ from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTF from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -36,6 +35,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import PIL_INTERPOLATION, deprecate, logging +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py index 975afddf..fd2d4afb 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py @@ -24,7 +24,6 @@ from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -34,6 +33,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import deprecate, logging +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 857ad6f3..0213a1a5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -25,7 +25,6 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -35,6 +34,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import PIL_INTERPOLATION, deprecate, logging +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 985b138d..77e1df6e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -25,9 +25,9 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import deprecate, logging +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 429e2da2..40e026e2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -25,7 +25,6 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -35,6 +34,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import PIL_INTERPOLATION, deprecate, logging +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput from .safety_checker import StableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index 1bb0cb05..c3915272 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -19,7 +19,7 @@ import torch from k_diffusion.external import CompVisDenoiser, CompVisVDenoiser -from ... import DiffusionPipeline +from ...pipelines import DiffusionPipeline from ...schedulers import LMSDiscreteScheduler from ...utils import is_accelerate_available, logging from . import StableDiffusionPipelineOutput diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 528ed0dc..236ecea5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -23,9 +23,9 @@ from diffusers.utils import is_accelerate_available from transformers import CLIPTextModel, CLIPTokenizer from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import DDIMScheduler, DDPMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import logging +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name diff --git a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py index d5b163ba..c7b58bbf 100644 --- a/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py +++ b/src/diffusers/pipelines/stable_diffusion_safe/pipeline_stable_diffusion_safe.py @@ -10,7 +10,6 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...configuration_utils import FrozenDict from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import ( DDIMScheduler, DPMSolverMultistepScheduler, @@ -20,6 +19,7 @@ from ...schedulers import ( PNDMScheduler, ) from ...utils import deprecate, is_accelerate_available, logging +from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionSafePipelineOutput from .safety_checker import SafeStableDiffusionSafetyChecker diff --git a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py b/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py index 8da1faf9..90f86837 100644 --- a/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py +++ b/src/diffusers/pipelines/stochastic_karras_ve/pipeline_stochastic_karras_ve.py @@ -17,8 +17,8 @@ from typing import List, Optional, Tuple, Union import torch from ...models import UNet2DModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import KarrasVeScheduler +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput class KarrasVePipeline(DiffusionPipeline): @@ -68,12 +68,11 @@ class KarrasVePipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ img_size = self.unet.config.sample_size diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip.py b/src/diffusers/pipelines/unclip/pipeline_unclip.py index a8e8cba8..52f15056 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py @@ -18,11 +18,11 @@ from typing import List, Optional, Union import torch from torch.nn import functional as F -from diffusers import PriorTransformer, UNet2DConditionModel, UNet2DModel -from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput -from diffusers.schedulers import UnCLIPScheduler from transformers import CLIPTextModelWithProjection, CLIPTokenizer +from ...models import PriorTransformer, UNet2DConditionModel, UNet2DModel +from ...pipelines import DiffusionPipeline, ImagePipelineOutput +from ...schedulers import UnCLIPScheduler from ...utils import is_accelerate_available, logging from .text_proj import UnCLIPTextProjModel @@ -291,7 +291,7 @@ class UnCLIPPipeline(DiffusionPipeline): The output format of the generated image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. """ if isinstance(prompt, str): batch_size = 1 diff --git a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py index 32b95039..1d0320d3 100644 --- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py +++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py @@ -19,9 +19,6 @@ import torch from torch.nn import functional as F import PIL -from diffusers import UNet2DConditionModel, UNet2DModel -from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput -from diffusers.schedulers import UnCLIPScheduler from transformers import ( CLIPFeatureExtractor, CLIPTextModelWithProjection, @@ -29,6 +26,9 @@ from transformers import ( CLIPVisionModelWithProjection, ) +from ...models import UNet2DConditionModel, UNet2DModel +from ...pipelines import DiffusionPipeline, ImagePipelineOutput +from ...schedulers import UnCLIPScheduler from ...utils import is_accelerate_available, logging from .text_proj import UnCLIPTextProjModel @@ -303,7 +303,7 @@ class UnCLIPImageVariationPipeline(DiffusionPipeline): The output format of the generated image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. """ if isinstance(image, PIL.Image.Image): batch_size = 1 diff --git a/src/diffusers/pipelines/unclip/text_proj.py b/src/diffusers/pipelines/unclip/text_proj.py index 31e7e064..44820105 100644 --- a/src/diffusers/pipelines/unclip/text_proj.py +++ b/src/diffusers/pipelines/unclip/text_proj.py @@ -15,9 +15,8 @@ import torch from torch import nn -from diffusers.modeling_utils import ModelMixin - from ...configuration_utils import ConfigMixin, register_to_config +from ...models import ModelMixin class UnCLIPTextProjModel(ModelMixin, ConfigMixin): diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 3d3f210c..40c6225c 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn from ...configuration_utils import ConfigMixin, register_to_config -from ...modeling_utils import ModelMixin +from ...models import ModelMixin from ...models.attention import CrossAttention, DualTransformer2DModel, Transformer2DModel from ...models.cross_attention import AttnProcessor, CrossAttnAddedKVProcessor from ...models.embeddings import TimestepEmbedding, Timesteps diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py index d0202922..88e7e4b6 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion.py @@ -7,9 +7,9 @@ import PIL.Image from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModel from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import logging +from ..pipeline_utils import DiffusionPipeline from .pipeline_versatile_diffusion_dual_guided import VersatileDiffusionDualGuidedPipeline from .pipeline_versatile_diffusion_image_variation import VersatileDiffusionImageVariationPipeline from .pipeline_versatile_diffusion_text_to_image import VersatileDiffusionTextToImagePipeline diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py index b7c22494..74902665 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_dual_guided.py @@ -27,11 +27,10 @@ from transformers import ( CLIPVisionModelWithProjection, ) -from ...models import AutoencoderKL, UNet2DConditionModel -from ...models.attention import DualTransformer2DModel, Transformer2DModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ...models import AutoencoderKL, DualTransformer2DModel, Transformer2DModel, UNet2DConditionModel from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import is_accelerate_available, logging +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .modeling_text_unet import UNetFlatConditionModel diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py index 122fd7a9..93c70688 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_image_variation.py @@ -23,9 +23,9 @@ import PIL from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection from ...models import AutoencoderKL, UNet2DConditionModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import is_accelerate_available, logging +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name diff --git a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py index e12eabde..e05cb036 100644 --- a/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py +++ b/src/diffusers/pipelines/versatile_diffusion/pipeline_versatile_diffusion_text_to_image.py @@ -20,11 +20,10 @@ import torch.utils.checkpoint from transformers import CLIPFeatureExtractor, CLIPTextModelWithProjection, CLIPTokenizer -from ...models import AutoencoderKL, UNet2DConditionModel -from ...models.attention import Transformer2DModel -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ...models import AutoencoderKL, Transformer2DModel, UNet2DConditionModel from ...schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler from ...utils import is_accelerate_available, logging +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput from .modeling_text_unet import UNetFlatConditionModel diff --git a/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py b/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py index a536f91d..bd63eda0 100644 --- a/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py +++ b/src/diffusers/pipelines/vq_diffusion/pipeline_vq_diffusion.py @@ -16,14 +16,13 @@ from typing import Callable, List, Optional, Tuple, Union import torch -from diffusers import Transformer2DModel, VQModel -from diffusers.schedulers.scheduling_vq_diffusion import VQDiffusionScheduler from transformers import CLIPTextModel, CLIPTokenizer from ...configuration_utils import ConfigMixin, register_to_config -from ...modeling_utils import ModelMixin -from ...pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from ...models import ModelMixin, Transformer2DModel, VQModel +from ...schedulers import VQDiffusionScheduler from ...utils import logging +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -212,7 +211,7 @@ class VQDiffusionPipeline(DiffusionPipeline): The output format of the generated image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. callback (`Callable`, *optional*): A function that will be called every `callback_steps` steps during inference. The function will be called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. @@ -221,9 +220,8 @@ class VQDiffusionPipeline(DiffusionPipeline): called at every step. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~ pipeline_utils.ImagePipelineOutput `] if - `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the - generated images. + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~ pipeline_utils.ImagePipelineOutput `] if `return_dict` + is True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ if isinstance(prompt, str): batch_size = 1 diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index e5a4d323..332a9917 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -18,7 +18,22 @@ import os from packaging import version from .. import __version__ +from .constants import ( + _COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS, + CONFIG_NAME, + DIFFUSERS_CACHE, + DIFFUSERS_DYNAMIC_MODULE_NAME, + FLAX_WEIGHTS_NAME, + HF_MODULES_CACHE, + HUGGINGFACE_CO_RESOLVE_ENDPOINT, + ONNX_EXTERNAL_WEIGHTS_NAME, + ONNX_WEIGHTS_NAME, + SAFETENSORS_WEIGHTS_NAME, + WEIGHTS_NAME, +) from .deprecation_utils import deprecate +from .dynamic_modules_utils import get_class_from_dynamic_module +from .hub_utils import HF_HUB_OFFLINE, http_user_agent from .import_utils import ( ENV_VARS_TRUE_AND_AUTO_VALUES, ENV_VARS_TRUE_VALUES, @@ -67,36 +82,6 @@ if is_torch_available(): logger = get_logger(__name__) -hf_cache_home = os.path.expanduser( - os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")) -) -default_cache_path = os.path.join(hf_cache_home, "diffusers") - - -CONFIG_NAME = "config.json" -WEIGHTS_NAME = "diffusion_pytorch_model.bin" -FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack" -ONNX_WEIGHTS_NAME = "model.onnx" -SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors" -ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb" -HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co" -DIFFUSERS_CACHE = default_cache_path -DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules" -HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules")) - -_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS = [ - "DDIMScheduler", - "DDPMScheduler", - "PNDMScheduler", - "LMSDiscreteScheduler", - "EulerDiscreteScheduler", - "HeunDiscreteScheduler", - "EulerAncestralDiscreteScheduler", - "DPMSolverMultistepScheduler", - "DPMSolverSinglestepScheduler", -] - - def check_min_version(min_version): if version.parse(__version__) < version.parse(min_version): if "dev" in min_version: diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py new file mode 100644 index 00000000..eaa82122 --- /dev/null +++ b/src/diffusers/utils/constants.py @@ -0,0 +1,44 @@ +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + + +hf_cache_home = os.path.expanduser( + os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")) +) +default_cache_path = os.path.join(hf_cache_home, "diffusers") + + +CONFIG_NAME = "config.json" +WEIGHTS_NAME = "diffusion_pytorch_model.bin" +FLAX_WEIGHTS_NAME = "diffusion_flax_model.msgpack" +ONNX_WEIGHTS_NAME = "model.onnx" +SAFETENSORS_WEIGHTS_NAME = "diffusion_pytorch_model.safetensors" +ONNX_EXTERNAL_WEIGHTS_NAME = "weights.pb" +HUGGINGFACE_CO_RESOLVE_ENDPOINT = "https://huggingface.co" +DIFFUSERS_CACHE = default_cache_path +DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules" +HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules")) + +_COMPATIBLE_STABLE_DIFFUSION_SCHEDULERS = [ + "DDIMScheduler", + "DDPMScheduler", + "PNDMScheduler", + "LMSDiscreteScheduler", + "EulerDiscreteScheduler", + "HeunDiscreteScheduler", + "EulerAncestralDiscreteScheduler", + "DPMSolverMultistepScheduler", + "DPMSolverSinglestepScheduler", +] diff --git a/src/diffusers/utils/dummy_onnx_objects.py b/src/diffusers/utils/dummy_onnx_objects.py new file mode 100644 index 00000000..963906b2 --- /dev/null +++ b/src/diffusers/utils/dummy_onnx_objects.py @@ -0,0 +1,19 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +# flake8: noqa + +from ..utils import DummyObject, requires_backends + + +class OnnxRuntimeModel(metaclass=DummyObject): + _backends = ["onnx"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["onnx"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["onnx"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["onnx"]) diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 615f84d1..63a7d258 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -4,7 +4,7 @@ from ..utils import DummyObject, requires_backends -class ModelMixin(metaclass=DummyObject): +class AutoencoderKL(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -19,7 +19,7 @@ class ModelMixin(metaclass=DummyObject): requires_backends(cls, ["torch"]) -class AutoencoderKL(metaclass=DummyObject): +class ModelMixin(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -152,7 +152,7 @@ def get_scheduler(*args, **kwargs): requires_backends(get_scheduler, ["torch"]) -class DiffusionPipeline(metaclass=DummyObject): +class AudioPipelineOutput(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -212,6 +212,36 @@ class DDPMPipeline(metaclass=DummyObject): requires_backends(cls, ["torch"]) +class DiffusionPipeline(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + +class ImagePipelineOutput(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class KarrasVePipeline(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/diffusers/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py similarity index 99% rename from src/diffusers/dynamic_modules_utils.py rename to src/diffusers/utils/dynamic_modules_utils.py index 693d9811..464257bd 100644 --- a/src/diffusers/dynamic_modules_utils.py +++ b/src/diffusers/utils/dynamic_modules_utils.py @@ -28,8 +28,8 @@ from urllib import request from huggingface_hub import HfFolder, cached_download, hf_hub_download, model_info -from . import __version__ -from .utils import DIFFUSERS_DYNAMIC_MODULE_NAME, HF_MODULES_CACHE, logging +from .. import __version__ +from . import DIFFUSERS_DYNAMIC_MODULE_NAME, HF_MODULES_CACHE, logging COMMUNITY_PIPELINES_URL = ( @@ -172,7 +172,7 @@ def find_pipeline_class(loaded_module): Retrieve pipeline class that inherits from `DiffusionPipeline`. Note that there has to be exactly one class inheriting from `DiffusionPipeline`. """ - from .pipeline_utils import DiffusionPipeline + from ..pipelines import DiffusionPipeline cls_members = dict(inspect.getmembers(loaded_module, inspect.isclass)) diff --git a/src/diffusers/hub_utils.py b/src/diffusers/utils/hub_utils.py similarity index 96% rename from src/diffusers/hub_utils.py rename to src/diffusers/utils/hub_utils.py index 33e17635..e22b3644 100644 --- a/src/diffusers/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -22,9 +22,10 @@ from uuid import uuid4 from huggingface_hub import HfFolder, whoami -from . import __version__ -from .utils import ENV_VARS_TRUE_VALUES, HUGGINGFACE_CO_RESOLVE_ENDPOINT, logging -from .utils.import_utils import ( +from .. import __version__ +from .constants import HUGGINGFACE_CO_RESOLVE_ENDPOINT +from .import_utils import ( + ENV_VARS_TRUE_VALUES, _flax_version, _jax_version, _onnxruntime_version, @@ -34,13 +35,14 @@ from .utils.import_utils import ( is_onnx_available, is_torch_available, ) +from .logging import get_logger if is_modelcards_available(): from modelcards import CardData, ModelCard -logger = logging.get_logger(__name__) +logger = get_logger(__name__) MODEL_CARD_TEMPLATE_PATH = Path(__file__).parent / "utils" / "model_card_template.md" diff --git a/tests/fixtures/custom_pipeline/pipeline.py b/tests/fixtures/custom_pipeline/pipeline.py index e7429d0a..0667edcf 100644 --- a/tests/fixtures/custom_pipeline/pipeline.py +++ b/tests/fixtures/custom_pipeline/pipeline.py @@ -18,7 +18,7 @@ from typing import Optional, Tuple, Union import torch -from diffusers.pipeline_utils import DiffusionPipeline, ImagePipelineOutput +from diffusers import DiffusionPipeline, ImagePipelineOutput class CustomLocalPipeline(DiffusionPipeline): @@ -63,10 +63,10 @@ class CustomLocalPipeline(DiffusionPipeline): The output format of the generate image. Choose between [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipeline_utils.ImagePipelineOutput`] instead of a plain tuple. + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. Returns: - [`~pipeline_utils.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. """ diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py index 2948151e..75481ecb 100644 --- a/tests/models/test_models_vae.py +++ b/tests/models/test_models_vae.py @@ -19,7 +19,7 @@ import unittest import torch from diffusers import AutoencoderKL -from diffusers.modeling_utils import ModelMixin +from diffusers.models import ModelMixin from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device from parameterized import parameterized diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py index 64b51536..5835a216 100644 --- a/tests/pipelines/unclip/test_unclip_image_variation.py +++ b/tests/pipelines/unclip/test_unclip_image_variation.py @@ -20,8 +20,13 @@ import unittest import numpy as np import torch -from diffusers import UnCLIPImageVariationPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel -from diffusers.pipeline_utils import DiffusionPipeline +from diffusers import ( + DiffusionPipeline, + UnCLIPImageVariationPipeline, + UnCLIPScheduler, + UNet2DConditionModel, + UNet2DModel, +) from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel from diffusers.utils import floats_tensor, load_numpy, slow, torch_device from diffusers.utils.testing_utils import load_image, require_torch_gpu diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index e4c32c0e..42f683f8 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -21,7 +21,7 @@ from typing import Dict, List, Tuple import numpy as np import torch -from diffusers.modeling_utils import ModelMixin +from diffusers.models import ModelMixin from diffusers.training_utils import EMAModel from diffusers.utils import torch_device diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index ff02ee8e..3ed90748 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -33,6 +33,7 @@ from diffusers import ( DDIMScheduler, DDPMPipeline, DDPMScheduler, + DiffusionPipeline, DPMSolverMultistepScheduler, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, @@ -45,7 +46,6 @@ from diffusers import ( UNet2DModel, logging, ) -from diffusers.pipeline_utils import DiffusionPipeline from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, nightly, slow, torch_device from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, require_torch_gpu @@ -704,7 +704,7 @@ class PipelineSlowTests(unittest.TestCase): def test_warning_unused_kwargs(self): model_id = "hf-internal-testing/unet-pipeline-dummy" - logger = logging.get_logger("diffusers.pipeline_utils") + logger = logging.get_logger("diffusers.pipelines") with tempfile.TemporaryDirectory() as tmpdirname: with CaptureLogger(logger) as cap_logger: DiffusionPipeline.from_pretrained(