diff --git a/README.md b/README.md index f6889baf..32dc7c82 100644 --- a/README.md +++ b/README.md @@ -249,24 +249,24 @@ image_pil = PIL.Image.fromarray(image_processed[0]) image_pil.save("test.png") ``` -#### **Text to speech with GradTTS and BDDM** +#### **Text to speech with GradTTS and BDDMPipeline** ```python import torch -from diffusers import BDDM, GradTTS +from diffusers import BDDMPipeline, GradTTS torch_device = "cuda" # load grad tts and bddm pipelines grad_tts = GradTTS.from_pretrained("fusing/grad-tts-libri-tts") -bddm = BDDM.from_pretrained("fusing/diffwave-vocoder-ljspeech") +bddm = BDDMPipeline.from_pretrained("fusing/diffwave-vocoder-ljspeech") text = "Hello world, I missed you so much." # generate mel spectograms using text mel_spec = grad_tts(text, torch_device=torch_device) -# generate the speech by passing mel spectograms to BDDM pipeline +# generate the speech by passing mel spectograms to BDDMPipeline pipeline generator = torch.manual_seed(42) audio = bddm(mel_spec, generator, torch_device=torch_device) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index efb89e85..aaca3d34 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -11,19 +11,19 @@ from .models.unet import UNetModel from .models.unet_ldm import UNetLDMModel from .models.unet_rl import TemporalUNet from .pipeline_utils import DiffusionPipeline -from .pipelines import BDDM, DDIM, DDPM, PNDM +from .pipelines import BDDMPipeline, DDIMPipeline, DDPMPipeline, PNDMPipeline from .schedulers import DDIMScheduler, DDPMScheduler, GradTTSScheduler, PNDMScheduler, SchedulerMixin if is_transformers_available(): from .models.unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel from .models.unet_grad_tts import UNetGradTTSModel - from .pipelines import Glide, LatentDiffusion + from .pipelines import GlidePipeline, LatentDiffusionPipeline else: from .utils.dummy_transformers_objects import * if is_transformers_available() and is_inflect_available() and is_unidecode_available(): - from .pipelines import GradTTS + from .pipelines import GradTTSPipeline else: from .utils.dummy_transformers_and_inflect_and_unidecode_objects import * diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py index d8a2644d..339ebb07 100644 --- a/src/diffusers/pipeline_utils.py +++ b/src/diffusers/pipeline_utils.py @@ -21,7 +21,6 @@ from typing import Optional, Union from huggingface_hub import snapshot_download from .configuration_utils import ConfigMixin -from .dynamic_modules_utils import get_class_from_dynamic_module from .utils import DIFFUSERS_CACHE, logging @@ -81,9 +80,6 @@ class DiffusionPipeline(ConfigMixin): # set models setattr(self, name, module) - register_dict = {"_module": self.__module__.split(".")[-1]} - self.register_to_config(**register_dict) - def save_pretrained(self, save_directory: Union[str, os.PathLike]): self.save_config(save_directory) @@ -139,11 +135,7 @@ class DiffusionPipeline(ConfigMixin): config_dict = cls.get_config_dict(cached_folder) - # 2. Get class name and module candidates to load custom models - module_candidate_name = config_dict["_module"] - module_candidate = module_candidate_name + ".py" - - # 3. Load the pipeline class, if using custom module then load it from the hub + # 2. Load the pipeline class, if using custom module then load it from the hub # if we load from explicit class, let's use it if cls != DiffusionPipeline: pipeline_class = cls @@ -151,11 +143,6 @@ class DiffusionPipeline(ConfigMixin): diffusers_module = importlib.import_module(cls.__module__.split(".")[0]) pipeline_class = getattr(diffusers_module, config_dict["_class_name"]) - # (TODO - we should allow to load custom pipelines - # else we need to load the correct module from the Hub - # module = module_candidate - # pipeline_class = get_class_from_dynamic_module(cached_folder, module, class_name_, cached_folder) - init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) init_kwargs = {} @@ -163,7 +150,7 @@ class DiffusionPipeline(ConfigMixin): # import it here to avoid circular import from diffusers import pipelines - # 4. Load each module in the pipeline + # 3. Load each module in the pipeline for name, (library_name, class_name) in init_dict.items(): is_pipeline_module = hasattr(pipelines, library_name) # if the model is in a pipeline module, then we load it from the pipeline @@ -171,14 +158,7 @@ class DiffusionPipeline(ConfigMixin): pipeline_module = getattr(pipelines, library_name) class_obj = getattr(pipeline_module, class_name) importable_classes = ALL_IMPORTABLE_CLASSES - class_candidates = {c: class_obj for c in ALL_IMPORTABLE_CLASSES.keys()} - elif library_name == module_candidate_name: - # if the model is not in diffusers or transformers, we need to load it from the hub - # assumes that it's a subclass of ModelMixin - class_obj = get_class_from_dynamic_module(cached_folder, module_candidate, class_name, cached_folder) - # since it's not from a library, we need to check class candidates for all importable classes - importable_classes = ALL_IMPORTABLE_CLASSES - class_candidates = {c: class_obj for c in ALL_IMPORTABLE_CLASSES.keys()} + class_candidates = {c: class_obj for c in importable_classes.keys()} else: # else we just import it from the library. library = importlib.import_module(library_name) diff --git a/src/diffusers/pipelines/README.md b/src/diffusers/pipelines/README.md index 61e653a8..c0558d35 100644 --- a/src/diffusers/pipelines/README.md +++ b/src/diffusers/pipelines/README.md @@ -15,5 +15,5 @@ TODO(Patrick, Anton, Suraj) - PNDM for unconditional image generation in [pipeline_pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py). - Latent diffusion for text to image generation / conditional image generation in [pipeline_latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_latent_diffusion.py). - Glide for text to image generation / conditional image generation in [pipeline_glide](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_glide.py). -- BDDM for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py). +- BDDMPipeline for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py). - Grad-TTS for text to audio generation / conditional audio generation in [pipeline_grad_tts](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_grad_tts.py). diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 7ba126b0..d26c5fc8 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -1,14 +1,14 @@ from ..utils import is_inflect_available, is_transformers_available, is_unidecode_available -from .pipeline_bddm import BDDM -from .pipeline_ddim import DDIM -from .pipeline_ddpm import DDPM -from .pipeline_pndm import PNDM +from .pipeline_bddm import BDDMPipeline +from .pipeline_ddim import DDIMPipeline +from .pipeline_ddpm import DDPMPipeline +from .pipeline_pndm import PNDMPipeline if is_transformers_available(): - from .pipeline_glide import Glide - from .pipeline_latent_diffusion import LatentDiffusion + from .pipeline_glide import GlidePipeline + from .pipeline_latent_diffusion import LatentDiffusionPipeline if is_transformers_available() and is_unidecode_available() and is_inflect_available(): - from .pipeline_grad_tts import GradTTS + from .pipeline_grad_tts import GradTTSPipeline diff --git a/src/diffusers/pipelines/pipeline_bddm.py b/src/diffusers/pipelines/pipeline_bddm.py index 3ca79c3d..8b24cb9c 100644 --- a/src/diffusers/pipelines/pipeline_bddm.py +++ b/src/diffusers/pipelines/pipeline_bddm.py @@ -271,7 +271,7 @@ class DiffWave(ModelMixin, ConfigMixin): return self.final_conv(x) -class BDDM(DiffusionPipeline): +class BDDMPipeline(DiffusionPipeline): def __init__(self, diffwave, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/src/diffusers/pipelines/pipeline_ddim.py b/src/diffusers/pipelines/pipeline_ddim.py index 272d3edb..8da24dbf 100644 --- a/src/diffusers/pipelines/pipeline_ddim.py +++ b/src/diffusers/pipelines/pipeline_ddim.py @@ -21,7 +21,7 @@ import tqdm from ..pipeline_utils import DiffusionPipeline -class DDIM(DiffusionPipeline): +class DDIMPipeline(DiffusionPipeline): def __init__(self, unet, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/src/diffusers/pipelines/pipeline_ddpm.py b/src/diffusers/pipelines/pipeline_ddpm.py index ebcce773..9cf83bfb 100644 --- a/src/diffusers/pipelines/pipeline_ddpm.py +++ b/src/diffusers/pipelines/pipeline_ddpm.py @@ -21,7 +21,7 @@ import tqdm from ..pipeline_utils import DiffusionPipeline -class DDPM(DiffusionPipeline): +class DDPMPipeline(DiffusionPipeline): def __init__(self, unet, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/src/diffusers/pipelines/pipeline_glide.py b/src/diffusers/pipelines/pipeline_glide.py index 00460553..8680b754 100644 --- a/src/diffusers/pipelines/pipeline_glide.py +++ b/src/diffusers/pipelines/pipeline_glide.py @@ -711,7 +711,7 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): return res + torch.zeros(broadcast_shape, device=timesteps.device) -class Glide(DiffusionPipeline): +class GlidePipeline(DiffusionPipeline): def __init__( self, text_unet: GlideTextToImageUNetModel, diff --git a/src/diffusers/pipelines/pipeline_grad_tts.py b/src/diffusers/pipelines/pipeline_grad_tts.py index 42011249..51c861a2 100644 --- a/src/diffusers/pipelines/pipeline_grad_tts.py +++ b/src/diffusers/pipelines/pipeline_grad_tts.py @@ -420,7 +420,7 @@ class TextEncoder(ModelMixin, ConfigMixin): return mu, logw, x_mask -class GradTTS(DiffusionPipeline): +class GradTTSPipeline(DiffusionPipeline): def __init__(self, unet, text_encoder, noise_scheduler, tokenizer): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") @@ -430,7 +430,14 @@ class GradTTS(DiffusionPipeline): @torch.no_grad() def __call__( - self, text, num_inference_steps=50, temperature=1.3, length_scale=0.91, speaker_id=15, torch_device=None + self, + text, + num_inference_steps=50, + temperature=1.3, + length_scale=0.91, + speaker_id=15, + torch_device=None, + generator=None, ): if torch_device is None: torch_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @@ -464,7 +471,7 @@ class GradTTS(DiffusionPipeline): mu_y = mu_y.transpose(1, 2) # Sample latent representation from terminal distribution N(mu_y, I) - z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature + z = mu_y + torch.randn(mu_y.shape, device=mu_y.device, generator=generator) / temperature xt = z * y_mask h = 1.0 / num_inference_steps diff --git a/src/diffusers/pipelines/pipeline_latent_diffusion.py b/src/diffusers/pipelines/pipeline_latent_diffusion.py index cd7f653b..7d386765 100644 --- a/src/diffusers/pipelines/pipeline_latent_diffusion.py +++ b/src/diffusers/pipelines/pipeline_latent_diffusion.py @@ -860,7 +860,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin): return dec, posterior -class LatentDiffusion(DiffusionPipeline): +class LatentDiffusionPipeline(DiffusionPipeline): def __init__(self, vqvae, bert, tokenizer, unet, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/src/diffusers/pipelines/pipeline_pndm.py b/src/diffusers/pipelines/pipeline_pndm.py index a19f933e..5fd8a984 100644 --- a/src/diffusers/pipelines/pipeline_pndm.py +++ b/src/diffusers/pipelines/pipeline_pndm.py @@ -21,7 +21,7 @@ import tqdm from ..pipeline_utils import DiffusionPipeline -class PNDM(DiffusionPipeline): +class PNDMPipeline(DiffusionPipeline): def __init__(self, unet, noise_scheduler): super().__init__() noise_scheduler = noise_scheduler.set_format("pt") diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index 372435de..720e6874 100755 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -22,17 +22,17 @@ import numpy as np import torch from diffusers import ( - BDDM, - DDIM, - DDPM, - Glide, - PNDM, + BDDMPipeline, + DDIMPipeline, DDIMScheduler, + DDPMPipeline, DDPMScheduler, + GlidePipeline, GlideSuperResUNetModel, GlideTextToImageUNetModel, - GradTTS, - LatentDiffusion, + GradTTSPipeline, + LatentDiffusionPipeline, + PNDMPipeline, PNDMScheduler, UNetGradTTSModel, UNetLDMModel, @@ -583,11 +583,11 @@ class PipelineTesterMixin(unittest.TestCase): model = UNetModel(ch=32, ch_mult=(1, 2), num_res_blocks=2, attn_resolutions=(16,), resolution=32) schedular = DDPMScheduler(timesteps=10) - ddpm = DDPM(model, schedular) + ddpm = DDPMPipeline(model, schedular) with tempfile.TemporaryDirectory() as tmpdirname: ddpm.save_pretrained(tmpdirname) - new_ddpm = DDPM.from_pretrained(tmpdirname) + new_ddpm = DDPMPipeline.from_pretrained(tmpdirname) generator = torch.manual_seed(0) @@ -601,7 +601,7 @@ class PipelineTesterMixin(unittest.TestCase): def test_from_pretrained_hub(self): model_path = "fusing/ddpm-cifar10" - ddpm = DDPM.from_pretrained(model_path) + ddpm = DDPMPipeline.from_pretrained(model_path) ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path) ddpm.noise_scheduler.num_timesteps = 10 @@ -624,7 +624,7 @@ class PipelineTesterMixin(unittest.TestCase): noise_scheduler = DDPMScheduler.from_config(model_id) noise_scheduler = noise_scheduler.set_format("pt") - ddpm = DDPM(unet=unet, noise_scheduler=noise_scheduler) + ddpm = DDPMPipeline(unet=unet, noise_scheduler=noise_scheduler) image = ddpm(generator=generator) image_slice = image[0, -1, -3:, -3:].cpu() @@ -641,7 +641,7 @@ class PipelineTesterMixin(unittest.TestCase): unet = UNetModel.from_pretrained(model_id) noise_scheduler = DDIMScheduler(tensor_format="pt") - ddim = DDIM(unet=unet, noise_scheduler=noise_scheduler) + ddim = DDIMPipeline(unet=unet, noise_scheduler=noise_scheduler) image = ddim(generator=generator, eta=0.0) image_slice = image[0, -1, -3:, -3:].cpu() @@ -660,7 +660,7 @@ class PipelineTesterMixin(unittest.TestCase): unet = UNetModel.from_pretrained(model_id) noise_scheduler = PNDMScheduler(tensor_format="pt") - pndm = PNDM(unet=unet, noise_scheduler=noise_scheduler) + pndm = PNDMPipeline(unet=unet, noise_scheduler=noise_scheduler) image = pndm(generator=generator) image_slice = image[0, -1, -3:, -3:].cpu() @@ -674,7 +674,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_ldm_text2img(self): model_id = "fusing/latent-diffusion-text2im-large" - ldm = LatentDiffusion.from_pretrained(model_id) + ldm = LatentDiffusionPipeline.from_pretrained(model_id) prompt = "A painting of a squirrel eating a burger" generator = torch.manual_seed(0) @@ -689,7 +689,7 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_glide_text2img(self): model_id = "fusing/glide-base" - glide = Glide.from_pretrained(model_id) + glide = GlidePipeline.from_pretrained(model_id) prompt = "a pencil sketch of a corgi" generator = torch.manual_seed(0) @@ -704,22 +704,25 @@ class PipelineTesterMixin(unittest.TestCase): @slow def test_grad_tts(self): model_id = "fusing/grad-tts-libri-tts" - grad_tts = GradTTS.from_pretrained(model_id) + grad_tts = GradTTSPipeline.from_pretrained(model_id) text = "Hello world, I missed you so much." + generator = torch.manual_seed(0) # generate mel spectograms using text - mel_spec = grad_tts(text) + mel_spec = grad_tts(text, generator=generator) - assert mel_spec.shape == (1, 256, 256, 3) - expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784]) - assert (mel_spec.flatten() - expected_slice).abs().max() < 1e-2 + assert mel_spec.shape == (1, 80, 143) + expected_slice = torch.tensor( + [-6.6119, -6.5963, -6.2776, -6.7496, -6.7096, -6.5131, -6.4643, -6.4817, -6.7185] + ) + assert (mel_spec[0, :3, :3].flatten() - expected_slice).abs().max() < 1e-2 def test_module_from_pipeline(self): model = DiffWave(num_res_layers=4) noise_scheduler = DDPMScheduler(timesteps=12) - bddm = BDDM(model, noise_scheduler) + bddm = BDDMPipeline(model, noise_scheduler) # check if the library name for the diffwave moduel is set to pipeline module self.assertTrue(bddm.config["diffwave"][0] == "pipeline_bddm") @@ -727,6 +730,6 @@ class PipelineTesterMixin(unittest.TestCase): # check if we can save and load the pipeline with tempfile.TemporaryDirectory() as tmpdirname: bddm.save_pretrained(tmpdirname) - _ = BDDM.from_pretrained(tmpdirname) + _ = BDDMPipeline.from_pretrained(tmpdirname) # check if the same works using the DifusionPipeline class _ = DiffusionPipeline.from_pretrained(tmpdirname)