more fixes

This commit is contained in:
Patrick von Platen 2022-06-22 13:40:08 +00:00
parent e45dae7dc0
commit 48269070d2
13 changed files with 59 additions and 69 deletions

View File

@ -249,24 +249,24 @@ image_pil = PIL.Image.fromarray(image_processed[0])
image_pil.save("test.png") image_pil.save("test.png")
``` ```
#### **Text to speech with GradTTS and BDDM** #### **Text to speech with GradTTS and BDDMPipeline**
```python ```python
import torch import torch
from diffusers import BDDM, GradTTS from diffusers import BDDMPipeline, GradTTS
torch_device = "cuda" torch_device = "cuda"
# load grad tts and bddm pipelines # load grad tts and bddm pipelines
grad_tts = GradTTS.from_pretrained("fusing/grad-tts-libri-tts") grad_tts = GradTTS.from_pretrained("fusing/grad-tts-libri-tts")
bddm = BDDM.from_pretrained("fusing/diffwave-vocoder-ljspeech") bddm = BDDMPipeline.from_pretrained("fusing/diffwave-vocoder-ljspeech")
text = "Hello world, I missed you so much." text = "Hello world, I missed you so much."
# generate mel spectograms using text # generate mel spectograms using text
mel_spec = grad_tts(text, torch_device=torch_device) mel_spec = grad_tts(text, torch_device=torch_device)
# generate the speech by passing mel spectograms to BDDM pipeline # generate the speech by passing mel spectograms to BDDMPipeline pipeline
generator = torch.manual_seed(42) generator = torch.manual_seed(42)
audio = bddm(mel_spec, generator, torch_device=torch_device) audio = bddm(mel_spec, generator, torch_device=torch_device)

View File

@ -11,19 +11,19 @@ from .models.unet import UNetModel
from .models.unet_ldm import UNetLDMModel from .models.unet_ldm import UNetLDMModel
from .models.unet_rl import TemporalUNet from .models.unet_rl import TemporalUNet
from .pipeline_utils import DiffusionPipeline from .pipeline_utils import DiffusionPipeline
from .pipelines import BDDM, DDIM, DDPM, PNDM from .pipelines import BDDMPipeline, DDIMPipeline, DDPMPipeline, PNDMPipeline
from .schedulers import DDIMScheduler, DDPMScheduler, GradTTSScheduler, PNDMScheduler, SchedulerMixin from .schedulers import DDIMScheduler, DDPMScheduler, GradTTSScheduler, PNDMScheduler, SchedulerMixin
if is_transformers_available(): if is_transformers_available():
from .models.unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel from .models.unet_glide import GlideSuperResUNetModel, GlideTextToImageUNetModel, GlideUNetModel
from .models.unet_grad_tts import UNetGradTTSModel from .models.unet_grad_tts import UNetGradTTSModel
from .pipelines import Glide, LatentDiffusion from .pipelines import GlidePipeline, LatentDiffusionPipeline
else: else:
from .utils.dummy_transformers_objects import * from .utils.dummy_transformers_objects import *
if is_transformers_available() and is_inflect_available() and is_unidecode_available(): if is_transformers_available() and is_inflect_available() and is_unidecode_available():
from .pipelines import GradTTS from .pipelines import GradTTSPipeline
else: else:
from .utils.dummy_transformers_and_inflect_and_unidecode_objects import * from .utils.dummy_transformers_and_inflect_and_unidecode_objects import *

View File

@ -21,7 +21,6 @@ from typing import Optional, Union
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from .configuration_utils import ConfigMixin from .configuration_utils import ConfigMixin
from .dynamic_modules_utils import get_class_from_dynamic_module
from .utils import DIFFUSERS_CACHE, logging from .utils import DIFFUSERS_CACHE, logging
@ -81,9 +80,6 @@ class DiffusionPipeline(ConfigMixin):
# set models # set models
setattr(self, name, module) setattr(self, name, module)
register_dict = {"_module": self.__module__.split(".")[-1]}
self.register_to_config(**register_dict)
def save_pretrained(self, save_directory: Union[str, os.PathLike]): def save_pretrained(self, save_directory: Union[str, os.PathLike]):
self.save_config(save_directory) self.save_config(save_directory)
@ -139,11 +135,7 @@ class DiffusionPipeline(ConfigMixin):
config_dict = cls.get_config_dict(cached_folder) config_dict = cls.get_config_dict(cached_folder)
# 2. Get class name and module candidates to load custom models # 2. Load the pipeline class, if using custom module then load it from the hub
module_candidate_name = config_dict["_module"]
module_candidate = module_candidate_name + ".py"
# 3. Load the pipeline class, if using custom module then load it from the hub
# if we load from explicit class, let's use it # if we load from explicit class, let's use it
if cls != DiffusionPipeline: if cls != DiffusionPipeline:
pipeline_class = cls pipeline_class = cls
@ -151,11 +143,6 @@ class DiffusionPipeline(ConfigMixin):
diffusers_module = importlib.import_module(cls.__module__.split(".")[0]) diffusers_module = importlib.import_module(cls.__module__.split(".")[0])
pipeline_class = getattr(diffusers_module, config_dict["_class_name"]) pipeline_class = getattr(diffusers_module, config_dict["_class_name"])
# (TODO - we should allow to load custom pipelines
# else we need to load the correct module from the Hub
# module = module_candidate
# pipeline_class = get_class_from_dynamic_module(cached_folder, module, class_name_, cached_folder)
init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs) init_dict, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
init_kwargs = {} init_kwargs = {}
@ -163,7 +150,7 @@ class DiffusionPipeline(ConfigMixin):
# import it here to avoid circular import # import it here to avoid circular import
from diffusers import pipelines from diffusers import pipelines
# 4. Load each module in the pipeline # 3. Load each module in the pipeline
for name, (library_name, class_name) in init_dict.items(): for name, (library_name, class_name) in init_dict.items():
is_pipeline_module = hasattr(pipelines, library_name) is_pipeline_module = hasattr(pipelines, library_name)
# if the model is in a pipeline module, then we load it from the pipeline # if the model is in a pipeline module, then we load it from the pipeline
@ -171,14 +158,7 @@ class DiffusionPipeline(ConfigMixin):
pipeline_module = getattr(pipelines, library_name) pipeline_module = getattr(pipelines, library_name)
class_obj = getattr(pipeline_module, class_name) class_obj = getattr(pipeline_module, class_name)
importable_classes = ALL_IMPORTABLE_CLASSES importable_classes = ALL_IMPORTABLE_CLASSES
class_candidates = {c: class_obj for c in ALL_IMPORTABLE_CLASSES.keys()} class_candidates = {c: class_obj for c in importable_classes.keys()}
elif library_name == module_candidate_name:
# if the model is not in diffusers or transformers, we need to load it from the hub
# assumes that it's a subclass of ModelMixin
class_obj = get_class_from_dynamic_module(cached_folder, module_candidate, class_name, cached_folder)
# since it's not from a library, we need to check class candidates for all importable classes
importable_classes = ALL_IMPORTABLE_CLASSES
class_candidates = {c: class_obj for c in ALL_IMPORTABLE_CLASSES.keys()}
else: else:
# else we just import it from the library. # else we just import it from the library.
library = importlib.import_module(library_name) library = importlib.import_module(library_name)

View File

@ -15,5 +15,5 @@ TODO(Patrick, Anton, Suraj)
- PNDM for unconditional image generation in [pipeline_pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py). - PNDM for unconditional image generation in [pipeline_pndm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_pndm.py).
- Latent diffusion for text to image generation / conditional image generation in [pipeline_latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_latent_diffusion.py). - Latent diffusion for text to image generation / conditional image generation in [pipeline_latent_diffusion](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_latent_diffusion.py).
- Glide for text to image generation / conditional image generation in [pipeline_glide](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_glide.py). - Glide for text to image generation / conditional image generation in [pipeline_glide](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_glide.py).
- BDDM for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py). - BDDMPipeline for spectrogram-to-sound vocoding in [pipeline_bddm](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_bddm.py).
- Grad-TTS for text to audio generation / conditional audio generation in [pipeline_grad_tts](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_grad_tts.py). - Grad-TTS for text to audio generation / conditional audio generation in [pipeline_grad_tts](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_grad_tts.py).

View File

@ -1,14 +1,14 @@
from ..utils import is_inflect_available, is_transformers_available, is_unidecode_available from ..utils import is_inflect_available, is_transformers_available, is_unidecode_available
from .pipeline_bddm import BDDM from .pipeline_bddm import BDDMPipeline
from .pipeline_ddim import DDIM from .pipeline_ddim import DDIMPipeline
from .pipeline_ddpm import DDPM from .pipeline_ddpm import DDPMPipeline
from .pipeline_pndm import PNDM from .pipeline_pndm import PNDMPipeline
if is_transformers_available(): if is_transformers_available():
from .pipeline_glide import Glide from .pipeline_glide import GlidePipeline
from .pipeline_latent_diffusion import LatentDiffusion from .pipeline_latent_diffusion import LatentDiffusionPipeline
if is_transformers_available() and is_unidecode_available() and is_inflect_available(): if is_transformers_available() and is_unidecode_available() and is_inflect_available():
from .pipeline_grad_tts import GradTTS from .pipeline_grad_tts import GradTTSPipeline

View File

@ -271,7 +271,7 @@ class DiffWave(ModelMixin, ConfigMixin):
return self.final_conv(x) return self.final_conv(x)
class BDDM(DiffusionPipeline): class BDDMPipeline(DiffusionPipeline):
def __init__(self, diffwave, noise_scheduler): def __init__(self, diffwave, noise_scheduler):
super().__init__() super().__init__()
noise_scheduler = noise_scheduler.set_format("pt") noise_scheduler = noise_scheduler.set_format("pt")

View File

@ -21,7 +21,7 @@ import tqdm
from ..pipeline_utils import DiffusionPipeline from ..pipeline_utils import DiffusionPipeline
class DDIM(DiffusionPipeline): class DDIMPipeline(DiffusionPipeline):
def __init__(self, unet, noise_scheduler): def __init__(self, unet, noise_scheduler):
super().__init__() super().__init__()
noise_scheduler = noise_scheduler.set_format("pt") noise_scheduler = noise_scheduler.set_format("pt")

View File

@ -21,7 +21,7 @@ import tqdm
from ..pipeline_utils import DiffusionPipeline from ..pipeline_utils import DiffusionPipeline
class DDPM(DiffusionPipeline): class DDPMPipeline(DiffusionPipeline):
def __init__(self, unet, noise_scheduler): def __init__(self, unet, noise_scheduler):
super().__init__() super().__init__()
noise_scheduler = noise_scheduler.set_format("pt") noise_scheduler = noise_scheduler.set_format("pt")

View File

@ -711,7 +711,7 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
return res + torch.zeros(broadcast_shape, device=timesteps.device) return res + torch.zeros(broadcast_shape, device=timesteps.device)
class Glide(DiffusionPipeline): class GlidePipeline(DiffusionPipeline):
def __init__( def __init__(
self, self,
text_unet: GlideTextToImageUNetModel, text_unet: GlideTextToImageUNetModel,

View File

@ -420,7 +420,7 @@ class TextEncoder(ModelMixin, ConfigMixin):
return mu, logw, x_mask return mu, logw, x_mask
class GradTTS(DiffusionPipeline): class GradTTSPipeline(DiffusionPipeline):
def __init__(self, unet, text_encoder, noise_scheduler, tokenizer): def __init__(self, unet, text_encoder, noise_scheduler, tokenizer):
super().__init__() super().__init__()
noise_scheduler = noise_scheduler.set_format("pt") noise_scheduler = noise_scheduler.set_format("pt")
@ -430,7 +430,14 @@ class GradTTS(DiffusionPipeline):
@torch.no_grad() @torch.no_grad()
def __call__( def __call__(
self, text, num_inference_steps=50, temperature=1.3, length_scale=0.91, speaker_id=15, torch_device=None self,
text,
num_inference_steps=50,
temperature=1.3,
length_scale=0.91,
speaker_id=15,
torch_device=None,
generator=None,
): ):
if torch_device is None: if torch_device is None:
torch_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") torch_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@ -464,7 +471,7 @@ class GradTTS(DiffusionPipeline):
mu_y = mu_y.transpose(1, 2) mu_y = mu_y.transpose(1, 2)
# Sample latent representation from terminal distribution N(mu_y, I) # Sample latent representation from terminal distribution N(mu_y, I)
z = mu_y + torch.randn_like(mu_y, device=mu_y.device) / temperature z = mu_y + torch.randn(mu_y.shape, device=mu_y.device, generator=generator) / temperature
xt = z * y_mask xt = z * y_mask
h = 1.0 / num_inference_steps h = 1.0 / num_inference_steps

View File

@ -860,7 +860,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin):
return dec, posterior return dec, posterior
class LatentDiffusion(DiffusionPipeline): class LatentDiffusionPipeline(DiffusionPipeline):
def __init__(self, vqvae, bert, tokenizer, unet, noise_scheduler): def __init__(self, vqvae, bert, tokenizer, unet, noise_scheduler):
super().__init__() super().__init__()
noise_scheduler = noise_scheduler.set_format("pt") noise_scheduler = noise_scheduler.set_format("pt")

View File

@ -21,7 +21,7 @@ import tqdm
from ..pipeline_utils import DiffusionPipeline from ..pipeline_utils import DiffusionPipeline
class PNDM(DiffusionPipeline): class PNDMPipeline(DiffusionPipeline):
def __init__(self, unet, noise_scheduler): def __init__(self, unet, noise_scheduler):
super().__init__() super().__init__()
noise_scheduler = noise_scheduler.set_format("pt") noise_scheduler = noise_scheduler.set_format("pt")

View File

@ -22,17 +22,17 @@ import numpy as np
import torch import torch
from diffusers import ( from diffusers import (
BDDM, BDDMPipeline,
DDIM, DDIMPipeline,
DDPM,
Glide,
PNDM,
DDIMScheduler, DDIMScheduler,
DDPMPipeline,
DDPMScheduler, DDPMScheduler,
GlidePipeline,
GlideSuperResUNetModel, GlideSuperResUNetModel,
GlideTextToImageUNetModel, GlideTextToImageUNetModel,
GradTTS, GradTTSPipeline,
LatentDiffusion, LatentDiffusionPipeline,
PNDMPipeline,
PNDMScheduler, PNDMScheduler,
UNetGradTTSModel, UNetGradTTSModel,
UNetLDMModel, UNetLDMModel,
@ -583,11 +583,11 @@ class PipelineTesterMixin(unittest.TestCase):
model = UNetModel(ch=32, ch_mult=(1, 2), num_res_blocks=2, attn_resolutions=(16,), resolution=32) model = UNetModel(ch=32, ch_mult=(1, 2), num_res_blocks=2, attn_resolutions=(16,), resolution=32)
schedular = DDPMScheduler(timesteps=10) schedular = DDPMScheduler(timesteps=10)
ddpm = DDPM(model, schedular) ddpm = DDPMPipeline(model, schedular)
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
ddpm.save_pretrained(tmpdirname) ddpm.save_pretrained(tmpdirname)
new_ddpm = DDPM.from_pretrained(tmpdirname) new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
generator = torch.manual_seed(0) generator = torch.manual_seed(0)
@ -601,7 +601,7 @@ class PipelineTesterMixin(unittest.TestCase):
def test_from_pretrained_hub(self): def test_from_pretrained_hub(self):
model_path = "fusing/ddpm-cifar10" model_path = "fusing/ddpm-cifar10"
ddpm = DDPM.from_pretrained(model_path) ddpm = DDPMPipeline.from_pretrained(model_path)
ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path) ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path)
ddpm.noise_scheduler.num_timesteps = 10 ddpm.noise_scheduler.num_timesteps = 10
@ -624,7 +624,7 @@ class PipelineTesterMixin(unittest.TestCase):
noise_scheduler = DDPMScheduler.from_config(model_id) noise_scheduler = DDPMScheduler.from_config(model_id)
noise_scheduler = noise_scheduler.set_format("pt") noise_scheduler = noise_scheduler.set_format("pt")
ddpm = DDPM(unet=unet, noise_scheduler=noise_scheduler) ddpm = DDPMPipeline(unet=unet, noise_scheduler=noise_scheduler)
image = ddpm(generator=generator) image = ddpm(generator=generator)
image_slice = image[0, -1, -3:, -3:].cpu() image_slice = image[0, -1, -3:, -3:].cpu()
@ -641,7 +641,7 @@ class PipelineTesterMixin(unittest.TestCase):
unet = UNetModel.from_pretrained(model_id) unet = UNetModel.from_pretrained(model_id)
noise_scheduler = DDIMScheduler(tensor_format="pt") noise_scheduler = DDIMScheduler(tensor_format="pt")
ddim = DDIM(unet=unet, noise_scheduler=noise_scheduler) ddim = DDIMPipeline(unet=unet, noise_scheduler=noise_scheduler)
image = ddim(generator=generator, eta=0.0) image = ddim(generator=generator, eta=0.0)
image_slice = image[0, -1, -3:, -3:].cpu() image_slice = image[0, -1, -3:, -3:].cpu()
@ -660,7 +660,7 @@ class PipelineTesterMixin(unittest.TestCase):
unet = UNetModel.from_pretrained(model_id) unet = UNetModel.from_pretrained(model_id)
noise_scheduler = PNDMScheduler(tensor_format="pt") noise_scheduler = PNDMScheduler(tensor_format="pt")
pndm = PNDM(unet=unet, noise_scheduler=noise_scheduler) pndm = PNDMPipeline(unet=unet, noise_scheduler=noise_scheduler)
image = pndm(generator=generator) image = pndm(generator=generator)
image_slice = image[0, -1, -3:, -3:].cpu() image_slice = image[0, -1, -3:, -3:].cpu()
@ -674,7 +674,7 @@ class PipelineTesterMixin(unittest.TestCase):
@slow @slow
def test_ldm_text2img(self): def test_ldm_text2img(self):
model_id = "fusing/latent-diffusion-text2im-large" model_id = "fusing/latent-diffusion-text2im-large"
ldm = LatentDiffusion.from_pretrained(model_id) ldm = LatentDiffusionPipeline.from_pretrained(model_id)
prompt = "A painting of a squirrel eating a burger" prompt = "A painting of a squirrel eating a burger"
generator = torch.manual_seed(0) generator = torch.manual_seed(0)
@ -689,7 +689,7 @@ class PipelineTesterMixin(unittest.TestCase):
@slow @slow
def test_glide_text2img(self): def test_glide_text2img(self):
model_id = "fusing/glide-base" model_id = "fusing/glide-base"
glide = Glide.from_pretrained(model_id) glide = GlidePipeline.from_pretrained(model_id)
prompt = "a pencil sketch of a corgi" prompt = "a pencil sketch of a corgi"
generator = torch.manual_seed(0) generator = torch.manual_seed(0)
@ -704,22 +704,25 @@ class PipelineTesterMixin(unittest.TestCase):
@slow @slow
def test_grad_tts(self): def test_grad_tts(self):
model_id = "fusing/grad-tts-libri-tts" model_id = "fusing/grad-tts-libri-tts"
grad_tts = GradTTS.from_pretrained(model_id) grad_tts = GradTTSPipeline.from_pretrained(model_id)
text = "Hello world, I missed you so much." text = "Hello world, I missed you so much."
generator = torch.manual_seed(0)
# generate mel spectograms using text # generate mel spectograms using text
mel_spec = grad_tts(text) mel_spec = grad_tts(text, generator=generator)
assert mel_spec.shape == (1, 256, 256, 3) assert mel_spec.shape == (1, 80, 143)
expected_slice = torch.tensor([0.7119, 0.7073, 0.6460, 0.7780, 0.7423, 0.6926, 0.7378, 0.7189, 0.7784]) expected_slice = torch.tensor(
assert (mel_spec.flatten() - expected_slice).abs().max() < 1e-2 [-6.6119, -6.5963, -6.2776, -6.7496, -6.7096, -6.5131, -6.4643, -6.4817, -6.7185]
)
assert (mel_spec[0, :3, :3].flatten() - expected_slice).abs().max() < 1e-2
def test_module_from_pipeline(self): def test_module_from_pipeline(self):
model = DiffWave(num_res_layers=4) model = DiffWave(num_res_layers=4)
noise_scheduler = DDPMScheduler(timesteps=12) noise_scheduler = DDPMScheduler(timesteps=12)
bddm = BDDM(model, noise_scheduler) bddm = BDDMPipeline(model, noise_scheduler)
# check if the library name for the diffwave moduel is set to pipeline module # check if the library name for the diffwave moduel is set to pipeline module
self.assertTrue(bddm.config["diffwave"][0] == "pipeline_bddm") self.assertTrue(bddm.config["diffwave"][0] == "pipeline_bddm")
@ -727,6 +730,6 @@ class PipelineTesterMixin(unittest.TestCase):
# check if we can save and load the pipeline # check if we can save and load the pipeline
with tempfile.TemporaryDirectory() as tmpdirname: with tempfile.TemporaryDirectory() as tmpdirname:
bddm.save_pretrained(tmpdirname) bddm.save_pretrained(tmpdirname)
_ = BDDM.from_pretrained(tmpdirname) _ = BDDMPipeline.from_pretrained(tmpdirname)
# check if the same works using the DifusionPipeline class # check if the same works using the DifusionPipeline class
_ = DiffusionPipeline.from_pretrained(tmpdirname) _ = DiffusionPipeline.from_pretrained(tmpdirname)