diff --git a/README.md b/README.md index 67cb31a..853d619 100644 --- a/README.md +++ b/README.md @@ -108,9 +108,9 @@ Execute: python -m riffusion.cli image-to-audio --image spectrogram_image.png --audio clip.wav ``` -## Streamlit playground +## Riffusion Playground -Riffusion also has a streamlit app for interactive use and exploration. +Riffusion also has a [streamlit](https://streamlit.io/) app for interactive use and exploration. This app is called the Riffusion Playground. Run with: diff --git a/riffusion/server.py b/riffusion/server.py index 42ad7b7..aad1ca0 100644 --- a/riffusion/server.py +++ b/riffusion/server.py @@ -155,8 +155,9 @@ def compute_request( ) # Reconstruct audio from the image - # TODO(hayk): It may help performance to cache this object + # TODO(hayk): It may help performance a bit to cache this object converter = SpectrogramImageConverter(params=params, device=str(pipeline.device)) + segment = converter.audio_from_spectrogram_image( image, apply_filters=True, diff --git a/riffusion/spectrogram_converter.py b/riffusion/spectrogram_converter.py index a1c5e00..9fbfc65 100644 --- a/riffusion/spectrogram_converter.py +++ b/riffusion/spectrogram_converter.py @@ -155,7 +155,10 @@ class SpectrogramConverter: # Optionally apply post-processing filters if apply_filters: - segment = audio_util.apply_filters(segment) + segment = audio_util.apply_filters( + segment, + compression=False, + ) return segment diff --git a/riffusion/util/audio_util.py b/riffusion/util/audio_util.py index 251b5b8..aa02533 100644 --- a/riffusion/util/audio_util.py +++ b/riffusion/util/audio_util.py @@ -32,7 +32,7 @@ def audio_from_waveform( return pydub.AudioSegment.from_wav(wav_bytes) -def apply_filters(segment: pydub.AudioSegment) -> pydub.AudioSegment: +def apply_filters(segment: pydub.AudioSegment, compression: bool = False) -> pydub.AudioSegment: """ Apply post-processing filters to the audio segment to compress it and keep at a -10 dBFS level. @@ -40,20 +40,22 @@ def apply_filters(segment: pydub.AudioSegment) -> pydub.AudioSegment: # TODO(hayk): Come up with a principled strategy for these filters and experiment end-to-end. # TODO(hayk): Is this going to make audio unbalanced between sequential clips? - segment = pydub.effects.normalize( - segment, - headroom=0.1, - ) + if compression: + segment = pydub.effects.normalize( + segment, + headroom=0.1, + ) - segment = segment.apply_gain(-10 - segment.dBFS) + segment = segment.apply_gain(-10 - segment.dBFS) - segment = pydub.effects.compress_dynamic_range( - segment, - threshold=-20.0, - ratio=4.0, - attack=5.0, - release=50.0, - ) + # TODO(hayk): This is quite slow, ~1.7 seconds on a beefy CPU + segment = pydub.effects.compress_dynamic_range( + segment, + threshold=-20.0, + ratio=4.0, + attack=5.0, + release=50.0, + ) desired_db = -12 segment = segment.apply_gain(desired_db - segment.dBFS)