2022-12-26 22:03:30 -07:00
|
|
|
import typing as T
|
2022-12-26 21:01:27 -07:00
|
|
|
|
|
|
|
import streamlit as st
|
|
|
|
|
|
|
|
from riffusion.spectrogram_params import SpectrogramParams
|
|
|
|
from riffusion.streamlit import util as streamlit_util
|
|
|
|
|
|
|
|
|
|
|
|
def render_text_to_audio() -> None:
|
2022-12-27 01:25:19 -07:00
|
|
|
st.set_page_config(layout="wide", page_icon="🎸")
|
|
|
|
|
|
|
|
st.subheader(":pencil2: Text to Audio")
|
|
|
|
st.write(
|
|
|
|
"""
|
2023-01-06 10:19:38 -07:00
|
|
|
Generate audio from text prompts.
|
2022-12-26 21:01:27 -07:00
|
|
|
"""
|
2022-12-27 01:25:19 -07:00
|
|
|
)
|
2022-12-26 22:32:42 -07:00
|
|
|
|
2023-01-06 10:19:38 -07:00
|
|
|
with st.expander("Help", False):
|
|
|
|
st.write(
|
|
|
|
"""
|
|
|
|
This tool runs riffusion in the simplest text to image form to generate an audio
|
|
|
|
clip from a text prompt. There is no seed image or interpolation here. This mode
|
|
|
|
allows more diversity and creativity than when using a seed image, but it also
|
|
|
|
leads to having less control. Play with the seed to get infinite variations.
|
|
|
|
"""
|
|
|
|
)
|
|
|
|
|
2022-12-26 22:32:42 -07:00
|
|
|
device = streamlit_util.select_device(st.sidebar)
|
|
|
|
|
2022-12-27 01:25:19 -07:00
|
|
|
prompt = st.text_input("Prompt")
|
|
|
|
negative_prompt = st.text_input("Negative prompt")
|
|
|
|
|
2022-12-26 22:32:42 -07:00
|
|
|
with st.sidebar.expander("Text to Audio Params", expanded=True):
|
|
|
|
seed = T.cast(int, st.number_input("Seed", value=42))
|
|
|
|
num_inference_steps = T.cast(int, st.number_input("Inference steps", value=50))
|
|
|
|
width = T.cast(int, st.number_input("Width", value=512))
|
|
|
|
guidance = st.number_input(
|
|
|
|
"Guidance", value=7.0, help="How much the model listens to the text prompt"
|
|
|
|
)
|
2022-12-26 21:01:27 -07:00
|
|
|
|
2022-12-26 22:03:30 -07:00
|
|
|
if not prompt:
|
|
|
|
st.info("Enter a prompt")
|
|
|
|
return
|
2022-12-26 21:01:27 -07:00
|
|
|
|
2022-12-26 22:03:30 -07:00
|
|
|
image = streamlit_util.run_txt2img(
|
2022-12-26 21:01:27 -07:00
|
|
|
prompt=prompt,
|
|
|
|
num_inference_steps=num_inference_steps,
|
|
|
|
guidance=guidance,
|
|
|
|
negative_prompt=negative_prompt,
|
|
|
|
seed=seed,
|
|
|
|
width=width,
|
2022-12-27 01:25:19 -07:00
|
|
|
height=512,
|
2022-12-26 21:01:27 -07:00
|
|
|
device=device,
|
|
|
|
)
|
2022-12-27 01:25:19 -07:00
|
|
|
|
2022-12-26 21:01:27 -07:00
|
|
|
st.image(image)
|
|
|
|
|
|
|
|
# TODO(hayk): Change the frequency range to [20, 20k] once the model is retrained
|
|
|
|
params = SpectrogramParams(
|
|
|
|
min_frequency=0,
|
|
|
|
max_frequency=10000,
|
|
|
|
)
|
|
|
|
|
2022-12-26 22:03:30 -07:00
|
|
|
audio_bytes = streamlit_util.audio_bytes_from_spectrogram_image(
|
2022-12-26 21:01:27 -07:00
|
|
|
image=image,
|
|
|
|
params=params,
|
|
|
|
device=device,
|
2022-12-26 22:03:30 -07:00
|
|
|
output_format="mp3",
|
2022-12-26 21:01:27 -07:00
|
|
|
)
|
2022-12-26 22:03:30 -07:00
|
|
|
st.audio(audio_bytes)
|
2022-12-26 21:01:27 -07:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
render_text_to_audio()
|