Text to audio supports multiple clips

Topic: text_to_audio_multiple_riffs
This commit is contained in:
Hayk Martiros 2023-01-09 00:58:14 +00:00
parent a4784bb4dc
commit ca72f418b6
1 changed files with 51 additions and 24 deletions

View File

@ -28,11 +28,33 @@ def render_text_to_audio() -> None:
device = streamlit_util.select_device(st.sidebar) device = streamlit_util.select_device(st.sidebar)
prompt = st.text_input("Prompt") with st.form("Inputs"):
negative_prompt = st.text_input("Negative prompt") prompt = st.text_input("Prompt")
negative_prompt = st.text_input("Negative prompt")
with st.sidebar.expander("Text to Audio Params", expanded=True): row = st.columns(4)
seed = T.cast(int, st.number_input("Seed", value=42)) num_clips = T.cast(
int,
row[0].number_input(
"Number of clips",
value=1,
min_value=1,
max_value=25,
help="How many outputs to generate (seed gets incremented)",
),
)
starting_seed = T.cast(
int,
row[1].number_input(
"Seed",
value=42,
help="Change this to generate different variations",
),
)
st.form_submit_button("Riff", type="primary")
with st.sidebar:
num_inference_steps = T.cast(int, st.number_input("Inference steps", value=50)) num_inference_steps = T.cast(int, st.number_input("Inference steps", value=50))
width = T.cast(int, st.number_input("Width", value=512)) width = T.cast(int, st.number_input("Width", value=512))
guidance = st.number_input( guidance = st.number_input(
@ -43,32 +65,37 @@ def render_text_to_audio() -> None:
st.info("Enter a prompt") st.info("Enter a prompt")
return return
image = streamlit_util.run_txt2img(
prompt=prompt,
num_inference_steps=num_inference_steps,
guidance=guidance,
negative_prompt=negative_prompt,
seed=seed,
width=width,
height=512,
device=device,
)
st.image(image)
# TODO(hayk): Change the frequency range to [20, 20k] once the model is retrained # TODO(hayk): Change the frequency range to [20, 20k] once the model is retrained
params = SpectrogramParams( params = SpectrogramParams(
min_frequency=0, min_frequency=0,
max_frequency=10000, max_frequency=10000,
) )
audio_bytes = streamlit_util.audio_bytes_from_spectrogram_image( seed = starting_seed
image=image, for i in range(1, num_clips + 1):
params=params, st.write(f"#### Riff {i} / {num_clips} - Seed {seed}")
device=device,
output_format="mp3", image = streamlit_util.run_txt2img(
) prompt=prompt,
st.audio(audio_bytes) num_inference_steps=num_inference_steps,
guidance=guidance,
negative_prompt=negative_prompt,
seed=seed,
width=width,
height=512,
device=device,
)
st.image(image)
audio_bytes = streamlit_util.audio_bytes_from_spectrogram_image(
image=image,
params=params,
device=device,
output_format="mp3",
)
st.audio(audio_bytes)
seed += 1
if __name__ == "__main__": if __name__ == "__main__":