Text to audio supports multiple clips
Topic: text_to_audio_multiple_riffs
This commit is contained in:
parent
a4784bb4dc
commit
ca72f418b6
|
@ -28,11 +28,33 @@ def render_text_to_audio() -> None:
|
||||||
|
|
||||||
device = streamlit_util.select_device(st.sidebar)
|
device = streamlit_util.select_device(st.sidebar)
|
||||||
|
|
||||||
prompt = st.text_input("Prompt")
|
with st.form("Inputs"):
|
||||||
negative_prompt = st.text_input("Negative prompt")
|
prompt = st.text_input("Prompt")
|
||||||
|
negative_prompt = st.text_input("Negative prompt")
|
||||||
|
|
||||||
with st.sidebar.expander("Text to Audio Params", expanded=True):
|
row = st.columns(4)
|
||||||
seed = T.cast(int, st.number_input("Seed", value=42))
|
num_clips = T.cast(
|
||||||
|
int,
|
||||||
|
row[0].number_input(
|
||||||
|
"Number of clips",
|
||||||
|
value=1,
|
||||||
|
min_value=1,
|
||||||
|
max_value=25,
|
||||||
|
help="How many outputs to generate (seed gets incremented)",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
starting_seed = T.cast(
|
||||||
|
int,
|
||||||
|
row[1].number_input(
|
||||||
|
"Seed",
|
||||||
|
value=42,
|
||||||
|
help="Change this to generate different variations",
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
st.form_submit_button("Riff", type="primary")
|
||||||
|
|
||||||
|
with st.sidebar:
|
||||||
num_inference_steps = T.cast(int, st.number_input("Inference steps", value=50))
|
num_inference_steps = T.cast(int, st.number_input("Inference steps", value=50))
|
||||||
width = T.cast(int, st.number_input("Width", value=512))
|
width = T.cast(int, st.number_input("Width", value=512))
|
||||||
guidance = st.number_input(
|
guidance = st.number_input(
|
||||||
|
@ -43,32 +65,37 @@ def render_text_to_audio() -> None:
|
||||||
st.info("Enter a prompt")
|
st.info("Enter a prompt")
|
||||||
return
|
return
|
||||||
|
|
||||||
image = streamlit_util.run_txt2img(
|
|
||||||
prompt=prompt,
|
|
||||||
num_inference_steps=num_inference_steps,
|
|
||||||
guidance=guidance,
|
|
||||||
negative_prompt=negative_prompt,
|
|
||||||
seed=seed,
|
|
||||||
width=width,
|
|
||||||
height=512,
|
|
||||||
device=device,
|
|
||||||
)
|
|
||||||
|
|
||||||
st.image(image)
|
|
||||||
|
|
||||||
# TODO(hayk): Change the frequency range to [20, 20k] once the model is retrained
|
# TODO(hayk): Change the frequency range to [20, 20k] once the model is retrained
|
||||||
params = SpectrogramParams(
|
params = SpectrogramParams(
|
||||||
min_frequency=0,
|
min_frequency=0,
|
||||||
max_frequency=10000,
|
max_frequency=10000,
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_bytes = streamlit_util.audio_bytes_from_spectrogram_image(
|
seed = starting_seed
|
||||||
image=image,
|
for i in range(1, num_clips + 1):
|
||||||
params=params,
|
st.write(f"#### Riff {i} / {num_clips} - Seed {seed}")
|
||||||
device=device,
|
|
||||||
output_format="mp3",
|
image = streamlit_util.run_txt2img(
|
||||||
)
|
prompt=prompt,
|
||||||
st.audio(audio_bytes)
|
num_inference_steps=num_inference_steps,
|
||||||
|
guidance=guidance,
|
||||||
|
negative_prompt=negative_prompt,
|
||||||
|
seed=seed,
|
||||||
|
width=width,
|
||||||
|
height=512,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
st.image(image)
|
||||||
|
|
||||||
|
audio_bytes = streamlit_util.audio_bytes_from_spectrogram_image(
|
||||||
|
image=image,
|
||||||
|
params=params,
|
||||||
|
device=device,
|
||||||
|
output_format="mp3",
|
||||||
|
)
|
||||||
|
st.audio(audio_bytes)
|
||||||
|
|
||||||
|
seed += 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue