Text to audio supports multiple clips

Topic: text_to_audio_multiple_riffs
2023-01-09 00:58:14 +00:00 · 2023-01-09 00:58:14 +00:00 · ca72f418b6
parent a4784bb4dc
commit ca72f418b6
1 changed files with 51 additions and 24 deletions
--- a/riffusion/streamlit/pages/text_to_audio.py
+++ b/riffusion/streamlit/pages/text_to_audio.py
@ -28,11 +28,33 @@ def render_text_to_audio() -> None:
    device = streamlit_util.select_device(st.sidebar)
-    prompt = st.text_input("Prompt")
+    with st.form("Inputs"):
-    negative_prompt = st.text_input("Negative prompt")
+        prompt = st.text_input("Prompt")
        negative_prompt = st.text_input("Negative prompt")
-    with st.sidebar.expander("Text to Audio Params", expanded=True):
+        row = st.columns(4)
-        seed = T.cast(int, st.number_input("Seed", value=42))
+        num_clips = T.cast(
            int,
            row[0].number_input(
                "Number of clips",
                value=1,
                min_value=1,
                max_value=25,
                help="How many outputs to generate (seed gets incremented)",
            ),
        )
        starting_seed = T.cast(
            int,
            row[1].number_input(
                "Seed",
                value=42,
                help="Change this to generate different variations",
            ),
        )
        st.form_submit_button("Riff", type="primary")
    with st.sidebar:
        num_inference_steps = T.cast(int, st.number_input("Inference steps", value=50))
        width = T.cast(int, st.number_input("Width", value=512))
        guidance = st.number_input(
@ -43,32 +65,37 @@ def render_text_to_audio() -> None:
        st.info("Enter a prompt")
        return
    image = streamlit_util.run_txt2img(
        prompt=prompt,
        num_inference_steps=num_inference_steps,
        guidance=guidance,
        negative_prompt=negative_prompt,
        seed=seed,
        width=width,
        height=512,
        device=device,
    )
    st.image(image)
    # TODO(hayk): Change the frequency range to [20, 20k] once the model is retrained
    params = SpectrogramParams(
        min_frequency=0,
        max_frequency=10000,
    )
-    audio_bytes = streamlit_util.audio_bytes_from_spectrogram_image(
+    seed = starting_seed
-        image=image,
+    for i in range(1, num_clips + 1):
-        params=params,
+        st.write(f"#### Riff {i} / {num_clips} - Seed {seed}")
-        device=device,
+
-        output_format="mp3",
+        image = streamlit_util.run_txt2img(
-    )
+            prompt=prompt,
-    st.audio(audio_bytes)
+            num_inference_steps=num_inference_steps,
            guidance=guidance,
            negative_prompt=negative_prompt,
            seed=seed,
            width=width,
            height=512,
            device=device,
        )
        st.image(image)
        audio_bytes = streamlit_util.audio_bytes_from_spectrogram_image(
            image=image,
            params=params,
            device=device,
            output_format="mp3",
        )
        st.audio(audio_bytes)
        seed += 1
 if __name__ == "__main__":