Audio splitting with demucs hybrid transformer model

Topic: audio_splitter_transformer
2023-01-07 20:36:43 +00:00 · 2023-01-07 20:36:43 +00:00 · 8e87c133c8
parent f8595d7b29
commit 8e87c133c8
3 changed files with 78 additions and 3 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,7 @@
 accelerate
 argh
 dacite
+demucs
 diffusers>=0.9.0
 flask
 flask_cors
--- a/riffusion/audio_splitter.py
+++ b/riffusion/audio_splitter.py
@ -1,4 +1,8 @@
+import shutil
+import subprocess
+import tempfile
 import typing as T
+from pathlib import Path

 import numpy as np
 import pydub
@ -9,10 +13,65 @@ from torchaudio.transforms import Fade
 from riffusion.util import audio_util


+def split_audio(
+    segment: pydub.AudioSegment,
+    model_name: str = "htdemucs_6s",
+    extension: str = "wav",
+    jobs: int = 4,
+    device: str = "cuda",
+) -> T.Dict[str, pydub.AudioSegment]:
+    """
+    Split audio into stems using demucs.
+    """
+    tmp_dir = Path(tempfile.mkdtemp(prefix="split_audio_"))
+
+    # Save the audio to a temporary file
+    audio_path = tmp_dir / "audio.mp3"
+    segment.export(audio_path, format="mp3")
+
+    # Assemble command
+    command = [
+        "demucs",
+        str(audio_path),
+        "--name",
+        model_name,
+        "--out",
+        str(tmp_dir),
+        "--jobs",
+        str(jobs),
+        "--device",
+        device if device != "mps" else "cpu",
+    ]
+    print(" ".join(command))
+
+    if extension == "mp3":
+        command.append("--mp3")
+
+    # Run demucs
+    subprocess.run(
+        command,
+        check=True,
+    )
+
+    # Load the stems
+    stems = {}
+    for stem_path in tmp_dir.glob(f"{model_name}/audio/*.{extension}"):
+        stem = pydub.AudioSegment.from_file(stem_path)
+        stems[stem_path.stem] = stem
+
+    # Delete tmp dir
+    shutil.rmtree(tmp_dir)
+
+    return stems
+
+
 class AudioSplitter:
    """
    Split audio into instrument stems like {drums, bass, vocals, etc.}

+    NOTE(hayk): This is deprecated as it has inferior performance to the newer hybrid transformer
+    model in the demucs repo. See the function above. Probably just delete this.
+
    See:
        https://pytorch.org/audio/main/tutorials/hybrid_demucs_tutorial.html
    """
--- a/riffusion/streamlit/pages/split_audio.py
+++ b/riffusion/streamlit/pages/split_audio.py
@ -2,6 +2,7 @@ import io

 import streamlit as st

+from riffusion.audio_splitter import split_audio
 from riffusion.streamlit import util as streamlit_util


@ -32,11 +33,13 @@ def render_split_audio() -> None:

    audio_file = st.file_uploader(
        "Upload audio",
-        type=["mp3", "m4a", "ogg", "wav", "flac"],
+        type=["mp3", "m4a", "ogg", "wav", "flac", "webm"],
        label_visibility="collapsed",
    )

-    splitter = streamlit_util.get_audio_splitter(device=device)
+    recombine = st.sidebar.checkbox(
+        "Recombine", value=False, help="Show recombined audio at the end for comparison"
+    )

    if not audio_file:
        st.info("Upload audio to get started")
@ -51,7 +54,7 @@ def render_split_audio() -> None:
    segment = streamlit_util.load_audio_file(audio_file)

    # Split
-    stems = splitter.split(segment)
+    stems = split_audio(segment, device=device)

    # Display each
    for name, stem in stems.items():
@ -60,6 +63,18 @@ def render_split_audio() -> None:
        stem.export(audio_bytes, format="mp3")
        st.audio(audio_bytes)

+    if recombine:
+        stems_list = list(stems.values())
+        recombined = stems_list[0]
+        for stem in stems_list[1:]:
+            recombined = recombined.overlay(stem)
+
+        # Display
+        st.write("#### recombined")
+        audio_bytes = io.BytesIO()
+        recombined.export(audio_bytes, format="mp3")
+        st.audio(audio_bytes)
+

 if __name__ == "__main__":
    render_split_audio()