2022-12-26 18:15:05 -07:00
|
|
|
"""
|
|
|
|
Audio utility functions.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import io
|
2023-01-14 12:31:33 -07:00
|
|
|
import typing as T
|
2022-12-26 18:15:05 -07:00
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import pydub
|
|
|
|
from scipy.io import wavfile
|
|
|
|
|
|
|
|
|
|
|
|
def audio_from_waveform(
|
|
|
|
samples: np.ndarray, sample_rate: int, normalize: bool = False
|
|
|
|
) -> pydub.AudioSegment:
|
|
|
|
"""
|
|
|
|
Convert a numpy array of samples of a waveform to an audio segment.
|
2023-01-04 21:43:44 -07:00
|
|
|
|
|
|
|
Args:
|
|
|
|
samples: (channels, samples) array
|
2022-12-26 18:15:05 -07:00
|
|
|
"""
|
|
|
|
# Normalize volume to fit in int16
|
|
|
|
if normalize:
|
|
|
|
samples *= np.iinfo(np.int16).max / np.max(np.abs(samples))
|
|
|
|
|
|
|
|
# Transpose and convert to int16
|
|
|
|
samples = samples.transpose(1, 0)
|
|
|
|
samples = samples.astype(np.int16)
|
|
|
|
|
|
|
|
# Write to the bytes of a WAV file
|
|
|
|
wav_bytes = io.BytesIO()
|
|
|
|
wavfile.write(wav_bytes, sample_rate, samples)
|
|
|
|
wav_bytes.seek(0)
|
|
|
|
|
|
|
|
# Read into pydub
|
|
|
|
return pydub.AudioSegment.from_wav(wav_bytes)
|
|
|
|
|
|
|
|
|
2022-12-27 08:44:39 -07:00
|
|
|
def apply_filters(segment: pydub.AudioSegment, compression: bool = False) -> pydub.AudioSegment:
|
2022-12-26 18:15:05 -07:00
|
|
|
"""
|
|
|
|
Apply post-processing filters to the audio segment to compress it and
|
|
|
|
keep at a -10 dBFS level.
|
|
|
|
"""
|
|
|
|
# TODO(hayk): Come up with a principled strategy for these filters and experiment end-to-end.
|
|
|
|
# TODO(hayk): Is this going to make audio unbalanced between sequential clips?
|
|
|
|
|
2022-12-27 08:44:39 -07:00
|
|
|
if compression:
|
|
|
|
segment = pydub.effects.normalize(
|
|
|
|
segment,
|
|
|
|
headroom=0.1,
|
|
|
|
)
|
2022-12-26 18:15:05 -07:00
|
|
|
|
2022-12-27 08:44:39 -07:00
|
|
|
segment = segment.apply_gain(-10 - segment.dBFS)
|
2022-12-26 18:15:05 -07:00
|
|
|
|
2022-12-27 08:44:39 -07:00
|
|
|
# TODO(hayk): This is quite slow, ~1.7 seconds on a beefy CPU
|
|
|
|
segment = pydub.effects.compress_dynamic_range(
|
|
|
|
segment,
|
|
|
|
threshold=-20.0,
|
|
|
|
ratio=4.0,
|
|
|
|
attack=5.0,
|
|
|
|
release=50.0,
|
|
|
|
)
|
2022-12-26 18:15:05 -07:00
|
|
|
|
|
|
|
desired_db = -12
|
|
|
|
segment = segment.apply_gain(desired_db - segment.dBFS)
|
|
|
|
|
|
|
|
segment = pydub.effects.normalize(
|
|
|
|
segment,
|
|
|
|
headroom=0.1,
|
|
|
|
)
|
|
|
|
|
|
|
|
return segment
|
2023-01-14 12:31:33 -07:00
|
|
|
|
|
|
|
|
|
|
|
def stitch_segments(
|
|
|
|
segments: T.Sequence[pydub.AudioSegment], crossfade_s: float
|
|
|
|
) -> pydub.AudioSegment:
|
|
|
|
"""
|
|
|
|
Stitch together a sequence of audio segments with a crossfade between each segment.
|
|
|
|
"""
|
|
|
|
crossfade_ms = int(crossfade_s * 1000)
|
|
|
|
combined_segment = segments[0]
|
|
|
|
for segment in segments[1:]:
|
|
|
|
combined_segment = combined_segment.append(segment, crossfade=crossfade_ms)
|
|
|
|
return combined_segment
|
2023-01-14 14:59:36 -07:00
|
|
|
|
|
|
|
|
|
|
|
def overlay_segments(segments: T.Sequence[pydub.AudioSegment]) -> pydub.AudioSegment:
|
|
|
|
"""
|
|
|
|
Overlay a sequence of audio segments on top of each other.
|
|
|
|
"""
|
|
|
|
assert len(segments) > 0
|
|
|
|
output: pydub.AudioSegment = None
|
|
|
|
for segment in segments:
|
|
|
|
if output is None:
|
|
|
|
output = segment
|
|
|
|
else:
|
|
|
|
output = output.overlay(segment)
|
|
|
|
return output
|