diff --git a/components/about/CaptionedImage.tsx b/components/about/CaptionedImage.tsx new file mode 100644 index 0000000..e072f21 --- /dev/null +++ b/components/about/CaptionedImage.tsx @@ -0,0 +1,25 @@ +import Image from "next/image"; + +interface CaptionedImageProps { + image_url: string; + caption: string; + marginLeft?: number; +} + +export default function CaptionedImage({ + image_url, + caption, + marginLeft = 16, +}: CaptionedImageProps) { + return ( +
+

{caption}

+ {caption} +
+ ); +} diff --git a/pages/about.tsx b/pages/about.tsx new file mode 100644 index 0000000..9e4194a --- /dev/null +++ b/pages/about.tsx @@ -0,0 +1,217 @@ +import Image from "next/image"; + +import PageHead from "../components/PageHead"; +import CaptionedImage from "../components/about/CaptionedImage"; + +import funkySaxImg from "../public/about/funky_sax.png"; +import funkySaxToPianoImg from "../public/about/funky_sax_to_piano.png"; +import handDrawnSpectrogramImg from "../public/about/hand_drawn_spectrogram.png"; +import fourierTransformImg from "../public/about/fourier_transform.png"; +import spectrogramLabelImg from "../public/about/spectrogram_label.png"; + +export default function Home() { + return ( + <> + + +
+
+

Riffusion

+

+ (verb): riff + diffusion +

+

+ You’ve heard of{" "} + + Stable Diffusion + + , the open-source AI model that generates images from text? +

+ +

+ Well, we fine-tuned the model to generate images of spectrograms, + like this: +

+ +

+ The magic is that this spectrogram can then be converted to audio: +

+
+ +
+

🔥🔥🔥😱

+

+ Really? Yup. +

+

+ This is the v1.5 stable diffusion model with no modifications, just + fine-tuned on images of spectrograms. Audio processing happens + downstream of the model. +

+

+ It can generate infinite variations of a prompt by varying the seed. + All the same web UIs and techniques like img2img, inpainting, + negative prompts, and interpolation work out of the box. +

+

Spectrograms

+

+ An audio{" "} + spectrogram{" "} + is a visual way to represent the frequency content of a sound clip. + The x-axis represents time, and the y-axis represents frequency. The + color of each pixel gives the amplitude of the audio at the + frequency and time given by its row and column. +

+ {"spectrogram +

+ The spectogram can be computed from audio using the{" "} + + Short-time Fourier transform + {" "} + (STFT), which approximates the audio as a combination of sine waves + of varying amplitudes and phases. +

+ {"fourier +

+ The STFT is invertible, so the original audio can be reconstructed + from a spectrogram. However, the spectrogram images from our model + only contain the amplitude of the sine waves and not the phases, + because the phases are chaotic and hard to learn. Instead, we use + the{" "} + + Griffin-Lim + {" "} + algorithm to approximate the phase when reconstructing the audio + clip. +

+

+ The frequency bins in our spectrogram use the{" "} + Mel scale, + which is a perceptual scale of pitches judged by listeners to be + equal in distance from one another. +

+

+ Below is a hand-drawn image interpreted as a spectrogram and + converted to audio. Play it back to get an intuitive sense of how + they work. Note how you can hear the pitches of the two curves on + the bottom half, and how the four vertical lines at the top make + beats similar to a hi-hat sound. +

+ {"hand +
+ +
+

+ We use{" "} + + Torchaudio + + , which has excellent modules for efficient audio processing on the + GPU. Check out our audio processing code{" "} + + here + + . +

+

Image-to-Image

+

+ With diffusion models, it is possible to condition their creations + not only on a text prompt but also on other images. This is + incredibly useful for modifying sounds while preserving the + structure of the an original clip you like. A denoising strength + parameter trades off between sounding similar to the original and + adapting the new prompt. +

+

+ For example, here is a modification of that funky sax solo to crank + up the piano: +

+
+
+ + +
+ +
+
+
+ + +
+ +
+
+
+

+ And here’s an example that adapts a rock and roll solo to an + acoustic folk fiddle: +

+

TODO(hayk): This is as far as I got.

+
+
+ + +
+ +
+
+
+ + +
+ +
+
+
+
+
+ + ); +} diff --git a/pages/api/hello.js b/pages/api/hello.js deleted file mode 100644 index 13a03c2..0000000 --- a/pages/api/hello.js +++ /dev/null @@ -1,4 +0,0 @@ -// Next.js API route support: https://nextjs.org/docs/api-routes/introduction -export default function handler(req, res) { - res.status(200).json({ name: 'John Doe' }); -} diff --git a/public/about/astronaut.gif b/public/about/astronaut.gif new file mode 100644 index 0000000..ce127f3 Binary files /dev/null and b/public/about/astronaut.gif differ diff --git a/public/about/fourier_transform.png b/public/about/fourier_transform.png new file mode 100644 index 0000000..2a1d5af Binary files /dev/null and b/public/about/fourier_transform.png differ diff --git a/public/about/funky_sax.gif b/public/about/funky_sax.gif new file mode 100644 index 0000000..0fe4370 Binary files /dev/null and b/public/about/funky_sax.gif differ diff --git a/public/about/funky_sax.mp3 b/public/about/funky_sax.mp3 new file mode 100644 index 0000000..3c11ab9 Binary files /dev/null and b/public/about/funky_sax.mp3 differ diff --git a/public/about/funky_sax.png b/public/about/funky_sax.png new file mode 100644 index 0000000..b3b9bbd Binary files /dev/null and b/public/about/funky_sax.png differ diff --git a/public/about/funky_sax_to_piano.mp3 b/public/about/funky_sax_to_piano.mp3 new file mode 100644 index 0000000..c961dd4 Binary files /dev/null and b/public/about/funky_sax_to_piano.mp3 differ diff --git a/public/about/funky_sax_to_piano.png b/public/about/funky_sax_to_piano.png new file mode 100644 index 0000000..e2e3e84 Binary files /dev/null and b/public/about/funky_sax_to_piano.png differ diff --git a/public/about/hand_drawn.mp3 b/public/about/hand_drawn.mp3 new file mode 100644 index 0000000..50cb304 Binary files /dev/null and b/public/about/hand_drawn.mp3 differ diff --git a/public/about/hand_drawn_spectrogram.png b/public/about/hand_drawn_spectrogram.png new file mode 100644 index 0000000..25330ba Binary files /dev/null and b/public/about/hand_drawn_spectrogram.png differ diff --git a/public/about/spectrogram_label.png b/public/about/spectrogram_label.png new file mode 100644 index 0000000..cb7773c Binary files /dev/null and b/public/about/spectrogram_label.png differ diff --git a/styles/globals.css b/styles/globals.css index b5c61c9..c2310dd 100644 --- a/styles/globals.css +++ b/styles/globals.css @@ -1,3 +1,7 @@ @tailwind base; @tailwind components; @tailwind utilities; + +a { + @apply text-gray-700 underline +}