Add first half of about page

2022-11-27 18:23:53 -08:00 · 2022-11-27 18:23:53 -08:00 · 97077ae5c9
parent e51c7d3129
commit 97077ae5c9
14 changed files with 246 additions and 4 deletions
--- a/components/about/CaptionedImage.tsx
+++ b/components/about/CaptionedImage.tsx
@ -0,0 +1,25 @@
+import Image from "next/image";
+
+interface CaptionedImageProps {
+  image_url: string;
+  caption: string;
+  marginLeft?: number;
+}
+
+export default function CaptionedImage({
+  image_url,
+  caption,
+  marginLeft = 16,
+}: CaptionedImageProps) {
+  return (
+    <div className={`m-5 ml-${marginLeft}`}>
+      <p className="font-bold pb-2 text-base">{caption}</p>
+      <Image
+        src={image_url}
+        width={300}
+        height={300}
+        alt={caption}
+      />
+    </div>
+  );
+}
--- a/pages/about.tsx
+++ b/pages/about.tsx
@ -0,0 +1,217 @@
+import Image from "next/image";
+
+import PageHead from "../components/PageHead";
+import CaptionedImage from "../components/about/CaptionedImage";
+
+import funkySaxImg from "../public/about/funky_sax.png";
+import funkySaxToPianoImg from "../public/about/funky_sax_to_piano.png";
+import handDrawnSpectrogramImg from "../public/about/hand_drawn_spectrogram.png";
+import fourierTransformImg from "../public/about/fourier_transform.png";
+import spectrogramLabelImg from "../public/about/spectrogram_label.png";
+
+export default function Home() {
+  return (
+    <>
+      <PageHead />
+
+      <main className="bg-white flex flex-row text-black place-content-center">
+        <div className="w-3/4 md:w-2/3 lg:w-1/2 text-lg">
+          <h1 className="pt-20 pb-1 text-5xl font-bold">Riffusion</h1>
+          <h3 className="font-medium italic text-xl pb-10">
+            (verb): riff + diffusion
+          </h3>
+          <p>
+            You’ve heard of{" "}
+            <a href="https://en.wikipedia.org/wiki/Stable_Diffusion">
+              Stable Diffusion
+            </a>
+            , the open-source AI model that generates images from text?
+          </p>
+          <CaptionedImage
+            image_url={"/about/astronaut.gif"}
+            caption={"photograph of an astronaut riding a horse"}
+          />
+          <p>
+            Well, we fine-tuned the model to generate images of spectrograms,
+            like this:
+          </p>
+          <CaptionedImage
+            image_url={"/about/funky_sax.gif"}
+            caption={"funk bassline with a jazzy saxophone solo"}
+          />
+          <p>
+            The magic is that this spectrogram can then be converted to audio:
+          </p>
+          <div className="m-5 ml-16">
+            <audio controls src="/about/funky_sax.mp3" className="w-1/2">
+              Your browser does not support audio.
+            </audio>
+          </div>
+          <p className="text-4xl mb-2">🔥🔥🔥😱</p>
+          <p>
+            <b>Really? </b> Yup.
+          </p>
+          <p className="mt-3">
+            This is the v1.5 stable diffusion model with no modifications, just
+            fine-tuned on images of spectrograms. Audio processing happens
+            downstream of the model.
+          </p>
+          <p className="mt-3">
+            It can generate infinite variations of a prompt by varying the seed.
+            All the same web UIs and techniques like img2img, inpainting,
+            negative prompts, and interpolation work out of the box.
+          </p>
+          <h2 className="pt-10 pb-5 text-3xl font-bold">Spectrograms</h2>
+          <p>
+            An audio{" "}
+            <a href="https://en.wikipedia.org/wiki/Spectrogram">spectrogram</a>{" "}
+            is a visual way to represent the frequency content of a sound clip.
+            The x-axis represents time, and the y-axis represents frequency. The
+            color of each pixel gives the amplitude of the audio at the
+            frequency and time given by its row and column.
+          </p>
+          <Image
+            className="ml-16 m-5 w-2/3"
+            src={spectrogramLabelImg}
+            alt={"spectrogram with axes labeled"}
+          />
+          <p>
+            The spectogram can be computed from audio using the{" "}
+            <a href="https://en.wikipedia.org/wiki/Short-time_Fourier_transform">
+              Short-time Fourier transform
+            </a>{" "}
+            (STFT), which approximates the audio as a combination of sine waves
+            of varying amplitudes and phases.
+          </p>
+          <Image
+            className="ml-24 m-5 w-1/2"
+            src={fourierTransformImg}
+            alt={"fourier transform explanation"}
+          />
+          <p>
+            The STFT is invertible, so the original audio can be reconstructed
+            from a spectrogram. However, the spectrogram images from our model
+            only contain the amplitude of the sine waves and not the phases,
+            because the phases are chaotic and hard to learn. Instead, we use
+            the{" "}
+            <a href="https://ieeexplore.ieee.org/document/1164317">
+              Griffin-Lim
+            </a>{" "}
+            algorithm to approximate the phase when reconstructing the audio
+            clip.
+          </p>
+          <p>
+            The frequency bins in our spectrogram use the{" "}
+            <a href="https://en.wikipedia.org/wiki/Mel_scale">Mel scale</a>,
+            which is a perceptual scale of pitches judged by listeners to be
+            equal in distance from one another.
+          </p>
+          <p className="mt-3">
+            Below is a hand-drawn image interpreted as a spectrogram and
+            converted to audio. Play it back to get an intuitive sense of how
+            they work. Note how you can hear the pitches of the two curves on
+            the bottom half, and how the four vertical lines at the top make
+            beats similar to a hi-hat sound.
+          </p>
+          <Image
+            className="ml-20 m-5"
+            src={handDrawnSpectrogramImg}
+            width={300}
+            alt={"hand drawn spectrogram"}
+          />
+          <div className="m-5 ml-16">
+            <audio controls src="/about/hand_drawn.mp3" className="w-1/2">
+              Your browser does not support audio.
+            </audio>
+          </div>
+          <p>
+            We use{" "}
+            <a href="https://pytorch.org/audio/stable/transforms.html">
+              Torchaudio
+            </a>
+            , which has excellent modules for efficient audio processing on the
+            GPU. Check out our audio processing code{" "}
+            <a href="https://github.com/hmartiro/riffusion-inference/blob/main/riffusion/audio.py">
+              here
+            </a>
+            .
+          </p>
+          <h2 className="pt-10 pb-5 text-3xl font-bold">Image-to-Image</h2>
+          <p>
+            With diffusion models, it is possible to condition their creations
+            not only on a text prompt but also on other images. This is
+            incredibly useful for modifying sounds while preserving the
+            structure of the an original clip you like. A denoising strength
+            parameter trades off between sounding similar to the original and
+            adapting the new prompt.
+          </p>
+          <p className="mt-3">
+            For example, here is a modification of that funky sax solo to crank
+            up the piano:
+          </p>
+          <div className="grid grid-cols-2 gap-3">
+            <div>
+              <CaptionedImage
+                image_url={"/about/funky_sax.png"}
+                caption={"funk bassline with a jazzy saxophone solo"}
+                marginLeft={5}
+              />
+
+              <div className="m-4">
+                <audio controls src="/about/funky_sax.mp3">
+                  Your browser does not support audio.
+                </audio>
+              </div>
+            </div>
+            <div className="text-red text-xl">
+              <CaptionedImage
+                image_url={"/about/funky_sax_to_piano.png"}
+                caption={"piano funk"}
+                marginLeft={5}
+              />
+
+              <div className="m-4">
+                <audio controls src="/about/funky_sax_to_piano.mp3">
+                  Your browser does not support audio.
+                </audio>
+              </div>
+            </div>
+          </div>
+          <p>
+            And here’s an example that adapts a rock and roll solo to an
+            acoustic folk fiddle:
+          </p>
+          <p className="text-4xl">TODO(hayk): This is as far as I got.</p>
+          <div className="grid grid-cols-2 gap-3">
+            <div>
+              <CaptionedImage
+                image_url={"/about/funky_sax.png"}
+                caption={"funk bassline with a jazzy saxophone solo"}
+                marginLeft={5}
+              />
+
+              <div className="m-4">
+                <audio controls src="/about/funky_sax.mp3">
+                  Your browser does not support audio.
+                </audio>
+              </div>
+            </div>
+            <div className="text-red text-xl">
+              <CaptionedImage
+                image_url={"/about/funky_sax_to_piano.png"}
+                caption={"piano funk"}
+                marginLeft={5}
+              />
+
+              <div className="m-4">
+                <audio controls src="/about/funky_sax_to_piano.mp3">
+                  Your browser does not support audio.
+                </audio>
+              </div>
+            </div>
+          </div>
+        </div>
+      </main>
+    </>
+  );
+}
--- a/pages/api/hello.js
+++ b/pages/api/hello.js
@ -1,4 +0,0 @@
-// Next.js API route support: https://nextjs.org/docs/api-routes/introduction
-export default function handler(req, res) {
-  res.status(200).json({ name: 'John Doe' });
-}
--- a/public/about/astronaut.gif
+++ b/public/about/astronaut.gif
--- a/public/about/fourier_transform.png
+++ b/public/about/fourier_transform.png
--- a/public/about/funky_sax.gif
+++ b/public/about/funky_sax.gif
--- a/public/about/funky_sax.mp3
+++ b/public/about/funky_sax.mp3
--- a/public/about/funky_sax.png
+++ b/public/about/funky_sax.png
--- a/public/about/funky_sax_to_piano.mp3
+++ b/public/about/funky_sax_to_piano.mp3
--- a/public/about/funky_sax_to_piano.png
+++ b/public/about/funky_sax_to_piano.png
--- a/public/about/hand_drawn.mp3
+++ b/public/about/hand_drawn.mp3
--- a/public/about/hand_drawn_spectrogram.png
+++ b/public/about/hand_drawn_spectrogram.png
--- a/public/about/spectrogram_label.png
+++ b/public/about/spectrogram_label.png
--- a/styles/globals.css
+++ b/styles/globals.css
@ -1,3 +1,7 @@
@tailwind base;
@tailwind components;
@tailwind utilities;
+
+a {
+    @apply text-gray-700 underline
+}