import * as React from "react";
import {
  Footer,
  ManySamplesTCESS,
  Section,
  PaperSummary,
  ManySamplesTemporalControl
} from "./components";
import { useAsDefaultTheme } from "../../providers/Theme";
import * as samples from "./tcess";
import { H4, Paragraph } from "../../components/Typography";
import { useGA } from "../../hooks/analytics/useGA";

const TITLE =
  "Ctrl-P: Temporal Control of Prosodic Variation for Speech Synthesis";
const AUTHORS =
  "Devang S Ram Mohan*, Vivian Hu*, Tian Huey Teh*, Alexandra Torresquintero, Christopher Wallis, Marlene Staib, Lorenzo Foglianti, Jiameng Gao, Simon King. (*: equal contribution)";
export const TCESSPaperSummary = ({ samplesLink, paperLink }: any) => (
  <PaperSummary
    title={TITLE}
    authors={AUTHORS}
    samplesLink={samplesLink}
    paperLink={paperLink}
  >
    Text does not fully specify the spoken form, so text-to-speech models must
    be able to learn from speech data that vary in ways not explained by the
    corresponding text. One way to reduce the amount of unexplained variation in
    training data is to provide acoustic information as an additional learning
    signal. When generating speech, modifying this acoustic information enables
    multiple distinct renditions of a text to be produced.
    <br />
    <br />
    Since much of the unexplained variation is in the prosody, we propose a
    model that generates speech explicitly conditioned on the three primary
    acoustic correlates of prosody: <span className="formula">F</span>
    <sub>0</sub>, energy and duration. The model is flexible about how the
    values of these features are specified: they can be externally provided, or
    predicted from text, or predicted then subsequently modified.
    <br />
    <br />
    Compared to a model that employs a variational auto-encoder to learn
    unsupervised latent features, our model provides more interpretable,
    temporally-precise, and disentangled control. When automatically predicting
    the acoustic features from text, it generates speech that is more natural
    than that from a Tacotron 2 model with reference encoder. Subsequent
    human-in-the-loop modification of the predicted acoustic features can
    significantly further increase naturalness.
    <style global jsx>
      {`
        .formula {
          font-family: Georgia !important;
          font-weight: 700;
          font-style: italic;
        }
        sub {
          font-family: Georgia;
          font-weight: 500;
          vertical-align: sub;
          font-size: smaller;
        }
      `}
    </style>
  </PaperSummary>
);

export const TCESSPaper: React.FunctionComponent = () => {
  useAsDefaultTheme("light");
  useGA();
  return (
    <div className="mw8 ph4 center mt4">
      <TCESSPaperSummary />
      <Section title={"Disentanglement"}>
        <Paragraph>
          For each utterance, we shift the entire contour of an individual input
          feature/latent dimension (holding the others fixed) by a multiple of
          the speaker-specific standard deviation for that dimension.
        </Paragraph>

        <H4>Sentence 1</H4>
        <ManySamplesTCESS
          samples={samples.TCESS_SAMPLES}
          title="Ctrl-P (our model)"
        />
        <ManySamplesTCESS
          samples={samples.TVAE_SAMPLES}
          title="T-VAE Baseline"
        />
        <H4>Sentence 2</H4>
        <ManySamplesTCESS
          samples={samples.TCESS_SAMPLES2}
          title="Ctrl-P (our model)"
        />
        <ManySamplesTCESS
          samples={samples.TVAE_SAMPLES2}
          title="T-VAE Baseline"
        />
      </Section>
      <Section title={"Temporal Controllability"}>
        <Paragraph>
          To demonstrate the ability of our model to provide fine-grained
          temporal control, we synthesise the same sentence but modify the
          acoustic features corresponding to specific phones to elicit
          semantically distinct renditions.
        </Paragraph>

        <ManySamplesTemporalControl samples={samples.TEMPORAL_CONTROL} />
      </Section>

      <Footer />
    </div>
  );
};
