import * as React from "react";
import { Footer, ManySamples, Section, PaperSummary } from "./components";
import { useAsDefaultTheme } from "../../providers/Theme";
import * as samples from "./phonologicalFeatures";
import { Paragraph } from "../../components/Typography";
import { useGA } from "../../hooks/analytics/useGA";

const TITLE = "Phonological Features for 0-shot Multilingual Speech Synthesis";
const AUTHORS =
  "Marlene Staib, Tian Huey Teh, Alexandra Torresquintero, Devang S Ram Mohan,  Lorenzo Foglianti, Raphael Lenain, Jiameng Gao";
export const PhonologicalFeaturesPaperSummary = ({
  samplesLink,
  paperLink
}: any) => (
  <PaperSummary
    title={TITLE}
    authors={AUTHORS}
    samplesLink={samplesLink}
    paperLink={paperLink}
  >
    Code-switching &#9473; the intra-utterance use of multiple languages &#9473;
    is prevalent across the world. Within text-to-speech (TTS), multilingual
    models have been found to enable code-switching [
    <a href="https://arxiv.org/pdf/1907.04448.pdf">1</a>,
    <a href="https://storage.googleapis.com/pub-tools-public-publication-data/pdf/c00f67e71e0462c7572d3d8ca11a6bb2caf2a3b3.pdf">
      2
    </a>
    ,
    <a href="https://storage.googleapis.com/pub-tools-public-publication-data/pdf/528457d0e1af5e841b28223f19bc4468589c1320.pdf">
      3
    </a>
    ]. By modifying the linguistic input to sequence-to-sequence TTS, we show
    that code-switching is possible for languages unseen during training, even
    within monolingual models. We use a small set of phonological features
    derived from the International Phonetic Alphabet (IPA), such as vowel height
    and frontness, consonant place and manner. This allows the model topology to
    stay unchanged for different languages, and enables new, previously unseen
    feature combinations to be interpreted by the model. We show that this
    allows us to generate intelligible, code-switched speech in a new language
    at test time, including the approximation of sounds never seen in training.
  </PaperSummary>
);

export const PhonologicalFeaturesPaper: React.FunctionComponent = () => {
  useAsDefaultTheme("light");
  useGA();
  return (
    <div className="mw8 ph4 center mt4">
      <PhonologicalFeaturesPaperSummary />
      <Section title={"0-shot German from multilingual Spanish-English models"}>
        <Paragraph>
          The following samples are produced with the multilingual
          Spanish-English models, trained on a combination of the{" "}
          <a href="https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html">
            CSTR VCTK
          </a>
          -corpus and our proprietary, mexican Spanish Adrianex-corpus (referred
          to as the MIX data condition in the paper). Sentences are chosen from
          randomly sampled Wikipedia articles in German. 0-shot methods for
          producing unseen phonemes include 1) phonological features, or AUTO
          (our proposed method), and 2 baselines: 2) a manual, linguistic
          mapping to previously seen phonemes (MAPPED), and 3) a new, randomly
          initialized embedding (RANDOM).
        </Paragraph>
        <ManySamples samples={samples.VCTK_SAMPLES} />
      </Section>
      <Section title={"0-shot German from monolingual English models"}>
        <Paragraph>
          The following samples in German are produced with the monolingual
          English models, trained on the{" "}
          <a href="https://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html">
            CSTR VCTK
          </a>
          -corpus (referred to as the VCTK data condition in the paper).
          Sentences and 0-shot methods are as above. Note the collapse of [R] to
          [g] in AUTO due to the absence of trills in the phoneme set used for
          English, for example in the sentence{" "}
          <span className="i">
            "Die Blütezeit (r)eicht von Juni bis September"
          </span>
          .
        </Paragraph>
        <ManySamples samples={samples.ZERO_SHOT} />
      </Section>
      <Section title={"English samples with code-switching into German"}>
        <Paragraph>
          The following samples in English demonstrate an application of 0-shot
          modelling of a new language (here: German), where individual words or
          phrases, such as foreign names of people, institutions or places, are
          part of the English sentence (code-switching). Code-switching,
          especially <i>into</i> English, is common in many languages, and these
          samples are only meant to serve as a small demonstration of what can
          be achieved with the method we presented. Pronunciations for foreign
          words can be easily approximated by using a target language dictionary
          in combination with either phonological features (AUTO) or an expert
          mapping from unseen to seen phonemes (MAPPED). Words containing unseen
          phonemes are usually mispronounced or completely skipped when new
          phonemes are represented with a random embedding (RANDOM).
        </Paragraph>
        <ManySamples
          title="Multilingual Spanish-English model"
          samples={samples.MIX_CODE_SWITCH}
        />
      </Section>

      <Footer />
      <style global jsx>
        {`
          .highlight:before {
            content: "";
            border-radius: 2px;
            width: 100%;
            height: 5px;
            opacity: 50%;
            background: rgb(252, 78, 54);
            position: absolute;
            bottom: 0px;
          }
        `}
      </style>
    </div>
  );
};
