You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

278 lines
8.5 KiB
Clojure

(ns com.owoga.phonetics
(:require [clojure.set]
[clojure.string :as string]
[clojure.java.io :as io]
[clojure.set :as set]
[clojure.math.combinatorics :as combinatorics])
(:import (com.sun.speech.freetts.en.us CMULexicon)))
#_(set! *warn-on-reflection* true)
;; From http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.phones
(def phonemap
{"T" "stop",
"CH" "affricate",
"K" "stop",
"HH" "aspirate",
"UH" "vowel",
"AY" "vowel",
"AH" "vowel",
"OW" "vowel",
"L" "liquid",
"JH" "affricate",
"UW" "vowel",
"G" "stop",
"EH" "vowel",
"M" "nasal",
"OY" "vowel",
"S" "fricative",
"Y" "semivowel",
"EY" "vowel",
"Z" "fricative",
"R" "liquid",
"F" "fricative",
"AW" "vowel",
"IY" "vowel",
"B" "stop",
"SH" "fricative",
"P" "stop",
"V" "fricative",
"TH" "fricative",
"IH" "vowel",
"AA" "vowel",
"AO" "vowel",
"N" "nasal",
"DH" "fricative",
"W" "semivowel",
"ZH" "fricative",
"NG" "nasal",
"D" "stop",
"ER" "vowel",
"AE" "vowel"})
(def long-vowel #{"EY" "IY" "AY" "OW" "UW"})
(def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"})
(def vowel (clojure.set/union long-vowel short-vowel))
(def consonant (clojure.set/difference (into #{} (keys phonemap)) vowel))
(def syllable-end (clojure.set/union consonant long-vowel))
(def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"})
(def cmu-word-to-stressed-phones-map
"Map of lowercase English words to their phonetic sounding based on
the CMU Pronouncing Dictionary at http://www.speech.cs.cmu.edu/cgi-bin/cmudict/
Includes words with apostrophes, like possessive aaronson's.
Words with multiple pronunciations have keys with a `(1)` or `(2)` after their
duplicates, like [aaronsons(1) (AA1 R AH0 N S AH0 N Z)]
Primary stress is indicated by a `1` after the phoneme. Secondary stress with a `2`.
Unstressed with a `0`."
(->> "cmudict-0.7b"
io/resource
io/reader
line-seq
(drop-while #(= \; (first %)))
(map #(string/split % #"\s+"))
(map (partial split-at 1))
(map #(vector
(string/lower-case
(first (first %)))
(vec (second %))))
(into {})))
(def cmu-word-alternatives
"For words with multiple pronunciations in the CMU dictionary,
this maps from the word to its variations.
reputed -> reputed, reputed(1), reputed(2).
Not particularly useful itself since reputed(1) doesn't tell you how it's
different from reputed. But it's useful to look up the pronunciations in the
CMU dictionary."
(reduce
(fn [m k]
(let [norm-key (string/replace k #"\(\d\)" "")]
(update m norm-key (fnil (comp sort conj) []) k)))
{}
(keys cmu-word-to-stressed-phones-map)))
(defn word-alternatives
"For words with multiple pronunciations in the CMU dictionary,
this maps from the word to its variations.
reputed -> reputed, reputed(1), reputed(2).
Not particularly useful itself since reputed(1) doesn't tell you how it's
different from reputed. But it's useful to look up the pronunciations in the
CMU dictionary."
[word]
(get cmu-word-alternatives word))
(def stressed-phones-to-cmu-word-map
"The same sequence of phones can map to multiple words."
(reduce
(fn [m [k v]]
(update m v (fnil conj []) k))
{}
cmu-word-to-stressed-phones-map))
(def cmu-word-to-unstressed-phones-map
(->> cmu-word-to-stressed-phones-map
(mapv (fn [[k v]] [k (mapv #(string/replace % #"\d" "") v)]))
(into {})))
(def unstressed-phones-to-cmu-word-map
"There might be unstressed phones that can map
to two different pronunciations when stress is added,
so this maps unstressed phones to a vector of words that
can be looked up in the CMU Pronouncing dictionary to
see what their stressed phones are.
Another example, look at how many words map to [N IY S].
[[N IY S]
[neice neece niece nice kneece kniess neiss neace niess]]"
(reduce
(fn [m [k v]]
(let [v (map #(string/replace % #"\d" "") v)]
(update m v (fnil conj []) k)))
{}
cmu-word-to-stressed-phones-map))
(CMULexicon. "cmulex" true)
(def ^CMULexicon cmu-lexicon
"The CMULexicon can get phones for words that aren't in the
CMU Pronouncing Dictionary. But the phones are slightly different.
The `AH` sound, as in `allow`, is returned as `ax` from the CMULexicon.
Also, unstressed vowels don't have a `0` suffix. Instead, the CMULexicon
just returns unstressed vowels as the vowel itself with no suffix.
The above is important to note if you want clean interplay between these
two different ways of getting phonemes."
(CMULexicon/getInstance true))
;; This sonority hierarchy may not be perfect.
;; It stems from: http://www.glottopedia.org/index.php/Sonority_hierarchy
;; I tried to match the phones provided by the CMU dict to the hierarchies
;; listed on that page:
;; vowels > liquids > nasals > voiced fricatives
;; > voiceless fricatives = voiced plosives
;; > voiceless plosives (Anderson & Ewen 1987)
(def ^clojure.lang.PersistentVector sonority-hierarchy
;; more sonorous < < < vowel < < < (maximal onset) vowel > > > less sonorous
["vowel" "liquid" "semivowel" "aspirate" "affricate" "nasal" "fricative" "stop"])
(def lax-vowels #{"EH" "IH" "AE" "AH" "UH"})
(defn sonority [phone]
(.indexOf sonority-hierarchy (phonemap phone)))
(defn vowel? [phone]
(vowel (string/replace phone #"\d" "")))
(def consonant? (complement vowel?))
(defn >sonorous [a b]
(< (sonority a) (sonority b)))
(defn <sonorous [a b]
(> (sonority a) (sonority b)))
(defn remove-stress [phonemes]
(mapv #(string/replace % #"\d" "") phonemes))
(defn cmu-lexicon->cmu-pronouncing-dict
"The CMULexicon returns the `AH` sound, as in `allow`, as `ax`.
The Sphinx dictionary treates that sound as `AH`. This
converts `ax` to `AH`. It also adds `0` to phonemes that are
unstressed, which CMULexicon returns as the plain phoneme with
no stress marker."
[phonemes]
(mapv
(fn [phoneme]
(->> phoneme
((fn [^String phoneme]
(if (.equals phoneme "ax")
"ah"
phoneme)))
string/upper-case
(#(if (vowel %) (str % "0") %))))
phonemes))
(comment
(type (.getPhones cmu-lexicon "allow" nil)) ;; => [Ljava.lang.String;
(vec (.getPhones cmu-lexicon "allow" nil)) ;; => ["ax" "l" "aw1"]
(cmu-lexicon->cmu-pronouncing-dict
(.getPhones cmu-lexicon "allowance" nil))
;; => ["AH0" "L" "AW1" "AH0" "N" "S"]
(cmu-word-to-stressed-phones-map "allowance")
;; => ["AH0" "L" "AW1" "AH0" "N" "S"]
)
(defn get-phones
"Tries to get phones first from the CMU Pronouncing Dictionary
and falls back to the CMULexicon if the word doesn't exist in
the dictionary.
Input must be lower-case.
Returns a vector of all possible pronunciations."
[word]
(let [cmu-phones (mapv cmu-word-to-stressed-phones-map (word-alternatives word))]
(if (seq cmu-phones)
cmu-phones
[(cmu-lexicon->cmu-pronouncing-dict
(.getPhones cmu-lexicon word nil))])))
(defn get-word
"Returns vector of all words that are in the CMU pronouncing dictionary
that have the pronunciation given `phones`.
Expects phones to have stress removed.
Not an exact inverse of `get-phones` since `get-phones` can figure out
somewhat appropriate phones for a made-up word. This function cannot
figure out the spelling of a made-up word provided the made-up word's phones.
Returns nil if no word can be found."
[phones]
(let [stressed? (some #(re-matches #".*\d" %) phones)]
(if stressed?
(stressed-phones-to-cmu-word-map phones)
(unstressed-phones-to-cmu-word-map phones))))
(defn phrase-phones
"Pronunciations of a words seperated by spaces."
[phrase]
(->> phrase
(#(string/split % #" "))
(map get-phones)
(apply combinatorics/cartesian-product)
(mapv (partial reduce into []))))
(comment
(get-phones "alaska")
;; => [["AH0" "L" "AE1" "S" "K" "AH0"]]
(syllabify (first (get-phones "alaska")))
;; => [["AH0"] ["L" "AE1" "S"] ["K" "AH0"]]
(syllabify (first (get-phones "foobarbazia")))
;; => [["F" "UW1"] ["B" "AA1" "R"] ["B" "AA1"] ["Z" "IY0"] ["AH0"]]
(get-word ["AH" "L" "AE" "S" "K" "AH"])
;; => ["alaska"]
(get-word ["N" "IY" "S"])
;; => ["neice" "neece" "niece" "nice(1)" "kneece" "kniess" "neiss" "neace" "niess"]
(get-word ["F" "UW" "B" "AE" "Z"])
;; => nil
(phrase-phones "bog hog")
;; [["B" "AA1" "G" "HH" "AA1" "G"]
;; ["B" "AO1" "G" "HH" "AA1" "G"]]
)