Initial commit

6 years ago · 5ce14fefe2
commit 5ce14fefe2
18 changed files with 547395 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
 .DS_Store
 .idea
 *.log
 tmp/
 .cpcache
 .nrepl-port
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,83 @@
 ==============
 Articulation
 ==============
 Terminology and types of rhymes
 -------------------------------
 1. HAT - CAT
 2. HAT - HALF
 3. HAT - PACK
 The first of those examples clearly rhymes by anyone's definition of "rhyme". The first sound of the syllable known as the "onset", differs. The vowel sound, known as the "nuclei", and the final consonant sound, known as the "coda", are the same.
 The second example might not technically rhyme, but it can still be useful. The "onset", the "H" sound, and the "nuclei", the "AE" sound, are the same in both HAT and HALF. But they differ in their "coda".
 The third example is even less of a proper rhyme, but again it can be useful. The only matching sound is the "nuclei", the "AE" sound.
 Words with multiple syllables give us even more options.
 What is more important: to find the fewest words that rhyme any number of syllables (STUPIFIED - DIGNIFIED), or to find the fewest words that rhyme the greatest number of onsets/nuclei/codas (STUPIFIED - SCOOBY DIED)?
 1. STUPIFIED - SCOOBY DIED
 1. STUPIFIED - GROOVY FINE
 1. STUPIFIED - DIGNIFIED
 1. STUPIFIED - PRIDE
 Program Output
 --------------
 Perfect rhymes
 DOG     -> [ [ [FOG COG HOG ...] ] ]
 Onset rhymes
 DOG     -> [ [ [DOLL DAWN ...] ] ]
 Nuclei rhymes
 DOG     -> [ [ [BALL CAUGHT FOUGHT ...] ] ]
 For multiple syllables, show rhymes for each possible partitioning of syllables.
 Order by rhymes that use the fewest number of words.
 BEEHIVE -> [ [ [REVIVE DEPRIVE] ]
             [ [SEE WE BE ...] [THRIVE DIVE ...] ] ]
 For multi-syllable words, remove restriction to rhyme on every syllable.
 Order by words matching greatest number of syllables.
 BEEHIVE -> [ [ [REVIVE DEPRIVE ALIVE] ]
             [ [SEE WE BE ... ] [THRIVE DIVE ...] ] ]
 Syllables
 ---------
 Typical model
 In the typical theory[citation needed] of syllable structure, the general structure of a syllable (σ) consists of three segments. These segments are grouped into two components:
 Onset (ω)
    a consonant or consonant cluster, obligatory in some languages, optional or even restricted in others
 Rime (ρ)
    right branch, contrasts with onset, splits into nucleus and coda
    Nucleus (ν)
        a vowel or syllabic consonant, obligatory in most languages
    Coda (κ)
        consonant, optional in some languages, highly restricted or prohibited in others
 Rules
 ~~~~~
 Also, for "ellipsis", /ps/ is not a legal internal coda in English. The /s/ can only occur as an appendix, e.g. the plural -s at the end of a word. So it should be e.lip.sis
 http://www.glottopedia.org/index.php/Sonority_hierarchy
 http://www.glottopedia.org/index.php/Maximal_Onset_Principle
 Nasal
 -----
 Air flow goes through nose.
 Examples: "n" in "nose", "m" in "may", "ŋ" in "funk".
 "ŋ" is known as the letter "eng" and the technical name of the consonant is the "voiced velar nasal"
 "voiced" in the above sentence refers to whether or not your vocal chords are active. Your voice chord doesn't vibrate with voiceless consonants, like "sh" "th" "p" "f". In contrast, notice the vibration in phonemes like "m" "r" "z".
--- a/deps.edn
+++ b/deps.edn
@ -0,0 +1,9 @@
 {:paths ["src" "resources"]
 :deps {org.clojure/clojure {:mvn/version "1.10.0"}
        org.clojure/math.combinatorics {:mvn/version "0.1.6"}
        org.clojure/data.priority-map {:mvn/version "1.0.0"}
        org.clojure/core.async {:mvn/version "1.2.603"}
        inflections {:mvn/version "0.13.2"}
        com.taoensso/timbre {:mvn/version "4.10.0"}}
 :aliases {:dev {:extra-paths ["test"]
                 :extra-deps {}}}}
--- a/resources/adjectives.txt
+++ b/resources/adjectives.txt
--- a/resources/adverbs.txt
+++ b/resources/adverbs.txt
--- a/resources/cmudict-0.7b
+++ b/resources/cmudict-0.7b
--- a/resources/cmudict-0.7b.phones
+++ b/resources/cmudict-0.7b.phones
@ -0,0 +1,39 @@
 AA	vowel
 AE	vowel
 AH	vowel
 AO	vowel
 AW	vowel
 AY	vowel
 B	stop
 CH	affricate
 D	stop
 DH	fricative
 EH	vowel
 ER	vowel
 EY	vowel
 F	fricative
 G	stop
 HH	aspirate
 IH	vowel
 IY	vowel
 JH	affricate
 K	stop
 L	liquid
 M	nasal
 N	nasal
 NG	nasal
 OW	vowel
 OY	vowel
 P	stop
 R	liquid
 S	fricative
 SH	fricative
 T	stop
 TH	fricative
 UH	vowel
 UW	vowel
 V	fricative
 W	semivowel
 Y	semivowel
 Z	fricative
 ZH	fricative
--- a/resources/cmudict-0.7b.symbols
+++ b/resources/cmudict-0.7b.symbols
@ -0,0 +1,84 @@
 AA
 AA0
 AA1
 AA2
 AE
 AE0
 AE1
 AE2
 AH
 AH0
 AH1
 AH2
 AO
 AO0
 AO1
 AO2
 AW
 AW0
 AW1
 AW2
 AY
 AY0
 AY1
 AY2
 B
 CH
 D
 DH
 EH
 EH0
 EH1
 EH2
 ER
 ER0
 ER1
 ER2
 EY
 EY0
 EY1
 EY2
 F
 G
 HH
 IH
 IH0
 IH1
 IH2
 IY
 IY0
 IY1
 IY2
 JH
 K
 L
 M
 N
 NG
 OW
 OW0
 OW1
 OW2
 OY
 OY0
 OY1
 OY2
 P
 R
 S
 SH
 T
 TH
 UH
 UH0
 UH1
 UH2
 UW
 UW0
 UW1
 UW2
 V
 W
 Y
 Z
 ZH
--- a/resources/cmudict_SPHINX_40
+++ b/resources/cmudict_SPHINX_40
--- a/resources/nouns.txt
+++ b/resources/nouns.txt
--- a/resources/popular.txt
+++ b/resources/popular.txt
--- a/resources/verbs.txt
+++ b/resources/verbs.txt
--- a/resources/words-by-commonality.txt
+++ b/resources/words-by-commonality.txt
--- a/src/com/owoga/prhyme/core.clj
+++ b/src/com/owoga/prhyme/core.clj
@ -0,0 +1,255 @@
 (ns com.owoga.prhyme.core
  (:require [clojure.java.io :as io]
            [clojure.pprint :as pprint]
            [clojure.string :as string]
            [clojure.set :as set]
            [com.owoga.prhyme.util :as u]
            [com.owoga.prhyme.syllabify :as s]))
 (def dictionary
  (line-seq (io/reader (io/resource "cmudict_SPHINX_40"))))
 (def words (map u/prepare-word dictionary))
 (def popular
  (set (line-seq (io/reader (io/resource "popular.txt")))))
 (def adverbs
  (set/intersection popular (set (line-seq (io/reader (io/resource "adverbs.txt"))))))
 (def adjectives
  (set/intersection popular (set (line-seq (io/reader (io/resource "adjectives.txt"))))))
 (def verbs
  (set/intersection popular (set (line-seq (io/reader (io/resource "verbs.txt"))))))
 (def nouns
  (set/intersection popular (set (line-seq (io/reader (io/resource "nouns.txt"))))))
 (defn words-by-rime* [words]
  (let [words-with-rime (->> words
                             (map rest)
                             (map s/syllabify)
                             (map #(map reverse %))
                             (map #(map
                                    (fn [syllable]
                                      (first (u/take-through u/vowel syllable))) %))
                             (map #(map reverse %))
                             (map reverse)
                             (map #(cons %1 %2) (map first words)))]
    (loop [by-rime {}
           words words-with-rime]
      (let [key (rest (first words))
            val (first (first words))
            existing (get-in by-rime key {:words '()})]
        (cond
          (empty? words) by-rime
          (empty? key) (recur by-rime (rest words))
          :else (recur (assoc-in by-rime
                                 (concat key [:words])
                                 (cons val (:words existing)))
                       (rest words)))))))
 (def words-by-rime (words-by-rime* words))
 (defn words-by-onset-nucleus* [words]
  (let [words-with-onset-nucleus (->> words
                                      (map rest)
                                      (map s/syllabify)
                                      (map #(map
                                             (fn [syllable]
                                               (first (u/take-through u/vowel syllable)))
                                             %))
                                      (map #(cons %1 %2) (map first words)))]
    (loop [by-onset {}
           words words-with-onset-nucleus]
      (let [key (rest (first words))
            val (ffirst words)
            existing (get-in by-onset key {:words '()})]
        (cond
          (empty? words) by-onset
          (empty? key) (recur by-onset (rest words))
          :else (recur (assoc-in by-onset
                                 (concat key [:words])
                                 (cons val (:words existing)))
                       (rest words)))))))
 (def words-by-onset-nucleus (words-by-onset-nucleus* words))
 (defn words-by-nucleus* [words]
  (let [words-with-nucleus (->> words
                                (map rest)
                                (map s/syllabify)
                                (map #(map
                                       (fn [syllable]
                                         (list
                                          (last
                                           (first (u/take-through u/vowel syllable)))))
                                       %))
                                (map #(cons %1 %2) (map first words)))]
    (loop [by-nucleus {}
           words words-with-nucleus]
      (let [key (rest (first words))
            val (ffirst words)
            existing (get-in by-nucleus key {:words '()})]
        (cond
          (empty? words) by-nucleus
          (empty? key) (recur by-nucleus (rest words))
          :else (recur (assoc-in by-nucleus
                                 (concat key [:words])
                                 (cons val (:words existing)))
                       (rest words)))))))
 (def words-by-nucleus (words-by-nucleus* words))
 (defn words-by-syllables* [words]
  (loop [by-syllables {}
         words words]
    (let [word (first words)
          syllable-count (count (s/syllabify word))
          entry (get by-syllables syllable-count '())]
      (cond
        (empty? words) by-syllables
        :else (recur (assoc by-syllables syllable-count (cons word entry))
                     (rest words))))))
 (defn add-word-to-tree [tree word]
  (let [phonemes (reverse (rest word))]
    (assoc-in tree (conj (vec phonemes) :word) word)))
 (defn build-tree [words]
  (reduce add-word-to-tree {} words))
 (def phone-tree (build-tree words))
 (defn rhyme-node [rhyme-tree phonemes]
  (let [phonemes (reverse phonemes)
        node (get-in rhyme-tree phonemes)]
    node))
 (defn filter-to-syllable-count [n words]
  (filter (fn [word] (= n (count (s/syllabify (rest word))))) words))
 (defn rimes [syllables]
  (->> syllables
       (map reverse)
       (map #(first (u/take-through u/vowel %)))
       (map reverse)))
 (defn onset+nucleus [syllables]
  (->> syllables
       (map #(first (u/take-through u/vowel %)))))
 (defn nucleus [syllables]
  (map #(list (last (first (u/take-through u/vowel %)))) syllables))
 (defn single? [coll] (= 1 (count coll)))
 (defn partitions
  "There is a partitions in clojure.combinatorics that might be more
  efficient. This was fun to write. Want to understand more ways to
  write this algorithm. How to make it lazy? How to jump immediately
  to a specific rank?"
  ([coll]
   (partitions coll '()))
  ([coll acc]
   (cond
     (empty? coll) acc
     (single? coll) `((~coll))
     :else
     (let [x (first coll)]
       (reduce (fn [val el]
                 (cons
                  (cons (cons x (first el)) (rest el))
                  (cons (cons (list x) el) val)))
               '()
               (partitions (rest coll) acc))))))
 (defn rhyming-word
  "Simple lookup in data.
  Data is a tree of syllables to words.
  {(IH TH) {:words [WITH SMITH ...]
            (IY Z) {:words [SMITHIES PITHIES ...]
                    (OW) {:words [DITHIESOH ...]}]}}}"
  [data syllables]
  (get-in data (into '(:words) syllables)))
 (defn rhyming-words
  "A rime is made of lists of syllables.
  Each of the following is a rime.
  ([(AH L)] [(IH TH) (IY Z)])
  ([(AH L)] [(IH TH)] [(IY Z)])
  The first represents rhymes of a single-syllable word
  followed by a two-syllable word. The second represents
  a rhyme of three single-syllable words.
  This returns the list of possible words that fulfill each
  collection of syllables. If no rhyme matches, nil is in that
  spot in the list."
  [data rime]
  (map (partial rhyming-word data) rime))
 (defn prhyme [phones]
  (let [syllables (s/syllabify phones)
        rhymes (remove #(some nil? %)
                       (map (partial rhyming-words words-by-rime)
                            (partitions (rimes syllables))))
        onsets (remove #(some nil? %)
                       (map (partial rhyming-words words-by-onset-nucleus)
                            (partitions (onset+nucleus syllables))))
        nuclei (remove #(some nil? %)
                       (map (partial rhyming-words words-by-nucleus)
                            (partitions (nucleus (reverse syllables)))))
        popular-rhymes
        (let [popular (into #{} (map string/upper-case popular))]
          (remove #(some empty? %)
                  (map (fn [rhyme]
                         (map (fn [words-list]
                                (set/intersection popular (into #{} words-list)))
                              rhyme))
                       rhymes)))]
    {:rhymes popular-rhymes
     :onsets onsets
     :nuclei nuclei}))
 (comment
  (take 10 popular)
  (prhyme ["R" "OY" "AH" "L"])
  (let [phones ["D" "R" "IY" "M" "S" "AE" "N" "D" "HH" "OW" "P" "S"]]
    (prhyme phones))
  (let [phones ["AE" "N" "D" "HH" "OW" "P" "S"]]
    (prhyme phones)
    (get-in words-by-nucleus (nucleus (s/syllabify phones)))
    (prhyme phones)
    (partitions (nucleus (s/syllabify phones)))
    (prhyme phones))
  (let [phones ["T" "AY" "M" "T" "UW" "TH" "IH" "NG" "K"]]
    (rimes (s/syllabify phones))
    (prhyme phones))
  (let [phones ["R" "UH" "N" "AW" "T" "AH" "F" "S" "L" "IY" "P"]]
    (prhyme phones)
    (s/syllabify phones))
  (let [phones ["S" "L" "IY" "P"]]
    (prhyme phones))
  (let [phones ["AH" "F"]]
    (prhyme phones))
  (let [phones ["D" "OW" "N" "T" "F" "UH" "K" "W" "IH" "TH" "M" "IY"]]
    (prhyme phones))
  (prhyme ["B" "Y" "UW" "T" "IH" "F" "AH" "L" "G" "ER" "L"])
  (let [r (rimes (s/syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"]))]
    (remove #(some nil? %) (map rhyming-words (partitions r))))
  (let [r (rimes (s/syllabify ["B" "Y" "UW" "T" "IH" "F" "AH" "L" "G" "ER" "L"]))]
    (remove #(some nil? %) (map (partial rhyming-words words-by-rime) (partitions r))))
  (get
   (->> words
        (filter-to-syllable-count 1)
        (words-by-rime*))
   '("AA" "L"))
  )
 (comment
  (-> (s/syllabify ["HH" "AA" "R" "D" "B" "AA" "L"])
      (rimes))
  ;; => (("AA" "R" "D") ("AA" "L")) 
  )
--- a/src/com/owoga/prhyme/grammar.clj
+++ b/src/com/owoga/prhyme/grammar.clj
@ -0,0 +1,38 @@
 (ns com.owoga.prhyme.grammar)
 (def root-states
  [{::tk/name :failed
    ::tk/transitions [{::tk/on tk/_ ::tk/to :failed}]}
   {::tk/name :object
    ::tk/transitions [{::tk/on :adjectives ::tk/to :obj-adj}
                      {::tk/on :nouns ::tk/to :obj-noun}
                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
   {::tk/name :obj-adj
    ::tk/transitions [{::tk/on :nouns ::tk/to :obj-noun}
                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
   {::tk/name :obj-noun
    ::tk/transitions [{::tk/on :verbs ::tk/to :verbs}
                      {::tk/on :adverbs ::tk/to :adverbs}
                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
   {::tk/name :verbs
    ::tk/transitions [{::tk/on :nouns ::tk/to :subj-noun}
                      {::tk/on :adjectives ::tk/to :subj-adj}
                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
   {::tk/name :adverbs
    ::tk/transitions [{::tk/on :verbs ::tk/to :verbs}
                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
   {::tk/name :subj-noun
    ::tk/transitions [{::tk/on :nouns ::tk/to :obj-noun}
                      {::tk/on :adjectives ::tk/to :obj-adj}
                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
   {::tk/name :subj-adj
    ::tk/transitions [{::tk/on :nouns ::tk/to :subj-noun}
                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}])
 (def root-fsm
  {::tk/states root-states
   ::tk/action! (fn [{::tk/keys [signal action] :as fsm}]
                  (case signal
                    :failed (println "Failed! " signal " " action))
                  fsm)
   ::tk/state :object})
--- a/src/com/owoga/prhyme/syllabify.clj
+++ b/src/com/owoga/prhyme/syllabify.clj
@ -0,0 +1,115 @@
 (ns com.owoga.prhyme.syllabify
  (:require [com.owoga.prhyme.util :as p]))
 ;; ER is not yet handled properly.
 ;; PARENTHESES is syllabified as ("P" "ER" "IH" "N") ("TH" "UH") ("S" "IY" "S")
 ;; Glides are also broken. "R OY AH L" gets syllabified as a single syllable.
 ;; This sonority hierarchy is far from perfect.
 ;; It stems from: http://www.glottopedia.org/index.php/Sonority_hierarchy
 ;; I tried to match the phones provided by the CMU dict to the hierarchies
 ;; listed on that page:
 ;;   vowels > liquids > nasals > voiced fricatives
 ;;   > voiceless fricatives = voiced plosives
 ;;   > voiceless plosives (Anderson & Ewen 1987)
 ;;
 ;; *** Comment below this line is left as a future reference
 ;; *** but it does not reflect the true code.
 ;; One other modification I made is to put fricatives after stops.
 ;; I think that fricatives technically have priority over stops with the
 ;; exception of "s" at the end of codas. To quote a comment from a Reddit thread,
 ;; https://www.reddit.com/r/phonetics/comments/i7hp5f/what_is_the_alaska_rule_in_reference_to/
 ;;   Also, for "ellipsis", /ps/ is not a legal internal coda in English.
 ;;   The /s/ can only occur as an appendix, e.g. the plural -s at the end
 ;;   of a word. So it should be e.lip.sis
 ;; As an alternative to handling the isolated "s"-at-the-end-of-internal-coda case,
 ;; it works well-enough for me to treat all fricatives as lowest priority.
 (def sonority-hierarchy
  ["vowel" "semivowel" "liquid" "nasal" "aspirate" "affricate" "fricative" "stop"])
 (def lax-vowels #{"EH" "IH" "AE" "AH" "UH"})
 (defn sonority [phone]
  (.indexOf sonority-hierarchy (p/phonemap phone)))
 (defn vowel? [phone]
  (p/vowel phone))
 (defn >sonorous [a b]
  (> (sonority a) (sonority b)))
 (defn slurp-rime [phones]
  (let [splits (p/take-through vowel? phones)]
    [(first splits) (flatten (rest splits))]))
 (defn slurp-onset [phones]
  (if (empty? (take-while #(not (vowel? %)) phones))
    [[] phones]
    (loop [phones phones
          onset []]
     (cond
       (empty? phones) [onset []]
       (empty? onset) (recur (rest phones) [(first phones)])
       (not (>sonorous (first phones) (last onset))) [onset phones]
       :else (recur (rest phones) (conj onset (first phones)))))))
 (defn fix-lax
  "https://www.reddit.com/r/phonetics/comments/i7hp5f/what_is_the_alaska_rule_in_reference_to/
  He wants to ensure that vowels that cannot form legal codas (lax vowels like
  /æ/) always have some sort of consonantal coda after them.
  "
  [syllables]
  (loop [old-syllables syllables
         new-syllables '()]
    (cond
      (empty? old-syllables) (reverse new-syllables)
      (and (lax-vowels (last (first old-syllables)))
           (< 1 (count old-syllables)))
      (recur (cons (rest (first (rest old-syllables)))
                   (rest (rest old-syllables)))
             (cons (concat (first old-syllables)
                           (list (first (first (rest old-syllables)))))
                   new-syllables))
      :else (recur (rest old-syllables)
                   (cons (first old-syllables) new-syllables)))))
 (defn syllabify [phones]
  (let [phones (reverse phones)]
    (loop [phones phones
           segments []]
      (if (empty? phones)
        (reverse (map reverse segments))
        (let [[rime phones] (slurp-rime phones)
              [onset phones] (slurp-onset phones)]
          (cond
            (= \Y (last (first onset)))
            (recur phones (into segments [rime onset]))
            :else
            (recur phones (conj segments (concat rime onset)))))))))
 (= \Y (ffirst '("YO")))
 (first (ffirst (slurp-onset ["OY" "G" "AH"])))
 (comment
  (syllabify ["AH" "L" "AE" "S" "K" "AH"])
  (syllabify ["H" "ER" "AH" "L" "D"])
  (syllabify ["H" "EH" "R" "AH" "L" "D"])
  (syllabify ["B" "OY" "N" "K"])
  (syllabify ["H" "ER" "AH" "L" "D"])
  (syllabify ["G" "L" "IH" "M" "P" "S" "T"])
  (syllabify ["B" "IY" "G" "L" "IH" "M" "P" "S" "T"])
  (syllabify ["G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"])
  (syllabify ["UH" "P" "R" "AY" "S" "IY" "NG"])
  (syllabify ["UH" "L" "AE" "S" "K" "UH"])
  (syllabify ["R" "OY" "AH" "L"])
  (syllabify ["R" "AY" "AH" "L"])
  (syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"])
  )
 ;; ["GLIMPSED" "G" "L" "IH" "M" "P" "S" "T"]
 ;; ["BEGLIMPSED" "B" "IY" "G" "L" "IH" "M" "P" "S" "T"]
 ;; ["BEGLIMPSED" "B" "EH" "G" "L" "IH" "M" "P" "S" "T"]
 ;; ["GLIMSTEST" "G" "L" "IH" "M" "S" "T" "EH" "S" "T"]
 ;; ["GLIMPSTRED" "G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"]
 ;; ["GLIMSTRED" "G" "L" "IH" "M" "S" "T" "R" "EH" "D"]
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@ -0,0 +1,139 @@
 (ns com.owoga.prhyme.util
  (:require [clojure.java.io :as io]
            [clojure.string :as string]
            [clojure.set :as set]
            [clojure.zip :as z]))
 ;; Pulled from cmudict-0.7b.phones.
 (def phonemap
  {"AA" "vowel"
   "AE" "vowel"
   "AH" "vowel"
   "AO" "vowel"
   "AW" "vowel"
   "AY" "vowel"
   "B"  "stop"
   "CH" "affricate"
   "D"  "stop"
   "DH" "fricative"
   "EH" "vowel"
   "ER" "vowel"
   "EY" "vowel"
   "F"  "fricative"
   "G"  "stop"
   "HH" "aspirate"
   "IH" "vowel"
   "IY" "vowel"
   "JH" "affricate"
   "K"  "stop"
   "L"  "liquid"
   "M"  "nasal"
   "N"  "nasal"
   "NG" "nasal"
   "OW" "vowel"
   "OY" "vowel"
   "P"  "stop"
   "R"  "liquid"
   "S"  "fricative"
   "SH" "fricative"
   "T"  "stop"
   "TH" "fricative"
   "UH" "vowel"
   "UW" "vowel"
   "V"  "fricative"
   "W"  "semivowel"
   "Y"  "semivowel"
   "Z"  "fricative"
   "ZH" "fricative"})
 (def long-vowel #{"EY" "IY" "AY" "OW" "UW"})
 (def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"})
 (def vowel (set/union long-vowel short-vowel))
 (def consonant (set/difference (into #{} (keys phonemap)) vowel))
 (def syllable-end (set/union consonant long-vowel))
 (def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"})
 (def dictionary
  (line-seq (io/reader (io/resource "cmudict_SPHINX_40"))))
 (defn prepare-word
  "Splits whitespace-separated fields into a sequence."
  [line]
  (string/split line #"[\t ]"))
 (defn take-through [pred coll]
  "(take-through even? [1 2 3 4 7 7 5 2 8 10])
   returns '((1 2 3 4) (7 7 5 2) (8) (10))"
  (loop [coll coll
         acc '()]
    (cond
      (empty? coll)
      (if (empty? acc) acc (list (reverse acc)))
      (pred (first coll))
      (let [acc (cons (first coll) acc)]
        (lazy-seq (cons (reverse acc) (take-through pred (rest coll)))))
      :else
      (recur (rest coll)
             (cons (first coll) acc)))))
 (defn max-consecutive [pred coll]
  (loop [coll coll
         cur-count 0
         max-count 0]
    (cond
      (empty? coll) max-count
      (pred (first coll)) (recur (rest coll) (inc cur-count) max-count)
      :else (recur (rest coll) 0 (max cur-count max-count)))))
 (defn pp-word [word]
  (let [spelling (first word)
        phones (rest word)
        phonetypes (map phonemap phones)
        formatted-phones (map #(format "%-10s" %) phones)
        formatted-phonetypes (map #(format "%-10s" %) phonetypes)]
    (format "%s\n%s\n%s"
            spelling
            (string/join " " formatted-phones)
            (string/join " " formatted-phonetypes))))
 (defn count-pred [pred coll]
  (count (filter pred coll)))
 (def count-vowels (partial count-pred vowel))
 (defn node->zipper [node]
  (z/zipper (fn branch? [node]
              (cond
                (map? node)
                (->> (keys (into {} node))
                     (remove #{:word})
                     ((complement empty?)))
                :else
                (do
                  (let [b (->> (keys (into {} (second node)))
                               (remove #{:word})
                               ((complement empty?)))]
                    b))))
            (fn children [node]
              (let [node (if (map? node) node (second node))
                    ch (seq (select-keys node (remove #{:word} (keys node))))]
                ch))
            (fn make-node [node ch]
              (into {} ch))
            node))
 (defn leafs [leaf? zipper]
  (->> zipper
       (iterate z/next)
       (take-while (complement z/end?))
       (map z/node)
       (filter leaf?)))
 (def word-leafs (partial leafs (fn [node] (:word (second node)))))
--- a/syllabify.py
+++ b/syllabify.py
@ -0,0 +1,279 @@
 #!/usr/bin/env python
 # Copyright (c) 2012-2013 Kyle Gorman <gormanky@ohsu.edu>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the
 # "Software"), to deal in the Software without restriction, including
 # without limitation the rights to use, copy, modify, merge, publish,
 # distribute, sublicense, and/or sell copies of the Software, and to
 # permit persons to whom the Software is furnished to do so, subject to
 # the following conditions:
 #
 # The above copyright notice and this permission notice shall be included
 # in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #
 # syllabify.py: prosodic parsing of ARPABET entries
 from itertools import chain
 # constants
 SLAX = {
    "IH1",
    "IH2",
    "EH1",
    "EH2",
    "AE1",
    "AE2",
    "AH1",
    "AH2",
    "UH1",
    "UH2",
 }
 VOWELS = {
    "IY1",
    "IY2",
    "IY0",
    "EY1",
    "EY2",
    "EY0",
    "AA1",
    "AA2",
    "AA0",
    "ER1",
    "ER2",
    "ER0",
    "AW1",
    "AW2",
    "AW0",
    "AO1",
    "AO2",
    "AO0",
    "AY1",
    "AY2",
    "AY0",
    "OW1",
    "OW2",
    "OW0",
    "OY1",
    "OY2",
    "OY0",
    "IH0",
    "EH0",
    "AE0",
    "AH0",
    "UH0",
    "UW1",
    "UW2",
    "UW0",
    "UW",
    "IY",
    "EY",
    "AA",
    "ER",
    "AW",
    "AO",
    "AY",
    "OW",
    "OY",
    "UH",
    "IH",
    "EH",
    "AE",
    "AH",
    "UH",
 } | SLAX
 # licit medial onsets
 O2 = {
    ("P", "R"),
    ("T", "R"),
    ("K", "R"),
    ("B", "R"),
    ("D", "R"),
    ("G", "R"),
    ("F", "R"),
    ("TH", "R"),
    ("P", "L"),
    ("K", "L"),
    ("B", "L"),
    ("G", "L"),
    ("F", "L"),
    ("S", "L"),
    ("K", "W"),
    ("G", "W"),
    ("S", "W"),
    ("S", "P"),
    ("S", "T"),
    ("S", "K"),
    ("HH", "Y"),  # "clerihew"
    ("R", "W"),
 }
 O3 = {("S", "T", "R"), ("S", "K", "L"), ("T", "R", "W")}  # "octroi"
 # This does not represent anything like a complete list of onsets, but
 # merely those that need to be maximized in medial position.
 def syllabify(pron, alaska_rule=True):
    """
    Syllabifies a CMU dictionary (ARPABET) word string
    # Alaska rule:
    >>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska
    '-AH0-.L-AE1-S.K-AH0-'
    >>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska
    '-AH0-.L-AE1-.S K-AH0-'
    # huge medial onsets:
    >>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel
    'M-IH1-N.S T R-AH0-L'
    >>> pprint(syllabify('AA1  K T R W AA0 R'.split())) # octroi
    '-AA1-K.T R W-AA0-R'
    # destressing
    >>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split())))
    'M-IH-.L-AH-.T-EH-.R-IY-'
    # normal treatment of 'j':
    >>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu
    'M-EH1-N.Y-UW0-'
    >>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel
    'S P-AE1-N.Y-AH0-L'
    >>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon
    'K-AE1-N.Y-AH0-N'
    >>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet
    'M-IH0-N.Y-UW2-.-EH1-T'
    >>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior
    'JH-UW1-N.Y-ER0-'
    >>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew
    'K L-EH-.R-IH-.HH Y-UW-'
    # nuclear treatment of 'j'
    >>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue
    'R-EH1-S.K-Y UW0-'
    >>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute
    'T R-IH1-B.Y-UW0-T'
    >>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula
    'N-EH1-B.Y-AH0-.L-AH0-'
    >>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula
    'S P-AE1-.CH-UH0-.L-AH0-'
    >>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen
    '-AH0-K.Y-UW1-.M-AH0-N'
    >>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent
    'S-AH1-K.Y-AH0-.L-IH0-N T'
    >>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula
    'F-AO1 R-M.Y-AH0-.L-AH0-'
    >>> pprint(syllabify('V AE1 L Y UW0'.split())) # value
    'V-AE1-L.Y-UW0-'
    # everything else
    >>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic
    'N-AO0-.S T-AE1-L.JH-IH0-K'
    >>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen
    'CH-ER1-CH.M-AH0-N'
    >>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate
    'K-AA1-M.P-AH0-N.S-EY2-T'
    >>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE
    '-IH0-N.S-EH1-N S'
    >>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense
    '-IH1-N.S-EH2-N S'
    >>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend
    '-AH0-.S-EH1-N D'
    >>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate
    'R-OW1-.T-EY2-T'
    >>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist
    '-AA1 R-.T-AH0-S T'
    >>> pprint(syllabify('AE1 K T ER0'.split())) # actor
    '-AE1-K.T-ER0-'
    >>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster
    'P L-AE1-S.T-ER0-'
    >>> pprint(syllabify('B AH1 T ER0'.split())) # butter
    'B-AH1-.T-ER0-'
    >>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel
    'K-AE1-.M-AH0-L'
    >>> pprint(syllabify('AH1 P ER0'.split())) # upper
    '-AH1-.P-ER0-'
    >>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon
    'B-AH0-.L-UW1-N'
    >>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim
    'P R-OW0-.K L-EY1-M'
    >>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane
    '-IH0-N.S-EY1-N'
    >>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude
    '-IH0-K.S K L-UW1-D'
    """
    ## main pass
    mypron = list(pron)
    nuclei = []
    onsets = []
    i = -1
    for (j, seg) in enumerate(mypron):
        if seg in VOWELS:
            nuclei.append([seg])
            onsets.append(mypron[i + 1 : j])  # actually interludes, r.n.
            i = j
    codas = [mypron[i + 1 :]]
    ## resolve disputes and compute coda
    for i in range(1, len(onsets)):
        coda = []
        # boundary cases
        if len(onsets[i]) > 1 and onsets[i][0] == "R":
            nuclei[i - 1].append(onsets[i].pop(0))
        if len(onsets[i]) > 2 and onsets[i][-1] == "Y":
            nuclei[i].insert(0, onsets[i].pop())
        if (
            len(onsets[i]) > 1
            and alaska_rule
            and nuclei[i - 1][-1] in SLAX
            and onsets[i][0] == "S"
        ):
            coda.append(onsets[i].pop(0))
        # onset maximization
        depth = 1
        if len(onsets[i]) > 1:
            if tuple(onsets[i][-2:]) in O2:
                depth = 3 if tuple(onsets[i][-3:]) in O3 else 2
        for j in range(len(onsets[i]) - depth):
            coda.append(onsets[i].pop(0))
        # store coda
        codas.insert(i - 1, coda)
    ## verify that all segments are included in the ouput
    output = list(zip(onsets, nuclei, codas))  # in Python3 zip is a generator
    flat_output = list(chain.from_iterable(chain.from_iterable(output)))
    if flat_output != mypron:
        raise ValueError(f"could not syllabify {mypron}, got {flat_output}")
    return output
 def pprint(syllab):
    """
    Pretty-print a syllabification
    """
    return ".".join("-".join(" ".join(p) for p in syl) for syl in syllab)
 def destress(syllab):
    """
    Generate a syllabification with nuclear stress information removed
    """
    syls = []
    for (onset, nucleus, coda) in syllab:
        nuke = [p[:-1] if p[-1] in {"0", "1", "2"} else p for p in nucleus]
        syls.append((onset, nuke, coda))
    return syls
 if __name__ == "__main__":
    import doctest
    doctest.testmod()