Initial commit

5 years ago · 5ce14fefe2
commit 5ce14fefe2
18 changed files with 547395 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,6 @@
+.DS_Store
+.idea
+*.log
+tmp/
+.cpcache
+.nrepl-port
--- a/README.rst
+++ b/README.rst
@ -0,0 +1,83 @@
+==============
+ Articulation
+==============
+
+Terminology and types of rhymes
+-------------------------------
+
+1. HAT - CAT
+2. HAT - HALF
+3. HAT - PACK
+
+The first of those examples clearly rhymes by anyone's definition of "rhyme". The first sound of the syllable known as the "onset", differs. The vowel sound, known as the "nuclei", and the final consonant sound, known as the "coda", are the same.
+
+The second example might not technically rhyme, but it can still be useful. The "onset", the "H" sound, and the "nuclei", the "AE" sound, are the same in both HAT and HALF. But they differ in their "coda".
+
+The third example is even less of a proper rhyme, but again it can be useful. The only matching sound is the "nuclei", the "AE" sound.
+
+Words with multiple syllables give us even more options.
+
+What is more important: to find the fewest words that rhyme any number of syllables (STUPIFIED - DIGNIFIED), or to find the fewest words that rhyme the greatest number of onsets/nuclei/codas (STUPIFIED - SCOOBY DIED)?
+
+1. STUPIFIED - SCOOBY DIED
+1. STUPIFIED - GROOVY FINE
+1. STUPIFIED - DIGNIFIED
+1. STUPIFIED - PRIDE
+
+Program Output
+--------------
+
+Perfect rhymes
+DOG     -> [ [ [FOG COG HOG ...] ] ]
+
+Onset rhymes
+DOG     -> [ [ [DOLL DAWN ...] ] ]
+
+Nuclei rhymes
+DOG     -> [ [ [BALL CAUGHT FOUGHT ...] ] ]
+
+For multiple syllables, show rhymes for each possible partitioning of syllables.
+Order by rhymes that use the fewest number of words.
+BEEHIVE -> [ [ [REVIVE DEPRIVE] ]
+             [ [SEE WE BE ...] [THRIVE DIVE ...] ] ]
+
+For multi-syllable words, remove restriction to rhyme on every syllable.
+Order by words matching greatest number of syllables.
+BEEHIVE -> [ [ [REVIVE DEPRIVE ALIVE] ]
+             [ [SEE WE BE ... ] [THRIVE DIVE ...] ] ]
+
+Syllables
+---------
+Typical model
+
+In the typical theory[citation needed] of syllable structure, the general structure of a syllable (σ) consists of three segments. These segments are grouped into two components:
+
+Onset (ω)
+    a consonant or consonant cluster, obligatory in some languages, optional or even restricted in others
+Rime (ρ)
+    right branch, contrasts with onset, splits into nucleus and coda
+
+    Nucleus (ν)
+        a vowel or syllabic consonant, obligatory in most languages
+    Coda (κ)
+        consonant, optional in some languages, highly restricted or prohibited in others
+
+Rules
+~~~~~
+
+Also, for "ellipsis", /ps/ is not a legal internal coda in English. The /s/ can only occur as an appendix, e.g. the plural -s at the end of a word. So it should be e.lip.sis
+
+http://www.glottopedia.org/index.php/Sonority_hierarchy
+
+http://www.glottopedia.org/index.php/Maximal_Onset_Principle
+
+Nasal
+-----
+
+Air flow goes through nose.
+
+Examples: "n" in "nose", "m" in "may", "ŋ" in "funk".
+
+"ŋ" is known as the letter "eng" and the technical name of the consonant is the "voiced velar nasal"
+
+"voiced" in the above sentence refers to whether or not your vocal chords are active. Your voice chord doesn't vibrate with voiceless consonants, like "sh" "th" "p" "f". In contrast, notice the vibration in phonemes like "m" "r" "z".
--- a/deps.edn
+++ b/deps.edn
@ -0,0 +1,9 @@
+{:paths ["src" "resources"]
+ :deps {org.clojure/clojure {:mvn/version "1.10.0"}
+        org.clojure/math.combinatorics {:mvn/version "0.1.6"}
+        org.clojure/data.priority-map {:mvn/version "1.0.0"}
+        org.clojure/core.async {:mvn/version "1.2.603"}
+        inflections {:mvn/version "0.13.2"}
+        com.taoensso/timbre {:mvn/version "4.10.0"}}
+ :aliases {:dev {:extra-paths ["test"]
+                 :extra-deps {}}}}
--- a/resources/adjectives.txt
+++ b/resources/adjectives.txt
--- a/resources/adverbs.txt
+++ b/resources/adverbs.txt
--- a/resources/cmudict-0.7b
+++ b/resources/cmudict-0.7b
--- a/resources/cmudict-0.7b.phones
+++ b/resources/cmudict-0.7b.phones
@ -0,0 +1,39 @@
+AA	vowel
+AE	vowel
+AH	vowel
+AO	vowel
+AW	vowel
+AY	vowel
+B	stop
+CH	affricate
+D	stop
+DH	fricative
+EH	vowel
+ER	vowel
+EY	vowel
+F	fricative
+G	stop
+HH	aspirate
+IH	vowel
+IY	vowel
+JH	affricate
+K	stop
+L	liquid
+M	nasal
+N	nasal
+NG	nasal
+OW	vowel
+OY	vowel
+P	stop
+R	liquid
+S	fricative
+SH	fricative
+T	stop
+TH	fricative
+UH	vowel
+UW	vowel
+V	fricative
+W	semivowel
+Y	semivowel
+Z	fricative
+ZH	fricative
--- a/resources/cmudict-0.7b.symbols
+++ b/resources/cmudict-0.7b.symbols
@ -0,0 +1,84 @@
+AA
+AA0
+AA1
+AA2
+AE
+AE0
+AE1
+AE2
+AH
+AH0
+AH1
+AH2
+AO
+AO0
+AO1
+AO2
+AW
+AW0
+AW1
+AW2
+AY
+AY0
+AY1
+AY2
+B
+CH
+D
+DH
+EH
+EH0
+EH1
+EH2
+ER
+ER0
+ER1
+ER2
+EY
+EY0
+EY1
+EY2
+F
+G
+HH
+IH
+IH0
+IH1
+IH2
+IY
+IY0
+IY1
+IY2
+JH
+K
+L
+M
+N
+NG
+OW
+OW0
+OW1
+OW2
+OY
+OY0
+OY1
+OY2
+P
+R
+S
+SH
+T
+TH
+UH
+UH0
+UH1
+UH2
+UW
+UW0
+UW1
+UW2
+V
+W
+Y
+Z
+ZH
--- a/resources/cmudict_SPHINX_40
+++ b/resources/cmudict_SPHINX_40
--- a/resources/nouns.txt
+++ b/resources/nouns.txt
--- a/resources/popular.txt
+++ b/resources/popular.txt
--- a/resources/verbs.txt
+++ b/resources/verbs.txt
--- a/resources/words-by-commonality.txt
+++ b/resources/words-by-commonality.txt
--- a/src/com/owoga/prhyme/core.clj
+++ b/src/com/owoga/prhyme/core.clj
@ -0,0 +1,255 @@
+(ns com.owoga.prhyme.core
+  (:require [clojure.java.io :as io]
+            [clojure.pprint :as pprint]
+            [clojure.string :as string]
+            [clojure.set :as set]
+            [com.owoga.prhyme.util :as u]
+            [com.owoga.prhyme.syllabify :as s]))
+
+(def dictionary
+  (line-seq (io/reader (io/resource "cmudict_SPHINX_40"))))
+
+(def words (map u/prepare-word dictionary))
+
+(def popular
+  (set (line-seq (io/reader (io/resource "popular.txt")))))
+
+(def adverbs
+  (set/intersection popular (set (line-seq (io/reader (io/resource "adverbs.txt"))))))
+
+(def adjectives
+  (set/intersection popular (set (line-seq (io/reader (io/resource "adjectives.txt"))))))
+
+(def verbs
+  (set/intersection popular (set (line-seq (io/reader (io/resource "verbs.txt"))))))
+
+(def nouns
+  (set/intersection popular (set (line-seq (io/reader (io/resource "nouns.txt"))))))
+
+(defn words-by-rime* [words]
+  (let [words-with-rime (->> words
+                             (map rest)
+                             (map s/syllabify)
+                             (map #(map reverse %))
+                             (map #(map
+                                    (fn [syllable]
+                                      (first (u/take-through u/vowel syllable))) %))
+                             (map #(map reverse %))
+                             (map reverse)
+                             (map #(cons %1 %2) (map first words)))]
+    (loop [by-rime {}
+           words words-with-rime]
+      (let [key (rest (first words))
+            val (first (first words))
+            existing (get-in by-rime key {:words '()})]
+        (cond
+          (empty? words) by-rime
+          (empty? key) (recur by-rime (rest words))
+          :else (recur (assoc-in by-rime
+                                 (concat key [:words])
+                                 (cons val (:words existing)))
+                       (rest words)))))))
+
+(def words-by-rime (words-by-rime* words))
+
+(defn words-by-onset-nucleus* [words]
+  (let [words-with-onset-nucleus (->> words
+                                      (map rest)
+                                      (map s/syllabify)
+                                      (map #(map
+                                             (fn [syllable]
+                                               (first (u/take-through u/vowel syllable)))
+                                             %))
+                                      (map #(cons %1 %2) (map first words)))]
+    (loop [by-onset {}
+           words words-with-onset-nucleus]
+      (let [key (rest (first words))
+            val (ffirst words)
+            existing (get-in by-onset key {:words '()})]
+        (cond
+          (empty? words) by-onset
+          (empty? key) (recur by-onset (rest words))
+          :else (recur (assoc-in by-onset
+                                 (concat key [:words])
+                                 (cons val (:words existing)))
+                       (rest words)))))))
+
+(def words-by-onset-nucleus (words-by-onset-nucleus* words))
+
+(defn words-by-nucleus* [words]
+  (let [words-with-nucleus (->> words
+                                (map rest)
+                                (map s/syllabify)
+                                (map #(map
+                                       (fn [syllable]
+                                         (list
+                                          (last
+                                           (first (u/take-through u/vowel syllable)))))
+                                       %))
+                                (map #(cons %1 %2) (map first words)))]
+    (loop [by-nucleus {}
+           words words-with-nucleus]
+      (let [key (rest (first words))
+            val (ffirst words)
+            existing (get-in by-nucleus key {:words '()})]
+        (cond
+          (empty? words) by-nucleus
+          (empty? key) (recur by-nucleus (rest words))
+          :else (recur (assoc-in by-nucleus
+                                 (concat key [:words])
+                                 (cons val (:words existing)))
+                       (rest words)))))))
+
+(def words-by-nucleus (words-by-nucleus* words))
+
+(defn words-by-syllables* [words]
+  (loop [by-syllables {}
+         words words]
+    (let [word (first words)
+          syllable-count (count (s/syllabify word))
+          entry (get by-syllables syllable-count '())]
+      (cond
+        (empty? words) by-syllables
+        :else (recur (assoc by-syllables syllable-count (cons word entry))
+                     (rest words))))))
+
+(defn add-word-to-tree [tree word]
+  (let [phonemes (reverse (rest word))]
+    (assoc-in tree (conj (vec phonemes) :word) word)))
+
+(defn build-tree [words]
+  (reduce add-word-to-tree {} words))
+
+(def phone-tree (build-tree words))
+
+(defn rhyme-node [rhyme-tree phonemes]
+  (let [phonemes (reverse phonemes)
+        node (get-in rhyme-tree phonemes)]
+    node))
+
+(defn filter-to-syllable-count [n words]
+  (filter (fn [word] (= n (count (s/syllabify (rest word))))) words))
+
+(defn rimes [syllables]
+  (->> syllables
+       (map reverse)
+       (map #(first (u/take-through u/vowel %)))
+       (map reverse)))
+
+(defn onset+nucleus [syllables]
+  (->> syllables
+       (map #(first (u/take-through u/vowel %)))))
+
+(defn nucleus [syllables]
+  (map #(list (last (first (u/take-through u/vowel %)))) syllables))
+
+(defn single? [coll] (= 1 (count coll)))
+
+(defn partitions
+  "There is a partitions in clojure.combinatorics that might be more
+  efficient. This was fun to write. Want to understand more ways to
+  write this algorithm. How to make it lazy? How to jump immediately
+  to a specific rank?"
+  ([coll]
+   (partitions coll '()))
+  ([coll acc]
+   (cond
+     (empty? coll) acc
+     (single? coll) `((~coll))
+     :else
+     (let [x (first coll)]
+       (reduce (fn [val el]
+                 (cons
+                  (cons (cons x (first el)) (rest el))
+                  (cons (cons (list x) el) val)))
+               '()
+               (partitions (rest coll) acc))))))
+
+(defn rhyming-word
+  "Simple lookup in data.
+  Data is a tree of syllables to words.
+  {(IH TH) {:words [WITH SMITH ...]
+            (IY Z) {:words [SMITHIES PITHIES ...]
+                    (OW) {:words [DITHIESOH ...]}]}}}"
+  [data syllables]
+  (get-in data (into '(:words) syllables)))
+
+(defn rhyming-words
+  "A rime is made of lists of syllables.
+  Each of the following is a rime.
+  ([(AH L)] [(IH TH) (IY Z)])
+  ([(AH L)] [(IH TH)] [(IY Z)])
+  The first represents rhymes of a single-syllable word
+  followed by a two-syllable word. The second represents
+  a rhyme of three single-syllable words.
+  This returns the list of possible words that fulfill each
+  collection of syllables. If no rhyme matches, nil is in that
+  spot in the list."
+  [data rime]
+  (map (partial rhyming-word data) rime))
+
+(defn prhyme [phones]
+  (let [syllables (s/syllabify phones)
+        rhymes (remove #(some nil? %)
+                       (map (partial rhyming-words words-by-rime)
+                            (partitions (rimes syllables))))
+        onsets (remove #(some nil? %)
+                       (map (partial rhyming-words words-by-onset-nucleus)
+                            (partitions (onset+nucleus syllables))))
+        nuclei (remove #(some nil? %)
+                       (map (partial rhyming-words words-by-nucleus)
+                            (partitions (nucleus (reverse syllables)))))
+        popular-rhymes
+        (let [popular (into #{} (map string/upper-case popular))]
+          (remove #(some empty? %)
+                  (map (fn [rhyme]
+                         (map (fn [words-list]
+                                (set/intersection popular (into #{} words-list)))
+                              rhyme))
+                       rhymes)))]
+    {:rhymes popular-rhymes
+     :onsets onsets
+     :nuclei nuclei}))
+
+(comment
+  (take 10 popular)
+  (prhyme ["R" "OY" "AH" "L"])
+  (let [phones ["D" "R" "IY" "M" "S" "AE" "N" "D" "HH" "OW" "P" "S"]]
+    (prhyme phones))
+  (let [phones ["AE" "N" "D" "HH" "OW" "P" "S"]]
+    (prhyme phones)
+    (get-in words-by-nucleus (nucleus (s/syllabify phones)))
+    (prhyme phones)
+    (partitions (nucleus (s/syllabify phones)))
+    (prhyme phones))
+  (let [phones ["T" "AY" "M" "T" "UW" "TH" "IH" "NG" "K"]]
+    (rimes (s/syllabify phones))
+    (prhyme phones))
+  (let [phones ["R" "UH" "N" "AW" "T" "AH" "F" "S" "L" "IY" "P"]]
+    (prhyme phones)
+    (s/syllabify phones))
+  (let [phones ["S" "L" "IY" "P"]]
+    (prhyme phones))
+  (let [phones ["AH" "F"]]
+    (prhyme phones))
+  (let [phones ["D" "OW" "N" "T" "F" "UH" "K" "W" "IH" "TH" "M" "IY"]]
+    (prhyme phones))
+  (prhyme ["B" "Y" "UW" "T" "IH" "F" "AH" "L" "G" "ER" "L"])
+  (let [r (rimes (s/syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"]))]
+    (remove #(some nil? %) (map rhyming-words (partitions r))))
+
+  (let [r (rimes (s/syllabify ["B" "Y" "UW" "T" "IH" "F" "AH" "L" "G" "ER" "L"]))]
+    (remove #(some nil? %) (map (partial rhyming-words words-by-rime) (partitions r))))
+
+  (get
+   (->> words
+        (filter-to-syllable-count 1)
+        (words-by-rime*))
+   '("AA" "L"))
+  )
+(comment
+  (-> (s/syllabify ["HH" "AA" "R" "D" "B" "AA" "L"])
+      (rimes))
+  ;; => (("AA" "R" "D") ("AA" "L")) 
+  )
+
--- a/src/com/owoga/prhyme/grammar.clj
+++ b/src/com/owoga/prhyme/grammar.clj
@ -0,0 +1,38 @@
+(ns com.owoga.prhyme.grammar)
+
+(def root-states
+  [{::tk/name :failed
+    ::tk/transitions [{::tk/on tk/_ ::tk/to :failed}]}
+   {::tk/name :object
+    ::tk/transitions [{::tk/on :adjectives ::tk/to :obj-adj}
+                      {::tk/on :nouns ::tk/to :obj-noun}
+                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
+   {::tk/name :obj-adj
+    ::tk/transitions [{::tk/on :nouns ::tk/to :obj-noun}
+                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
+   {::tk/name :obj-noun
+    ::tk/transitions [{::tk/on :verbs ::tk/to :verbs}
+                      {::tk/on :adverbs ::tk/to :adverbs}
+                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
+   {::tk/name :verbs
+    ::tk/transitions [{::tk/on :nouns ::tk/to :subj-noun}
+                      {::tk/on :adjectives ::tk/to :subj-adj}
+                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
+   {::tk/name :adverbs
+    ::tk/transitions [{::tk/on :verbs ::tk/to :verbs}
+                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
+   {::tk/name :subj-noun
+    ::tk/transitions [{::tk/on :nouns ::tk/to :obj-noun}
+                      {::tk/on :adjectives ::tk/to :obj-adj}
+                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}
+   {::tk/name :subj-adj
+    ::tk/transitions [{::tk/on :nouns ::tk/to :subj-noun}
+                      {::tk/on tk/_ ::tk/to :object ::tk/actions [:failed]}]}])
+
+(def root-fsm
+  {::tk/states root-states
+   ::tk/action! (fn [{::tk/keys [signal action] :as fsm}]
+                  (case signal
+                    :failed (println "Failed! " signal " " action))
+                  fsm)
+   ::tk/state :object})
--- a/src/com/owoga/prhyme/syllabify.clj
+++ b/src/com/owoga/prhyme/syllabify.clj
@ -0,0 +1,115 @@
+(ns com.owoga.prhyme.syllabify
+  (:require [com.owoga.prhyme.util :as p]))
+;; ER is not yet handled properly.
+;; PARENTHESES is syllabified as ("P" "ER" "IH" "N") ("TH" "UH") ("S" "IY" "S")
+;; Glides are also broken. "R OY AH L" gets syllabified as a single syllable.
+
+;; This sonority hierarchy is far from perfect.
+;; It stems from: http://www.glottopedia.org/index.php/Sonority_hierarchy
+;; I tried to match the phones provided by the CMU dict to the hierarchies
+;; listed on that page:
+;;   vowels > liquids > nasals > voiced fricatives
+;;   > voiceless fricatives = voiced plosives
+;;   > voiceless plosives (Anderson & Ewen 1987)
+;;
+;; *** Comment below this line is left as a future reference
+;; *** but it does not reflect the true code.
+;; One other modification I made is to put fricatives after stops.
+;; I think that fricatives technically have priority over stops with the
+;; exception of "s" at the end of codas. To quote a comment from a Reddit thread,
+;; https://www.reddit.com/r/phonetics/comments/i7hp5f/what_is_the_alaska_rule_in_reference_to/
+;;   Also, for "ellipsis", /ps/ is not a legal internal coda in English.
+;;   The /s/ can only occur as an appendix, e.g. the plural -s at the end
+;;   of a word. So it should be e.lip.sis
+;; As an alternative to handling the isolated "s"-at-the-end-of-internal-coda case,
+;; it works well-enough for me to treat all fricatives as lowest priority.
+(def sonority-hierarchy
+  ["vowel" "semivowel" "liquid" "nasal" "aspirate" "affricate" "fricative" "stop"])
+
+(def lax-vowels #{"EH" "IH" "AE" "AH" "UH"})
+
+(defn sonority [phone]
+  (.indexOf sonority-hierarchy (p/phonemap phone)))
+
+(defn vowel? [phone]
+  (p/vowel phone))
+
+(defn >sonorous [a b]
+  (> (sonority a) (sonority b)))
+
+(defn slurp-rime [phones]
+  (let [splits (p/take-through vowel? phones)]
+    [(first splits) (flatten (rest splits))]))
+
+(defn slurp-onset [phones]
+  (if (empty? (take-while #(not (vowel? %)) phones))
+    [[] phones]
+    (loop [phones phones
+          onset []]
+     (cond
+       (empty? phones) [onset []]
+       (empty? onset) (recur (rest phones) [(first phones)])
+       (not (>sonorous (first phones) (last onset))) [onset phones]
+       :else (recur (rest phones) (conj onset (first phones)))))))
+
+(defn fix-lax
+  "https://www.reddit.com/r/phonetics/comments/i7hp5f/what_is_the_alaska_rule_in_reference_to/
+
+  He wants to ensure that vowels that cannot form legal codas (lax vowels like
+  /æ/) always have some sort of consonantal coda after them.
+  "
+  [syllables]
+  (loop [old-syllables syllables
+         new-syllables '()]
+    (cond
+      (empty? old-syllables) (reverse new-syllables)
+
+      (and (lax-vowels (last (first old-syllables)))
+           (< 1 (count old-syllables)))
+      (recur (cons (rest (first (rest old-syllables)))
+                   (rest (rest old-syllables)))
+             (cons (concat (first old-syllables)
+                           (list (first (first (rest old-syllables)))))
+                   new-syllables))
+
+      :else (recur (rest old-syllables)
+                   (cons (first old-syllables) new-syllables)))))
+
+(defn syllabify [phones]
+  (let [phones (reverse phones)]
+    (loop [phones phones
+           segments []]
+      (if (empty? phones)
+        (reverse (map reverse segments))
+        (let [[rime phones] (slurp-rime phones)
+              [onset phones] (slurp-onset phones)]
+          (cond
+            (= \Y (last (first onset)))
+            (recur phones (into segments [rime onset]))
+
+            :else
+            (recur phones (conj segments (concat rime onset)))))))))
+
+(= \Y (ffirst '("YO")))
+(first (ffirst (slurp-onset ["OY" "G" "AH"])))
+(comment
+  (syllabify ["AH" "L" "AE" "S" "K" "AH"])
+  (syllabify ["H" "ER" "AH" "L" "D"])
+  (syllabify ["H" "EH" "R" "AH" "L" "D"])
+  (syllabify ["B" "OY" "N" "K"])
+  (syllabify ["H" "ER" "AH" "L" "D"])
+  (syllabify ["G" "L" "IH" "M" "P" "S" "T"])
+  (syllabify ["B" "IY" "G" "L" "IH" "M" "P" "S" "T"])
+  (syllabify ["G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"])
+  (syllabify ["UH" "P" "R" "AY" "S" "IY" "NG"])
+  (syllabify ["UH" "L" "AE" "S" "K" "UH"])
+  (syllabify ["R" "OY" "AH" "L"])
+  (syllabify ["R" "AY" "AH" "L"])
+  (syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"])
+  )
+ ;; ["GLIMPSED" "G" "L" "IH" "M" "P" "S" "T"]
+ ;; ["BEGLIMPSED" "B" "IY" "G" "L" "IH" "M" "P" "S" "T"]
+ ;; ["BEGLIMPSED" "B" "EH" "G" "L" "IH" "M" "P" "S" "T"]
+ ;; ["GLIMSTEST" "G" "L" "IH" "M" "S" "T" "EH" "S" "T"]
+ ;; ["GLIMPSTRED" "G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"]
+ ;; ["GLIMSTRED" "G" "L" "IH" "M" "S" "T" "R" "EH" "D"]
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@ -0,0 +1,139 @@
+(ns com.owoga.prhyme.util
+  (:require [clojure.java.io :as io]
+            [clojure.string :as string]
+            [clojure.set :as set]
+            [clojure.zip :as z]))
+
+;; Pulled from cmudict-0.7b.phones.
+(def phonemap
+  {"AA" "vowel"
+   "AE" "vowel"
+   "AH" "vowel"
+   "AO" "vowel"
+   "AW" "vowel"
+   "AY" "vowel"
+   "B"  "stop"
+   "CH" "affricate"
+   "D"  "stop"
+   "DH" "fricative"
+   "EH" "vowel"
+   "ER" "vowel"
+   "EY" "vowel"
+   "F"  "fricative"
+   "G"  "stop"
+   "HH" "aspirate"
+   "IH" "vowel"
+   "IY" "vowel"
+   "JH" "affricate"
+   "K"  "stop"
+   "L"  "liquid"
+   "M"  "nasal"
+   "N"  "nasal"
+   "NG" "nasal"
+   "OW" "vowel"
+   "OY" "vowel"
+   "P"  "stop"
+   "R"  "liquid"
+   "S"  "fricative"
+   "SH" "fricative"
+   "T"  "stop"
+   "TH" "fricative"
+   "UH" "vowel"
+   "UW" "vowel"
+   "V"  "fricative"
+   "W"  "semivowel"
+   "Y"  "semivowel"
+   "Z"  "fricative"
+   "ZH" "fricative"})
+
+(def long-vowel #{"EY" "IY" "AY" "OW" "UW"})
+
+(def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"})
+
+(def vowel (set/union long-vowel short-vowel))
+
+(def consonant (set/difference (into #{} (keys phonemap)) vowel))
+
+(def syllable-end (set/union consonant long-vowel))
+
+(def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"})
+
+(def dictionary
+  (line-seq (io/reader (io/resource "cmudict_SPHINX_40"))))
+
+(defn prepare-word
+  "Splits whitespace-separated fields into a sequence."
+  [line]
+  (string/split line #"[\t ]"))
+
+(defn take-through [pred coll]
+  "(take-through even? [1 2 3 4 7 7 5 2 8 10])
+   returns '((1 2 3 4) (7 7 5 2) (8) (10))"
+  (loop [coll coll
+         acc '()]
+    (cond
+      (empty? coll)
+      (if (empty? acc) acc (list (reverse acc)))
+
+      (pred (first coll))
+      (let [acc (cons (first coll) acc)]
+        (lazy-seq (cons (reverse acc) (take-through pred (rest coll)))))
+
+      :else
+      (recur (rest coll)
+             (cons (first coll) acc)))))
+
+(defn max-consecutive [pred coll]
+  (loop [coll coll
+         cur-count 0
+         max-count 0]
+    (cond
+      (empty? coll) max-count
+      (pred (first coll)) (recur (rest coll) (inc cur-count) max-count)
+      :else (recur (rest coll) 0 (max cur-count max-count)))))
+
+(defn pp-word [word]
+  (let [spelling (first word)
+        phones (rest word)
+        phonetypes (map phonemap phones)
+        formatted-phones (map #(format "%-10s" %) phones)
+        formatted-phonetypes (map #(format "%-10s" %) phonetypes)]
+    (format "%s\n%s\n%s"
+            spelling
+            (string/join " " formatted-phones)
+            (string/join " " formatted-phonetypes))))
+
+(defn count-pred [pred coll]
+  (count (filter pred coll)))
+
+(def count-vowels (partial count-pred vowel))
+
+(defn node->zipper [node]
+  (z/zipper (fn branch? [node]
+              (cond
+                (map? node)
+                (->> (keys (into {} node))
+                     (remove #{:word})
+                     ((complement empty?)))
+                :else
+                (do
+                  (let [b (->> (keys (into {} (second node)))
+                               (remove #{:word})
+                               ((complement empty?)))]
+                    b))))
+            (fn children [node]
+              (let [node (if (map? node) node (second node))
+                    ch (seq (select-keys node (remove #{:word} (keys node))))]
+                ch))
+            (fn make-node [node ch]
+              (into {} ch))
+            node))
+
+(defn leafs [leaf? zipper]
+  (->> zipper
+       (iterate z/next)
+       (take-while (complement z/end?))
+       (map z/node)
+       (filter leaf?)))
+
+(def word-leafs (partial leafs (fn [node] (:word (second node)))))
--- a/syllabify.py
+++ b/syllabify.py
@ -0,0 +1,279 @@
+#!/usr/bin/env python
+# Copyright (c) 2012-2013 Kyle Gorman <gormanky@ohsu.edu>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the
+# "Software"), to deal in the Software without restriction, including
+# without limitation the rights to use, copy, modify, merge, publish,
+# distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so, subject to
+# the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+# syllabify.py: prosodic parsing of ARPABET entries
+
+from itertools import chain
+
+# constants
+SLAX = {
+    "IH1",
+    "IH2",
+    "EH1",
+    "EH2",
+    "AE1",
+    "AE2",
+    "AH1",
+    "AH2",
+    "UH1",
+    "UH2",
+}
+VOWELS = {
+    "IY1",
+    "IY2",
+    "IY0",
+    "EY1",
+    "EY2",
+    "EY0",
+    "AA1",
+    "AA2",
+    "AA0",
+    "ER1",
+    "ER2",
+    "ER0",
+    "AW1",
+    "AW2",
+    "AW0",
+    "AO1",
+    "AO2",
+    "AO0",
+    "AY1",
+    "AY2",
+    "AY0",
+    "OW1",
+    "OW2",
+    "OW0",
+    "OY1",
+    "OY2",
+    "OY0",
+    "IH0",
+    "EH0",
+    "AE0",
+    "AH0",
+    "UH0",
+    "UW1",
+    "UW2",
+    "UW0",
+    "UW",
+    "IY",
+    "EY",
+    "AA",
+    "ER",
+    "AW",
+    "AO",
+    "AY",
+    "OW",
+    "OY",
+    "UH",
+    "IH",
+    "EH",
+    "AE",
+    "AH",
+    "UH",
+} | SLAX
+
+# licit medial onsets
+
+O2 = {
+    ("P", "R"),
+    ("T", "R"),
+    ("K", "R"),
+    ("B", "R"),
+    ("D", "R"),
+    ("G", "R"),
+    ("F", "R"),
+    ("TH", "R"),
+    ("P", "L"),
+    ("K", "L"),
+    ("B", "L"),
+    ("G", "L"),
+    ("F", "L"),
+    ("S", "L"),
+    ("K", "W"),
+    ("G", "W"),
+    ("S", "W"),
+    ("S", "P"),
+    ("S", "T"),
+    ("S", "K"),
+    ("HH", "Y"),  # "clerihew"
+    ("R", "W"),
+}
+O3 = {("S", "T", "R"), ("S", "K", "L"), ("T", "R", "W")}  # "octroi"
+
+# This does not represent anything like a complete list of onsets, but
+# merely those that need to be maximized in medial position.
+
+
+def syllabify(pron, alaska_rule=True):
+    """
+    Syllabifies a CMU dictionary (ARPABET) word string
+
+    # Alaska rule:
+    >>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska
+    '-AH0-.L-AE1-S.K-AH0-'
+    >>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska
+    '-AH0-.L-AE1-.S K-AH0-'
+
+    # huge medial onsets:
+    >>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel
+    'M-IH1-N.S T R-AH0-L'
+    >>> pprint(syllabify('AA1  K T R W AA0 R'.split())) # octroi
+    '-AA1-K.T R W-AA0-R'
+
+    # destressing
+    >>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split())))
+    'M-IH-.L-AH-.T-EH-.R-IY-'
+
+    # normal treatment of 'j':
+    >>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu
+    'M-EH1-N.Y-UW0-'
+    >>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel
+    'S P-AE1-N.Y-AH0-L'
+    >>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon
+    'K-AE1-N.Y-AH0-N'
+    >>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet
+    'M-IH0-N.Y-UW2-.-EH1-T'
+    >>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior
+    'JH-UW1-N.Y-ER0-'
+    >>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew
+    'K L-EH-.R-IH-.HH Y-UW-'
+
+    # nuclear treatment of 'j'
+    >>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue
+    'R-EH1-S.K-Y UW0-'
+    >>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute
+    'T R-IH1-B.Y-UW0-T'
+    >>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula
+    'N-EH1-B.Y-AH0-.L-AH0-'
+    >>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula
+    'S P-AE1-.CH-UH0-.L-AH0-'
+    >>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen
+    '-AH0-K.Y-UW1-.M-AH0-N'
+    >>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent
+    'S-AH1-K.Y-AH0-.L-IH0-N T'
+    >>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula
+    'F-AO1 R-M.Y-AH0-.L-AH0-'
+    >>> pprint(syllabify('V AE1 L Y UW0'.split())) # value
+    'V-AE1-L.Y-UW0-'
+
+    # everything else
+    >>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic
+    'N-AO0-.S T-AE1-L.JH-IH0-K'
+    >>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen
+    'CH-ER1-CH.M-AH0-N'
+    >>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate
+    'K-AA1-M.P-AH0-N.S-EY2-T'
+    >>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE
+    '-IH0-N.S-EH1-N S'
+    >>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense
+    '-IH1-N.S-EH2-N S'
+    >>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend
+    '-AH0-.S-EH1-N D'
+    >>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate
+    'R-OW1-.T-EY2-T'
+    >>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist
+    '-AA1 R-.T-AH0-S T'
+    >>> pprint(syllabify('AE1 K T ER0'.split())) # actor
+    '-AE1-K.T-ER0-'
+    >>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster
+    'P L-AE1-S.T-ER0-'
+    >>> pprint(syllabify('B AH1 T ER0'.split())) # butter
+    'B-AH1-.T-ER0-'
+    >>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel
+    'K-AE1-.M-AH0-L'
+    >>> pprint(syllabify('AH1 P ER0'.split())) # upper
+    '-AH1-.P-ER0-'
+    >>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon
+    'B-AH0-.L-UW1-N'
+    >>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim
+    'P R-OW0-.K L-EY1-M'
+    >>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane
+    '-IH0-N.S-EY1-N'
+    >>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude
+    '-IH0-K.S K L-UW1-D'
+    """
+    ## main pass
+    mypron = list(pron)
+    nuclei = []
+    onsets = []
+    i = -1
+    for (j, seg) in enumerate(mypron):
+        if seg in VOWELS:
+            nuclei.append([seg])
+            onsets.append(mypron[i + 1 : j])  # actually interludes, r.n.
+            i = j
+    codas = [mypron[i + 1 :]]
+    ## resolve disputes and compute coda
+    for i in range(1, len(onsets)):
+        coda = []
+        # boundary cases
+        if len(onsets[i]) > 1 and onsets[i][0] == "R":
+            nuclei[i - 1].append(onsets[i].pop(0))
+        if len(onsets[i]) > 2 and onsets[i][-1] == "Y":
+            nuclei[i].insert(0, onsets[i].pop())
+        if (
+            len(onsets[i]) > 1
+            and alaska_rule
+            and nuclei[i - 1][-1] in SLAX
+            and onsets[i][0] == "S"
+        ):
+            coda.append(onsets[i].pop(0))
+        # onset maximization
+        depth = 1
+        if len(onsets[i]) > 1:
+            if tuple(onsets[i][-2:]) in O2:
+                depth = 3 if tuple(onsets[i][-3:]) in O3 else 2
+        for j in range(len(onsets[i]) - depth):
+            coda.append(onsets[i].pop(0))
+        # store coda
+        codas.insert(i - 1, coda)
+
+    ## verify that all segments are included in the ouput
+    output = list(zip(onsets, nuclei, codas))  # in Python3 zip is a generator
+    flat_output = list(chain.from_iterable(chain.from_iterable(output)))
+    if flat_output != mypron:
+        raise ValueError(f"could not syllabify {mypron}, got {flat_output}")
+    return output
+
+
+def pprint(syllab):
+    """
+    Pretty-print a syllabification
+    """
+    return ".".join("-".join(" ".join(p) for p in syl) for syl in syllab)
+
+
+def destress(syllab):
+    """
+    Generate a syllabification with nuclear stress information removed
+    """
+    syls = []
+    for (onset, nucleus, coda) in syllab:
+        nuke = [p[:-1] if p[-1] in {"0", "1", "2"} else p for p in nucleus]
+        syls.append((onset, nuke, coda))
+    return syls
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()