fixup! Fix bug from refactor

Move non-syllable-related defs to phonetics ns
Add stress manipulation functions
10 changed files with 176 additions and 60 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,24 +1,22 @@
 # Change Log
 All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/).

-## [Unreleased]
-### Changed
- Add a new arity to `make-widget-async` to provide a different widget shape.
-
-## [0.1.1] - 2021-04-22
-### Changed
- Documentation on how to make the widgets.
-
-### Removed
- `make-widget-sync` - we're all async, all the time.
+## [0.1.3] - 2021-05-02
+### Fixed
+- Fixed bug sylalbifying words that begin with consonants that don't adhere to sonority heirarchy.
+  - "Steel", for example. "T" is less sonorous than "S" and typically wouldn't be included in an onset, but since there are no vowels preceding the "ST" then both *should* be included in the onset.

+## [0.1.2] - 2021-04-22
 ### Fixed
- Fixed widget maker to keep working when daylight savings switches over.
+- Fixed bug when getting phones from CMULexicon because the word wasn't found in the CMU dictionary. (Missing parens)
+- Comment out warn-on-reflection code that was just being used to find performance gains.

-## 0.1.0 - 2021-04-22
+## 0.1.1
 ### Added
- Files from the new template.
- Widget maker public API - `make-widget-sync`.

-[Unreleased]: https://github.com/com.owoga/phonetics/compare/0.1.1...HEAD
-[0.1.1]: https://github.com/com.owoga/phonetics/compare/0.1.0...0.1.1
+Initial release
+
+- Phonetics and syllabification utilities 
+
+[Unreleased]: https://github.com/com.owoga/phonetics/compare/0.1.2...HEAD
+[0.1.1]: https://github.com/com.owoga/phonetics/compare/0.1.1...0.1.2
--- a/deps.edn
+++ b/deps.edn
@ -1,6 +1,7 @@
 {:paths ["src" "resources"]
 :deps {org.clojure/clojure {:mvn/version "1.10.3"}
-        net.sf.sociaal/freetts {:mvn/version "1.2.2"}}
+        net.sf.sociaal/freetts {:mvn/version "1.2.2"}
+        org.clojure/math.combinatorics {:mvn/version "0.1.6"}}
 :aliases
 {:test {:extra-paths ["test"]
         :extra-deps {org.clojure/test.check {:mvn/version "1.1.0"}}}
--- a/pom.xml
+++ b/pom.xml
@ -3,10 +3,10 @@
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.owoga</groupId>
  <artifactId>phonetics</artifactId>
-  <version>0.1.1</version>
+  <version>0.1.3</version>
  <name>com.owoga/phonetics</name>
  <description>Phonetics and syllabification of English words.</description>
-  <url>https://github.com/com.owoga/phonetics</url>
+  <url>https://github.com/eihli/phonetics</url>
  <licenses>
    <license>
      <name>MIT License</name>
@ -19,9 +19,9 @@
    </developer>
  </developers>
  <scm>
-    <url>https://github.com/com.owoga/phonetics</url>
-    <connection>scm:git:git://github.com/com.owoga/phonetics.git</connection>
-    <developerConnection>scm:git:ssh://git@github.com/com.owoga/phonetics.git</developerConnection>
+    <url>https://github.com/eihli/phonetics</url>
+    <connection>scm:git:git://github.com/eihli/phonetics.git</connection>
+    <developerConnection>scm:git:ssh://git@github.com/eihli/phonetics.git</developerConnection>
    <tag>v0.1.0-SNAPSHOT</tag>
  </scm>
  <dependencies>
--- a/resources/07IJCAI-spelling-variants.pdf
+++ b/resources/07IJCAI-spelling-variants.pdf
--- a/src/com/owoga/phonetics.clj
+++ b/src/com/owoga/phonetics.clj
@ -2,7 +2,8 @@
  (:require [clojure.set]
            [clojure.string :as string]
            [clojure.java.io :as io]
-            [clojure.set :as set])
+            [clojure.set :as set]
+            [clojure.math.combinatorics :as combinatorics])
  (:import (com.sun.speech.freetts.en.us CMULexicon)))

 #_(set! *warn-on-reflection* true)
@ -155,6 +156,34 @@
  two different ways of getting phonemes."
  (CMULexicon/getInstance true))

+
+;; This sonority hierarchy may not be perfect.
+;; It stems from: http://www.glottopedia.org/index.php/Sonority_hierarchy
+;; I tried to match the phones provided by the CMU dict to the hierarchies
+;; listed on that page:
+;;   vowels > liquids > nasals > voiced fricatives
+;;   > voiceless fricatives = voiced plosives
+;;   > voiceless plosives (Anderson & Ewen 1987)
+(def ^clojure.lang.PersistentVector sonority-hierarchy
+  ;;   more sonorous  < < < vowel < < < (maximal onset) vowel > > > less sonorous
+  ["vowel" "liquid" "semivowel" "aspirate" "affricate" "nasal" "fricative" "stop"])
+
+(def lax-vowels #{"EH" "IH" "AE" "AH" "UH"})
+
+(defn sonority [phone]
+  (.indexOf sonority-hierarchy (phonemap phone)))
+
+(defn vowel? [phone]
+  (vowel (string/replace phone #"\d" "")))
+
+(def consonant? (complement vowel?))
+
+(defn >sonorous [a b]
+  (< (sonority a) (sonority b)))
+
+(defn <sonorous [a b]
+  (> (sonority a) (sonority b)))
+
 (defn remove-stress [phonemes]
  (mapv #(string/replace % #"\d" "") phonemes))

@ -202,12 +231,31 @@
        (.getPhones cmu-lexicon word nil))])))

 (defn get-word
+  "Returns vector of all words that are in the CMU pronouncing dictionary
+  that have the pronunciation given `phones`.
+
+  Expects phones to have stress removed.
+
+  Not an exact inverse of `get-phones` since `get-phones` can figure out
+  somewhat appropriate phones for a made-up word. This function cannot
+  figure out the spelling of a made-up word provided the made-up word's phones.
+
+  Returns nil if no word can be found."
  [phones]
  (let [stressed? (some #(re-matches #".*\d" %) phones)]
    (if stressed?
      (stressed-phones-to-cmu-word-map phones)
      (unstressed-phones-to-cmu-word-map phones))))

+(defn phrase-phones
+  "Pronunciations of a words seperated by spaces."
+  [phrase]
+  (->> phrase
+       (#(string/split % #" "))
+       (map get-phones)
+       (apply combinatorics/cartesian-product)
+       (mapv (partial reduce into []))))
+
 (comment
  (get-phones "alaska")
  ;; => [["AH0" "L" "AE1" "S" "K" "AH0"]]
@ -220,4 +268,10 @@
  ;; => ["alaska"]
  (get-word ["N" "IY" "S"])
  ;; => ["neice" "neece" "niece" "nice(1)" "kneece" "kniess" "neiss" "neace" "niess"]
+  (get-word ["F" "UW" "B" "AE" "Z"])
+  ;; => nil
+  (phrase-phones "bog hog")
+  ;;  [["B" "AA1" "G" "HH" "AA1" "G"]
+  ;;   ["B" "AO1" "G" "HH" "AA1" "G"]]
+
  )
--- a/src/com/owoga/phonetics/stress_manip.clj
+++ b/src/com/owoga/phonetics/stress_manip.clj
@ -0,0 +1,64 @@
+(ns com.owoga.phonetics.stress-manip
+  (:require [clojure.string :as string]))
+
+(defn primary-stress?
+  [phone]
+  (re-find #"1" phone))
+
+(defn non-primary-stress?
+  [phone]
+  (re-find #"[2-9]" phone))
+
+(defn unstressed?
+  [phone]
+  (re-find #"0" phone))
+
+(defn remove-any-stress-signifiers
+  [phones]
+  (map #(string/replace % #"\d" "") phones))
+
+(defn remove-non-primary-stress-signifiers
+  [phones]
+  (map #(string/replace % #"[02-9]" "") phones))
+
+(defn unify-stressed
+  [phones]
+  (map #(string/replace % #"[2-9]" "1") phones))
+
+(def consonant-unification-map
+  "This almost aligns with the phonemap that maps phones to whether they are vowels, aspirates, nasals, etc...
+  Slight but possibly important difference in stops. For example, I think T and D
+  are more unified than T and G; and G and K are more unifide than G and T."
+  {"T" "T"
+   "CH" "CH"
+   "K" "K"
+   "HH" "HH"
+   "L" "L"
+   "JH" "CH" ;; <-
+   "G" "K"   ;; <-
+   "M" "M"   ;; <-
+   "S" "S"
+   "Y" "Y"
+   "Z" "S"   ;; <-
+   "R" "R"
+   "F" "F"
+   "B" "B"
+   "SH" "CH" ;; <-
+   "P" "B"   ;; <-
+   "V" "F"   ;; <-
+   "TH" "T"  ;; <-
+   "N" "M"   ;; <-
+   "DH" "T"  ;; <-
+   "W"  "Y"  ;; <-
+   "ZH" "S"  ;; <-
+   "NG" "M"  ;; <-
+   "D" "T"   ;; <-
+   })
+
+(defn unify-consonants
+  [phones]
+  (mapv #(get consonant-unification-map % %) phones))
+
+(defn remove-unstressed-signifiers
+  [phones]
+  (map #(string/replace % #"0" "")))
--- a/src/com/owoga/phonetics/syllabify.clj
+++ b/src/com/owoga/phonetics/syllabify.clj
@ -5,37 +5,11 @@

 #_(set! *warn-on-reflection* true)

-;; This sonority hierarchy may not be perfect.
-;; It stems from: http://www.glottopedia.org/index.php/Sonority_hierarchy
-;; I tried to match the phones provided by the CMU dict to the hierarchies
-;; listed on that page:
-;;   vowels > liquids > nasals > voiced fricatives
-;;   > voiceless fricatives = voiced plosives
-;;   > voiceless plosives (Anderson & Ewen 1987)
-(def ^clojure.lang.PersistentVector sonority-hierarchy
-  ;;   more sonorous  < < < vowel < < < (maximal onset) vowel > > > less sonorous
-  ["vowel" "liquid" "semivowel" "aspirate" "affricate" "nasal" "fricative" "stop"])
-
-(def lax-vowels #{"EH" "IH" "AE" "AH" "UH"})
-
-(defn sonority [phone]
-  (.indexOf sonority-hierarchy (phonetics/phonemap phone)))
-
-(defn vowel? [phone]
-  (phonetics/vowel phone))
-
-(def consonant? (complement vowel?))
-
-(defn >sonorous [a b]
-  (< (sonority a) (sonority b)))
-
-(defn <sonorous [a b]
-  (> (sonority a) (sonority b)))
-
 (defn slurp-rime
-  "Returns a vector of the rime and the remaining phones to process."
+  "Expects the phones in reverse order.
+  Returns a vector of the rime (in forwards order) and the remaining phones to process."
  [phones]
-  (let [splits (util/take-through vowel? phones)]
+  (let [splits (util/take-through phonetics/vowel? phones)]
    [(vec (reverse (first splits))) (vec (flatten (rest splits)))]))

 (comment
@ -61,18 +35,22 @@
      ;; Two vowels next to each other is treated as two syllables.
      ;; This might not always be the case if the vowels are lax.
      ;; Is "royal" 1 syllable or two? This treats it as two.
-      (vowel? (nth phones 0))
+      (phonetics/vowel? (nth phones 0))
      [syllable phones]

      ;; Maximal onset principle with exception for lax vowels occurring in
      ;; closed syllables.
-      (and (consonant? (nth syllable 0))
-           (<sonorous (nth phones 0) (nth syllable 0))
-           (not (lax-vowels (nth phones 1 nil))))
+      (and (phonetics/consonant? (nth syllable 0))
+           (phonetics/<sonorous (nth phones 0) (nth syllable 0))
+           (not (phonetics/lax-vowels (nth phones 1 nil))))
+      (recur (subvec phones 1)
+             (into [(nth phones 0)] syllable))
+
+      (phonetics/vowel? (nth syllable 0))
      (recur (subvec phones 1)
             (into [(nth phones 0)] syllable))

-      (vowel? (nth syllable 0))
+      (not-any? phonetics/vowel? phones)
      (recur (subvec phones 1)
             (into [(nth phones 0)] syllable))

@ -128,6 +106,9 @@
          (recur phones'' (into [syllable] segments)))))))

 (comment
+  (syllabify ["S" "T" "IY" "L"])
+  (slurp-rime (reverse ["S" "T" "IY" "L"]))
+  (slurp-onset-given-rime ["T" "S"] ["IY" "L"])
  (phonetics/remove-stress ["AH" "L" "AE" "S" "K" "AH"])
  (slurp-onset-given-rime ["L" "AE" "S" "K" "AH"] ["AH"])
  (syllabify ["AH0" "L" "AE1" "S" "K" "AH0"])
--- a/src/com/owoga/phonetics/util.clj
+++ b/src/com/owoga/phonetics/util.clj
@ -4,7 +4,7 @@

 (defn take-through
  "(take-through even? [1 2 3 4 7 7 5 2 8 10])
-   returns '((1 2 3 4) (7 7 5 2) (8) (10))"
+  returns '((1 2) (3 4) (7 7 5 2) (8) (10))"
  [pred coll]
  (loop [coll coll
         acc '()]
@ -19,3 +19,8 @@
      :else
      (recur (rest coll)
             (cons (first coll) acc)))))
+
+(comment
+  (take-through even? [1 2 3 4 7 7 5 2 8 10])
+  ;; => ((1 2) (3 4) (7 7 5 2) (8) (10))
+  )
--- a/test/com/owoga/phonetics/syllabify_test.clj
+++ b/test/com/owoga/phonetics/syllabify_test.clj
@ -38,4 +38,10 @@
  ;; about handling ambisyllabic words. There's no such thing.
  (testing "pillow"
    (is (= '(("P" "IH") ("L" "OW"))
-           (syllabify ["P" "IH" "L" "OW"])))))
+           (syllabify ["P" "IH" "L" "OW"]))))
+  (testing "steel"
+    (is (= [["S" "T" "IY1" "L"]]
+           (syllabify ["S" "T" "IY1" "L"]))))
+  (testing "scotch"
+    (is (= [["S" "K" "AA1" "CH"]]
+           (syllabify ["S" "K" "AA1" "CH"])))))
--- a/test/com/owoga/phonetics_test.clj
+++ b/test/com/owoga/phonetics_test.clj
@ -13,4 +13,8 @@
    (is (= ["hello(1)"]
           (get-word ["HH" "EH" "L" "OW"])))
    (is (= ["ensure(1)" "insure"]
-           (get-word ["IH" "N" "SH" "UH" "R"])))))
+           (get-word ["IH" "N" "SH" "UH" "R"]))))
+  (testing "phrase to phones"
+    (is (= [["B" "AA1" "G" "HH" "AA1" "G"]
+            ["B" "AO1" "G" "HH" "AA1" "G"]]
+           (phrase-phones "bog hog")))))
Author	SHA1	Message	Date
Eric Ihli	a93544a9c3	fixup! Fix bug from refactor	4 years ago
Eric Ihli	d0cf546bf7	Move non-syllable-related defs to phonetics ns	4 years ago
Eric Ihli	3385e7fd15	Add stress manipulation functions Useful for comparing the phonetics in a more flexible manner.	4 years ago
Eric Ihli	6164773515	Add function phonify a space-separated phrase	4 years ago
Eric Ihli	9d547a2733	Improve docstring on slurp-rime	4 years ago
Eric Ihli	e6481308cc	Fix pom.xml repo url	4 years ago
Eric Ihli	a44feaa166	Bump version for release, update changelog.	4 years ago
Eric Ihli	e4722dbb69	Add resource for spelling variants	4 years ago
Eric Ihli	d0f5ed1733	Fix bug incorrectly syllabifying steel If there are no vowels left when slurping an onset, then every remaining phone belongs to the syllable, regardless of any other rule.	4 years ago
Eric Ihli	249919d684	Bump version	4 years ago