From 6164773515d76372ed1d4ae33cf41b5dbfe70267 Mon Sep 17 00:00:00 2001
From: Eric Ihli <eihli@owoga.com>
Date: Thu, 24 Jun 2021 12:57:40 -0500
Subject: [PATCH] Add function phonify a space-separated phrase

---
 deps.edn                                    |  3 ++-
 src/com/owoga/phonetics.clj                 | 28 ++++++++++++++++++++-
 src/com/owoga/phonetics/util.clj            |  7 +++++-
 test/com/owoga/phonetics/syllabify_test.clj |  5 +++-
 test/com/owoga/phonetics_test.clj           |  6 ++++-
 5 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/deps.edn b/deps.edn
index d9546ff..5792f45 100644
--- a/deps.edn
+++ b/deps.edn
@@ -1,6 +1,7 @@
 {:paths ["src" "resources"]
  :deps {org.clojure/clojure {:mvn/version "1.10.3"}
-        net.sf.sociaal/freetts {:mvn/version "1.2.2"}}
+        net.sf.sociaal/freetts {:mvn/version "1.2.2"}
+        org.clojure/math.combinatorics {:mvn/version "0.1.6"}}
  :aliases
  {:test {:extra-paths ["test"]
          :extra-deps {org.clojure/test.check {:mvn/version "1.1.0"}}}
diff --git a/src/com/owoga/phonetics.clj b/src/com/owoga/phonetics.clj
index 1c18b47..46862ef 100644
--- a/src/com/owoga/phonetics.clj
+++ b/src/com/owoga/phonetics.clj
@@ -2,7 +2,8 @@
   (:require [clojure.set]
             [clojure.string :as string]
             [clojure.java.io :as io]
-            [clojure.set :as set])
+            [clojure.set :as set]
+            [clojure.math.combinatorics :as combinatorics])
   (:import (com.sun.speech.freetts.en.us CMULexicon)))
 
 #_(set! *warn-on-reflection* true)
@@ -202,12 +203,31 @@
         (.getPhones cmu-lexicon word nil))])))
 
 (defn get-word
+  "Returns vector of all words that are in the CMU pronouncing dictionary
+  that have the pronunciation given `phones`.
+
+  Expects phones to have stress removed.
+
+  Not an exact inverse of `get-phones` since `get-phones` can figure out
+  somewhat appropriate phones for a made-up word. This function cannot
+  figure out the spelling of a made-up word provided the made-up word's phones.
+
+  Returns nil if no word can be found."
   [phones]
   (let [stressed? (some #(re-matches #".*\d" %) phones)]
     (if stressed?
       (stressed-phones-to-cmu-word-map phones)
       (unstressed-phones-to-cmu-word-map phones))))
 
+(defn phrase-phones
+  "Pronunciations of a words seperated by spaces."
+  [phrase]
+  (->> phrase
+       (#(string/split % #" "))
+       (map get-phones)
+       (apply combinatorics/cartesian-product)
+       (mapv (partial reduce into []))))
+
 (comment
   (get-phones "alaska")
   ;; => [["AH0" "L" "AE1" "S" "K" "AH0"]]
@@ -220,4 +240,10 @@
   ;; => ["alaska"]
   (get-word ["N" "IY" "S"])
   ;; => ["neice" "neece" "niece" "nice(1)" "kneece" "kniess" "neiss" "neace" "niess"]
+  (get-word ["F" "UW" "B" "AE" "Z"])
+  ;; => nil
+  (phrase-phones "bog hog")
+  ;;  [["B" "AA1" "G" "HH" "AA1" "G"]
+  ;;   ["B" "AO1" "G" "HH" "AA1" "G"]]
+
   )
diff --git a/src/com/owoga/phonetics/util.clj b/src/com/owoga/phonetics/util.clj
index 52cfff3..8f6fef9 100644
--- a/src/com/owoga/phonetics/util.clj
+++ b/src/com/owoga/phonetics/util.clj
@@ -4,7 +4,7 @@
 
 (defn take-through
   "(take-through even? [1 2 3 4 7 7 5 2 8 10])
-   returns '((1 2 3 4) (7 7 5 2) (8) (10))"
+  returns '((1 2) (3 4) (7 7 5 2) (8) (10))"
   [pred coll]
   (loop [coll coll
          acc '()]
@@ -19,3 +19,8 @@
       :else
       (recur (rest coll)
              (cons (first coll) acc)))))
+
+(comment
+  (take-through even? [1 2 3 4 7 7 5 2 8 10])
+  ;; => ((1 2) (3 4) (7 7 5 2) (8) (10))
+  )
diff --git a/test/com/owoga/phonetics/syllabify_test.clj b/test/com/owoga/phonetics/syllabify_test.clj
index 3101279..60b170b 100644
--- a/test/com/owoga/phonetics/syllabify_test.clj
+++ b/test/com/owoga/phonetics/syllabify_test.clj
@@ -41,4 +41,7 @@
            (syllabify ["P" "IH" "L" "OW"]))))
   (testing "steel"
     (is (= [["S" "T" "IY1" "L"]]
-           (syllabify ["S" "T" "IY1" "L"])))))
+           (syllabify ["S" "T" "IY1" "L"]))))
+  (testing "scotch"
+    (is (= [["S" "K" "AA1" "CH"]]
+           (syllabify ["S" "K" "AA1" "CH"])))))
diff --git a/test/com/owoga/phonetics_test.clj b/test/com/owoga/phonetics_test.clj
index ff17105..a4e31f3 100644
--- a/test/com/owoga/phonetics_test.clj
+++ b/test/com/owoga/phonetics_test.clj
@@ -13,4 +13,8 @@
     (is (= ["hello(1)"]
            (get-word ["HH" "EH" "L" "OW"])))
     (is (= ["ensure(1)" "insure"]
-           (get-word ["IH" "N" "SH" "UH" "R"])))))
+           (get-word ["IH" "N" "SH" "UH" "R"]))))
+  (testing "phrase to phones"
+    (is (= [["B" "AA1" "G" "HH" "AA1" "G"]
+            ["B" "AO1" "G" "HH" "AA1" "G"]]
+           (phrase-phones "bog hog")))))