From 6164773515d76372ed1d4ae33cf41b5dbfe70267 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Thu, 24 Jun 2021 12:57:40 -0500 Subject: [PATCH] Add function phonify a space-separated phrase --- deps.edn | 3 ++- src/com/owoga/phonetics.clj | 28 ++++++++++++++++++++- src/com/owoga/phonetics/util.clj | 7 +++++- test/com/owoga/phonetics/syllabify_test.clj | 5 +++- test/com/owoga/phonetics_test.clj | 6 ++++- 5 files changed, 44 insertions(+), 5 deletions(-) diff --git a/deps.edn b/deps.edn index d9546ff..5792f45 100644 --- a/deps.edn +++ b/deps.edn @@ -1,6 +1,7 @@ {:paths ["src" "resources"] :deps {org.clojure/clojure {:mvn/version "1.10.3"} - net.sf.sociaal/freetts {:mvn/version "1.2.2"}} + net.sf.sociaal/freetts {:mvn/version "1.2.2"} + org.clojure/math.combinatorics {:mvn/version "0.1.6"}} :aliases {:test {:extra-paths ["test"] :extra-deps {org.clojure/test.check {:mvn/version "1.1.0"}}} diff --git a/src/com/owoga/phonetics.clj b/src/com/owoga/phonetics.clj index 1c18b47..46862ef 100644 --- a/src/com/owoga/phonetics.clj +++ b/src/com/owoga/phonetics.clj @@ -2,7 +2,8 @@ (:require [clojure.set] [clojure.string :as string] [clojure.java.io :as io] - [clojure.set :as set]) + [clojure.set :as set] + [clojure.math.combinatorics :as combinatorics]) (:import (com.sun.speech.freetts.en.us CMULexicon))) #_(set! *warn-on-reflection* true) @@ -202,12 +203,31 @@ (.getPhones cmu-lexicon word nil))]))) (defn get-word + "Returns vector of all words that are in the CMU pronouncing dictionary + that have the pronunciation given `phones`. + + Expects phones to have stress removed. + + Not an exact inverse of `get-phones` since `get-phones` can figure out + somewhat appropriate phones for a made-up word. This function cannot + figure out the spelling of a made-up word provided the made-up word's phones. + + Returns nil if no word can be found." [phones] (let [stressed? (some #(re-matches #".*\d" %) phones)] (if stressed? (stressed-phones-to-cmu-word-map phones) (unstressed-phones-to-cmu-word-map phones)))) +(defn phrase-phones + "Pronunciations of a words seperated by spaces." + [phrase] + (->> phrase + (#(string/split % #" ")) + (map get-phones) + (apply combinatorics/cartesian-product) + (mapv (partial reduce into [])))) + (comment (get-phones "alaska") ;; => [["AH0" "L" "AE1" "S" "K" "AH0"]] @@ -220,4 +240,10 @@ ;; => ["alaska"] (get-word ["N" "IY" "S"]) ;; => ["neice" "neece" "niece" "nice(1)" "kneece" "kniess" "neiss" "neace" "niess"] + (get-word ["F" "UW" "B" "AE" "Z"]) + ;; => nil + (phrase-phones "bog hog") + ;; [["B" "AA1" "G" "HH" "AA1" "G"] + ;; ["B" "AO1" "G" "HH" "AA1" "G"]] + ) diff --git a/src/com/owoga/phonetics/util.clj b/src/com/owoga/phonetics/util.clj index 52cfff3..8f6fef9 100644 --- a/src/com/owoga/phonetics/util.clj +++ b/src/com/owoga/phonetics/util.clj @@ -4,7 +4,7 @@ (defn take-through "(take-through even? [1 2 3 4 7 7 5 2 8 10]) - returns '((1 2 3 4) (7 7 5 2) (8) (10))" + returns '((1 2) (3 4) (7 7 5 2) (8) (10))" [pred coll] (loop [coll coll acc '()] @@ -19,3 +19,8 @@ :else (recur (rest coll) (cons (first coll) acc))))) + +(comment + (take-through even? [1 2 3 4 7 7 5 2 8 10]) + ;; => ((1 2) (3 4) (7 7 5 2) (8) (10)) + ) diff --git a/test/com/owoga/phonetics/syllabify_test.clj b/test/com/owoga/phonetics/syllabify_test.clj index 3101279..60b170b 100644 --- a/test/com/owoga/phonetics/syllabify_test.clj +++ b/test/com/owoga/phonetics/syllabify_test.clj @@ -41,4 +41,7 @@ (syllabify ["P" "IH" "L" "OW"])))) (testing "steel" (is (= [["S" "T" "IY1" "L"]] - (syllabify ["S" "T" "IY1" "L"]))))) + (syllabify ["S" "T" "IY1" "L"])))) + (testing "scotch" + (is (= [["S" "K" "AA1" "CH"]] + (syllabify ["S" "K" "AA1" "CH"]))))) diff --git a/test/com/owoga/phonetics_test.clj b/test/com/owoga/phonetics_test.clj index ff17105..a4e31f3 100644 --- a/test/com/owoga/phonetics_test.clj +++ b/test/com/owoga/phonetics_test.clj @@ -13,4 +13,8 @@ (is (= ["hello(1)"] (get-word ["HH" "EH" "L" "OW"]))) (is (= ["ensure(1)" "insure"] - (get-word ["IH" "N" "SH" "UH" "R"]))))) + (get-word ["IH" "N" "SH" "UH" "R"])))) + (testing "phrase to phones" + (is (= [["B" "AA1" "G" "HH" "AA1" "G"] + ["B" "AO1" "G" "HH" "AA1" "G"]] + (phrase-phones "bog hog")))))