From 6cf1236708a4417f9700f71530b0dc08e13505b6 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Mon, 26 Apr 2021 10:06:13 -0500 Subject: [PATCH] Misc updates to tpt --- .gitattributes | 1 + deps.edn | 3 +- dev/examples/tpt.clj | 71 ++++++++++++++++++------- src/com/owoga/prhyme/data/phonetics.clj | 50 +++++++++++++---- src/com/owoga/prhyme/syllabify.clj | 3 ++ 5 files changed, 98 insertions(+), 30 deletions(-) diff --git a/.gitattributes b/.gitattributes index 0b1d71a..182eaf9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -15,3 +15,4 @@ resources/dark-corpus-tpt.bin filter=lfs diff=lfs merge=lfs -text resources/database.bin filter=lfs diff=lfs merge=lfs -text resources/backwards-database.bin filter=lfs diff=lfs merge=lfs -text resources/dark-corpus-backwards-tpt.bin filter=lfs diff=lfs merge=lfs -text +**.bin filter=lfs diff=lfs merge=lfs -text diff --git a/deps.edn b/deps.edn index 66e50ad..e3828bc 100644 --- a/deps.edn +++ b/deps.edn @@ -21,6 +21,7 @@ com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/timbre {:mvn/version "4.10.0"} com.owoga/tightly-packed-trie - {:local/root "/home/eihli/code/tightly-packed-trie"}} + {:local/root "/home/eihli/code/tightly-packed-trie"} + com.owoga/phonetics {:mvn/version "0.1.1"}} :aliases {:dev {:extra-paths ["test" "examples" "dev"] :extra-deps {}}}} diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj index 62c33c0..89ed2d2 100644 --- a/dev/examples/tpt.clj +++ b/dev/examples/tpt.clj @@ -1,6 +1,8 @@ (ns examples.tpt (:require [clojure.string :as string] [clojure.java.io :as io] + [com.owoga.phonetics :as owoga.phonetics] + [com.owoga.phonetics.syllabify :as owoga.syllabify] [com.owoga.prhyme.core :as prhyme] [com.owoga.prhyme.nlp.core :as nlp] [taoensso.tufte :as tufte :refer (defnp p profiled profile)] @@ -22,13 +24,6 @@ (drop start) (take end))) -(defn dark-corpus-file-seq [start end] - (let [xf (comp (remove #(.isDirectory %)) - (drop start) - (take end)) - documents (file-seq (io/file "dark-corpus"))] - (transduce xf conj documents))) - (def re-word "Regex for tokenizing a string into words (including contractions and hyphenations), @@ -262,6 +257,14 @@ ngrams-ids)))))) (comment + (transduce (comp (xf-file-seq 0 10) + (map slurp) + (map (partial n-to-m-grams 1 5)) + #_#_(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams))) + stateful-transducer) + conj + (file-seq (io/file "dark-corpus"))) + (time (def trie (transduce (comp (xf-file-seq 0 250000) @@ -384,7 +387,7 @@ (defn syllabify-with-stress [word] (let [phones (word->phones word) phones-without-stress (map #(string/replace % #"\d" "") phones) - syllables (syllabify/syllabify phones-without-stress)] + syllables (first (owoga.syllabify/syllabify phones-without-stress))] (loop [phones phones syllables syllables result [[]]] @@ -411,6 +414,8 @@ (comment (syllabify-phrase-with-stress "bother me") + (word->phones "bother me") + (map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones) ["bother" "me"]) [(syllabify-phrase-with-stress "on poverty") (syllabify-phrase-with-stress "can bother me")] @@ -501,7 +506,7 @@ (swap! context assoc - :flex-rhyme-trie' + :flex-rhyme-trie3' (transduce (comp (map (fn [[k v]] @@ -514,7 +519,7 @@ (fn [trie [k v]] (update trie k (fnil conj [v]) v))) (trie/make-trie) - (tpt/children-at-depth (@context :trie) 0 2)))) + (trie/children-at-depth (@context :trie) 0 3)))) nil) ) @@ -526,13 +531,14 @@ (take 500)) (trie/children (trie/lookup (@context :flex-rhyme-trie') - (reverse (rest (phrase->flex-rhyme-phones "technology"))))) + (reverse (rest (phrase->flex-rhyme-phones "i love you"))))) (trie/lookup (@context :flex-rhyme-trie') '("IY" "AH" "AA")) + (map (@context :database) '()) (take 5 (@context :flex-rhyme-trie')) - (map #(get (@context :database) %) [6177 13036]) + (map #(get (@context :database) %) [21 8953]) (map #(get (@context :database) %) [410 48670]) (get (@context :trie) [1 2 2]) @@ -545,8 +551,36 @@ ) -(comment +(defn flex-rhymes->phrases [flex-rhymes database] + (->> flex-rhymes + (map second) + (map + (fn [rhymes] + (reduce + (fn [acc [k [v fr]]] + (update acc k (fnil #(+ % fr) 0))) + {} + rhymes))) + (map (partial sort-by (comp - second))) + (map + (fn [rhymes] + (map + (fn [[k fr]] + [(map database k) fr]) + rhymes))))) +(comment + (->> (trie/lookup + (@context :flex-rhyme-trie3') + (reverse (phrase->flex-rhyme-phones "taylor my dear"))) + (#(flex-rhymes->phrases % (@context :database))) + (apply concat) + (sort-by (comp - second)) + (remove + (fn [[k fr]] + (or (= 1 (count k)) + (= "" (first k)) + (= "" (second k)))))) (filter dict/english? @@ -557,7 +591,9 @@ (@context :flex-rhyme-trie) '("IY" "AH" "AA")))))) - (take 5 (drop 500 (@context :flex-rhyme-trie))) + (->> (take 5 (drop 500 (@context :flex-rhyme-trie'))) + (#(flex-rhymes->phrases % (@context :database)))) + (let [key (reverse (phrase->flex-rhyme-phones "technology"))] [key (reverse (phrase->flex-rhyme-phones "sociology")) @@ -728,11 +764,6 @@ - - - - - @@ -797,7 +828,7 @@ (map #(vector % (reverse (word->phones %)))) (map reverse) (map (fn [[phones v]] - [(map #(if (phonetics/vowel + [(map #(if (owoga.phonetics/vowel (string/replace % #"\d" "")) % "?") diff --git a/src/com/owoga/prhyme/data/phonetics.clj b/src/com/owoga/prhyme/data/phonetics.clj index c01773e..d8f7072 100644 --- a/src/com/owoga/prhyme/data/phonetics.clj +++ b/src/com/owoga/prhyme/data/phonetics.clj @@ -3,16 +3,48 @@ [clojure.set] [clojure.java.io :as io])) +;; From http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.phones + (def phonemap - (->> (io/reader (io/resource "cmudict-0.7b.phones")) - (line-seq) - (map #(string/split % #"\t")) - (into {}))) - -(comment - (take 5 (seq phonemap)) - ;; => (["T" "stop"] ["CH" "affricate"] ["K" "stop"] ["HH" "aspirate"] ["UH" "vowel"]) - ) + {"T" "stop", + "CH" "affricate", + "K" "stop", + "HH" "aspirate", + "UH" "vowel", + "AY" "vowel", + "AH" "vowel", + "OW" "vowel", + "L" "liquid", + "JH" "affricate", + "UW" "vowel", + "G" "stop", + "EH" "vowel", + "M" "nasal", + "OY" "vowel", + "S" "fricative", + "Y" "semivowel", + "EY" "vowel", + "Z" "fricative", + "R" "liquid", + "F" "fricative", + "AW" "vowel", + "IY" "vowel", + "B" "stop", + "SH" "fricative", + "P" "stop", + "V" "fricative", + "TH" "fricative", + "IH" "vowel", + "AA" "vowel", + "AO" "vowel", + "N" "nasal", + "DH" "fricative", + "W" "semivowel", + "ZH" "fricative", + "NG" "nasal", + "D" "stop", + "ER" "vowel", + "AE" "vowel"}) (def long-vowel #{"EY" "IY" "AY" "OW" "UW"}) diff --git a/src/com/owoga/prhyme/syllabify.clj b/src/com/owoga/prhyme/syllabify.clj index cafd485..8958e80 100644 --- a/src/com/owoga/prhyme/syllabify.clj +++ b/src/com/owoga/prhyme/syllabify.clj @@ -2,7 +2,10 @@ (:require [com.owoga.prhyme.data.phonetics :as phonetics] [com.owoga.prhyme.util :as util] [clojure.string :as string])) + (set! *warn-on-reflection* true) + +;; TODO: ;; ER is not yet handled properly. ;; PARENTHESES is syllabified as ("P" "ER" "IH" "N") ("TH" "UH") ("S" "IY" "S") ;; Glides are also broken. "R OY AH L" gets syllabified as a single syllable.