Misc updates to tpt

main
Eric Ihli 4 years ago
parent e8f56ed043
commit 6cf1236708

1
.gitattributes vendored

@ -15,3 +15,4 @@ resources/dark-corpus-tpt.bin filter=lfs diff=lfs merge=lfs -text
resources/database.bin filter=lfs diff=lfs merge=lfs -text resources/database.bin filter=lfs diff=lfs merge=lfs -text
resources/backwards-database.bin filter=lfs diff=lfs merge=lfs -text resources/backwards-database.bin filter=lfs diff=lfs merge=lfs -text
resources/dark-corpus-backwards-tpt.bin filter=lfs diff=lfs merge=lfs -text resources/dark-corpus-backwards-tpt.bin filter=lfs diff=lfs merge=lfs -text
**.bin filter=lfs diff=lfs merge=lfs -text

@ -21,6 +21,7 @@
com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/nippy {:mvn/version "3.0.0"}
com.taoensso/timbre {:mvn/version "4.10.0"} com.taoensso/timbre {:mvn/version "4.10.0"}
com.owoga/tightly-packed-trie com.owoga/tightly-packed-trie
{:local/root "/home/eihli/code/tightly-packed-trie"}} {:local/root "/home/eihli/code/tightly-packed-trie"}
com.owoga/phonetics {:mvn/version "0.1.1"}}
:aliases {:dev {:extra-paths ["test" "examples" "dev"] :aliases {:dev {:extra-paths ["test" "examples" "dev"]
:extra-deps {}}}} :extra-deps {}}}}

@ -1,6 +1,8 @@
(ns examples.tpt (ns examples.tpt
(:require [clojure.string :as string] (:require [clojure.string :as string]
[clojure.java.io :as io] [clojure.java.io :as io]
[com.owoga.phonetics :as owoga.phonetics]
[com.owoga.phonetics.syllabify :as owoga.syllabify]
[com.owoga.prhyme.core :as prhyme] [com.owoga.prhyme.core :as prhyme]
[com.owoga.prhyme.nlp.core :as nlp] [com.owoga.prhyme.nlp.core :as nlp]
[taoensso.tufte :as tufte :refer (defnp p profiled profile)] [taoensso.tufte :as tufte :refer (defnp p profiled profile)]
@ -22,13 +24,6 @@
(drop start) (drop start)
(take end))) (take end)))
(defn dark-corpus-file-seq [start end]
(let [xf (comp (remove #(.isDirectory %))
(drop start)
(take end))
documents (file-seq (io/file "dark-corpus"))]
(transduce xf conj documents)))
(def re-word (def re-word
"Regex for tokenizing a string into words "Regex for tokenizing a string into words
(including contractions and hyphenations), (including contractions and hyphenations),
@ -262,6 +257,14 @@
ngrams-ids)))))) ngrams-ids))))))
(comment (comment
(transduce (comp (xf-file-seq 0 10)
(map slurp)
(map (partial n-to-m-grams 1 5))
#_#_(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
stateful-transducer)
conj
(file-seq (io/file "dark-corpus")))
(time (time
(def trie (def trie
(transduce (comp (xf-file-seq 0 250000) (transduce (comp (xf-file-seq 0 250000)
@ -384,7 +387,7 @@
(defn syllabify-with-stress [word] (defn syllabify-with-stress [word]
(let [phones (word->phones word) (let [phones (word->phones word)
phones-without-stress (map #(string/replace % #"\d" "") phones) phones-without-stress (map #(string/replace % #"\d" "") phones)
syllables (syllabify/syllabify phones-without-stress)] syllables (first (owoga.syllabify/syllabify phones-without-stress))]
(loop [phones phones (loop [phones phones
syllables syllables syllables syllables
result [[]]] result [[]]]
@ -411,6 +414,8 @@
(comment (comment
(syllabify-phrase-with-stress "bother me") (syllabify-phrase-with-stress "bother me")
(word->phones "bother me")
(map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones) ["bother" "me"])
[(syllabify-phrase-with-stress "on poverty") [(syllabify-phrase-with-stress "on poverty")
(syllabify-phrase-with-stress "can bother me")] (syllabify-phrase-with-stress "can bother me")]
@ -501,7 +506,7 @@
(swap! (swap!
context context
assoc assoc
:flex-rhyme-trie' :flex-rhyme-trie3'
(transduce (transduce
(comp (comp
(map (fn [[k v]] (map (fn [[k v]]
@ -514,7 +519,7 @@
(fn [trie [k v]] (fn [trie [k v]]
(update trie k (fnil conj [v]) v))) (update trie k (fnil conj [v]) v)))
(trie/make-trie) (trie/make-trie)
(tpt/children-at-depth (@context :trie) 0 2)))) (trie/children-at-depth (@context :trie) 0 3))))
nil) nil)
) )
@ -526,13 +531,14 @@
(take 500)) (take 500))
(trie/children (trie/lookup (@context :flex-rhyme-trie') (trie/children (trie/lookup (@context :flex-rhyme-trie')
(reverse (rest (phrase->flex-rhyme-phones "technology"))))) (reverse (rest (phrase->flex-rhyme-phones "i love you")))))
(trie/lookup (@context :flex-rhyme-trie') '("IY" "AH" "AA")) (trie/lookup (@context :flex-rhyme-trie') '("IY" "AH" "AA"))
(map (@context :database) '()) (map (@context :database) '())
(take 5 (@context :flex-rhyme-trie')) (take 5 (@context :flex-rhyme-trie'))
(map #(get (@context :database) %) [6177 13036]) (map #(get (@context :database) %) [21 8953])
(map #(get (@context :database) %) [410 48670]) (map #(get (@context :database) %) [410 48670])
(get (@context :trie) [1 2 2]) (get (@context :trie) [1 2 2])
@ -545,8 +551,36 @@
) )
(comment (defn flex-rhymes->phrases [flex-rhymes database]
(->> flex-rhymes
(map second)
(map
(fn [rhymes]
(reduce
(fn [acc [k [v fr]]]
(update acc k (fnil #(+ % fr) 0)))
{}
rhymes)))
(map (partial sort-by (comp - second)))
(map
(fn [rhymes]
(map
(fn [[k fr]]
[(map database k) fr])
rhymes)))))
(comment
(->> (trie/lookup
(@context :flex-rhyme-trie3')
(reverse (phrase->flex-rhyme-phones "taylor my dear")))
(#(flex-rhymes->phrases % (@context :database)))
(apply concat)
(sort-by (comp - second))
(remove
(fn [[k fr]]
(or (= 1 (count k))
(= "</s>" (first k))
(= "<s>" (second k))))))
(filter (filter
dict/english? dict/english?
@ -557,7 +591,9 @@
(@context :flex-rhyme-trie) (@context :flex-rhyme-trie)
'("IY" "AH" "AA")))))) '("IY" "AH" "AA"))))))
(take 5 (drop 500 (@context :flex-rhyme-trie))) (->> (take 5 (drop 500 (@context :flex-rhyme-trie')))
(#(flex-rhymes->phrases % (@context :database))))
(let [key (reverse (phrase->flex-rhyme-phones "technology"))] (let [key (reverse (phrase->flex-rhyme-phones "technology"))]
[key [key
(reverse (phrase->flex-rhyme-phones "sociology")) (reverse (phrase->flex-rhyme-phones "sociology"))
@ -728,11 +764,6 @@
@ -797,7 +828,7 @@
(map #(vector % (reverse (word->phones %)))) (map #(vector % (reverse (word->phones %))))
(map reverse) (map reverse)
(map (fn [[phones v]] (map (fn [[phones v]]
[(map #(if (phonetics/vowel [(map #(if (owoga.phonetics/vowel
(string/replace % #"\d" "")) (string/replace % #"\d" ""))
% %
"?") "?")

@ -3,16 +3,48 @@
[clojure.set] [clojure.set]
[clojure.java.io :as io])) [clojure.java.io :as io]))
;; From http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.phones
(def phonemap (def phonemap
(->> (io/reader (io/resource "cmudict-0.7b.phones")) {"T" "stop",
(line-seq) "CH" "affricate",
(map #(string/split % #"\t")) "K" "stop",
(into {}))) "HH" "aspirate",
"UH" "vowel",
(comment "AY" "vowel",
(take 5 (seq phonemap)) "AH" "vowel",
;; => (["T" "stop"] ["CH" "affricate"] ["K" "stop"] ["HH" "aspirate"] ["UH" "vowel"]) "OW" "vowel",
) "L" "liquid",
"JH" "affricate",
"UW" "vowel",
"G" "stop",
"EH" "vowel",
"M" "nasal",
"OY" "vowel",
"S" "fricative",
"Y" "semivowel",
"EY" "vowel",
"Z" "fricative",
"R" "liquid",
"F" "fricative",
"AW" "vowel",
"IY" "vowel",
"B" "stop",
"SH" "fricative",
"P" "stop",
"V" "fricative",
"TH" "fricative",
"IH" "vowel",
"AA" "vowel",
"AO" "vowel",
"N" "nasal",
"DH" "fricative",
"W" "semivowel",
"ZH" "fricative",
"NG" "nasal",
"D" "stop",
"ER" "vowel",
"AE" "vowel"})
(def long-vowel #{"EY" "IY" "AY" "OW" "UW"}) (def long-vowel #{"EY" "IY" "AY" "OW" "UW"})

@ -2,7 +2,10 @@
(:require [com.owoga.prhyme.data.phonetics :as phonetics] (:require [com.owoga.prhyme.data.phonetics :as phonetics]
[com.owoga.prhyme.util :as util] [com.owoga.prhyme.util :as util]
[clojure.string :as string])) [clojure.string :as string]))
(set! *warn-on-reflection* true) (set! *warn-on-reflection* true)
;; TODO:
;; ER is not yet handled properly. ;; ER is not yet handled properly.
;; PARENTHESES is syllabified as ("P" "ER" "IH" "N") ("TH" "UH") ("S" "IY" "S") ;; PARENTHESES is syllabified as ("P" "ER" "IH" "N") ("TH" "UH") ("S" "IY" "S")
;; Glides are also broken. "R OY AH L" gets syllabified as a single syllable. ;; Glides are also broken. "R OY AH L" gets syllabified as a single syllable.

Loading…
Cancel
Save