Working reverse rhyme lookup efficient

main
Eric Ihli 4 years ago
parent 2e8c108299
commit baf290650f

2
.gitattributes vendored

@ -13,3 +13,5 @@ resources/structure-freqs filter=lfs diff=lfs merge=lfs -text
research filter=lfs diff=lfs merge=lfs -text research filter=lfs diff=lfs merge=lfs -text
resources/dark-corpus-tpt.bin filter=lfs diff=lfs merge=lfs -text resources/dark-corpus-tpt.bin filter=lfs diff=lfs merge=lfs -text
resources/database.bin filter=lfs diff=lfs merge=lfs -text resources/database.bin filter=lfs diff=lfs merge=lfs -text
resources/backwards-database.bin filter=lfs diff=lfs merge=lfs -text
resources/dark-corpus-backwards-tpt.bin filter=lfs diff=lfs merge=lfs -text

@ -60,6 +60,23 @@
(mapv #(mapv vec %)) (mapv #(mapv vec %))
(reduce #(into %1 %2) []))) (reduce #(into %1 %2) [])))
(defn text->backwards-ngrams
"Takes text from a file, including newlines.
Pads lines with <s> and </s> for start/end of line.
Pads beginning with n - 1 <s>s"
[text n]
(->> text
util/clean-text
(#(string/split % #"\n+"))
(remove empty?)
(mapv tokenize-line)
(mapv #(pad-tokens % n))
reverse
(mapv reverse)
(mapv #(partition n 1 %))
(mapv #(mapv vec %))
(reduce #(into %1 %2) [])))
(defn n-to-m-grams (defn n-to-m-grams
"Exclusive of m, similar to range." "Exclusive of m, similar to range."
[n m text] [n m text]
@ -72,6 +89,18 @@
(recur (inc i) (recur (inc i)
(into r (text->ngrams text i)))))) (into r (text->ngrams text i))))))
(defn n-to-m-backwards-grams
"Exclusive of m, similar to range."
[n m text]
(loop [i n
r []]
(cond
(= i m)
r
:else
(recur (inc i)
(into r (text->backwards-ngrams text i))))))
(declare ->TrieKey) (declare ->TrieKey)
(deftype TrieKey [key] (deftype TrieKey [key]
@ -188,6 +217,46 @@
(sort-by :count) (sort-by :count)
reverse)) reverse))
(defn rhyme-trie-transducer [xf]
(let [trie (volatile! (trie/make-trie))
database (atom {})
next-id (volatile! 1)]
(fn
([] (xf))
([result]
(reset! trie-database @database)
(xf result))
([result input]
(let [ngrams-ids
(mapv
(fn [ngrams]
(mapv
(fn [ngram]
(let [gram-ids (mapv
(fn [gram]
(let [gram-id (get @database gram @next-id)]
(when (.equals gram-id @next-id)
(swap! database
#(-> %
(assoc gram gram-id)
(assoc gram-id gram)))
(vswap! next-id inc))
gram-id))
ngram)
ngram-id (get database gram-ids @next-id)]
gram-ids))
ngrams))
input)]
(vswap!
trie
(fn [trie ngrams-ids]
(reduce
(fn [trie [ngram-ids _]]
(update trie ngram-ids (fnil #(update % 1 inc) [(peek ngram-ids) 0])))
trie
ngrams-ids))
ngrams-ids))))))
(comment (comment
(time (time
(def trie (def trie
@ -199,7 +268,16 @@
conj conj
(file-seq (io/file "dark-corpus"))))) (file-seq (io/file "dark-corpus")))))
(take 20 trie) (time
(def backwards-trie
(transduce (comp (xf-file-seq 0 1000)
(map slurp)
(map (partial n-to-m-backwards-grams 1 4))
(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
stateful-transducer)
conj
(file-seq (io/file "dark-corpus")))))
) )
(defn encode-fn [v] (defn encode-fn [v]
@ -225,6 +303,13 @@
encode-fn encode-fn
(decode-fn @trie-database)))) (decode-fn @trie-database))))
(time
(def tightly-packed-backwards-trie
(tpt/tightly-packed-trie
backwards-trie
encode-fn
(decode-fn @trie-database))))
) )
(defn key-get-in-tpt [tpt db ks] (defn key-get-in-tpt [tpt db ks]
@ -240,7 +325,97 @@
(defn word->phones [word]
(or (dict/word->cmu-phones word)
(util/get-phones-with-stress word)))
(defn perfect-rhymes [rhyme-trie phones]
(let [rhyme-suffix (first
(util/take-through
#(= (last %) \1)
(reverse phones)))]
(trie/lookup rhyme-trie rhyme-suffix)))
(defn n+1grams [trie k]
(->> (trie/lookup trie k)
(trie/children)
(map #(get % []))))
(comment
;; Bigrams of rhyme
(->> (perfect-rhymes rhyme-trie (or (dict/cmu-with-stress-map "pleasing")
(util/get-phones-with-stress "pleasing")))
(map (comp first second))
(remove nil?)
(map @trie-database)
(map #(vector [%] (n+1grams
tightly-packed-backwards-trie
[%])))
(map (fn [[w1 w2s]]
(mapv #(into w1 [(nth % 0)]) w2s)))
(reduce into [])
(map (fn [k]
(let [children (->> (n+1grams tightly-packed-backwards-trie k)
(mapv first))]
(mapv #(into k [%]) children))))
(reduce into [])
(map #(map @trie-database %)))
)
(comment (comment
(do
(time
(def backwards-trie
(transduce (comp (xf-file-seq 0 250000)
(map slurp)
(map (partial n-to-m-backwards-grams 1 4))
(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
stateful-transducer)
conj
(file-seq (io/file "dark-corpus")))))
(time
(def tightly-packed-backwards-trie
(tpt/tightly-packed-trie
backwards-trie
encode-fn
(decode-fn @trie-database))))
(tpt/save-tightly-packed-trie-to-file
"resources/dark-corpus-backwards-tpt.bin"
tightly-packed-backwards-trie)
(with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")]
(let [lines (->> (seq @trie-database)
(map pr-str)
(map #(str % "\n")))]
(doseq [line lines]
(.write wtr line))))
(def loaded-backwards-trie
(tpt/load-tightly-packed-trie-from-file
"resources/dark-corpus-backwards-tpt.bin"
(decode-fn @trie-database)))
(def loaded-database
(atom (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
(into {} (map read-string (line-seq rdr))))))
(->> (take 20 loaded-backwards-trie)
(map first)
(map (partial map @loaded-database)))
(def rhyme-database (atom {}))
(def rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
@loaded-database))
(trie/lookup rhyme-trie '("IY0" "JH"))
)
(tpt/save-tightly-packed-trie-to-file "dark-corpus-tpt.bin" tightly-packed-trie) (tpt/save-tightly-packed-trie-to-file "dark-corpus-tpt.bin" tightly-packed-trie)
(def loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file (def loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file
@ -292,6 +467,11 @@
(atom (with-open [rdr (clojure.java.io/reader "database.bin")] (atom (with-open [rdr (clojure.java.io/reader "database.bin")]
(into {} (map read-string (line-seq rdr)))))) (into {} (map read-string (line-seq rdr))))))
(->> loaded-tightly-packed-trie
(take 20)
(map first)
(map (partial map @trie-database)))
(profile (profile
{} {}
(def example-story (def example-story

BIN
resources/backwards-database.bin (Stored with Git LFS)

Binary file not shown.

BIN
resources/dark-corpus-backwards-tpt.bin (Stored with Git LFS)

Binary file not shown.

@ -1,6 +0,0 @@
(ns com.owoga.prhyme.corpus.db
(:require [integrant.core :as ig]))
(defn tokens->db
[tokens]
)

@ -12,6 +12,18 @@
(drop-while #(= \; (first %))) (drop-while #(= \; (first %)))
(map #(string/split % #"\s+")))) (map #(string/split % #"\s+"))))
(def cmu-with-stress-map
(->> cmu-with-stress
(map (partial split-at 1))
(map #(vector
(string/lower-case
(first (first %)))
(second %)))
(into {})))
(defn word->cmu-phones [word]
(get cmu-with-stress-map word))
(def cmu-dict (def cmu-dict
(->> (io/reader (io/resource "cmudict_SPHINX_40")) (->> (io/reader (io/resource "cmudict_SPHINX_40"))
(line-seq) (line-seq)
@ -27,6 +39,15 @@
:else (recur (rest words) :else (recur (rest words)
(update accum key (fnil conj []) (rest word))))))) (update accum key (fnil conj []) (rest word)))))))
(comment
(take 5 (seq spelling->phonemes))
;; => (["daphnis" [("D" "AE" "F" "N" "AH" "S")]]
;; ["dammam" [("D" "AE" "M" "AH" "M")]]
;; ["kirschenmann" [("K" "ER" "SH" "AH" "N" "M" "AH" "N")]]
;; ["baumgart" [("B" "AW" "M" "G" "AA" "R" "T") ("B" "AA" "M" "G" "AA" "R" "T")]]
;; ["probasco" [("P" "R" "OW" "B" "AA" "S" "K" "OW")]])
)
(def phrase->Word (def phrase->Word
(into (into
{} {}
@ -36,6 +57,15 @@
phonemes]) phonemes])
cmu-dict))) cmu-dict)))
(comment
(take 5 (seq phrase->Word))
;; => (["daphnis" ("D" "AE" "F" "N" "AH" "S")]
;; ["dammam" ("D" "AE" "M" "AH" "M")]
;; ["kirschenmann" ("K" "ER" "SH" "AH" "N" "M" "AH" "N")]
;; ["baumgart" ("B" "AW" "M" "G" "AA" "R" "T")]
;; ["probasco" ("P" "R" "OW" "B" "AA" "S" "K" "OW")])
)
(def prhyme-dict (def prhyme-dict
(into [] (map prhyme/cmu->prhyme cmu-dict))) (into [] (map prhyme/cmu->prhyme cmu-dict)))

@ -9,6 +9,11 @@
(map #(string/split % #"\t")) (map #(string/split % #"\t"))
(into {}))) (into {})))
(comment
(take 5 (seq phonemap))
;; => (["T" "stop"] ["CH" "affricate"] ["K" "stop"] ["HH" "aspirate"] ["UH" "vowel"])
)
(def long-vowel #{"EY" "IY" "AY" "OW" "UW"}) (def long-vowel #{"EY" "IY" "AY" "OW" "UW"})
(def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"}) (def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"})

@ -1,295 +1,7 @@
(ns com.owoga.prhyme.data.tpt (ns com.owoga.prhyme.data.tpt
(:require [clojure.string :as string] (:require [com.owoga.tightly-packed-trie :as tpt]
[com.owoga.trie :as trie]
[clojure.string :as string]
[clojure.java.io :as io]) [clojure.java.io :as io])
(:import (java.nio ByteBuffer) (:import (java.nio ByteBuffer)
(java.lang.reflect Array))) (java.lang.reflect Array)))
(defn ubyte [b]
(bit-and 0xff b))
(defn sbyte [b]
(bit-or -0x100 b))
(defn nth-bit [n b]
(let [mask (bit-shift-left 1 n)]
(bit-and
1
(bit-shift-right
(bit-and mask b)
n))))
(reduce (fn [a _] (bit-or 1 (bit-shift-left a 1))) 0 (range 2))
(defn ones-mask [n]
(reduce (fn [a _] (bit-or 1 (bit-shift-left a 1))) 0 (range n)))
(defn bit-slice
"Start is least-significant bit.
(bit-slice 2 6 10101010)
-> ,1010,
"
[start end b]
(let [mask (bit-shift-left (ones-mask (- end start)) start)]
(bit-shift-right (bit-and b mask) start)))
(defn as-binary-string [b]
(string/replace
(format "%8s" (Integer/toBinaryString b))
#" "
"0"))
(defn bits [binary-string]
(Integer/parseInt binary-string 2))
(defn bit-on [i b]
(bit-or b (int (Math/pow 2 i))))
;; I've got a dictionary of ~120,000 words.
;; I need to reference them in a compact way.
;; It will be a little bit of a waste, but we can use a 32 bit index.
(/ (Math/log 1.2e5) (Math/log 2))
(defn vb-encode [n]
(loop [n n
b '()]
(cond
(< n 128) (let [ba (byte-array (cons n b))
i (dec (count ba))]
(aset-byte ba i (sbyte (+ 128 (aget ba i))))
ba)
:else (recur (quot n 128) (cons (mod n 128) b)))))
(defn vb-decode-1 [ba]
(loop [n 0
i 0]
(let [b (aget ba i)]
(cond
(> (bit-and b 0x80) 0)
[(bit-or (bit-and 0x7f b)
(bit-shift-left n 7))
(inc i)]
:else
(recur (bit-or (bit-and 0x7f b)
(bit-shift-left n 7))
(inc i))))))
(defn vb-decode
([ba]
(vb-decode ba 0))
([ba i]
(if (>= i (count ba))
(cons (first (vb-decode-1 ba))
nil)
(let [[value byte-count] (vb-decode-1 ba)]
(lazy-seq
(cons
value
(vb-decode (byte-array (drop byte-count ba))
(+ i byte-count))))))))
(comment
(let [n1 0
n2 1
n3 127
n4 128
n5 257
n6 9876543210
baos (java.io.ByteArrayOutputStream.)]
(->> [n1 n2 n3 n4 n5 n6]
(map vb-encode)
(run! #(.writeBytes baos %)))
(let [ba (.toByteArray baos)]
(vb-decode ba)))
;; => ([0 1] [1 1] [127 1] [128 2] [257 2] [9876543210 5])
)
(defn combine-significant-bits [num-significant-bits & bytes]
(reduce
(fn [a b]
(bit-or b (bit-shift-left a num-significant-bits)))
bytes))
(comment
(let [b1 (bits "0110110")
b2 (bits "1001001")
;; remove 2 flag bits
slice (partial bit-slice 0 6)
b1' (slice b1)
b2' (slice b2)]
(map
as-binary-string
[b1
b2
b1'
b2'
(combine-significant-bits 6 b1' b2' )]))
;; => ("00110110" "01001001" "00110110" "00001001" "110110001001")
)
(defn byte-buffer-variable-length-decode
[bb]
(let [combine (fn [n b]
(bit-or (bit-and 0x7f b)
(bit-shift-left n 7)))]
(loop [n 0 i 0]
(let [b (.get bb)]
(if (zero? (bit-and b 0x80))
(recur
(combine n b)
(inc i))
(combine n b))))))
(comment
(let [bb (java.nio.ByteBuffer/wrap (vb-encode 9876543210))]
(println (.limit bb))
(byte-buffer-variable-length-decode bb))
)
(defn encode-with-flag-bits
"Flag is a binary string.
Returns byte array."
[flag n]
(let [flag-len (count flag)
data-len (- 8 flag-len)
max-data-val (int (Math/pow 2 data-len))
flag-val (bit-shift-left (bits flag) data-len)]
(loop [n n r '()]
(if (< n max-data-val)
(byte-array (cons (bit-or flag-val n) r))
(recur
(quot n max-data-val)
(cons
(bit-or flag-val (mod n max-data-val))
r))))))
(comment
(let [ns [0 1 127 128 257 9876543210]
encoded (map
(partial encode-with-flag-bits "101")
ns)
as-binary (->> encoded
(map (partial map as-binary-string))
(map (partial map (partial take-last 8)))
(map (partial map (partial apply str))))]
as-binary)
;; => (("10100000")
;; ("10100001")
;; ("10100011" "10111111")
;; ("10100100" "10100000")
;; ("10101000" "10100001")
;; ("10101001" "10100110" "10101011" "10100000" "10100101" "10110111" "10101010"))
)
(defn decode-with-flag-bits
"Flag is only used for length. It could be refactored to take an int.
Or it could be refactored to take a function that does work based on the flags.
For now though, flag bits must be checked external to this. This is just parity
decoding of the encoding above.
Returns [decoded-number number-of-bytes-read]."
[flag ba]
(let [flag-len (count flag)
data-len (- 8 flag-len)
mask-val (int (dec (Math/pow 2 data-len)))]
(loop [ba ba r 0 i 0]
(if (empty? ba)
[r i]
(let [data-val (bit-and mask-val (first ba))]
(recur
(rest ba)
(bit-or
(bit-shift-left r data-len)
data-val)
(inc i)))))))
(comment
(let [ns [0 1 127 128 257 9876543210]
encoded (map
(partial encode-with-flag-bits "101")
ns)
decoded (->> encoded
(map (partial decode-with-flag-bits "101")))]
decoded)
;; => ([0 1] [1 1] [127 2] [128 2] [257 2] [9876543210 7])
(let [ns [0 1 127 128 257 9876543210]
encoded (map
(partial encode-with-flag-bits "001")
ns)
decoded (->> encoded
(map (partial decode-with-flag-bits "101")))]
decoded)
;; => ([0 1] [1 1] [127 2] [128 2] [257 2] [9876543210 7])
)
(defn encode-byte-with-n-flag-bits [n b]
(let [max-size-with-flag (int (Math/pow 2 (- 8 n)))]
(loop [b b r '()]
(if (< b max-size-with-flag)
(byte-array (cons b r))
(recur (quot b max-size-with-flag)
(cons (mod b max-size-with-flag) r))))))
(defn decode-byte-with-n-flag-bits [n ba]
(let [max-size-with-flag (int (Math/pow 2 (- 8 n)))]
(loop [ba ba r 0]
(if (nil? ba)
r
(recur (rest ba)
(+ r ))))))
(def dictionary ["hi" "my" "name" "is" "what"])
(defn slurp-bytes [x]
(with-open [out (java.io.ByteArrayOutputStream.)]
(io/copy (io/input-stream x) out)
(.toByteArray out)))
(def markov
{"hello" {:freq 5
:children {"world" {:freq 2}
"eric" {:freq 1}}}
"goodbye" {:freq 2
:children {"cruel" {:freq 1
:children {"world" {:freq 1}}}}}
"world" {:freq 3}
"eric" {:freq 1}
"cruel" {:freq 1
:children {"world" {:freq 1}}}})
(def markov
{"world"
{:freq 3
:children {"hello" {:freq 2}
"goodbye" {:freq 2}
"cruel" {:freq 1
:children {"goodbye" {:freq 1}}}}}
"hello"
{:freq 2}
"goodbye"
{:freq 3}
"cruel"
{:freq 1
:children {"goodbye" {:freq 1}}}})
(comment
(with-open [o (io/output-stream "test.bin")]
(.write o (byte-array 8) 0 8))
(let [ba (slurp-bytes "test.bin")
len (count ba)]
)
(with-open [o (io/output-stream "test.bin")]
(.write o (count dictionary))
(run!
(fn [[i w]]
(let [b (vb-encode i)]
(.write o b 0 (count b)))
(let [b (.getBytes w)]
(.write o b 0 (count b))))
(map vector (range) dictionary))
)
)

@ -24,14 +24,18 @@
(comment (comment
(map str (.getPhones cmu-lexicon "two" nil))) (map str (.getPhones cmu-lexicon "two" nil)))
(defn get-phones [word] (defn get-phones
"String must be lowercase."
[word]
(->> (map str (.getPhones cmu-lexicon word nil)) (->> (map str (.getPhones cmu-lexicon word nil))
(map remove-stress) (map remove-stress)
(map convert-to-sphinx) (map convert-to-sphinx)
(map string/upper-case))) (map string/upper-case)))
(defn get-phones-with-stress (defn get-phones-with-stress
".getPhones only " "String must be lowercase.
.getPhones only.
Might be different from stress in cmu-dict"
[word] [word]
(->> (map str (.getPhones cmu-lexicon word nil)) (->> (map str (.getPhones cmu-lexicon word nil))
(map convert-to-sphinx) (map convert-to-sphinx)

Loading…
Cancel
Save