Clean up training code, save tpt on train

main
Eric Ihli 3 years ago
parent a00f30fabe
commit f42cdfb59a

@ -229,7 +229,7 @@
(time
(def trie
(transduce (comp (xf-file-seq 0 250000)
(transduce (comp (xf-file-seq 0 5)
(map slurp)
(map (partial n-to-m-grams 1 4))
(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
@ -272,6 +272,14 @@
encode-fn
(decode-fn @trie-database))))
(take 5 trie)
;; => ([(1 1 2) [2 1]]
;; [(1 1 3) [3 4]]
;; [(1 1 10) [10 1]]
;; [(1 1 12) [12 1]]
;; [(1 1 14) [14 3]])
(time
(def tightly-packed-backwards-trie
(tpt/tightly-packed-trie

@ -6,6 +6,7 @@
[com.owoga.prhyme.data-transform :as data-transform]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.phonetics :as phonetics]
@ -296,13 +297,13 @@
(map (partial transduce data-transform/xf-filter-english conj))
(map (partial remove empty?))
(map (partial map (comp vec reverse)))
;; xf-pad-tokens works on vectors due to `into`
;; xf-pad-tokens needs vectors to properly pad due to `into`
(map (partial into [] (data-transform/xf-pad-tokens (dec m) "</s>" 1 "<s>")))
(map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m))))
(mapcat (partial mapv (data-transform/make-database-processor database))))
(completing
(fn [trie lookup]
(update trie lookup (fnil #(update % 1 inc) [lookup 0]))))
(update trie lookup (fnil #(update % 1 inc) [(peek lookup) 0]))))
(trie/make-trie)
files))
@ -330,35 +331,91 @@
;; [(",") [[19] 14]]
;; [("you") [[63] 11]]
;; [("to") [[15] 7]])]
)
)
;;;; Packing the trie into a small memory footprint
(defn encode-fn [v]
(let [[value count] (if (seqable? v) v [nil nil])]
(if (nil? value)
(encoding/encode 0)
(byte-array
(concat (encoding/encode value)
(encoding/encode count))))))
(defn decode-fn [db]
(fn [byte-buffer]
(let [value (encoding/decode byte-buffer)]
(if (zero? value)
nil
[value (encoding/decode byte-buffer)]))))
(defn save-tightly-packed-trie
[trie database filepath]
(let [tightly-packed-trie
(tpt/tightly-packed-trie
trie
encode-fn
(decode-fn @database))]
(tpt/save-tightly-packed-trie-to-file
filepath
tightly-packed-trie)))
(defn load-tightly-packed-trie
[filepath database]
(tpt/load-tightly-packed-trie-from-file
filepath
(decode-fn @database)))
;;;; Training
(defn train-backwards
"For building lines backwards so they can be seeded with a target rhyme."
[files n m trie-filepath database-filepath]
(let [database (atom {:next-id 0})
[files n m trie-filepath database-filepath tightly-packed-trie-filepath]
(let [database (atom {:next-id 1})
trie (file-seq->backwards-markov-trie database files n m)]
(nippy/freeze-to-file trie-filepath (seq trie))
(nippy/freeze-to-file database-filepath @database)
(save-tightly-packed-trie trie database tightly-packed-trie-filepath)
(let [loaded-trie (->> trie-filepath
nippy/thaw-from-file
(into (trie/make-trie)))
loaded-db (->> database-filepath
nippy/thaw-from-file)]
(println "Successfully loaded trie and database.")
(println (take 5 loaded-trie))
(println (take 5 loaded-db)))))
nippy/thaw-from-file)
loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file
tightly-packed-trie-filepath
(decode-fn loaded-db))]
(println "Loaded trie:" (take 5 loaded-trie))
(println "Loaded database:" (take 5 loaded-db))
(println "Loaded tightly-packed-trie:" (take 5 loaded-tightly-packed-trie))
(println "Successfully loaded trie and database."))))
(comment
(time
(let [files (->> "dark-corpus"
io/file
file-seq
(eduction (xf-file-seq 0 4000)))
[trie database] (train-backwards files 1 4 "/tmp/trie.bin" "/tmp/database.bin")]))
(eduction (xf-file-seq 0 1000)))
[trie database] (train-backwards files 1 4 "/tmp/trie.bin" "/tmp/database.bin" "/tmp/tpt.bin")]))
(def trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/trie.bin")))
(take 5 trie)
;; => ([(0 0 0 1) [1 2]]
;; [(0 0 0 3) [3 1]]
;; [(0 0 0 4) [4 1]]
;; [(0 0 0 5) [5 8]]
;; [(0 0 0 10) [10 1]])
(def tight (tpt/tightly-packed-trie trie encode-fn (decode-fn db)))
tight
(def db (nippy/thaw-from-file "/tmp/database.bin"))
(db 4)
)
(defn gen-rhyme-model
[rhyme-type-fn database database-filepath]
(let [words (filter string? (keys @database))
@ -370,11 +427,12 @@
(println (take 5 loaded-trie)))))
(comment
(let [database (atom (nippy/thaw-from-file "/tmp/database.edn"))]
(let [database (atom (nippy/thaw-from-file "/tmp/database.bin"))]
(gen-rhyme-model prhyme/phrase->all-flex-rhyme-tailing-consonants-phones database "/tmp/rhyme-trie.bin"))
(def rt (into (trie/make-trie) (nippy/thaw-from-file "/tmp/rhyme-trie.bin")))
(take 5 rt)
(take 100 rt)
(prhyme/phrase->all-flex-rhyme-tailing-consonants-phones "brasilia")
(phonetics/get-phones "brasilia")

@ -298,7 +298,11 @@
(map #(update % 0 reverse))
(reduce
(fn [trie [phones word]]
(update trie phones conj word))
;; Use a set? If rhyme-type-fn filters out
;; phones that make a word with different pronunciations
;; have the same phones, then the word will be duplicated.
;; Alternatively, place word and pronunciation in value of trie.
(update trie phones (fnil conj #{}) word))
(trie/make-trie))))

Loading…
Cancel
Save