Saving and loading corpus from disk

main
Eric Ihli 4 years ago
parent c7b6ee1a32
commit fc0cbe3b2e

@ -158,9 +158,6 @@
gram-id)) gram-id))
ngram) ngram)
ngram-id (get database gram-ids @next-id)] ngram-id (get database gram-ids @next-id)]
(when (.equals ngram-id @next-id)
(swap! database #(-> % (assoc gram-ids @next-id)))
(vswap! next-id inc))
gram-ids)) gram-ids))
ngrams)) ngrams))
input)] input)]
@ -180,33 +177,6 @@
[ngram] [ngram]
(clojure.lang.MapEntry. (vec ngram) ngram)) (clojure.lang.MapEntry. (vec ngram) ngram))
(defn create-trie-from-texts [texts]
(->> texts
(map #(n-to-m-grams 1 5 %))
(apply concat)
(map prep-ngram-for-trie)
(reduce
(fn [[trie i db] [k v]]
(let [[db i] (reduce
(fn [[db i] k]
(let [id (get db k i)
i (if (= id i) (inc i) i)
db (-> db
(assoc id k)
(assoc k id))]
[db i]))
[db i]
k)
k' (map #(get db %) k)]
(if-let [existing (get trie k')]
(let [[val count] existing
trie (assoc trie k' [val (inc count)])]
[trie i db])
[(assoc trie k' [i 1])
(inc i)
(assoc db i k')])))
[(trie/make-trie) 1 {}])))
(defn seq-of-nodes->sorted-by-count (defn seq-of-nodes->sorted-by-count
"Sorted first by the rank of the ngram, lowest ranks first. "Sorted first by the rank of the ngram, lowest ranks first.
Sorted second by the frequency of the ngram, highest frequencies first. Sorted second by the frequency of the ngram, highest frequencies first.
@ -218,9 +188,10 @@
(sort-by :count) (sort-by :count)
reverse)) reverse))
(time (comment
(time
(def trie (def trie
(transduce (comp (xf-file-seq 0 10) (transduce (comp (xf-file-seq 0 250000)
(map slurp) (map slurp)
(map (partial n-to-m-grams 1 4)) (map (partial n-to-m-grams 1 4))
(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams))) (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
@ -228,12 +199,7 @@
conj conj
(file-seq (io/file "dark-corpus"))))) (file-seq (io/file "dark-corpus")))))
(comment (take 20 trie)
(let [texts (->> (dark-corpus-file-seq 0 5)
(map slurp))
[trie _ db] (create-trie-from-texts texts)]
texts)
) )
(defn encode-fn [v] (defn encode-fn [v]
@ -251,13 +217,16 @@
nil nil
[value (encoding/decode byte-buffer)])))) [value (encoding/decode byte-buffer)]))))
(time (comment
(time
(def tightly-packed-trie (def tightly-packed-trie
(tpt/tightly-packed-trie (tpt/tightly-packed-trie
trie trie
encode-fn encode-fn
(decode-fn @trie-database)))) (decode-fn @trie-database))))
)
(defn key-get-in-tpt [tpt db ks] (defn key-get-in-tpt [tpt db ks]
(let [id (map #(get-in db [(list %) :id]) ks) (let [id (map #(get-in db [(list %) :id]) ks)
v (get tpt id)] v (get tpt id)]
@ -272,40 +241,37 @@
(comment (comment
(->> (trie/lookup tightly-packed-trie [1]) (tpt/save-tightly-packed-trie-to-file "dark-corpus-tpt.bin" tightly-packed-trie)
(trie/children)
(map #(get % []))
(remove #(nil? (first %)))
(math/weighted-selection second))
(->> trie (def loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file
(#(trie/lookup % [1])) "dark-corpus-tpt.bin"
(trie/children) (decode-fn @trie-database)))
(map #(get % []))
(remove nil?)
(map first)
(map #(trie-database %))
(map #(map trie-database %)))
(->> tightly-packed-trie [(first tightly-packed-trie)
(#(trie/lookup % [1])) (first loaded-tightly-packed-trie)]
(trie/children)
(map #(get % []))
(remove nil?)
(math/weighted-selection second)
first)
(->> trie (take-last 10 (.array (.byte-buffer loaded-tightly-packed-trie)))
(#(trie/lookup % [1])) ;; => (-127 -124 -42 -23 28 -127 -124 -41 -90 9)
(trie/children) ;; => (0 0 0 0 0 37 0 6 -124 -56 -128 -121 1 -17 -128 -118 -117 -128 -115 2)
(map #(get % []))
(remove nil?)
(math/weighted-selection second)
first)
(take 20 (seq @trie-database)) (take-last 10 (.array (.byte-buffer tightly-packed-trie)))
(take 20 trie) ;; => (-127 -124 -42 -23 28 -127 -124 -41 -90 9)
(take 20 tightly-packed-trie) ;; => (0 0 0 0 0 37 0 6 -124 -56 -128 -121 1 -17 -128 -118 -117 -128 -115 2)
(.byte-buffer loaded-tightly-packed-trie)
;; => #object[java.nio.HeapByteBuffer 0x21b8291a "java.nio.HeapByteBuffer[pos=8 lim=2548630 cap=2548630]"]
(.byte-buffer tightly-packed-trie)
;; => #object[java.nio.HeapByteBuffer 0x7dc15357 "java.nio.HeapByteBuffer[pos=8 lim=2548630 cap=2548630]"]
[(.key loaded-tightly-packed-trie)
(.address loaded-tightly-packed-trie)
(.limit loaded-tightly-packed-trie)]
;; => [0 2424838 2548630]
[(.key tightly-packed-trie)
(.address tightly-packed-trie)
(.limit tightly-packed-trie)]
;; => [0 2424838 2548630]
(->> (trie/lookup tightly-packed-trie [1]) (->> (trie/lookup tightly-packed-trie [1])
(trie/children) (trie/children)
@ -315,13 +281,23 @@
first first
(@trie-database)) (@trie-database))
(with-open [wtr (clojure.java.io/writer "database.bin")]
(let [lines (->> (seq @trie-database)
(map pr-str)
(map #(str % "\n")))]
(doseq [line lines]
(.write wtr line))))
(def trie-database
(atom (with-open [rdr (clojure.java.io/reader "database.bin")]
(into {} (map read-string (line-seq rdr))))))
(profile (profile
{} {}
(def example-story (def example-story
(loop [generated-text [(get @trie-database "<s>")] (loop [generated-text [(get @trie-database "<s>")]
i 0] i 0]
(println generated-text) (if (> i 20)
(if (> i 100)
generated-text generated-text
(let [children (loop [i 4] (let [children (loop [i 4]
(let [node (p :lookup (let [node (p :lookup
@ -362,32 +338,7 @@
(id-get-in-tpt (id-get-in-tpt
tightly-packed-trie tightly-packed-trie
trie-database trie-database
'(2 2 3)) '(2 2 3)))
;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}} ;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}}
)
(comment
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(tpt/as-map (transform-trie->ids trie)))
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)
tightly-packed-trie (tpt/tightly-packed-trie
(transform-trie->ids trie))]
(get tightly-packed-trie '(2 2 3)))
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(tpt/as-map trie))
(let [text (slurp (first (dark-corpus-file-seq 500 1)))]
(->> text
util/clean-text
(#(string/split % #"\n+"))))
)

Loading…
Cancel
Save