Start re-using code from data_transform.clj

main
Eric Ihli 4 years ago
parent 60e1c4fd73
commit d84b2a0204

@ -70,12 +70,19 @@
(drop start) (drop start)
(take end))) (take end)))
;;;; Efficient Tries with Database
;; To make a more memory-efficient trie, and
;; to more easily support the conversion of a trie
;; to a tightly packed trie, convert all keys and values
;; to integers.
;;
;; Also, create a database to map integer IDs back to
;; their string values and string values to integer IDs.
(defn stateful-transducer (defn stateful-transducer
"Stateful transform that crates a trie. "Stateful transform that crates a trie and populates an `atom` database."
" [database xf]
[xf]
(let [trie (volatile! (trie/make-trie)) (let [trie (volatile! (trie/make-trie))
database (atom {})
next-id (volatile! 1)] next-id (volatile! 1)]
(fn (fn
([] (xf)) ([] (xf))
@ -94,8 +101,7 @@
(assoc key key-id) (assoc key key-id)
(assoc key-id key))) (assoc key-id key)))
(vswap! next-id inc)) (vswap! next-id inc))
(mapv @database lookup)))
[(mapv @database lookup) v]))
lookup)) lookup))
map-entries-in)] map-entries-in)]
(vswap! (vswap!
@ -166,20 +172,74 @@
;; ["dog" "</s>"]] ;; ["dog" "</s>"]]
) )
(defn text->backwards-ngrams
"Takes text from a file, including newlines.
Pads lines with <s> and </s> for start/end of line.
Pads beginning with n - 1 <s>s"
[text n]
(->> text
util/clean-text
(#(string/split % #"\n+"))
(remove empty?)
(mapv tokenize-line)
(mapv #(pad-tokens % n))
reverse
(mapv reverse)
(mapv #(partition n 1 %))
(mapv #(mapv vec %))
(reduce #(into %1 %2) [])))
(defn n-to-m-backwards-grams
"Exclusive of m, similar to range."
[n m text]
(loop [i n
r []]
(cond
(= i m)
r
:else
(recur (inc i)
(into r (text->backwards-ngrams text i))))))
(defn prep-ngram-for-trie (defn prep-ngram-for-trie
"The tpt/trie expects values conjed into an ngram "The tpt/trie expects values conjed into an ngram
to be of format '[[k1 k2 k3] value]." to be of format '[[k1 k2 k3] value]."
[ngram] [ngram]
(clojure.lang.MapEntry. (vec ngram) ngram)) (clojure.lang.MapEntry. (vec ngram) ngram))
(comment (defn make-trie-and-database
(transduce (comp (xf-file-seq 501 2) "Takes a file seq, like (file-seq (io/file \"dark-corpus\"))"
[file-seq]
(let [database (atom {})
trie (transduce (comp (xf-file-seq 501 2)
(map slurp) (map slurp)
(map (partial n-to-m-grams 1 4)) (map (partial n-to-m-grams 1 4))
(map (fn [ngrams] (mapv #(prep-ngram-for-trie %) ngrams))) (map (fn [ngrams] (mapv #(prep-ngram-for-trie %) ngrams)))
stateful-transducer) (partial stateful-transducer database))
conj
file-seq)]
[trie database]))
(defn make-backwards-trie-and-database
[file-seq]
(let [database (atom {})
trie (transduce (comp (xf-file-seq 0 1000)
(map slurp)
(map (partial n-to-m-backwards-grams 1 4))
(map (fn [ngrams] (mapv #(prep-ngram-for-trie %) ngrams)))
(partial stateful-transducer database))
conj conj
(file-seq (io/file "dark-corpus"))) (file-seq (io/file "dark-corpus")))]
[trie database]))
(comment
(take 20 trie)
(take 20 @trie-database)
(->> (map #(get % []) (trie/children (trie/lookup trie [1])))
(map first)
(map @trie-database))
) )
(defn initialize (defn initialize

@ -57,7 +57,9 @@
(transduce (transduce
xf-untokenize xf-untokenize
conj conj
tokens)])) tokens)])
)
(def xf-filter-english (def xf-filter-english
(let [word? (fn [x] (or (#{"." "?" ","} x) (let [word? (fn [x] (or (#{"." "?" ","} x)
@ -65,12 +67,26 @@
(filter (partial every? word?)))) (filter (partial every? word?))))
(defn n-to-m-partitions (defn n-to-m-partitions
"Exclusive of m, similar to range." "Returns a concatenated list of n-partitions, n+1-partitions, ..., m-1-partitions of coll.
[n m partitions] Exclusive of m, similar to range."
[n m coll]
(mapcat (mapcat
#(partition % 1 partitions) (fn [partition-size]
(partition partition-size 1 coll))
(range n m))) (range n m)))
(comment
(n-to-m-partitions 1 4 (range 6))
;; => ((0)
;; (1)
;; ,,,
;; (3 4)
;; (4 5)
;; ,,,
;; (2 3 4)
;; (3 4 5))
)
(defn new-key [database k] (defn new-key [database k]
(let [next-id (@database :next-id)] (let [next-id (@database :next-id)]
(swap! (swap!
@ -83,6 +99,11 @@
(defn make-database-processor (defn make-database-processor
"Takes an atom and returns a function that takes a Trie key/value. "Takes an atom and returns a function that takes a Trie key/value.
Expects `database` to have a `:next-id` key, which should start at 1
so that 0 can remain the id for the root node of the trie. That is important
for the encode/decode functions.
When the returned function is called, it checks to see When the returned function is called, it checks to see
if the key is in the database and if so it returns the associated id. if the key is in the database and if so it returns the associated id.
If not, it increments the id (which is stored in the database If not, it increments the id (which is stored in the database
@ -115,7 +136,7 @@
sentence)))) sentence))))
(comment (comment
(let [database (atom {})] (let [database (atom {:next-id 0})]
(transduce (transduce
(map (partial mapv (part-of-speech-database database))) (map (partial mapv (part-of-speech-database database)))
conj conj
@ -173,6 +194,20 @@
(trie/make-trie) (trie/make-trie)
files)) files))
(comment
(def trie
(let [database (atom {:next-id 0})
files (->> (file-seq (io/file "dark-corpus"))
(remove #(.isDirectory %))
(drop 501)
(take 2))
trie (file-seq->trie database files 1 3)]
trie))
(take 20 trie)
)
(defn trie->tightly-packed-trie (defn trie->tightly-packed-trie
[trie encode-fn decode-fn] [trie encode-fn decode-fn]
(tpt/tightly-packed-trie trie encode-fn decode-fn)) (tpt/tightly-packed-trie trie encode-fn decode-fn))

Loading…
Cancel
Save