From d84b2a0204c26e6e8c8a143cf6a19665b55d739f Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Sun, 20 Jun 2021 10:39:15 -0500 Subject: [PATCH] Start re-using code from data_transform.clj --- src/com/owoga/corpus/markov.clj | 86 +++++++++++++++++++++---- src/com/owoga/prhyme/data_transform.clj | 45 +++++++++++-- 2 files changed, 113 insertions(+), 18 deletions(-) diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index 6333a21..0611e74 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -70,12 +70,19 @@ (drop start) (take end))) +;;;; Efficient Tries with Database +;; To make a more memory-efficient trie, and +;; to more easily support the conversion of a trie +;; to a tightly packed trie, convert all keys and values +;; to integers. +;; +;; Also, create a database to map integer IDs back to +;; their string values and string values to integer IDs. + (defn stateful-transducer - "Stateful transform that crates a trie. - " - [xf] + "Stateful transform that crates a trie and populates an `atom` database." + [database xf] (let [trie (volatile! (trie/make-trie)) - database (atom {}) next-id (volatile! 1)] (fn ([] (xf)) @@ -94,8 +101,7 @@ (assoc key key-id) (assoc key-id key))) (vswap! next-id inc)) - - [(mapv @database lookup) v])) + (mapv @database lookup))) lookup)) map-entries-in)] (vswap! @@ -166,20 +172,74 @@ ;; ["dog" ""]] ) + +(defn text->backwards-ngrams + "Takes text from a file, including newlines. + Pads lines with and for start/end of line. + Pads beginning with n - 1 s" + [text n] + (->> text + util/clean-text + (#(string/split % #"\n+")) + (remove empty?) + (mapv tokenize-line) + (mapv #(pad-tokens % n)) + reverse + (mapv reverse) + (mapv #(partition n 1 %)) + (mapv #(mapv vec %)) + (reduce #(into %1 %2) []))) + +(defn n-to-m-backwards-grams + "Exclusive of m, similar to range." + [n m text] + (loop [i n + r []] + (cond + (= i m) + r + :else + (recur (inc i) + (into r (text->backwards-ngrams text i)))))) + (defn prep-ngram-for-trie "The tpt/trie expects values conjed into an ngram to be of format '[[k1 k2 k3] value]." [ngram] (clojure.lang.MapEntry. (vec ngram) ngram)) +(defn make-trie-and-database + "Takes a file seq, like (file-seq (io/file \"dark-corpus\"))" + [file-seq] + (let [database (atom {}) + trie (transduce (comp (xf-file-seq 501 2) + (map slurp) + (map (partial n-to-m-grams 1 4)) + (map (fn [ngrams] (mapv #(prep-ngram-for-trie %) ngrams))) + (partial stateful-transducer database)) + conj + file-seq)] + [trie database])) + +(defn make-backwards-trie-and-database + [file-seq] + (let [database (atom {}) + trie (transduce (comp (xf-file-seq 0 1000) + (map slurp) + (map (partial n-to-m-backwards-grams 1 4)) + (map (fn [ngrams] (mapv #(prep-ngram-for-trie %) ngrams))) + (partial stateful-transducer database)) + conj + (file-seq (io/file "dark-corpus")))] + [trie database])) + (comment - (transduce (comp (xf-file-seq 501 2) - (map slurp) - (map (partial n-to-m-grams 1 4)) - (map (fn [ngrams] (mapv #(prep-ngram-for-trie %) ngrams))) - stateful-transducer) - conj - (file-seq (io/file "dark-corpus"))) + + (take 20 trie) + (take 20 @trie-database) + (->> (map #(get % []) (trie/children (trie/lookup trie [1]))) + (map first) + (map @trie-database)) ) (defn initialize diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj index 628f139..b90ef1d 100644 --- a/src/com/owoga/prhyme/data_transform.clj +++ b/src/com/owoga/prhyme/data_transform.clj @@ -57,7 +57,9 @@ (transduce xf-untokenize conj - tokens)])) + tokens)]) + + ) (def xf-filter-english (let [word? (fn [x] (or (#{"." "?" ","} x) @@ -65,12 +67,26 @@ (filter (partial every? word?)))) (defn n-to-m-partitions - "Exclusive of m, similar to range." - [n m partitions] + "Returns a concatenated list of n-partitions, n+1-partitions, ..., m-1-partitions of coll. + Exclusive of m, similar to range." + [n m coll] (mapcat - #(partition % 1 partitions) + (fn [partition-size] + (partition partition-size 1 coll)) (range n m))) +(comment + (n-to-m-partitions 1 4 (range 6)) + ;; => ((0) + ;; (1) + ;; ,,, + ;; (3 4) + ;; (4 5) + ;; ,,, + ;; (2 3 4) + ;; (3 4 5)) + ) + (defn new-key [database k] (let [next-id (@database :next-id)] (swap! @@ -83,6 +99,11 @@ (defn make-database-processor "Takes an atom and returns a function that takes a Trie key/value. + + Expects `database` to have a `:next-id` key, which should start at 1 + so that 0 can remain the id for the root node of the trie. That is important + for the encode/decode functions. + When the returned function is called, it checks to see if the key is in the database and if so it returns the associated id. If not, it increments the id (which is stored in the database @@ -115,7 +136,7 @@ sentence)))) (comment - (let [database (atom {})] + (let [database (atom {:next-id 0})] (transduce (map (partial mapv (part-of-speech-database database))) conj @@ -173,6 +194,20 @@ (trie/make-trie) files)) +(comment + (def trie + (let [database (atom {:next-id 0}) + files (->> (file-seq (io/file "dark-corpus")) + (remove #(.isDirectory %)) + (drop 501) + (take 2)) + trie (file-seq->trie database files 1 3)] + trie)) + + (take 20 trie) + + ) + (defn trie->tightly-packed-trie [trie encode-fn decode-fn] (tpt/tightly-packed-trie trie encode-fn decode-fn))