diff --git a/deps.edn b/deps.edn index 31cde87..66e50ad 100644 --- a/deps.edn +++ b/deps.edn @@ -1,6 +1,7 @@ {:paths ["src" "resources"] :jvm-opts ["-Xmx6g"] :deps {org.clojure/clojure {:mvn/version "1.10.0"} + org.clojure/data.int-map {:mvn/version "1.0.0"} org.clojure/math.combinatorics {:mvn/version "0.1.6"} org.clojure/data.priority-map {:mvn/version "1.0.0"} org.clojure/core.async {:mvn/version "1.2.603"} @@ -19,6 +20,7 @@ org.clojure/data.fressian {:mvn/version "1.0.0"} com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/timbre {:mvn/version "4.10.0"} - com.owoga/tightly-packed-trie {:mvn/version "0.2.1"}} + com.owoga/tightly-packed-trie + {:local/root "/home/eihli/code/tightly-packed-trie"}} :aliases {:dev {:extra-paths ["test" "examples" "dev"] :extra-deps {}}}} diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj index db90981..e9584ab 100644 --- a/dev/examples/tpt.clj +++ b/dev/examples/tpt.clj @@ -13,28 +13,17 @@ (tufte/add-basic-println-handler! {}) +(defn xf-file-seq [start end] + (comp (remove #(.isDirectory %)) + (drop start) + (take end))) + (defn dark-corpus-file-seq [start end] - (let [documents (->> "dark-corpus" - io/file - file-seq - (remove #(.isDirectory %)) - (drop start) - (take end))] - documents)) - -(defn remove-sentences-with-words-not-in-dictionary - "This gets rid of a lot of good words. All contractions for example... I'll, They'll... - possessives like morning's... - - Might not end up using it." - [dictionary] - (let [dictionary (into #{} dictionary)] - (fn [sentences] - (->> sentences - (map #(string/split % #" ")) - (remove #(some (complement dictionary) %)) - (remove #(some string/blank? %)) - (map #(string/join " " %)))))) + (let [xf (comp (remove #(.isDirectory %)) + (drop start) + (take end)) + documents (file-seq (io/file "dark-corpus"))] + (transduce xf conj documents))) (def re-word "Regex for tokenizing a string into words @@ -46,59 +35,144 @@ "Pads the beginning with n - 1 tokens and the end with 1 token." [tokens n] - (concat (repeat (max 1 (dec n)) "") tokens [""])) + (vec (concat (vec (repeat (max 1 (dec n)) "")) tokens [""]))) (defn tokenize-line [line] (->> line (string/trim) (re-seq re-word) - (map second) - (map string/lower-case))) - -(defn process-files-for-trie - "Expects an entire song, lines seperated by \n." - [files] - (->> files - (map slurp) - (filter dict/english?) - (map util/clean-text) - (map #(string/split % #"\n+")) - (map (remove-sentences-with-words-not-in-dictionary dict/popular)) - (remove empty?) - (remove #(some empty? %)) - (map (fn [lines] - (map tokenize-line lines))) - (map (fn [lines] - (map #(pad-tokens % 1) lines))) - (map (fn [lines] - (map #(partition 2 1 %) lines))))) + (mapv second) + (mapv string/lower-case))) (defn text->ngrams "Takes text from a file, including newlines. Pads lines with and for start/end of line. Pads beginning with n - 1 s" [text n] - (let [words-not-in-dict-filter (remove-sentences-with-words-not-in-dictionary dict/popular)] - (->> text - util/clean-text - (#(string/split % #"\n+")) - (remove empty?) - (map tokenize-line) - (map #(pad-tokens % n)) - (map #(partition n 1 %)) - (apply concat)))) + (->> text + util/clean-text + (#(string/split % #"\n+")) + (remove empty?) + (mapv tokenize-line) + (mapv #(pad-tokens % n)) + (mapv #(partition n 1 %)) + (mapv #(mapv vec %)) + (reduce #(into %1 %2) []))) (defn n-to-m-grams "Exclusive of m, similar to range." [n m text] (loop [i n - r '()] + r []] (cond (= i m) - (apply concat r) + r :else - (recur (inc i) (cons (text->ngrams text i) r))))) + (recur (inc i) + (into r (text->ngrams text i)))))) + +(declare ->TrieKey) + +(deftype TrieKey [key] + clojure.lang.IPersistentStack + (peek [self] + (let [x (last (seq self))] + (if (.equals "" x) + nil + (Integer/parseInt x)))) + (pop [self] + (TrieKey. (string/replace key #"(.*):.*$" "$1"))) + + clojure.lang.ISeq + (first [self] + (let [x (first (seq self))] + (if (.equals x "") + nil + (Integer/parseInt x)))) + (next [self] + (TrieKey. (string/replace key #".*?:(.*)" "$1"))) + (more [self] + (let [xs (string/split key #":")] + (if (.equals xs "") '() (into (->TrieKey "") (rest xs))))) + (cons [self o] + (TrieKey. + (cond + (.equals key "") ":" + (.equals key ":") (str key o) + :else (str key ":" o)))) + + clojure.lang.IPersistentCollection + (count [self] + (count (seq self))) + (empty [self] + (TrieKey. "")) + (equiv [self o] + (.equals self o)) + + clojure.lang.Seqable + (seq [self] + (if (.equals "" key) + nil + (seq (string/split key #":"))))) + +(defmethod print-method TrieKey [trie-key ^java.io.Writer w] + (print-method (.key trie-key) w)) + +(defmethod print-dup TrieKey [trie-key ^java.io.Writer w] + (print-ctor trie-key (fn [o w] (print-dup (.key trie-key) w)) w)) + +(defn trie-key + ([] + (->TrieKey "")) + ([coll] + (->TrieKey (string/join ":" coll)))) + + +(def trie-database (atom nil)) + +(defn stateful-transducer [xf] + (let [trie (volatile! (trie/make-trie)) + database (atom {}) + next-id (volatile! 1)] + (fn + ([] (xf)) + ([result] + (reset! trie-database @database) + (xf result)) + ([result input] + (let [ngrams-ids + (mapv + (fn [ngrams] + (mapv + (fn [ngram] + (let [gram-ids (mapv + (fn [gram] + (let [gram-id (get @database gram @next-id)] + (when (.equals gram-id @next-id) + (swap! database + #(-> % + (assoc gram gram-id) + (assoc gram-id gram))) + (vswap! next-id inc)) + gram-id)) + ngram) + ngram-id (get database gram-ids @next-id)] + (when (.equals ngram-id @next-id) + (swap! database #(-> % (assoc gram-ids @next-id))) + (vswap! next-id inc)) + gram-ids)) + ngrams)) + input)] + (vswap! + trie + (fn [trie ngrams-ids] + (reduce + (fn [trie [ngram-ids _]] + (update trie ngram-ids (fnil #(update % 1 inc) [(peek ngram-ids) 0]))) + trie + ngrams-ids)) + ngrams-ids)))))) (defn prep-ngram-for-trie "The tpt/trie expects values conjed into an ngram @@ -114,21 +188,23 @@ (reduce (fn [[trie i db] [k v]] (let [[db i] (reduce - (fn [[db i] k] - (let [k' (get db k i) - i' (if (= i k') (inc i) i) - db' (-> db - (assoc k' k) - (assoc k k'))] - [db' i'])) - [db i] - k)] - (let [k' (map #(get db %) k)] - (if-let [existing (get trie k')] - (let [[val count] existing - trie (assoc trie k' [val (inc count)])] - [trie i db]) - [(assoc trie k' [i 1]) i db])))) + (fn [[db i] k] + (let [id (get db k i) + i (if (= id i) (inc i) i) + db (-> db + (assoc id k) + (assoc k id))] + [db i])) + [db i] + k) + k' (map #(get db %) k)] + (if-let [existing (get trie k')] + (let [[val count] existing + trie (assoc trie k' [val (inc count)])] + [trie i db]) + [(assoc trie k' [i 1]) + (inc i) + (assoc db i k')]))) [(trie/make-trie) 1 {}]))) (defn seq-of-nodes->sorted-by-count @@ -143,11 +219,22 @@ reverse)) (time - (let [texts (->> (dark-corpus-file-seq 0 250000) - (map slurp)) - [trie _ db] (create-trie-from-texts texts)] - (def trie trie) - (def trie-database db))) + (def trie + (transduce (comp (xf-file-seq 0 10) + (map slurp) + (map (partial n-to-m-grams 1 4)) + (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams))) + stateful-transducer) + conj + (file-seq (io/file "dark-corpus"))))) + +(comment + (let [texts (->> (dark-corpus-file-seq 0 5) + (map slurp)) + [trie _ db] (create-trie-from-texts texts)] + texts) + + ) (defn encode-fn [v] (let [[value count] (if (seqable? v) v [nil nil])] @@ -157,20 +244,19 @@ (concat (encoding/encode value) (encoding/encode count)))))) -(defn decode-fn [byte-buffer] - (let [value (encoding/decode byte-buffer)] - (if (zero? value) - [nil nil] - [value (encoding/decode byte-buffer)]))) +(defn decode-fn [db] + (fn [byte-buffer] + (let [value (encoding/decode byte-buffer)] + (if (zero? value) + nil + [value (encoding/decode byte-buffer)])))) (time (def tightly-packed-trie (tpt/tightly-packed-trie trie encode-fn - decode-fn))) - -(take 20 tightly-packed-trie) + (decode-fn @trie-database)))) (defn key-get-in-tpt [tpt db ks] (let [id (map #(get-in db [(list %) :id]) ks) @@ -183,41 +269,85 @@ id (get-in db [ks :id])] {ks (assoc v :value (get db id))})) -(time (count (map #(get % []) (trie/children (trie/lookup tightly-packed-trie [1]))))) -(time (count (trie/children (trie/lookup tightly-packed-trie [1])))) + (comment - (profile {} + (->> (trie/lookup tightly-packed-trie [1]) + (trie/children) + (map #(get % [])) + (remove #(nil? (first %))) + (math/weighted-selection second)) + + (->> trie + (#(trie/lookup % [1])) + (trie/children) + (map #(get % [])) + (remove nil?) + (map first) + (map #(trie-database %)) + (map #(map trie-database %))) + + (->> tightly-packed-trie + (#(trie/lookup % [1])) + (trie/children) + (map #(get % [])) + (remove nil?) + (math/weighted-selection second) + first) + + (->> trie + (#(trie/lookup % [1])) + (trie/children) + (map #(get % [])) + (remove nil?) + (math/weighted-selection second) + first) + + (take 20 (seq @trie-database)) + (take 20 trie) + (take 20 tightly-packed-trie) + + (->> (trie/lookup tightly-packed-trie [1]) + (trie/children) + (map #(get % [])) + (remove nil?) + (math/weighted-selection #(nth % 1)) + first + (@trie-database)) + + (profile + {} (def example-story - (loop [generated-text [(get trie-database "")] - i 0] - (if (> i 10) + (loop [generated-text [(get @trie-database "")] + i 0] + (println generated-text) + (if (> i 100) generated-text (let [children (loop [i 4] (let [node (p :lookup (trie/lookup tightly-packed-trie (vec (take-last i generated-text)))) - children (p :seq-children (doall (seq (and node (trie/children node)))))] + children (p :seq-children (and node (trie/children node)))] (cond - (nil? node) (recur (dec i)) - (< i 0) (throw (Exception. "Error")) - children children - :else (recur (dec i)))))] + (nil? node) (recur (dec i)) + (< i 0) (throw (Exception. "Error")) + (seq children) children + :else (recur (dec i)))))] (recur (conj generated-text (->> children (map #(get % [])) (remove nil?) - (#(p :weighted-selection (math/weighted-selection :count %))) - :value - (get trie-database))) + (#(p :weighted-selection (math/weighted-selection + (fn [[_ c]] c) + %))) + first)) (inc i))))))) (->> example-story - (map #(get-in trie-database [% :value])) - (concat) + (map (fn [v] (get-in @trie-database [v]))) (string/join " ") (#(string/replace % #" ([\.,\?])" "$1")) ((fn [txt]