From f66210d7dd37f4aeee0a1fee751032b56b0bca2a Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Mon, 1 Mar 2021 21:47:18 -0600 Subject: [PATCH] Add tightly-packed-trie example --- deps.edn | 9 +- dev/examples/tpt.clj | 251 ++++++++++++++++++ src/com/owoga/prhyme/core.clj | 3 - .../prhyme/generation/simple_good_turing.clj | 2 +- src/com/owoga/prhyme/rhyme_trie.clj | 20 +- 5 files changed, 259 insertions(+), 26 deletions(-) create mode 100644 dev/examples/tpt.clj diff --git a/deps.edn b/deps.edn index 041cae4..0d2a69c 100644 --- a/deps.edn +++ b/deps.edn @@ -9,13 +9,14 @@ cljol/cljol {:git/url "https://github.com/jafingerhut/cljol" :sha "11d4aa72fdd19248bd7600fb7b5cde7189f32938"} org.xerial/sqlite-jdbc {:mvn/version "3.32.3.2"} - inflections {:mvn/version "0.13.2"} + inflections/inflections {:mvn/version "0.13.2"} com.taoensso/tufte {:mvn/version "2.2.0"} - clojure-opennlp {:mvn/version "0.5.0"} + clojure-opennlp/clojure-opennlp {:mvn/version "0.5.0"} uk.ac.abdn/SimpleNLG {:mvn/version "4.5.0"} net.sf.sociaal/freetts {:mvn/version "1.2.2"} - enlive {:mvn/version "1.1.6"} - integrant {:mvn/version "0.8.0"} + enlive/enlive {:mvn/version "1.1.6"} + integrant/integrant {:mvn/version "0.8.0"} + com.owoga/tightly-packed-trie {:local/root "/home/eihli/code/tightly-packed-trie/TightlyPackedTrie.jar"} org.clojure/data.fressian {:mvn/version "1.0.0"} com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/timbre {:mvn/version "4.10.0"}} diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj new file mode 100644 index 0000000..f94d24e --- /dev/null +++ b/dev/examples/tpt.clj @@ -0,0 +1,251 @@ +(ns examples.tpt + (:require [clojure.string :as string] + [clojure.java.io :as io] + [com.owoga.tightly-packed-trie.core :as tpt] + [com.owoga.prhyme.util :as util] + [com.owoga.prhyme.data.dictionary :as dict] + [clojure.zip :as zip])) + +(defn dark-corpus-file-seq [start end] + (let [documents (->> "dark-corpus" + io/file + file-seq + (remove #(.isDirectory %)) + (drop start) + (take end))] + documents)) + +(defn remove-sentences-with-words-not-in-dictionary + "This gets rid of a lot of good words. All contractions for example... I'll, They'll... + possessives like morning's... + + Might not end up using it." + [dictionary] + (let [dictionary (into #{} dictionary)] + (fn [sentences] + (->> sentences + (map #(string/split % #" ")) + (remove #(some (complement dictionary) %)) + (remove #(some string/blank? %)) + (map #(string/join " " %)))))) + +(def re-word + "Regex for tokenizing a string into words + (including contractions and hyphenations), + commas, periods, and newlines." + #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)") + +(defn pad-tokens + "Pads the beginning with n - 1 tokens and + the end with 1 token." + [tokens n] + (concat (repeat (max 1 (dec n)) "") tokens [""])) + +(defn tokenize-line + [line] + (->> line + (string/trim) + (re-seq re-word) + (map second) + (map string/lower-case))) + +(def database (atom {})) + +(defn process-files-for-trie + "Expects an entire song, lines seperated by \n." + [files] + (->> files + (map slurp) + (filter dict/english?) + (map util/clean-text) + (map #(string/split % #"\n+")) + (map (remove-sentences-with-words-not-in-dictionary dict/popular)) + (remove empty?) + (remove #(some empty? %)) + (map (fn [lines] + (map tokenize-line lines))) + (map (fn [lines] + (map #(pad-tokens % 1) lines))) + (map (fn [lines] + (map #(partition 2 1 %) lines))))) + +(defn text->ngrams + "Takes text from a file, including newlines. + Pads lines with and for start/end of line. + Pads beginning with n - 1 s" + [text n] + (let [words-not-in-dict-filter (remove-sentences-with-words-not-in-dictionary dict/popular)] + (->> text + util/clean-text + (#(string/split % #"\n+")) + (remove empty?) + (map tokenize-line) + (map #(pad-tokens % n)) + (map #(partition n 1 %)) + (apply concat)))) + +(defn make-trie + ([] (tpt/->Trie + (fn update-fn [prev cur] + (if (nil? prev) + {:value (last cur) + :count 1} + (-> prev + (update :count (fnil inc 0)) + (assoc :value (last cur))))) + (sorted-map))) + ([& ks] + (reduce + (fn [t k] + (conj t k)) + (make-trie) + ks))) + +(defn n-to-m-grams + "Exclusive of m, similar to range." + [n m text] + (loop [i n + r '()] + (cond + (= i m) + (apply concat r) + :else + (recur (inc i) (cons (text->ngrams text i) r))))) + + +(defn prep-ngram-for-trie + "The tpt/trie expects values conjed into an ngram + to be of format '(k1 k2 k3 value)." + [ngram] + (concat ngram (list ngram))) + +(defn create-trie-from-texts [texts] + (->> texts + (map #(n-to-m-grams 1 4 %)) + (apply concat) + (map prep-ngram-for-trie) + (apply make-trie))) + +(defn trie->seq-of-nodes [trie] + (->> trie + tpt/as-vec + zip/vector-zip + (iterate zip/next) + (take-while (complement zip/end?)) + (map zip/node) + (filter map?))) + +(defn seq-of-nodes->sorted-by-count + "Sorted first by the rank of the ngram, lowest ranks first. + Sorted second by the frequency of the ngram, highest frequencies first. + This is the order that you'd populate a mapping of keys to IDs." + [nodes] + (->> nodes + (map (comp first seq)) + (map (fn [[k v]] + (vector (:value v) (:count v)))) + ;; root node and padded starts + (remove (comp nil? second)) + (sort-by #(vector (count (first %)) + (- (second %)))))) + +(defn trie->database [trie] + (let [sorted-keys + (->> (trie->seq-of-nodes trie) + seq-of-nodes->sorted-by-count)] + (loop [sorted-keys sorted-keys + database {} + i 1] + (if (empty? sorted-keys) + database + (recur + (rest sorted-keys) + (-> database + (assoc (first (first sorted-keys)) + {:count (second (first sorted-keys)) + :id i}) + (assoc i (first (first sorted-keys)))) + (inc i)))))) + +(defn transform-trie->ids [trie database] + (let [transform-p #(map? (zip/node %)) + transform-f + (fn tf [loc] + (zip/edit + loc + (fn [node] + (let [[k v] (first (seq node))] + {(get-in database [(list k) :id] (if (= k :root) :root)) + (assoc v :value (get-in database [(list k) :count] 0))}))))] + (tpt/transform trie (tpt/visitor-filter transform-p transform-f)))) + +(defonce trie + (let [texts (->> (dark-corpus-file-seq 500 500) + (map slurp))] + (create-trie-from-texts texts))) + +(defonce trie-database + (trie->database trie)) + +(def tightly-packed-trie + (let [trie-with-ids (transform-trie->ids trie trie-database) + tightly-packed-trie (tpt/tightly-packed-trie trie-with-ids)] + tightly-packed-trie)) + +(defn key-get-in-tpt [tpt db ks] + (let [id (map #(get-in db [(list %) :id]) ks) + v (get tpt id)] + (println id ks) + {id v})) + +(defn id-get-in-tpt [tpt db ids] + (let [ks (apply concat (map #(get db %) ids)) + v (get tpt ids) + id (get-in db [ks :id])] + {ks (assoc v :value (get db id))})) + +(comment + (key-get-in-tpt + tightly-packed-trie + trie-database + '("" "" "the")) + ;; => {(2 2 3) {:value 3263, :count 462}} + (id-get-in-tpt + tightly-packed-trie + trie-database + '(2 2 3)) + ;; => {("" "" "the") {:value ("" "" "the"), :count 462}} + ) + +(comment + ;; database + (let [texts (->> (dark-corpus-file-seq 500 2) + (map slurp)) + trie (create-trie-from-texts texts)] + (->> (trie->database trie) + (#(get % 3)))) + + (let [texts (->> (dark-corpus-file-seq 500 2) + (map slurp)) + trie (create-trie-from-texts texts)] + (tpt/as-map (transform-trie->ids trie))) + + (let [texts (->> (dark-corpus-file-seq 500 2) + (map slurp)) + trie (create-trie-from-texts texts) + tightly-packed-trie (tpt/tightly-packed-trie + (transform-trie->ids trie))] + (get tightly-packed-trie '(2 2 3))) + + + (let [texts (->> (dark-corpus-file-seq 500 2) + (map slurp)) + trie (create-trie-from-texts texts)] + (tpt/as-map trie)) + + (let [text (slurp (first (dark-corpus-file-seq 500 1)))] + (->> text + util/clean-text + (#(string/split % #"\n+")))) + + ) diff --git a/src/com/owoga/prhyme/core.clj b/src/com/owoga/prhyme/core.clj index ceaaccd..1811738 100644 --- a/src/com/owoga/prhyme/core.clj +++ b/src/com/owoga/prhyme/core.clj @@ -286,9 +286,6 @@ [data rime] (map (partial rhyming-word data) rime)) -(defn deep-merge-with [& maps] - ((apply merge-with merge maps))) - (defn flatten-node [node] (let [zipper (zip/zipper (fn branch? [node] diff --git a/src/com/owoga/prhyme/generation/simple_good_turing.clj b/src/com/owoga/prhyme/generation/simple_good_turing.clj index 5276424..c314bf9 100644 --- a/src/com/owoga/prhyme/generation/simple_good_turing.clj +++ b/src/com/owoga/prhyme/generation/simple_good_turing.clj @@ -16,7 +16,7 @@ (defn pad-tokens [tokens n] - (concat (repeat (min 1 (dec n)) "") tokens [""])) + (concat (repeat (max 1 (dec n)) "") tokens [""])) (defn tokenize-line [line] diff --git a/src/com/owoga/prhyme/rhyme_trie.clj b/src/com/owoga/prhyme/rhyme_trie.clj index b511921..b5189ba 100644 --- a/src/com/owoga/prhyme/rhyme_trie.clj +++ b/src/com/owoga/prhyme/rhyme_trie.clj @@ -359,7 +359,6 @@ child-nodes) index-ba (let [index-baos (ByteArrayOutputStream.) child-byte-arrays (map pack-index-entry children)] - (println child-byte-arrays) (loop [bas child-byte-arrays] (if (empty? bas) (.toByteArray index-baos) @@ -368,11 +367,9 @@ (zip/edit loc (fn [node] - (println children) (let [[k v] (first (seq node))] (.write baos (pack-node-value v)) (.write baos (tpt/vb-encode (count index-ba))) - (println "writing index" (map int index-ba)) (.write baos index-ba) {k (conj v {:byte-address byte-address :byte-array (.toByteArray baos)})}))))) @@ -428,7 +425,6 @@ clojure.lang.ILookup (valAt [_ k] - (println (cons :root (interleave (repeat :children) k))) (get-in trie (cons :root (interleave (repeat :children) k)))) (valAt [_ k not-found] (get-in trie (cons :root (interleave (repeat :children) k)) not-found)) @@ -503,7 +499,6 @@ (let [slice (partial tpt/bit-slice 0 7) combine (partial tpt/combine-significant-bits 7)] (loop [bytes []] - (println (.position bb) (map int bytes)) (cond (or (< max-position (.position bb)) (zero? (.remaining bb))) @@ -519,8 +514,6 @@ (let [slice (partial tpt/bit-slice 0 7) combine (partial tpt/combine-significant-bits 7)] (loop [bytes []] - (println (.position bb) (map int bytes)) - (println "max" max-position) (cond (or (< max-position (.position bb)) (zero? (.remaining bb))) @@ -545,21 +538,15 @@ (defn find-key-in-index [bb target-key max-address not-found] - (println target-key "pos" (.position bb)) (loop [previous-key nil min-position (.position bb) max-position max-address] (if (zero? (- max-position min-position)) not-found (let [mid-position (+ min-position (quot 2 (- max-position min-position)))] - (Thread/sleep 20) - (println min-position mid-position max-position) (.position bb mid-position) (let [bb (rewind-to-key bb min-position) - _ (println "rewound to key") - current-key (decode-key bb max-position) - _ (println "cur key" current-key)] - (println "keys" current-key target-key) + current-key (decode-key bb max-position)] (cond (= current-key target-key) (decode-offset bb max-position) @@ -595,7 +582,6 @@ (let [val (tpt/byte-buffer-variable-length-decode byte-buffer) freq (tpt/byte-buffer-variable-length-decode byte-buffer) size-of-index (tpt/byte-buffer-variable-length-decode byte-buffer) - _ (println "val" val "freq" freq "size" size-of-index) offset (find-key-in-index byte-buffer (first ks) @@ -619,7 +605,6 @@ (let [val (tpt/byte-buffer-variable-length-decode byte-buffer) freq (tpt/byte-buffer-variable-length-decode byte-buffer) size-of-index (tpt/byte-buffer-variable-length-decode byte-buffer) - _ (println "val" val "freq" freq "size" size-of-index) offset (find-key-in-index byte-buffer (first ks) @@ -630,6 +615,7 @@ (do (.position byte-buffer (- current-address offset)) (recur (rest ks))))))))))) + (comment (let [v1 '(1 2 1 121) v2 '(1 3 1 131) @@ -659,9 +645,7 @@ (.write baos byte-array) loc)))) (let [ba (.toByteArray baos) - _ (println "root-address") root-address (get-in (as-map trie) [:root :byte-address]) - _ (println root-address) byte-buf (java.nio.ByteBuffer/allocate (+ 4 (count ba)))] (.putInt byte-buf root-address) (.put byte-buf ba)