diff --git a/src/com/owoga/prhyme/data/tpt.clj b/src/com/owoga/prhyme/data/tpt.clj index 9cf7e4b..d59562e 100644 --- a/src/com/owoga/prhyme/data/tpt.clj +++ b/src/com/owoga/prhyme/data/tpt.clj @@ -10,17 +10,105 @@ (defn sbyte [b] (bit-or -0x100 b)) +(defn as-binary-string [b] + (string/replace + (format "%8s" (Integer/toBinaryString b)) + #" " + "0")) + +(defn bits [binary-string] + (Integer/parseInt binary-string 2)) + +(defn encode-with-flag-bits + "Flag is a binary string. + Returns byte array." + [flag n] + (let [flag-len (count flag) + data-len (- 8 flag-len) + max-data-val (int (Math/pow 2 data-len)) + flag-val (bit-shift-left (bits flag) data-len)] + (loop [n n r '()] + (if (< n max-data-val) + (byte-array (cons (bit-or flag-val n) r)) + (recur + (quot n max-data-val) + (cons + (bit-or flag-val (mod n max-data-val)) + r)))))) + +(comment + (let [ns [0 1 127 128 257 9876543210] + encoded (map + (partial encode-with-flag-bits "101") + ns) + as-binary (->> encoded + (map (partial map as-binary-string)) + (map (partial map (partial take-last 8))) + (map (partial map (partial apply str))))] + as-binary) + ;; => (("10100000") + ;; ("10100001") + ;; ("10100011" "10111111") + ;; ("10100100" "10100000") + ;; ("10101000" "10100001") + ;; ("10101001" "10100110" "10101011" "10100000" "10100101" "10110111" "10101010")) + ) + +(defn decode-with-flag-bits + "Flag is only used for length. It could be refactored to take an int. + Or it could be refactored to take a function that does work based on the flags. + For now though, flag bits must be checked external to this. This is just parity + decoding of the encoding above." + [flag ba] + (let [flag-len (count flag) + data-len (- 8 flag-len) + mask-val (int (dec (Math/pow 2 data-len)))] + (loop [ba ba r 0] + (if (empty? ba) + r + (let [data-val (bit-and mask-val (first ba))] + (recur + (rest ba) + (bit-or + (bit-shift-left r data-len) + data-val))))))) + +(comment + (let [ns [0 1 127 128 257 9876543210] + encoded (map + (partial encode-with-flag-bits "101") + ns) + decoded (->> encoded + (map (partial decode-with-flag-bits "101")))] + decoded) + ;; => (0 1 127 128 257 9876543210) + ) + +(defn encode-byte-with-n-flag-bits [n b] + (let [max-size-with-flag (int (Math/pow 2 (- 8 n)))] + (loop [b b r '()] + (if (< b max-size-with-flag) + (byte-array (cons b r)) + (recur (quot b max-size-with-flag) + (cons (mod b max-size-with-flag) r)))))) + +(defn decode-byte-with-n-flag-bits [n ba] + (let [max-size-with-flag (int (Math/pow 2 (- 8 n)))] + (loop [ba ba r 0] + (if (nil? ba) + r + (recur (rest ba) + (+ r )))))) +(defn bit-on [i b] + (bit-or b (int (Math/pow 2 i)))) + ;; I've got a dictionary of ~120,000 words. ;; I need to reference them in a compact way. ;; It will be a little bit of a waste, but we can use a 32 bit index. (/ (Math/log 1.2e5) (Math/log 2)) -(defn as-binary-string [b] - (string/replace - (format "%8s" (Integer/toBinaryString b)) - #" " - "0")) + (defn vb-encode [n] (loop [n n diff --git a/src/com/owoga/prhyme/rhyme_trie.clj b/src/com/owoga/prhyme/rhyme_trie.clj index 2ea583f..62f5db2 100644 --- a/src/com/owoga/prhyme/rhyme_trie.clj +++ b/src/com/owoga/prhyme/rhyme_trie.clj @@ -124,9 +124,128 @@ ) +;;;; Tightly Packed Trie +;; The following functions are helpers for converting +;; a trie into a tightly-packed byte-array. + +(defn previous-node [loc] + (loop [loc (zip/prev loc)] + (cond + (nil? loc) nil + (map? (zip/node loc)) loc + :else (recur (zip/prev loc))))) + +(defn loc->byte-address + "Given a loc without a byte-address, calculate it from the previous loc." + ([loc] + (loc->byte-address loc 0)) + ([loc starting-offset] + (let [prev (previous-node loc)] + (when prev (println "prev" (zip/node prev))) + (if prev + (let [[k {:keys [byte-address byte-array]}] (first (seq (zip/node prev)))] + (+ byte-address (count byte-array))) + starting-offset)))) + +(defn child->index + "Given a child gets a map with info needed to build an index." + [child] + (let [[k {:keys [byte-address byte-array] :as v}] (first (seq child))] + (println byte-address byte-array) + {:byte-address byte-address + :key k + :byte-array byte-array})) + +(defn visitor-filter [pred visitor] + (fn [loc] + (if (pred loc) + (visitor loc) + loc))) + +(defn encode-offset + "0-padded" + [n] + (tpt/encode-with-flag-bits "0" n)) + +(defn decode-offset + [ba] + (tpt/decode-with-flag-bits "0" ba)) + +(defn encode-key + "1-padded" + [n] + (tpt/encode-with-flag-bits "1" n)) + +(defn decode-key [ba] + (tpt/decode-with-flag-bits "1" ba)) + +(defn pack-node-value + "Returns byte-array of node value. + Byte-array is 2 variable-length encoded integers. + For a markov trie, this would be an integer ID + of the n-gram and an integer of the frequency. + + Nodes without terminal values get a value and count of 0." + [node] + (let [baos (ByteArrayOutputStream.)] + (println "node-value" node) + (.write baos (tpt/vb-encode (get node :value 0))) + (.write baos (tpt/vb-encode (get node :count 0))) + (.toByteArray baos))) + +(defn pack-index-entry + [child] + (let [baos (ByteArrayOutputStream.)] + (println "index-entry" child) + (.write baos (encode-key (:key child))) + (.write baos (encode-offset (:offset child))) + (.toByteArray baos))) + +(defn pack-index + "Reserves 4 bytes for offset of root node. + Lots of mutation going on here as we write to baos." + [loc] + (println "pack-index" (zip/node loc)) + (let [baos (ByteArrayOutputStream.) + byte-address (loc->byte-address loc 4) + _ (println "byte-address" byte-address) + child-nodes (->> loc + child-seq + (map (comp second zip/node))) + _ (println "child nodes") + children (map + (fn [child-node] + (let [child-index (child->index child-node)] + (println "cn" child-node) + (assoc + child-index + :offset + (- byte-address (:byte-address child-index))))) + child-nodes) + _ (println "children" (count children)) + index-ba (let [index-baos (ByteArrayOutputStream.) + child-byte-arrays (map pack-index-entry children)] + (println "child-byte-arrays" child-byte-arrays) + (loop [bas child-byte-arrays] + (if (empty? bas) + (.toByteArray baos) + (do (.write index-baos (first bas)) + (recur (rest bas))))))] + (zip/edit + loc + (fn [node] + (println node) + (let [[k v] (first (seq node))] + (.write baos (pack-node-value v)) + (.write baos (tpt/vb-encode (count index-ba))) + (.write baos index-ba) + {k (conj v {:byte-address byte-address + :byte-array (.toByteArray baos)})}))))) + (defprotocol ITrie (as-map [this] "Map that underlies trie.") - (as-vec [this] "Depth-first post-order vector") + (as-vec [this] "Depth-first post-order vector.") + (as-byte-array [this] "Tightly-packed byte-array.") (transform [this f] "Depth-first post-order apply each function to each node.")) ;; Seq offers a depth-first post-order traversal @@ -135,6 +254,11 @@ ITrie (as-map [_] trie) (as-vec [_] (map-trie->seq-trie trie)) + (as-byte-array [self] + (->> (transform self (visitor-filter #(map? (zip/node %)) pack-index)) + as-vec + vec-trie->map-trie + (Trie.))) (transform [self f] (->> self as-vec @@ -182,6 +306,37 @@ (trie) entries))) +(defn mapped-byte-array-trie->byte-array + [trie] + (let [baos (ByteArrayOutputStream.)] + (->> trie + as-vec + zip/vector-zip + (iterate zip/next) + (take-while (complement zip/end?)) + (filter (comp map? zip/node)) + ((fn [nodes] + (loop [nodes nodes] + (if (empty? nodes) + baos + (let [node (first nodes)] + (.write baos (:byte-array node)) + (recur (rest nodes)))))))))) + +(comment + (let [v1 '(1 2 1 121) + v2 '(1 3 1 131) + v3 '(1 2 12) + t1 (trie v1) + t2 (trie v2) + t3 (trie v1 v2 v3) + vect (as-vec t3)] + (->> (as-byte-array t3) + (mapped-byte-array-trie->byte-array) + (map zip/node))) + + ) + (comment (let [v1 '("T" "A" "T" "TAT") v2 '("T" "U" "T" "TUT") @@ -245,69 +400,7 @@ ) -(defn previous-node [loc] - (loop [loc (zip/prev loc)] - (cond - (nil? loc) nil - (map? (zip/node loc)) loc - :else (recur (zip/prev loc))))) -(defn loc->byte-address - "Given a loc without a byte-address, calculate it from the previous loc." - [loc] - (let [prev (previous-node loc)] - (if prev - (let [[k {:keys [byte-address bytes]}] (first (seq (zip/node prev)))] - (+ byte-address (count bytes))) - 0))) - -(defn child->index - "Given a child gets a map with info needed to build an index." - [child] - (let [[k {:keys [byte-address] :as v}] (first (seq child))] - {:byte-address byte-address - :key k})) - -(child->index {"T" {:byte-address 20}}) - -(defn pack-index - "" - [loc] - (println "pack" (zip/node loc)) - (if (map? (zip/node loc)) - (let [parent-byte-address (loc->byte-address loc) - _ (println parent-byte-address) - child-nodes (->> loc - child-seq - (map (comp second zip/node))) - _ (println child-nodes) - child-key-offsets (map - (fn [child-node] - (let [{child-byte-address :byte-address - child-key :key} (child->index child-node)] - {:key child-key - :offset (- parent-byte-address child-byte-address)})) - child-nodes)] - (zip/edit - loc - (fn [node] - (println "zip" node) - (let [[k v] (first (seq node))] - {k {:byte-address parent-byte-address - :bytes (byte-array [1 2 3])}})))) - loc)) - -(comment - (let [v1 '("T" "A" "T" "TAT") - v2 '("T" "U" "T" "TUT") - v3 '("T" "A" "AT") - t1 (trie v1) - t2 (trie v2) - t3 (trie v1 v2 v3) - vect (as-vec t3)] - (transform t3 pack-index)) - - ) (comment (let [v1 '("T" "A" "T" "TAT")