diff --git a/src/com/owoga/prhyme/data/tpt.clj b/src/com/owoga/prhyme/data/tpt.clj index 9e1e525..36ed189 100644 --- a/src/com/owoga/prhyme/data/tpt.clj +++ b/src/com/owoga/prhyme/data/tpt.clj @@ -89,10 +89,29 @@ (run! #(.writeBytes baos %))) (let [ba (.toByteArray baos)] (vb-decode ba))) - ;; => ([0 1] [1 1] [127 1] [128 2] [257 2] [9876543210 5]) ) +(defn byte-buffer-variable-length-decode + [bb] + (let [combine (fn [n b] + (bit-or (bit-and 0x7f b) + (bit-shift-left n 7)))] + (loop [n 0 i 0] + (let [b (.get bb)] + (if (zero? (bit-and b 0x80)) + (recur + (combine n b) + (inc i)) + (combine n b)))))) + +(comment + (let [bb (java.nio.ByteBuffer/wrap (vb-encode 9876543210))] + (println (.limit bb)) + (byte-buffer-variable-length-decode bb)) + + ) + (defn encode-with-flag-bits "Flag is a binary string. Returns byte array." diff --git a/src/com/owoga/prhyme/rhyme_trie.clj b/src/com/owoga/prhyme/rhyme_trie.clj index c45dd33..cbacfe0 100644 --- a/src/com/owoga/prhyme/rhyme_trie.clj +++ b/src/com/owoga/prhyme/rhyme_trie.clj @@ -7,35 +7,49 @@ DataOutputStream DataInputStream))) (defprotocol IByteArrayCursor - (loc [_]) + (loc [_] "Current index location of cursor.") (jump [_ loc] "Moves location of cursor to specified index.") - (advance [_] [_ n]) - (retreat [_] [_ n]) - (peek [_]) - (val [_]) - (slice [_ end]) - (ba= [_ oba] (identical? ba oba))) + (forward [_] [_ n] "Jump forward n bytes.") + (backward [_] [_ n] "Jump backward n bytes.") + (slice [_ end] "A view of the byte array between loc and end.") + (ba= [_ other-ba] "Compares for identical equality the backing byte-array.")) (deftype ByteArrayCursor [ba loc] + clojure.lang.Indexed + (nth [_ i] + (if (and (>= i 0) + (< i (count ba))) + (ByteArrayCursor. ba i) + (throw (ex-info "Index out of bounds.")))) + (nth [self i not-found] + (if (and (>= i 0) + (< i (count ba))) + (ByteArrayCursor. ba i) + not-found)) + IByteArrayCursor (loc [_] loc) (jump [_ loc] (ByteArrayCursor. ba loc)) - (advance [_] (ByteArrayCursor. ba (inc loc))) - (advance [_ n] (ByteArrayCursor. ba (+ loc n))) - (retreat [_] (ByteArrayCursor. ba (dec loc))) - (retreat [_ n] (ByteArrayCursor. ba (- loc n))) - (peek [_] (nth ba (inc loc))) - (val [_] (nth ba loc)) + (forward [_ n] (ByteArrayCursor. ba (+ loc n))) + (backward [_ n] (ByteArrayCursor. ba (- loc n))) (slice [_ n] (loop [i 0 r []] (if (or (= i n) (>= (+ loc i) (count ba))) - (byte-array r) + (ByteArrayCursor. (byte-array r) 0) (recur (inc i) (conj r (aget ba (+ i loc))))))) - clojure.lang.Indexed - (nth [_ i] (aget ba (+ loc i))) - (nth [_ i not-found] (aget ba (+ loc i) not-found)) + clojure.lang.ILookup + (valAt [_ i] + (if (or (>= i (count ba)) + (< i 0)) + (throw "Index out of boundes.") + (aget ba i))) + (valAt [_ i not-found] + (if (or (>= i (count ba)) + (< i 0)) + not-found + (aget ba i))) clojure.lang.IPersistentCollection (count [_] (count ba)) @@ -49,6 +63,10 @@ (next [self] (ByteArrayCursor. ba (inc loc))) (more [_] (ByteArrayCursor. ba (inc loc))) + clojure.lang.IPersistentStack + (peek [self] (aget ba loc)) + (pop [_] (ByteArrayCursor. ba (inc loc))) + clojure.lang.Seqable (seq [_] (let [seeker (fn seeker [ba i] (if (>= i (count ba)) @@ -109,7 +127,9 @@ (let [children (first trie) parent (second trie) [parent-key parent-val] (first (seq parent))] - {parent-key (assoc parent-val :children (into {} (map vec-trie->map-trie children)))})) + (sorted-map + parent-key + (assoc parent-val :children (into (sorted-map) (map vec-trie->map-trie children)))))) (comment (let [vect [[[[[[[[] {"T" {:value "TAT", :count 1}}]] {"A" {:value "AT", :count 1}}] @@ -226,10 +246,10 @@ (visitor loc) loc))) -(def *offset-flag* 0) +(def offset-flag 0) (defn offset-byte? [b] - (= *offset-flag* (tpt/nth-bit 8 b))) + (= offset-flag (tpt/nth-bit 8 b))) (defn encode-offset "0-padded" @@ -243,17 +263,17 @@ (defn decode-offset-cursor [cursor] (let [bytes (take-while offset-byte? cursor) [decoded byte-count] (tpt/decode-with-flag-bits "0" (byte-array bytes))] - [decoded (advance cursor byte-count)])) + [decoded (forward cursor byte-count)])) (defn encode-key "1-padded" [n] (tpt/encode-with-flag-bits "1" n)) -(def *key-flag* 1) +(def key-flag 1) (defn key-byte? [b] - (= *key-flag* (tpt/nth-bit 8 b))) + (= key-flag (tpt/nth-bit 8 b))) (defn decode-key [ba] (let [bytes (take-while key-byte? ba)] @@ -262,7 +282,7 @@ (defn decode-key-cursor [cursor] (let [bytes (take-while key-byte? cursor) [decoded byte-count] (tpt/decode-with-flag-bits "1" (byte-array bytes))] - [decoded (advance cursor byte-count)])) + [decoded (forward cursor byte-count)])) (comment ;; Test encoding and decoding key/offset pairs. @@ -303,6 +323,8 @@ (.write baos (tpt/vb-encode (get node :count 0))) (.toByteArray baos))) +(defn unpack-node-value + []) (defn pack-index-entry [child] (let [baos (ByteArrayOutputStream.)] @@ -337,21 +359,48 @@ child-nodes) index-ba (let [index-baos (ByteArrayOutputStream.) child-byte-arrays (map pack-index-entry children)] + (println child-byte-arrays) (loop [bas child-byte-arrays] (if (empty? bas) - (.toByteArray baos) + (.toByteArray index-baos) (do (.write index-baos (first bas)) (recur (rest bas))))))] (zip/edit loc (fn [node] + (println children) (let [[k v] (first (seq node))] (.write baos (pack-node-value v)) (.write baos (tpt/vb-encode (count index-ba))) + (println "writing index" (map int index-ba)) (.write baos index-ba) {k (conj v {:byte-address byte-address :byte-array (.toByteArray baos)})}))))) +(let [baos (ByteArrayOutputStream.)] + (.write baos (.toByteArray (ByteArrayOutputStream.)))) + +(defn update-in-sorted + "'Updates' a value in a nested associative structure, where ks is a + sequence of keys and f is a function that will take the old value + and any supplied args and return the new value, and returns a new + nested structure. If any levels do not exist, hash-maps will be + created." + {:added "1.0" + :static true} + ([m ks f & args] + (let [up (fn up [m ks f args] + (let [m (or m (sorted-map)) + [k & ks] ks] + (assert (instance? clojure.lang.PersistentTreeMap m) + (apply str + "A non-sorted hash-map in a sorted" + "hash-map will probably be the seed of some problems.")) + (if ks + (assoc m k (up (get m k) ks f args)) + (assoc m k (apply f (get m k) args)))))] + (up m ks f args)))) + (defprotocol ITrie (as-map [this] "Map that underlies trie.") (as-vec [this] "Depth-first post-order vector.") @@ -377,6 +426,13 @@ (vec-trie->map-trie) (Trie.))) + clojure.lang.ILookup + (valAt [_ k] + (println (cons :root (interleave (repeat :children) k))) + (get-in trie (cons :root (interleave (repeat :children) k)))) + (valAt [_ k not-found] + (get-in trie (cons :root (interleave (repeat :children) k)) not-found)) + clojure.lang.IPersistentCollection (seq [self] (->> self @@ -392,7 +448,7 @@ id (last o) node (get-in trie path)] (Trie. - (update-in + (update-in-sorted trie path (fn [prev] @@ -408,7 +464,7 @@ (= (as-map o) trie)))) (defn trie - ([] (->Trie {})) + ([] (->Trie (sorted-map))) ([& entries] (reduce (fn [t entry] @@ -433,17 +489,156 @@ (.write baos (:byte-array node)) (recur (rest nodes)))))))))) + +(defn bytes->int [byte-arr] + (reduce + (fn [acc b] + (bit-or + (bit-and 0xFF b) + (bit-shift-left acc 8))) + 0 + byte-arr)) + +(defn decode-key [bb max-position] + (loop [bytes []] + (println (.position bb) (map int bytes)) + (cond + (or (< max-position (.position bb)) + (zero? (.remaining bb))) + (first (tpt/vb-decode-1 (byte-array bytes))) + + (offset-byte? (.get bb (.position bb))) + (first (tpt/vb-decode-1 (byte-array bytes))) + + :else + (recur (conj bytes (.get bb)))))) + +(defn decode-offset [bb max-position] + (loop [bytes []] + (println (.position bb) (map int bytes)) + (println "max" max-position) + (cond + (or (< max-position (.position bb)) + (zero? (.remaining bb))) + (first (tpt/vb-decode-1 (byte-array bytes))) + + (key-byte? (.get bb (.position bb))) + (first (tpt/vb-decode-1 (byte-array bytes))) + + :else + (recur (conj bytes (.get bb)))))) + +(defn rewind-to-key [bb stop] + (loop [] + (let [current (.get bb (.position bb)) + previous (.get bb (dec (.position bb)))] + (if (or (= stop (.position bb)) + (and (key-byte? current) + (offset-byte? previous))) + bb + (do (.position bb (dec (.position bb))) + (recur)))))) + +(defn find-key-in-index + [bb target-key max-address] + (println target-key "pos" (.position bb)) + (loop [previous-key nil + min-position (.position bb) + max-position max-address + mid-position (+ min-position (quot 2 (- max-position min-position)))] + (Thread/sleep 20) + (println min-position mid-position max-position) + (.position bb mid-position) + (let [bb (rewind-to-key bb min-position) + _ (println "rewound to key") + current-key (decode-key bb max-position) + _ (println "cur key" current-key)] + (println "keys" current-key target-key) + (cond + (= current-key target-key) + (do (println "=") + (.position bb (decode-offset bb max-position)) + bb) + (= current-key previous-key) + (throw "Key not found.") + (< current-key target-key) + (recur + current-key + mid-position + max-position + (+ mid-position (quot 2 (- max-position mid-position)))) + (> current-key target-key) + (recur + current-key + min-position + mid-position + (+ min-position (quot 2 (- mid-position min-position)))))))) + +(deftype TightlyPackedTrie [byte-buffer] + clojure.lang.ILookup + (valAt [_ ks] + (let [root-address (.getInt byte-buffer 0)] + (.position byte-buffer root-address) + (loop [ks ks] + (if (empty? ks) + (let [value (tpt/byte-buffer-variable-length-decode byte-buffer) + freq (tpt/byte-buffer-variable-length-decode byte-buffer)] + {:value value + :count freq}) + (let [val (tpt/byte-buffer-variable-length-decode byte-buffer) + freq (tpt/byte-buffer-variable-length-decode byte-buffer) + size-of-index (tpt/byte-buffer-variable-length-decode byte-buffer) + _ (println "val" val "freq" freq "size" size-of-index) + bb (find-key-in-index byte-buffer (first ks) (+ (.position byte-buffer) size-of-index))] + (recur (rest ks)))))))) + (comment (let [v1 '(1 2 1 121) v2 '(1 3 1 131) v3 '(1 2 12) t1 (trie v1) t2 (trie v2) - t3 (trie v1 v2 v3) - vect (as-vec t3)] - (->> (as-byte-array t3) - (mapped-byte-array-trie->byte-array) - (map zip/node))) + t3 (trie '(1 2 2 122) v1 '(1 2 3 123) v2 v3) + vect (as-vec t3) + packed (tightly-packed-trie t3) + tpt (->TightlyPackedTrie packed)] + (println packed) + (get tpt '(1))) + + ) + +(defn tightly-packed-trie + "Assumes 4 bytes for root byte address." + [trie] + (let [baos (ByteArrayOutputStream.) + trie (as-byte-array trie)] + (transform + trie + (visitor-filter + #(map? (zip/node %)) + (fn [loc] + (let [{:keys [byte-array]} (second (first (seq (zip/node loc))))] + (.write baos byte-array) + loc)))) + (let [ba (.toByteArray baos) + _ (println "root-address") + root-address (get-in (as-map trie) [:root :byte-address]) + _ (println root-address) + byte-buf (java.nio.ByteBuffer/allocate (+ 4 (count ba)))] + (.putInt byte-buf root-address) + (.put byte-buf ba) + byte-buf))) + +(comment + (let [v1 '(1 2 1 121) + v2 '(1 3 1 131) + v3 '(1 2 12) + t1 (trie v1) + t2 (trie v2) + t3 (trie '(1 2 2 122) v1 '(1 2 3 123) v2 v3) + vect (as-vec t3) + packed (tightly-packed-trie t3)] + (map byte (.array packed))) )