ByteBuffer is packed with trie. Decoding...

main
Eric Ihli 4 years ago
parent 6e9021495a
commit 809a30b8a3

@ -89,10 +89,29 @@
(run! #(.writeBytes baos %)))
(let [ba (.toByteArray baos)]
(vb-decode ba)))
;; => ([0 1] [1 1] [127 1] [128 2] [257 2] [9876543210 5])
)
(defn byte-buffer-variable-length-decode
[bb]
(let [combine (fn [n b]
(bit-or (bit-and 0x7f b)
(bit-shift-left n 7)))]
(loop [n 0 i 0]
(let [b (.get bb)]
(if (zero? (bit-and b 0x80))
(recur
(combine n b)
(inc i))
(combine n b))))))
(comment
(let [bb (java.nio.ByteBuffer/wrap (vb-encode 9876543210))]
(println (.limit bb))
(byte-buffer-variable-length-decode bb))
)
(defn encode-with-flag-bits
"Flag is a binary string.
Returns byte array."

@ -7,35 +7,49 @@
DataOutputStream DataInputStream)))
(defprotocol IByteArrayCursor
(loc [_])
(loc [_] "Current index location of cursor.")
(jump [_ loc] "Moves location of cursor to specified index.")
(advance [_] [_ n])
(retreat [_] [_ n])
(peek [_])
(val [_])
(slice [_ end])
(ba= [_ oba] (identical? ba oba)))
(forward [_] [_ n] "Jump forward n bytes.")
(backward [_] [_ n] "Jump backward n bytes.")
(slice [_ end] "A view of the byte array between loc and end.")
(ba= [_ other-ba] "Compares for identical equality the backing byte-array."))
(deftype ByteArrayCursor [ba loc]
clojure.lang.Indexed
(nth [_ i]
(if (and (>= i 0)
(< i (count ba)))
(ByteArrayCursor. ba i)
(throw (ex-info "Index out of bounds."))))
(nth [self i not-found]
(if (and (>= i 0)
(< i (count ba)))
(ByteArrayCursor. ba i)
not-found))
IByteArrayCursor
(loc [_] loc)
(jump [_ loc] (ByteArrayCursor. ba loc))
(advance [_] (ByteArrayCursor. ba (inc loc)))
(advance [_ n] (ByteArrayCursor. ba (+ loc n)))
(retreat [_] (ByteArrayCursor. ba (dec loc)))
(retreat [_ n] (ByteArrayCursor. ba (- loc n)))
(peek [_] (nth ba (inc loc)))
(val [_] (nth ba loc))
(forward [_ n] (ByteArrayCursor. ba (+ loc n)))
(backward [_ n] (ByteArrayCursor. ba (- loc n)))
(slice [_ n]
(loop [i 0 r []]
(if (or (= i n)
(>= (+ loc i) (count ba)))
(byte-array r)
(ByteArrayCursor. (byte-array r) 0)
(recur (inc i) (conj r (aget ba (+ i loc)))))))
clojure.lang.Indexed
(nth [_ i] (aget ba (+ loc i)))
(nth [_ i not-found] (aget ba (+ loc i) not-found))
clojure.lang.ILookup
(valAt [_ i]
(if (or (>= i (count ba))
(< i 0))
(throw "Index out of boundes.")
(aget ba i)))
(valAt [_ i not-found]
(if (or (>= i (count ba))
(< i 0))
not-found
(aget ba i)))
clojure.lang.IPersistentCollection
(count [_] (count ba))
@ -49,6 +63,10 @@
(next [self] (ByteArrayCursor. ba (inc loc)))
(more [_] (ByteArrayCursor. ba (inc loc)))
clojure.lang.IPersistentStack
(peek [self] (aget ba loc))
(pop [_] (ByteArrayCursor. ba (inc loc)))
clojure.lang.Seqable
(seq [_] (let [seeker (fn seeker [ba i]
(if (>= i (count ba))
@ -109,7 +127,9 @@
(let [children (first trie)
parent (second trie)
[parent-key parent-val] (first (seq parent))]
{parent-key (assoc parent-val :children (into {} (map vec-trie->map-trie children)))}))
(sorted-map
parent-key
(assoc parent-val :children (into (sorted-map) (map vec-trie->map-trie children))))))
(comment
(let [vect [[[[[[[[] {"T" {:value "TAT", :count 1}}]] {"A" {:value "AT", :count 1}}]
@ -226,10 +246,10 @@
(visitor loc)
loc)))
(def *offset-flag* 0)
(def offset-flag 0)
(defn offset-byte? [b]
(= *offset-flag* (tpt/nth-bit 8 b)))
(= offset-flag (tpt/nth-bit 8 b)))
(defn encode-offset
"0-padded"
@ -243,17 +263,17 @@
(defn decode-offset-cursor [cursor]
(let [bytes (take-while offset-byte? cursor)
[decoded byte-count] (tpt/decode-with-flag-bits "0" (byte-array bytes))]
[decoded (advance cursor byte-count)]))
[decoded (forward cursor byte-count)]))
(defn encode-key
"1-padded"
[n]
(tpt/encode-with-flag-bits "1" n))
(def *key-flag* 1)
(def key-flag 1)
(defn key-byte? [b]
(= *key-flag* (tpt/nth-bit 8 b)))
(= key-flag (tpt/nth-bit 8 b)))
(defn decode-key [ba]
(let [bytes (take-while key-byte? ba)]
@ -262,7 +282,7 @@
(defn decode-key-cursor [cursor]
(let [bytes (take-while key-byte? cursor)
[decoded byte-count] (tpt/decode-with-flag-bits "1" (byte-array bytes))]
[decoded (advance cursor byte-count)]))
[decoded (forward cursor byte-count)]))
(comment
;; Test encoding and decoding key/offset pairs.
@ -303,6 +323,8 @@
(.write baos (tpt/vb-encode (get node :count 0)))
(.toByteArray baos)))
(defn unpack-node-value
[])
(defn pack-index-entry
[child]
(let [baos (ByteArrayOutputStream.)]
@ -337,21 +359,48 @@
child-nodes)
index-ba (let [index-baos (ByteArrayOutputStream.)
child-byte-arrays (map pack-index-entry children)]
(println child-byte-arrays)
(loop [bas child-byte-arrays]
(if (empty? bas)
(.toByteArray baos)
(.toByteArray index-baos)
(do (.write index-baos (first bas))
(recur (rest bas))))))]
(zip/edit
loc
(fn [node]
(println children)
(let [[k v] (first (seq node))]
(.write baos (pack-node-value v))
(.write baos (tpt/vb-encode (count index-ba)))
(println "writing index" (map int index-ba))
(.write baos index-ba)
{k (conj v {:byte-address byte-address
:byte-array (.toByteArray baos)})})))))
(let [baos (ByteArrayOutputStream.)]
(.write baos (.toByteArray (ByteArrayOutputStream.))))
(defn update-in-sorted
"'Updates' a value in a nested associative structure, where ks is a
sequence of keys and f is a function that will take the old value
and any supplied args and return the new value, and returns a new
nested structure. If any levels do not exist, hash-maps will be
created."
{:added "1.0"
:static true}
([m ks f & args]
(let [up (fn up [m ks f args]
(let [m (or m (sorted-map))
[k & ks] ks]
(assert (instance? clojure.lang.PersistentTreeMap m)
(apply str
"A non-sorted hash-map in a sorted"
"hash-map will probably be the seed of some problems."))
(if ks
(assoc m k (up (get m k) ks f args))
(assoc m k (apply f (get m k) args)))))]
(up m ks f args))))
(defprotocol ITrie
(as-map [this] "Map that underlies trie.")
(as-vec [this] "Depth-first post-order vector.")
@ -377,6 +426,13 @@
(vec-trie->map-trie)
(Trie.)))
clojure.lang.ILookup
(valAt [_ k]
(println (cons :root (interleave (repeat :children) k)))
(get-in trie (cons :root (interleave (repeat :children) k))))
(valAt [_ k not-found]
(get-in trie (cons :root (interleave (repeat :children) k)) not-found))
clojure.lang.IPersistentCollection
(seq [self]
(->> self
@ -392,7 +448,7 @@
id (last o)
node (get-in trie path)]
(Trie.
(update-in
(update-in-sorted
trie
path
(fn [prev]
@ -408,7 +464,7 @@
(= (as-map o) trie))))
(defn trie
([] (->Trie {}))
([] (->Trie (sorted-map)))
([& entries]
(reduce
(fn [t entry]
@ -433,17 +489,156 @@
(.write baos (:byte-array node))
(recur (rest nodes))))))))))
(defn bytes->int [byte-arr]
(reduce
(fn [acc b]
(bit-or
(bit-and 0xFF b)
(bit-shift-left acc 8)))
0
byte-arr))
(defn decode-key [bb max-position]
(loop [bytes []]
(println (.position bb) (map int bytes))
(cond
(or (< max-position (.position bb))
(zero? (.remaining bb)))
(first (tpt/vb-decode-1 (byte-array bytes)))
(offset-byte? (.get bb (.position bb)))
(first (tpt/vb-decode-1 (byte-array bytes)))
:else
(recur (conj bytes (.get bb))))))
(defn decode-offset [bb max-position]
(loop [bytes []]
(println (.position bb) (map int bytes))
(println "max" max-position)
(cond
(or (< max-position (.position bb))
(zero? (.remaining bb)))
(first (tpt/vb-decode-1 (byte-array bytes)))
(key-byte? (.get bb (.position bb)))
(first (tpt/vb-decode-1 (byte-array bytes)))
:else
(recur (conj bytes (.get bb))))))
(defn rewind-to-key [bb stop]
(loop []
(let [current (.get bb (.position bb))
previous (.get bb (dec (.position bb)))]
(if (or (= stop (.position bb))
(and (key-byte? current)
(offset-byte? previous)))
bb
(do (.position bb (dec (.position bb)))
(recur))))))
(defn find-key-in-index
[bb target-key max-address]
(println target-key "pos" (.position bb))
(loop [previous-key nil
min-position (.position bb)
max-position max-address
mid-position (+ min-position (quot 2 (- max-position min-position)))]
(Thread/sleep 20)
(println min-position mid-position max-position)
(.position bb mid-position)
(let [bb (rewind-to-key bb min-position)
_ (println "rewound to key")
current-key (decode-key bb max-position)
_ (println "cur key" current-key)]
(println "keys" current-key target-key)
(cond
(= current-key target-key)
(do (println "=")
(.position bb (decode-offset bb max-position))
bb)
(= current-key previous-key)
(throw "Key not found.")
(< current-key target-key)
(recur
current-key
mid-position
max-position
(+ mid-position (quot 2 (- max-position mid-position))))
(> current-key target-key)
(recur
current-key
min-position
mid-position
(+ min-position (quot 2 (- mid-position min-position))))))))
(deftype TightlyPackedTrie [byte-buffer]
clojure.lang.ILookup
(valAt [_ ks]
(let [root-address (.getInt byte-buffer 0)]
(.position byte-buffer root-address)
(loop [ks ks]
(if (empty? ks)
(let [value (tpt/byte-buffer-variable-length-decode byte-buffer)
freq (tpt/byte-buffer-variable-length-decode byte-buffer)]
{:value value
:count freq})
(let [val (tpt/byte-buffer-variable-length-decode byte-buffer)
freq (tpt/byte-buffer-variable-length-decode byte-buffer)
size-of-index (tpt/byte-buffer-variable-length-decode byte-buffer)
_ (println "val" val "freq" freq "size" size-of-index)
bb (find-key-in-index byte-buffer (first ks) (+ (.position byte-buffer) size-of-index))]
(recur (rest ks))))))))
(comment
(let [v1 '(1 2 1 121)
v2 '(1 3 1 131)
v3 '(1 2 12)
t1 (trie v1)
t2 (trie v2)
t3 (trie v1 v2 v3)
vect (as-vec t3)]
(->> (as-byte-array t3)
(mapped-byte-array-trie->byte-array)
(map zip/node)))
t3 (trie '(1 2 2 122) v1 '(1 2 3 123) v2 v3)
vect (as-vec t3)
packed (tightly-packed-trie t3)
tpt (->TightlyPackedTrie packed)]
(println packed)
(get tpt '(1)))
)
(defn tightly-packed-trie
"Assumes 4 bytes for root byte address."
[trie]
(let [baos (ByteArrayOutputStream.)
trie (as-byte-array trie)]
(transform
trie
(visitor-filter
#(map? (zip/node %))
(fn [loc]
(let [{:keys [byte-array]} (second (first (seq (zip/node loc))))]
(.write baos byte-array)
loc))))
(let [ba (.toByteArray baos)
_ (println "root-address")
root-address (get-in (as-map trie) [:root :byte-address])
_ (println root-address)
byte-buf (java.nio.ByteBuffer/allocate (+ 4 (count ba)))]
(.putInt byte-buf root-address)
(.put byte-buf ba)
byte-buf)))
(comment
(let [v1 '(1 2 1 121)
v2 '(1 3 1 131)
v3 '(1 2 12)
t1 (trie v1)
t2 (trie v2)
t3 (trie '(1 2 2 122) v1 '(1 2 3 123) v2 v3)
vect (as-vec t3)
packed (tightly-packed-trie t3)]
(map byte (.array packed)))
)

Loading…
Cancel
Save