Implement cursors to decode from a byte array

main
Eric Ihli 4 years ago
parent 6e143a91cb
commit 6e9021495a

@ -10,6 +10,14 @@
(defn sbyte [b] (defn sbyte [b]
(bit-or -0x100 b)) (bit-or -0x100 b))
(defn nth-bit [n b]
(let [mask (bit-shift-left 1 n)]
(bit-and
1
(bit-shift-right
(bit-and mask b)
n))))
(defn as-binary-string [b] (defn as-binary-string [b]
(string/replace (string/replace
(format "%8s" (Integer/toBinaryString b)) (format "%8s" (Integer/toBinaryString b))
@ -19,6 +27,72 @@
(defn bits [binary-string] (defn bits [binary-string]
(Integer/parseInt binary-string 2)) (Integer/parseInt binary-string 2))
(defn bit-on [i b]
(bit-or b (int (Math/pow 2 i))))
;; I've got a dictionary of ~120,000 words.
;; I need to reference them in a compact way.
;; It will be a little bit of a waste, but we can use a 32 bit index.
(/ (Math/log 1.2e5) (Math/log 2))
(defn vb-encode [n]
(loop [n n
b '()]
(cond
(< n 128) (let [ba (byte-array (cons n b))
i (dec (count ba))]
(aset-byte ba i (sbyte (+ 128 (aget ba i))))
ba)
:else (recur (quot n 128) (cons (mod n 128) b)))))
(defn vb-decode-1 [ba]
(loop [n 0
i 0]
(let [b (aget ba i)]
(cond
(> (bit-and b 0x80) 0)
[(bit-or (bit-and 0x7f b)
(bit-shift-left n 7))
(inc i)]
:else
(recur (bit-or (bit-and 0x7f b)
(bit-shift-left n 7))
(inc i))))))
(defn vb-decode
([ba]
(vb-decode ba 0))
([ba i]
(if (>= i (count ba))
(cons (first (vb-decode-1 ba))
nil)
(let [[value byte-count] (vb-decode-1 ba)]
(lazy-seq
(cons
value
(vb-decode (byte-array (drop byte-count ba))
(+ i byte-count))))))))
(comment
(let [n1 0
n2 1
n3 127
n4 128
n5 257
n6 9876543210
baos (java.io.ByteArrayOutputStream.)]
(->> [n1 n2 n3 n4 n5 n6]
(map vb-encode)
(run! #(.writeBytes baos %)))
(let [ba (.toByteArray baos)]
(vb-decode ba)))
;; => ([0 1] [1 1] [127 1] [128 2] [257 2] [9876543210 5])
)
(defn encode-with-flag-bits (defn encode-with-flag-bits
"Flag is a binary string. "Flag is a binary string.
Returns byte array." Returns byte array."
@ -58,20 +132,22 @@
"Flag is only used for length. It could be refactored to take an int. "Flag is only used for length. It could be refactored to take an int.
Or it could be refactored to take a function that does work based on the flags. Or it could be refactored to take a function that does work based on the flags.
For now though, flag bits must be checked external to this. This is just parity For now though, flag bits must be checked external to this. This is just parity
decoding of the encoding above." decoding of the encoding above.
Returns [decoded-number number-of-bytes-read]."
[flag ba] [flag ba]
(let [flag-len (count flag) (let [flag-len (count flag)
data-len (- 8 flag-len) data-len (- 8 flag-len)
mask-val (int (dec (Math/pow 2 data-len)))] mask-val (int (dec (Math/pow 2 data-len)))]
(loop [ba ba r 0] (loop [ba ba r 0 i 0]
(if (empty? ba) (if (empty? ba)
r [r i]
(let [data-val (bit-and mask-val (first ba))] (let [data-val (bit-and mask-val (first ba))]
(recur (recur
(rest ba) (rest ba)
(bit-or (bit-or
(bit-shift-left r data-len) (bit-shift-left r data-len)
data-val))))))) data-val)
(inc i)))))))
(comment (comment
(let [ns [0 1 127 128 257 9876543210] (let [ns [0 1 127 128 257 9876543210]
@ -81,7 +157,15 @@
decoded (->> encoded decoded (->> encoded
(map (partial decode-with-flag-bits "101")))] (map (partial decode-with-flag-bits "101")))]
decoded) decoded)
;; => (0 1 127 128 257 9876543210) ;; => ([0 1] [1 1] [127 2] [128 2] [257 2] [9876543210 7])
(let [ns [0 1 127 128 257 9876543210]
encoded (map
(partial encode-with-flag-bits "001")
ns)
decoded (->> encoded
(map (partial decode-with-flag-bits "101")))]
decoded)
;; => ([0 1] [1 1] [127 2] [128 2] [257 2] [9876543210 7])
) )
(defn encode-byte-with-n-flag-bits [n b] (defn encode-byte-with-n-flag-bits [n b]
@ -99,71 +183,6 @@
r r
(recur (rest ba) (recur (rest ba)
(+ r )))))) (+ r ))))))
(defn bit-on [i b]
(bit-or b (int (Math/pow 2 i))))
;; I've got a dictionary of ~120,000 words.
;; I need to reference them in a compact way.
;; It will be a little bit of a waste, but we can use a 32 bit index.
(/ (Math/log 1.2e5) (Math/log 2))
(defn vb-encode [n]
(loop [n n
b '()]
(cond
(< n 128) (let [ba (byte-array (cons n b))
i (dec (count ba))]
(aset-byte ba i (sbyte (+ 128 (aget ba i))))
ba)
:else (recur (quot n 128) (cons (mod n 128) b)))))
(defn vb-decode-1 [ba]
(loop [n 0
i 0]
(let [b (aget ba i)]
(cond
(> (bit-and b 0x80) 0)
[(bit-or (bit-and 0x7f b)
(bit-shift-left n 7))
(inc i)]
:else
(recur (bit-or (bit-and 0x7f b)
(bit-shift-left n 7))
(inc i))))))
(defn vb-decode
([ba]
(vb-decode ba 0))
([ba i]
(if (>= i (count ba))
(cons (first (vb-decode-1 ba))
nil)
(let [[value byte-count] (vb-decode-1 ba)]
(lazy-seq
(cons
value
(vb-decode (byte-array (drop byte-count ba))
(+ i byte-count))))))))
(comment
(let [n1 0
n2 1
n3 127
n4 128
n5 257
n6 9876543210
baos (java.io.ByteArrayOutputStream.)]
(->> [n1 n2 n3 n4 n5 n6]
(map vb-encode)
(run! #(.writeBytes baos %)))
(let [ba (.toByteArray baos)]
(vb-decode ba)))
;; => ([0 1] [1 1] [127 1] [128 2] [257 2] [9876543210 5])
)
(def dictionary ["hi" "my" "name" "is" "what"]) (def dictionary ["hi" "my" "name" "is" "what"])

@ -6,6 +6,72 @@
(:import (java.io ByteArrayOutputStream ByteArrayInputStream (:import (java.io ByteArrayOutputStream ByteArrayInputStream
DataOutputStream DataInputStream))) DataOutputStream DataInputStream)))
(defprotocol IByteArrayCursor
(loc [_])
(jump [_ loc] "Moves location of cursor to specified index.")
(advance [_] [_ n])
(retreat [_] [_ n])
(peek [_])
(val [_])
(slice [_ end])
(ba= [_ oba] (identical? ba oba)))
(deftype ByteArrayCursor [ba loc]
IByteArrayCursor
(loc [_] loc)
(jump [_ loc] (ByteArrayCursor. ba loc))
(advance [_] (ByteArrayCursor. ba (inc loc)))
(advance [_ n] (ByteArrayCursor. ba (+ loc n)))
(retreat [_] (ByteArrayCursor. ba (dec loc)))
(retreat [_ n] (ByteArrayCursor. ba (- loc n)))
(peek [_] (nth ba (inc loc)))
(val [_] (nth ba loc))
(slice [_ n]
(loop [i 0 r []]
(if (or (= i n)
(>= (+ loc i) (count ba)))
(byte-array r)
(recur (inc i) (conj r (aget ba (+ i loc)))))))
clojure.lang.Indexed
(nth [_ i] (aget ba (+ loc i)))
(nth [_ i not-found] (aget ba (+ loc i) not-found))
clojure.lang.IPersistentCollection
(count [_] (count ba))
(cons [_ o] (ByteArrayCursor. (byte-array (concat ba [o])) loc))
(empty [_] (ByteArrayCursor. (byte-array 0) 0))
(equiv [self o] (and (ba= o ba)
(= loc (loc o))))
clojure.lang.ISeq
(first [self] (self))
(next [self] (ByteArrayCursor. ba (inc loc)))
(more [_] (ByteArrayCursor. ba (inc loc)))
clojure.lang.Seqable
(seq [_] (let [seeker (fn seeker [ba i]
(if (>= i (count ba))
nil
(lazy-seq
(cons
(aget ba i)
(seeker ba (inc i))))))]
(seeker ba loc))))
(comment
(let [bac1 (->ByteArrayCursor (byte-array (range 100)) 0)
bac2 (->ByteArrayCursor (byte-array (range 100 200)) 0)
bac3 (take-last 5 (concat bac1 bac2))
bac4 (take-last 5 (conj bac1 111))]
[(peek bac1)
(take 5 (next bac1))
(take 5 (rest bac1))
bac3
bac4])
)
(defn branch? [node] (defn branch? [node]
(and (map? node) (and (map? node)
(:children (first (vals node))))) (:children (first (vals node)))))
@ -141,7 +207,6 @@
(loc->byte-address loc 0)) (loc->byte-address loc 0))
([loc starting-offset] ([loc starting-offset]
(let [prev (previous-node loc)] (let [prev (previous-node loc)]
(when prev (println "prev" (zip/node prev)))
(if prev (if prev
(let [[k {:keys [byte-address byte-array]}] (first (seq (zip/node prev)))] (let [[k {:keys [byte-address byte-array]}] (first (seq (zip/node prev)))]
(+ byte-address (count byte-array))) (+ byte-address (count byte-array)))
@ -151,7 +216,6 @@
"Given a child gets a map with info needed to build an index." "Given a child gets a map with info needed to build an index."
[child] [child]
(let [[k {:keys [byte-address byte-array] :as v}] (first (seq child))] (let [[k {:keys [byte-address byte-array] :as v}] (first (seq child))]
(println byte-address byte-array)
{:byte-address byte-address {:byte-address byte-address
:key k :key k
:byte-array byte-array})) :byte-array byte-array}))
@ -162,6 +226,11 @@
(visitor loc) (visitor loc)
loc))) loc)))
(def *offset-flag* 0)
(defn offset-byte? [b]
(= *offset-flag* (tpt/nth-bit 8 b)))
(defn encode-offset (defn encode-offset
"0-padded" "0-padded"
[n] [n]
@ -171,13 +240,55 @@
[ba] [ba]
(tpt/decode-with-flag-bits "0" ba)) (tpt/decode-with-flag-bits "0" ba))
(defn decode-offset-cursor [cursor]
(let [bytes (take-while offset-byte? cursor)
[decoded byte-count] (tpt/decode-with-flag-bits "0" (byte-array bytes))]
[decoded (advance cursor byte-count)]))
(defn encode-key (defn encode-key
"1-padded" "1-padded"
[n] [n]
(tpt/encode-with-flag-bits "1" n)) (tpt/encode-with-flag-bits "1" n))
(def *key-flag* 1)
(defn key-byte? [b]
(= *key-flag* (tpt/nth-bit 8 b)))
(defn decode-key [ba] (defn decode-key [ba]
(tpt/decode-with-flag-bits "1" ba)) (let [bytes (take-while key-byte? ba)]
(tpt/decode-with-flag-bits "1" (byte-array bytes))))
(defn decode-key-cursor [cursor]
(let [bytes (take-while key-byte? cursor)
[decoded byte-count] (tpt/decode-with-flag-bits "1" (byte-array bytes))]
[decoded (advance cursor byte-count)]))
(comment
;; Test encoding and decoding key/offset pairs.
;; Cursor works just the same as a byte-array but
;; instead of the decodes returning a byte-count
;; it returns a cursor with a pointer to the new location.
(let [key (encode-key 128)
offset (encode-offset 65537)
ba (byte-array (concat key offset))
cursor (->ByteArrayCursor
(byte-array (concat (byte-array 3) ba (byte-array (repeat 3 -1))))
3)
[decoded-key key-bytecount] (decode-key ba)
[decoded-offset offset-bytecount] (decode-offset (drop key-bytecount ba))
[decoded-key-cursor new-cursor] (decode-key-cursor cursor)
[decoded-offset-cursor new-cursor] (decode-offset-cursor new-cursor)]
[decoded-key
decoded-offset
decoded-key-cursor
decoded-offset-cursor])
;; => [128 65537 128 65537]
(let [key (encode-key 128)
offset (encode-offset 65537)
bac (->ByteArrayCursor (byte-array (concat key offset)) 0)]
(take 2 ba))
)
(defn pack-node-value (defn pack-node-value
"Returns byte-array of node value. "Returns byte-array of node value.
@ -188,7 +299,6 @@
Nodes without terminal values get a value and count of 0." Nodes without terminal values get a value and count of 0."
[node] [node]
(let [baos (ByteArrayOutputStream.)] (let [baos (ByteArrayOutputStream.)]
(println "node-value" node)
(.write baos (tpt/vb-encode (get node :value 0))) (.write baos (tpt/vb-encode (get node :value 0)))
(.write baos (tpt/vb-encode (get node :count 0))) (.write baos (tpt/vb-encode (get node :count 0)))
(.toByteArray baos))) (.toByteArray baos)))
@ -196,36 +306,37 @@
(defn pack-index-entry (defn pack-index-entry
[child] [child]
(let [baos (ByteArrayOutputStream.)] (let [baos (ByteArrayOutputStream.)]
(println "index-entry" child)
(.write baos (encode-key (:key child))) (.write baos (encode-key (:key child)))
(.write baos (encode-offset (:offset child))) (.write baos (encode-offset (:offset child)))
(.toByteArray baos))) (.toByteArray baos)))
(defn unpack-index-entry
[ba i]
(let [cursor (->ByteArrayCursor ba i)
[key offset] (decode-key )]))
(let [some-array (make-array Integer/TYPE 3)]
(drop 1 some-array))
(defn pack-index (defn pack-index
"Reserves 4 bytes for offset of root node. "Reserves 4 bytes for offset of root node.
Lots of mutation going on here as we write to baos." Lots of mutation going on here as we write to baos."
[loc] [loc]
(println "pack-index" (zip/node loc))
(let [baos (ByteArrayOutputStream.) (let [baos (ByteArrayOutputStream.)
byte-address (loc->byte-address loc 4) byte-address (loc->byte-address loc 4)
_ (println "byte-address" byte-address)
child-nodes (->> loc child-nodes (->> loc
child-seq child-seq
(map (comp second zip/node))) (map (comp second zip/node)))
_ (println "child nodes")
children (map children (map
(fn [child-node] (fn [child-node]
(let [child-index (child->index child-node)] (let [child-index (child->index child-node)]
(println "cn" child-node)
(assoc (assoc
child-index child-index
:offset :offset
(- byte-address (:byte-address child-index))))) (- byte-address (:byte-address child-index)))))
child-nodes) child-nodes)
_ (println "children" (count children))
index-ba (let [index-baos (ByteArrayOutputStream.) index-ba (let [index-baos (ByteArrayOutputStream.)
child-byte-arrays (map pack-index-entry children)] child-byte-arrays (map pack-index-entry children)]
(println "child-byte-arrays" child-byte-arrays)
(loop [bas child-byte-arrays] (loop [bas child-byte-arrays]
(if (empty? bas) (if (empty? bas)
(.toByteArray baos) (.toByteArray baos)
@ -234,7 +345,6 @@
(zip/edit (zip/edit
loc loc
(fn [node] (fn [node]
(println node)
(let [[k v] (first (seq node))] (let [[k v] (first (seq node))]
(.write baos (pack-node-value v)) (.write baos (pack-node-value v))
(.write baos (tpt/vb-encode (count index-ba))) (.write baos (tpt/vb-encode (count index-ba)))

Loading…
Cancel
Save