Closer to tightly packed trie

main
Eric Ihli 4 years ago
parent 633ac1bd18
commit 6e143a91cb

@ -10,17 +10,105 @@
(defn sbyte [b] (defn sbyte [b]
(bit-or -0x100 b)) (bit-or -0x100 b))
(defn as-binary-string [b]
(string/replace
(format "%8s" (Integer/toBinaryString b))
#" "
"0"))
(defn bits [binary-string]
(Integer/parseInt binary-string 2))
(defn encode-with-flag-bits
"Flag is a binary string.
Returns byte array."
[flag n]
(let [flag-len (count flag)
data-len (- 8 flag-len)
max-data-val (int (Math/pow 2 data-len))
flag-val (bit-shift-left (bits flag) data-len)]
(loop [n n r '()]
(if (< n max-data-val)
(byte-array (cons (bit-or flag-val n) r))
(recur
(quot n max-data-val)
(cons
(bit-or flag-val (mod n max-data-val))
r))))))
(comment
(let [ns [0 1 127 128 257 9876543210]
encoded (map
(partial encode-with-flag-bits "101")
ns)
as-binary (->> encoded
(map (partial map as-binary-string))
(map (partial map (partial take-last 8)))
(map (partial map (partial apply str))))]
as-binary)
;; => (("10100000")
;; ("10100001")
;; ("10100011" "10111111")
;; ("10100100" "10100000")
;; ("10101000" "10100001")
;; ("10101001" "10100110" "10101011" "10100000" "10100101" "10110111" "10101010"))
)
(defn decode-with-flag-bits
"Flag is only used for length. It could be refactored to take an int.
Or it could be refactored to take a function that does work based on the flags.
For now though, flag bits must be checked external to this. This is just parity
decoding of the encoding above."
[flag ba]
(let [flag-len (count flag)
data-len (- 8 flag-len)
mask-val (int (dec (Math/pow 2 data-len)))]
(loop [ba ba r 0]
(if (empty? ba)
r
(let [data-val (bit-and mask-val (first ba))]
(recur
(rest ba)
(bit-or
(bit-shift-left r data-len)
data-val)))))))
(comment
(let [ns [0 1 127 128 257 9876543210]
encoded (map
(partial encode-with-flag-bits "101")
ns)
decoded (->> encoded
(map (partial decode-with-flag-bits "101")))]
decoded)
;; => (0 1 127 128 257 9876543210)
)
(defn encode-byte-with-n-flag-bits [n b]
(let [max-size-with-flag (int (Math/pow 2 (- 8 n)))]
(loop [b b r '()]
(if (< b max-size-with-flag)
(byte-array (cons b r))
(recur (quot b max-size-with-flag)
(cons (mod b max-size-with-flag) r))))))
(defn decode-byte-with-n-flag-bits [n ba]
(let [max-size-with-flag (int (Math/pow 2 (- 8 n)))]
(loop [ba ba r 0]
(if (nil? ba)
r
(recur (rest ba)
(+ r ))))))
(defn bit-on [i b]
(bit-or b (int (Math/pow 2 i))))
;; I've got a dictionary of ~120,000 words. ;; I've got a dictionary of ~120,000 words.
;; I need to reference them in a compact way. ;; I need to reference them in a compact way.
;; It will be a little bit of a waste, but we can use a 32 bit index. ;; It will be a little bit of a waste, but we can use a 32 bit index.
(/ (Math/log 1.2e5) (Math/log 2)) (/ (Math/log 1.2e5) (Math/log 2))
(defn as-binary-string [b]
(string/replace
(format "%8s" (Integer/toBinaryString b))
#" "
"0"))
(defn vb-encode [n] (defn vb-encode [n]
(loop [n n (loop [n n

@ -124,9 +124,128 @@
) )
;;;; Tightly Packed Trie
;; The following functions are helpers for converting
;; a trie into a tightly-packed byte-array.
(defn previous-node [loc]
(loop [loc (zip/prev loc)]
(cond
(nil? loc) nil
(map? (zip/node loc)) loc
:else (recur (zip/prev loc)))))
(defn loc->byte-address
"Given a loc without a byte-address, calculate it from the previous loc."
([loc]
(loc->byte-address loc 0))
([loc starting-offset]
(let [prev (previous-node loc)]
(when prev (println "prev" (zip/node prev)))
(if prev
(let [[k {:keys [byte-address byte-array]}] (first (seq (zip/node prev)))]
(+ byte-address (count byte-array)))
starting-offset))))
(defn child->index
"Given a child gets a map with info needed to build an index."
[child]
(let [[k {:keys [byte-address byte-array] :as v}] (first (seq child))]
(println byte-address byte-array)
{:byte-address byte-address
:key k
:byte-array byte-array}))
(defn visitor-filter [pred visitor]
(fn [loc]
(if (pred loc)
(visitor loc)
loc)))
(defn encode-offset
"0-padded"
[n]
(tpt/encode-with-flag-bits "0" n))
(defn decode-offset
[ba]
(tpt/decode-with-flag-bits "0" ba))
(defn encode-key
"1-padded"
[n]
(tpt/encode-with-flag-bits "1" n))
(defn decode-key [ba]
(tpt/decode-with-flag-bits "1" ba))
(defn pack-node-value
"Returns byte-array of node value.
Byte-array is 2 variable-length encoded integers.
For a markov trie, this would be an integer ID
of the n-gram and an integer of the frequency.
Nodes without terminal values get a value and count of 0."
[node]
(let [baos (ByteArrayOutputStream.)]
(println "node-value" node)
(.write baos (tpt/vb-encode (get node :value 0)))
(.write baos (tpt/vb-encode (get node :count 0)))
(.toByteArray baos)))
(defn pack-index-entry
[child]
(let [baos (ByteArrayOutputStream.)]
(println "index-entry" child)
(.write baos (encode-key (:key child)))
(.write baos (encode-offset (:offset child)))
(.toByteArray baos)))
(defn pack-index
"Reserves 4 bytes for offset of root node.
Lots of mutation going on here as we write to baos."
[loc]
(println "pack-index" (zip/node loc))
(let [baos (ByteArrayOutputStream.)
byte-address (loc->byte-address loc 4)
_ (println "byte-address" byte-address)
child-nodes (->> loc
child-seq
(map (comp second zip/node)))
_ (println "child nodes")
children (map
(fn [child-node]
(let [child-index (child->index child-node)]
(println "cn" child-node)
(assoc
child-index
:offset
(- byte-address (:byte-address child-index)))))
child-nodes)
_ (println "children" (count children))
index-ba (let [index-baos (ByteArrayOutputStream.)
child-byte-arrays (map pack-index-entry children)]
(println "child-byte-arrays" child-byte-arrays)
(loop [bas child-byte-arrays]
(if (empty? bas)
(.toByteArray baos)
(do (.write index-baos (first bas))
(recur (rest bas))))))]
(zip/edit
loc
(fn [node]
(println node)
(let [[k v] (first (seq node))]
(.write baos (pack-node-value v))
(.write baos (tpt/vb-encode (count index-ba)))
(.write baos index-ba)
{k (conj v {:byte-address byte-address
:byte-array (.toByteArray baos)})})))))
(defprotocol ITrie (defprotocol ITrie
(as-map [this] "Map that underlies trie.") (as-map [this] "Map that underlies trie.")
(as-vec [this] "Depth-first post-order vector") (as-vec [this] "Depth-first post-order vector.")
(as-byte-array [this] "Tightly-packed byte-array.")
(transform [this f] "Depth-first post-order apply each function to each node.")) (transform [this f] "Depth-first post-order apply each function to each node."))
;; Seq offers a depth-first post-order traversal ;; Seq offers a depth-first post-order traversal
@ -135,6 +254,11 @@
ITrie ITrie
(as-map [_] trie) (as-map [_] trie)
(as-vec [_] (map-trie->seq-trie trie)) (as-vec [_] (map-trie->seq-trie trie))
(as-byte-array [self]
(->> (transform self (visitor-filter #(map? (zip/node %)) pack-index))
as-vec
vec-trie->map-trie
(Trie.)))
(transform [self f] (transform [self f]
(->> self (->> self
as-vec as-vec
@ -182,6 +306,37 @@
(trie) (trie)
entries))) entries)))
(defn mapped-byte-array-trie->byte-array
[trie]
(let [baos (ByteArrayOutputStream.)]
(->> trie
as-vec
zip/vector-zip
(iterate zip/next)
(take-while (complement zip/end?))
(filter (comp map? zip/node))
((fn [nodes]
(loop [nodes nodes]
(if (empty? nodes)
baos
(let [node (first nodes)]
(.write baos (:byte-array node))
(recur (rest nodes))))))))))
(comment
(let [v1 '(1 2 1 121)
v2 '(1 3 1 131)
v3 '(1 2 12)
t1 (trie v1)
t2 (trie v2)
t3 (trie v1 v2 v3)
vect (as-vec t3)]
(->> (as-byte-array t3)
(mapped-byte-array-trie->byte-array)
(map zip/node)))
)
(comment (comment
(let [v1 '("T" "A" "T" "TAT") (let [v1 '("T" "A" "T" "TAT")
v2 '("T" "U" "T" "TUT") v2 '("T" "U" "T" "TUT")
@ -245,69 +400,7 @@
) )
(defn previous-node [loc]
(loop [loc (zip/prev loc)]
(cond
(nil? loc) nil
(map? (zip/node loc)) loc
:else (recur (zip/prev loc)))))
(defn loc->byte-address
"Given a loc without a byte-address, calculate it from the previous loc."
[loc]
(let [prev (previous-node loc)]
(if prev
(let [[k {:keys [byte-address bytes]}] (first (seq (zip/node prev)))]
(+ byte-address (count bytes)))
0)))
(defn child->index
"Given a child gets a map with info needed to build an index."
[child]
(let [[k {:keys [byte-address] :as v}] (first (seq child))]
{:byte-address byte-address
:key k}))
(child->index {"T" {:byte-address 20}})
(defn pack-index
""
[loc]
(println "pack" (zip/node loc))
(if (map? (zip/node loc))
(let [parent-byte-address (loc->byte-address loc)
_ (println parent-byte-address)
child-nodes (->> loc
child-seq
(map (comp second zip/node)))
_ (println child-nodes)
child-key-offsets (map
(fn [child-node]
(let [{child-byte-address :byte-address
child-key :key} (child->index child-node)]
{:key child-key
:offset (- parent-byte-address child-byte-address)}))
child-nodes)]
(zip/edit
loc
(fn [node]
(println "zip" node)
(let [[k v] (first (seq node))]
{k {:byte-address parent-byte-address
:bytes (byte-array [1 2 3])}}))))
loc))
(comment
(let [v1 '("T" "A" "T" "TAT")
v2 '("T" "U" "T" "TUT")
v3 '("T" "A" "AT")
t1 (trie v1)
t2 (trie v2)
t3 (trie v1 v2 v3)
vect (as-vec t3)]
(transform t3 pack-index))
)
(comment (comment
(let [v1 '("T" "A" "T" "TAT") (let [v1 '("T" "A" "T" "TAT")

Loading…
Cancel
Save