Redesign API, add tests, add example useage
- Add tests - Refactor new version of Trie to be more coll-like - Add an example use-case using a Markov chain to generate textmain
parent
56be9e9898
commit
77475e0c13
@ -0,0 +1,326 @@
|
||||
(ns com.owoga.tightly-packed-trie
|
||||
(:require [com.owoga.trie :as trie]
|
||||
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
||||
[clojure.java.io :as io]
|
||||
[clojure.string :as string]
|
||||
[com.owoga.tightly-packed-trie.bit-manip :as bm]
|
||||
[clojure.zip :as zip])
|
||||
(:import (java.io ByteArrayOutputStream ByteArrayInputStream
|
||||
DataOutputStream DataInputStream)))
|
||||
|
||||
;; A trie data structure that can be converted to
|
||||
;; a contiguous array of bytes while maintaining
|
||||
;; efficient lookups.
|
||||
;;
|
||||
;; A regular Clojure hash-map can be used as a trie,
|
||||
;; but there's a lot of memory overhead that comes along
|
||||
;; with hash-maps.
|
||||
;;
|
||||
;; To conveniently get the benefit of packing a trie into a contiguous array of
|
||||
;; bytes, there are a few restrictions.
|
||||
;;
|
||||
;; Everything must be numeric IDs. Keys must be numeric IDs.
|
||||
;; Values must be numeric IDs. Everything must be a number.
|
||||
;;
|
||||
;; This lets us encode everything as variable-length-encoded byte arrays.
|
||||
;;
|
||||
;; To maximize efficiency, your most common keys should have the
|
||||
;; smallest IDs.
|
||||
|
||||
(defmacro wrap-byte-buffer
|
||||
"Saves the position and limit of a byte buffer, runs body,
|
||||
returns byte buffer to original position and limit."
|
||||
[byte-buffer & body]
|
||||
`(let [original-position# (.position ~byte-buffer)
|
||||
original-limit# (.limit ~byte-buffer)]
|
||||
(try (do ~@body)
|
||||
(finally
|
||||
(.limit ~byte-buffer original-limit#)
|
||||
(.position ~byte-buffer original-position#)))))
|
||||
|
||||
(defn trie->depth-first-post-order-traversable-zipperable-vector
|
||||
([path node decode-value-fn]
|
||||
(vec
|
||||
(map
|
||||
(fn [child]
|
||||
[(trie->depth-first-post-order-traversable-zipperable-vector
|
||||
(conj path (.key child))
|
||||
child
|
||||
decode-value-fn)
|
||||
(wrap-byte-buffer
|
||||
(.byte-buffer child)
|
||||
(.limit (.byte-buffer child) (.limit child))
|
||||
(.position (.byte-buffer child) (.address child))
|
||||
(clojure.lang.MapEntry.
|
||||
(conj path (.key child))
|
||||
(decode-value-fn (.byte-buffer child))))])
|
||||
(trie/children node)))))
|
||||
|
||||
(defn rewind-to-key [bb stop]
|
||||
(loop []
|
||||
(let [current (.get bb (.position bb))
|
||||
previous (.get bb (dec (.position bb)))]
|
||||
(if (or (= stop (.position bb))
|
||||
(and (encoding/key-byte? current)
|
||||
(encoding/offset-byte? previous)))
|
||||
bb
|
||||
(do (.position bb (dec (.position bb)))
|
||||
(recur))))))
|
||||
|
||||
(defn forward-to-key [bb stop]
|
||||
(loop []
|
||||
(if (or (= stop (.position bb))
|
||||
(and (encoding/key-byte? (.get bb (.position bb)))
|
||||
(encoding/offset-byte?
|
||||
(.get bb (inc (.position bb))))))
|
||||
bb
|
||||
(do (.position bb (inc (.position bb)))
|
||||
(recur)))))
|
||||
|
||||
(defn find-key-in-index
|
||||
[bb target-key max-address not-found]
|
||||
(.limit bb max-address)
|
||||
(let [key
|
||||
(loop [previous-key nil
|
||||
min-position (.position bb)
|
||||
max-position max-address]
|
||||
(if (zero? (- max-position min-position))
|
||||
not-found
|
||||
(let [mid-position (+ min-position (quot (- max-position min-position) 2))]
|
||||
(.position bb mid-position)
|
||||
(let [bb (rewind-to-key bb min-position)
|
||||
current-key
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)]
|
||||
(cond
|
||||
(= current-key target-key)
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
|
||||
(= current-key previous-key)
|
||||
(do
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(let [final-key (encoding/decode-number-from-tightly-packed-trie-index bb)]
|
||||
(if (= target-key final-key)
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(throw (Exception. "Key not found.")))))
|
||||
|
||||
(< current-key target-key)
|
||||
;; Chew the next decoded number. It's a useless offset.
|
||||
(do
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(recur
|
||||
current-key
|
||||
(.position bb)
|
||||
max-position))
|
||||
|
||||
(> current-key target-key)
|
||||
;; This could also be rewound.
|
||||
(do
|
||||
(rewind-to-key bb min-position)
|
||||
(recur
|
||||
current-key
|
||||
min-position
|
||||
(.position bb))))))))]
|
||||
(.limit bb (.capacity bb))
|
||||
key))
|
||||
|
||||
(defn tightly-packed-trie-node-value
|
||||
[byte-buffer]
|
||||
(let [value (encoding/decode byte-buffer)
|
||||
freq (encoding/decode byte-buffer)]
|
||||
{:id value
|
||||
:count freq}))
|
||||
|
||||
(defn -value [trie value-decode-fn]
|
||||
(wrap-byte-buffer
|
||||
(.byte-buffer trie)
|
||||
(.limit (.byte-buffer trie) (.limit trie))
|
||||
(.position (.byte-buffer trie) (.address trie))
|
||||
(value-decode-fn (.byte-buffer trie))))
|
||||
|
||||
(deftype TightlyPackedTrie [byte-buffer key address limit value-decode-fn]
|
||||
trie/ITrie
|
||||
(lookup [self ks]
|
||||
(wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer limit)
|
||||
(.position byte-buffer address)
|
||||
(if (empty? ks)
|
||||
self
|
||||
(let [val (value-decode-fn byte-buffer)
|
||||
size-of-index (encoding/decode byte-buffer)
|
||||
offset (find-key-in-index
|
||||
byte-buffer
|
||||
(first ks)
|
||||
(+ (.position byte-buffer) size-of-index)
|
||||
:not-found)]
|
||||
(if (= offset :not-found)
|
||||
nil
|
||||
(let [child (TightlyPackedTrie.
|
||||
byte-buffer
|
||||
(first ks)
|
||||
(- address offset)
|
||||
(.capacity byte-buffer)
|
||||
value-decode-fn)]
|
||||
(trie/lookup child (rest ks))))))))
|
||||
(children [self]
|
||||
(wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer limit)
|
||||
(.position byte-buffer address)
|
||||
(let [val (value-decode-fn byte-buffer)
|
||||
size-of-index (encoding/decode byte-buffer)]
|
||||
(.limit byte-buffer (+ (.position byte-buffer)
|
||||
size-of-index))
|
||||
(loop [children []]
|
||||
(if (= (.position byte-buffer) (.limit byte-buffer))
|
||||
children
|
||||
(let [child-key (encoding/decode-number-from-tightly-packed-trie-index byte-buffer)
|
||||
child-offset (encoding/decode-number-from-tightly-packed-trie-index byte-buffer)]
|
||||
(recur
|
||||
(conj
|
||||
children
|
||||
(TightlyPackedTrie.
|
||||
byte-buffer
|
||||
child-key
|
||||
(- address child-offset)
|
||||
(.capacity byte-buffer)
|
||||
value-decode-fn)))))))))
|
||||
|
||||
clojure.lang.ILookup
|
||||
(valAt [self ks]
|
||||
(if-let [node (trie/lookup self ks)]
|
||||
(-value node value-decode-fn)
|
||||
nil))
|
||||
(valAt [self ks not-found]
|
||||
(or (get self ks) not-found))
|
||||
|
||||
clojure.lang.Seqable
|
||||
(seq [trie]
|
||||
(->> trie
|
||||
(#(trie->depth-first-post-order-traversable-zipperable-vector
|
||||
[]
|
||||
%
|
||||
value-decode-fn))
|
||||
zip/vector-zip
|
||||
(iterate zip/next)
|
||||
(take-while (complement zip/end?))
|
||||
(map zip/node)
|
||||
(filter (partial instance? clojure.lang.MapEntry))
|
||||
(#(if (empty? %) nil %)))))
|
||||
|
||||
(defn tightly-packed-trie
|
||||
[trie value-encode-fn value-decode-fn]
|
||||
(let [baos (ByteArrayOutputStream.)]
|
||||
(loop [nodes (seq trie)
|
||||
current-offset 8
|
||||
previous-depth 0
|
||||
child-indexes []]
|
||||
(let [current-node (first nodes)
|
||||
current-depth (count (first current-node))]
|
||||
(cond
|
||||
(empty? nodes)
|
||||
(let [child-index (first child-indexes)
|
||||
child-index-baos (ByteArrayOutputStream.)
|
||||
_ (->> child-index
|
||||
(run!
|
||||
(fn [[key offset]]
|
||||
(.write
|
||||
child-index-baos
|
||||
(encoding/encode-key-to-tightly-packed-trie-index key))
|
||||
(.write
|
||||
child-index-baos
|
||||
(encoding/encode-offset-to-tightly-packed-trie-index
|
||||
(- current-offset offset))))))
|
||||
child-index-byte-array (.toByteArray child-index-baos)
|
||||
size-of-child-index (encoding/encode (count child-index-byte-array))
|
||||
root-address current-offset
|
||||
value (value-encode-fn 0)]
|
||||
(.write baos value)
|
||||
(.write baos size-of-child-index)
|
||||
(.write baos child-index-byte-array)
|
||||
(let [ba (.toByteArray baos)
|
||||
byte-buf (java.nio.ByteBuffer/allocate (+ 8 (count ba)))]
|
||||
(do (.putLong byte-buf root-address)
|
||||
(.put byte-buf ba)
|
||||
(.rewind byte-buf)
|
||||
(->TightlyPackedTrie
|
||||
byte-buf
|
||||
0
|
||||
(.getLong byte-buf)
|
||||
(.capacity byte-buf)
|
||||
value-decode-fn))))
|
||||
|
||||
;; Gone up from depth to a parent.
|
||||
;; Process index of children.
|
||||
(> previous-depth current-depth)
|
||||
(do (let [[k v] (first nodes)
|
||||
value (value-encode-fn v)
|
||||
child-index (first child-indexes)
|
||||
child-index-baos (ByteArrayOutputStream.)
|
||||
_ (->> child-index
|
||||
(run!
|
||||
(fn [[key offset]]
|
||||
(.write
|
||||
child-index-baos
|
||||
(encoding/encode-key-to-tightly-packed-trie-index key))
|
||||
(.write
|
||||
child-index-baos
|
||||
(encoding/encode-offset-to-tightly-packed-trie-index
|
||||
(- current-offset offset))))))
|
||||
child-index-byte-array (.toByteArray child-index-baos)
|
||||
size-of-child-index (encoding/encode (count child-index-byte-array))
|
||||
current-index (second child-indexes)]
|
||||
(.write baos value)
|
||||
(.write baos size-of-child-index)
|
||||
(.write baos child-index-byte-array)
|
||||
(recur (rest nodes)
|
||||
(+ current-offset
|
||||
(count value)
|
||||
(count size-of-child-index)
|
||||
(count child-index-byte-array))
|
||||
current-depth
|
||||
(cons (conj current-index
|
||||
[(last k)
|
||||
current-offset])
|
||||
(drop 2 child-indexes)))))
|
||||
|
||||
;; Down or even in depth to children
|
||||
;; Start keeping track of new children index
|
||||
:else
|
||||
(do (let [[k v] (first nodes)
|
||||
value (value-encode-fn v)
|
||||
size-of-child-index (encoding/encode 0)
|
||||
child-indexes (concat (repeat (- current-depth previous-depth) [])
|
||||
child-indexes)
|
||||
current-child-index (first child-indexes)]
|
||||
(.write baos value)
|
||||
(.write baos size-of-child-index)
|
||||
(recur (rest nodes)
|
||||
(+ current-offset
|
||||
(count value)
|
||||
(count size-of-child-index))
|
||||
current-depth
|
||||
(cons (conj current-child-index
|
||||
[(last k)
|
||||
current-offset])
|
||||
(rest child-indexes))))))))))
|
||||
|
||||
;; TODO: Shared "save" interface for Trie?
|
||||
(defn save-tightly-packed-trie-to-file
|
||||
[filepath trie]
|
||||
(with-open [o (io/output-stream filepath)]
|
||||
(.write o (.array (.byte-buffer trie)))))
|
||||
|
||||
(defn load-tightly-packed-trie-from-file
|
||||
[filepath value-decode-fn]
|
||||
(with-open [i (io/input-stream filepath)
|
||||
baos (ByteArrayOutputStream.)]
|
||||
(io/copy i baos)
|
||||
(let [byte-buffer (java.nio.ByteBuffer/wrap (.toByteArray baos))]
|
||||
(.rewind byte-buffer)
|
||||
(->TightlyPackedTrie
|
||||
byte-buffer
|
||||
0
|
||||
(.getLong byte-buffer)
|
||||
(.capacity byte-buffer)
|
||||
value-decode-fn))))
|
@ -1,773 +0,0 @@
|
||||
(ns com.owoga.tightly-packed-trie.core
|
||||
(:require [clojure.zip :as zip]
|
||||
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
||||
[com.owoga.tightly-packed-trie.bit-manip :as bm]
|
||||
[clojure.java.io :as io]
|
||||
[clojure.string :as string])
|
||||
(:import (java.io ByteArrayOutputStream ByteArrayInputStream
|
||||
DataOutputStream DataInputStream)))
|
||||
|
||||
;; A trie data structure that can be converted to
|
||||
;; a contiguous array of bytes while maintaining
|
||||
;; efficient lookups.
|
||||
;;
|
||||
;; A regular Clojure hash-map can be used as a trie,
|
||||
;; but there's a lot of memory overhead that comes along
|
||||
;; with hash-maps.
|
||||
;;
|
||||
;; To conveniently get the benefit of packing a trie into a contiguous array of
|
||||
;; bytes, there are a few restrictions.
|
||||
;;
|
||||
;; Everything must be numeric IDs. Keys must be numeric IDs.
|
||||
;; Values must be numeric IDs. Everything must be a number.
|
||||
;;
|
||||
;; This lets us encode everything as variable-length-encoded byte arrays.
|
||||
;;
|
||||
;; To maximize efficiency, your most common keys should have the
|
||||
;; smallest IDs.
|
||||
|
||||
|
||||
;; It's convenient to work with hash-map representations
|
||||
;; while developing.
|
||||
;;
|
||||
;; {"T" {:children:
|
||||
;; "I": {:value "IT"}
|
||||
;; "A": {:value "AT"}}}
|
||||
;;
|
||||
;; If you want to work with a hash-map representation and
|
||||
;; want this library to handle conversion of the trie,
|
||||
;; then your hash-map version will need to follow a few conventions.
|
||||
;;
|
||||
;; For example, this code expects child nodes to be key/value pairs under
|
||||
;; the :children key.
|
||||
|
||||
|
||||
;; To pack a trie into a contiguous array of bytes
|
||||
;; and still be able to find a key in a list of
|
||||
;; child indexes in an efficient way, the child
|
||||
;; indexes need to be sorted so they can be
|
||||
;; binary-searched.
|
||||
;;
|
||||
;; When working with a hash-map-backed Trie,
|
||||
;; it's convenient to use update-in to add
|
||||
;; new values to the trie. But the default
|
||||
;; update-in creates unsorted hash-maps when it encounters
|
||||
;; a new key. This has the same functionality of update-in,
|
||||
;; but new keys are given sorted-map values.
|
||||
(defn update-in-sorted
|
||||
"'Updates' a value in a nested associative structure, where ks is a
|
||||
sequence of keys and f is a function that will take the old value
|
||||
and any supplied args and return the new value, and returns a new
|
||||
nested structure. If any levels do not exist, hash-maps will be
|
||||
created."
|
||||
{:added "1.0"
|
||||
:static true}
|
||||
([m ks f & args]
|
||||
(let [up (fn up [m ks f args]
|
||||
(let [m (or m (sorted-map))
|
||||
[k & ks] ks]
|
||||
(assert (instance? clojure.lang.PersistentTreeMap m)
|
||||
(apply str
|
||||
"A non-sorted hash-map in a sorted"
|
||||
"hash-map will probably be the seed of some problems."))
|
||||
(if ks
|
||||
(assoc m k (up (get m k) ks f args))
|
||||
(assoc m k (apply f (get m k) args)))))]
|
||||
(up m ks f args))))
|
||||
|
||||
|
||||
(defn children-of-map-trie-node
|
||||
"Grab the children of a node.
|
||||
|
||||
A node is a map of {node-key {:children {,,,} ,,,}}.
|
||||
|
||||
This functions gives you the child nodes not as a single map
|
||||
where all the child keys are part of the same map, but instead as a
|
||||
a seq where each value is a single node.
|
||||
|
||||
This is a useful helper for turning a Trie from a map into a
|
||||
depth-first post-order traversable zipper."
|
||||
[node]
|
||||
(let [[k {:keys [children]}] (first (seq node))]
|
||||
(->> children
|
||||
(map (partial apply hash-map)))))
|
||||
|
||||
(comment
|
||||
(let [root-node {:root {:children {"T" {:some 'val} "U" {:other 'val}}}}]
|
||||
(children-of-map-trie-node root-node))
|
||||
;; => ({"T" {:some val}} {"U" {:other val}})
|
||||
)
|
||||
|
||||
(defn without-children [node]
|
||||
(let [[k v] (first (seq node))]
|
||||
{k (dissoc v :children)}))
|
||||
|
||||
(defn map->depth-first-post-order-traversable-zipperable-vector
|
||||
[node]
|
||||
[(vec
|
||||
(map
|
||||
map->depth-first-post-order-traversable-zipperable-vector
|
||||
(children-of-map-trie-node node)))
|
||||
(without-children node)])
|
||||
|
||||
(comment
|
||||
;; This comment demonstrates how we change the order in which
|
||||
;; we traverse the map. If we don't turn each node into a vector
|
||||
;; where the list of children are first, then the parent node would
|
||||
;; get traversed befor the children. In the example below, the
|
||||
;; "AT" node would be traversed befor the "SAT" and "TAT" nodes.
|
||||
(let [m {:root
|
||||
{:children
|
||||
{"T"
|
||||
{:children
|
||||
{"A" {:children
|
||||
{"T" {:value "TAT", :count 1}
|
||||
"S" {:value "SAT" :count 1}}
|
||||
:value "AT"
|
||||
:count 1},
|
||||
"U" {:children {"T" {:value "TUT", :count 1}}}}}}}}]
|
||||
(let [z (zip/vector-zip
|
||||
(map->depth-first-post-order-traversable-zipperable-vector m))]
|
||||
(->> z
|
||||
(iterate zip/next)
|
||||
(take-while (complement zip/end?))
|
||||
(map zip/node)
|
||||
(filter map?))))
|
||||
;; => ({"T" {:value "TAT", :count 1}}
|
||||
;; {"S" {:value "SAT", :count 1}}
|
||||
;; {"A" {:value "AT", :count 1}}
|
||||
;; {"T" {:value "TUT", :count 1}}
|
||||
;; {"U" {}}
|
||||
;; {"T" {}}
|
||||
;; {:root {}})
|
||||
)
|
||||
|
||||
(defn depth-first-post-order-traversable-zipperable-vector->map
|
||||
"Parity reversal of the code above. Maps are easier to reason about
|
||||
while developing. It's just inconvenient to traverse them in the
|
||||
order needed by the algorithms we use to pack them into a contiguous
|
||||
array of byte for the tightly packed trie."
|
||||
[node]
|
||||
(let [children (first node)
|
||||