Redesign API, add tests, add example useage
- Add tests - Refactor new version of Trie to be more coll-like - Add an example use-case using a Markov chain to generate textmain
parent
56be9e9898
commit
77475e0c13
@ -0,0 +1,326 @@
|
||||
(ns com.owoga.tightly-packed-trie
|
||||
(:require [com.owoga.trie :as trie]
|
||||
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
||||
[clojure.java.io :as io]
|
||||
[clojure.string :as string]
|
||||
[com.owoga.tightly-packed-trie.bit-manip :as bm]
|
||||
[clojure.zip :as zip])
|
||||
(:import (java.io ByteArrayOutputStream ByteArrayInputStream
|
||||
DataOutputStream DataInputStream)))
|
||||
|
||||
;; A trie data structure that can be converted to
|
||||
;; a contiguous array of bytes while maintaining
|
||||
;; efficient lookups.
|
||||
;;
|
||||
;; A regular Clojure hash-map can be used as a trie,
|
||||
;; but there's a lot of memory overhead that comes along
|
||||
;; with hash-maps.
|
||||
;;
|
||||
;; To conveniently get the benefit of packing a trie into a contiguous array of
|
||||
;; bytes, there are a few restrictions.
|
||||
;;
|
||||
;; Everything must be numeric IDs. Keys must be numeric IDs.
|
||||
;; Values must be numeric IDs. Everything must be a number.
|
||||
;;
|
||||
;; This lets us encode everything as variable-length-encoded byte arrays.
|
||||
;;
|
||||
;; To maximize efficiency, your most common keys should have the
|
||||
;; smallest IDs.
|
||||
|
||||
(defmacro wrap-byte-buffer
|
||||
"Saves the position and limit of a byte buffer, runs body,
|
||||
returns byte buffer to original position and limit."
|
||||
[byte-buffer & body]
|
||||
`(let [original-position# (.position ~byte-buffer)
|
||||
original-limit# (.limit ~byte-buffer)]
|
||||
(try (do ~@body)
|
||||
(finally
|
||||
(.limit ~byte-buffer original-limit#)
|
||||
(.position ~byte-buffer original-position#)))))
|
||||
|
||||
(defn trie->depth-first-post-order-traversable-zipperable-vector
|
||||
([path node decode-value-fn]
|
||||
(vec
|
||||
(map
|
||||
(fn [child]
|
||||
[(trie->depth-first-post-order-traversable-zipperable-vector
|
||||
(conj path (.key child))
|
||||
child
|
||||
decode-value-fn)
|
||||
(wrap-byte-buffer
|
||||
(.byte-buffer child)
|
||||
(.limit (.byte-buffer child) (.limit child))
|
||||
(.position (.byte-buffer child) (.address child))
|
||||
(clojure.lang.MapEntry.
|
||||
(conj path (.key child))
|
||||
(decode-value-fn (.byte-buffer child))))])
|
||||
(trie/children node)))))
|
||||
|
||||
(defn rewind-to-key [bb stop]
|
||||
(loop []
|
||||
(let [current (.get bb (.position bb))
|
||||
previous (.get bb (dec (.position bb)))]
|
||||
(if (or (= stop (.position bb))
|
||||
(and (encoding/key-byte? current)
|
||||
(encoding/offset-byte? previous)))
|
||||
bb
|
||||
(do (.position bb (dec (.position bb)))
|
||||
(recur))))))
|
||||
|
||||
(defn forward-to-key [bb stop]
|
||||
(loop []
|
||||
(if (or (= stop (.position bb))
|
||||
(and (encoding/key-byte? (.get bb (.position bb)))
|
||||
(encoding/offset-byte?
|
||||
(.get bb (inc (.position bb))))))
|
||||
bb
|
||||
(do (.position bb (inc (.position bb)))
|
||||
(recur)))))
|
||||
|
||||
(defn find-key-in-index
|
||||
[bb target-key max-address not-found]
|
||||
(.limit bb max-address)
|
||||
(let [key
|
||||
(loop [previous-key nil
|
||||
min-position (.position bb)
|
||||
max-position max-address]
|
||||
(if (zero? (- max-position min-position))
|
||||
not-found
|
||||
(let [mid-position (+ min-position (quot (- max-position min-position) 2))]
|
||||
(.position bb mid-position)
|
||||
(let [bb (rewind-to-key bb min-position)
|
||||
current-key
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)]
|
||||
(cond
|
||||
(= current-key target-key)
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
|
||||
(= current-key previous-key)
|
||||
(do
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(let [final-key (encoding/decode-number-from-tightly-packed-trie-index bb)]
|
||||
(if (= target-key final-key)
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(throw (Exception. "Key not found.")))))
|
||||
|
||||
(< current-key target-key)
|
||||
;; Chew the next decoded number. It's a useless offset.
|
||||
(do
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(recur
|
||||
current-key
|
||||
(.position bb)
|
||||
max-position))
|
||||
|
||||
(> current-key target-key)
|
||||
;; This could also be rewound.
|
||||
(do
|
||||
(rewind-to-key bb min-position)
|
||||
(recur
|
||||
current-key
|
||||
min-position
|
||||
(.position bb))))))))]
|
||||
(.limit bb (.capacity bb))
|
||||
key))
|
||||
|
||||
(defn tightly-packed-trie-node-value
|
||||
[byte-buffer]
|
||||
(let [value (encoding/decode byte-buffer)
|
||||
freq (encoding/decode byte-buffer)]
|
||||
{:id value
|
||||
:count freq}))
|
||||
|
||||
(defn -value [trie value-decode-fn]
|
||||
(wrap-byte-buffer
|
||||
(.byte-buffer trie)
|
||||
(.limit (.byte-buffer trie) (.limit trie))
|
||||
(.position (.byte-buffer trie) (.address trie))
|
||||
(value-decode-fn (.byte-buffer trie))))
|
||||
|
||||
(deftype TightlyPackedTrie [byte-buffer key address limit value-decode-fn]
|
||||
trie/ITrie
|
||||
(lookup [self ks]
|
||||
(wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer limit)
|
||||
(.position byte-buffer address)
|
||||
(if (empty? ks)
|
||||
self
|
||||
(let [val (value-decode-fn byte-buffer)
|
||||
size-of-index (encoding/decode byte-buffer)
|
||||
offset (find-key-in-index
|
||||
byte-buffer
|
||||
(first ks)
|
||||
(+ (.position byte-buffer) size-of-index)
|
||||
:not-found)]
|
||||
(if (= offset :not-found)
|
||||
nil
|
||||
(let [child (TightlyPackedTrie.
|
||||
byte-buffer
|
||||
(first ks)
|
||||
(- address offset)
|
||||
(.capacity byte-buffer)
|
||||
value-decode-fn)]
|
||||
(trie/lookup child (rest ks))))))))
|
||||
(children [self]
|
||||
(wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer limit)
|
||||
(.position byte-buffer address)
|
||||
(let [val (value-decode-fn byte-buffer)
|
||||
size-of-index (encoding/decode byte-buffer)]
|
||||
(.limit byte-buffer (+ (.position byte-buffer)
|
||||
size-of-index))
|
||||
(loop [children []]
|
||||
(if (= (.position byte-buffer) (.limit byte-buffer))
|
||||
children
|
||||
(let [child-key (encoding/decode-number-from-tightly-packed-trie-index byte-buffer)
|
||||
child-offset (encoding/decode-number-from-tightly-packed-trie-index byte-buffer)]
|
||||
(recur
|
||||
(conj
|
||||
children
|
||||
(TightlyPackedTrie.
|
||||
byte-buffer
|
||||
child-key
|
||||
(- address child-offset)
|
||||
(.capacity byte-buffer)
|
||||
value-decode-fn)))))))))
|
||||
|
||||
clojure.lang.ILookup
|
||||
(valAt [self ks]
|
||||
(if-let [node (trie/lookup self ks)]
|
||||
(-value node value-decode-fn)
|
||||
nil))
|
||||
(valAt [self ks not-found]
|
||||
(or (get self ks) not-found))
|
||||
|
||||
clojure.lang.Seqable
|
||||
(seq [trie]
|
||||
(->> trie
|
||||
(#(trie->depth-first-post-order-traversable-zipperable-vector
|
||||
[]
|
||||
%
|
||||
value-decode-fn))
|
||||
zip/vector-zip
|
||||
(iterate zip/next)
|
||||
(take-while (complement zip/end?))
|
||||
(map zip/node)
|
||||
(filter (partial instance? clojure.lang.MapEntry))
|
||||
(#(if (empty? %) nil %)))))
|
||||
|
||||
(defn tightly-packed-trie
|
||||
[trie value-encode-fn value-decode-fn]
|
||||
(let [baos (ByteArrayOutputStream.)]
|
||||
(loop [nodes (seq trie)
|
||||
current-offset 8
|
||||
previous-depth 0
|
||||
child-indexes []]
|
||||
(let [current-node (first nodes)
|
||||
current-depth (count (first current-node))]
|
||||
(cond
|
||||
(empty? nodes)
|
||||
(let [child-index (first child-indexes)
|
||||
child-index-baos (ByteArrayOutputStream.)
|
||||
_ (->> child-index
|
||||
(run!
|
||||
(fn [[key offset]]
|
||||
(.write
|
||||
child-index-baos
|
||||
(encoding/encode-key-to-tightly-packed-trie-index key))
|
||||
(.write
|
||||
child-index-baos
|
||||
(encoding/encode-offset-to-tightly-packed-trie-index
|
||||
(- current-offset offset))))))
|
||||
child-index-byte-array (.toByteArray child-index-baos)
|
||||
size-of-child-index (encoding/encode (count child-index-byte-array))
|
||||
root-address current-offset
|
||||
value (value-encode-fn 0)]
|
||||
(.write baos value)
|
||||
(.write baos size-of-child-index)
|
||||
(.write baos child-index-byte-array)
|
||||
(let [ba (.toByteArray baos)
|
||||
byte-buf (java.nio.ByteBuffer/allocate (+ 8 (count ba)))]
|
||||
(do (.putLong byte-buf root-address)
|
||||
(.put byte-buf ba)
|
||||
(.rewind byte-buf)
|
||||
(->TightlyPackedTrie
|
||||
byte-buf
|
||||
0
|
||||
(.getLong byte-buf)
|
||||
(.capacity byte-buf)
|
||||
value-decode-fn))))
|
||||
|
||||
;; Gone up from depth to a parent.
|
||||
;; Process index of children.
|
||||
(> previous-depth current-depth)
|
||||
(do (let [[k v] (first nodes)
|
||||
value (value-encode-fn v)
|
||||
child-index (first child-indexes)
|
||||
child-index-baos (ByteArrayOutputStream.)
|
||||
_ (->> child-index
|
||||
(run!
|
||||
(fn [[key offset]]
|
||||
(.write
|
||||
child-index-baos
|
||||
(encoding/encode-key-to-tightly-packed-trie-index key))
|
||||
(.write
|
||||
child-index-baos
|
||||
(encoding/encode-offset-to-tightly-packed-trie-index
|
||||
(- current-offset offset))))))
|
||||
child-index-byte-array (.toByteArray child-index-baos)
|
||||
size-of-child-index (encoding/encode (count child-index-byte-array))
|
||||
current-index (second child-indexes)]
|
||||
(.write baos value)
|
||||
(.write baos size-of-child-index)
|
||||
(.write baos child-index-byte-array)
|
||||
(recur (rest nodes)
|
||||
(+ current-offset
|
||||
(count value)
|
||||
(count size-of-child-index)
|
||||
(count child-index-byte-array))
|
||||
current-depth
|
||||
(cons (conj current-index
|
||||
[(last k)
|
||||
current-offset])
|
||||
(drop 2 child-indexes)))))
|
||||
|
||||
;; Down or even in depth to children
|
||||
;; Start keeping track of new children index
|
||||
:else
|
||||
(do (let [[k v] (first nodes)
|
||||
value (value-encode-fn v)
|
||||
size-of-child-index (encoding/encode 0)
|
||||
child-indexes (concat (repeat (- current-depth previous-depth) [])
|
||||
child-indexes)
|
||||
current-child-index (first child-indexes)]
|
||||
(.write baos value)
|
||||
(.write baos size-of-child-index)
|
||||
(recur (rest nodes)
|
||||
(+ current-offset
|
||||
(count value)
|
||||
(count size-of-child-index))
|
||||
current-depth
|
||||
(cons (conj current-child-index
|
||||
[(last k)
|
||||
current-offset])
|
||||
(rest child-indexes))))))))))
|
||||
|
||||
;; TODO: Shared "save" interface for Trie?
|
||||
(defn save-tightly-packed-trie-to-file
|
||||
[filepath trie]
|
||||
(with-open [o (io/output-stream filepath)]
|
||||
(.write o (.array (.byte-buffer trie)))))
|
||||
|
||||
(defn load-tightly-packed-trie-from-file
|
||||
[filepath value-decode-fn]
|
||||
(with-open [i (io/input-stream filepath)
|
||||
baos (ByteArrayOutputStream.)]
|
||||
(io/copy i baos)
|
||||
(let [byte-buffer (java.nio.ByteBuffer/wrap (.toByteArray baos))]
|
||||
(.rewind byte-buffer)
|
||||
(->TightlyPackedTrie
|
||||
byte-buffer
|
||||
0
|
||||
(.getLong byte-buffer)
|
||||
(.capacity byte-buffer)
|
||||
value-decode-fn))))
|
@ -1,773 +0,0 @@
|
||||
(ns com.owoga.tightly-packed-trie.core
|
||||
(:require [clojure.zip :as zip]
|
||||
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
||||
[com.owoga.tightly-packed-trie.bit-manip :as bm]
|
||||
[clojure.java.io :as io]
|
||||
[clojure.string :as string])
|
||||
(:import (java.io ByteArrayOutputStream ByteArrayInputStream
|
||||
DataOutputStream DataInputStream)))
|
||||
|
||||
;; A trie data structure that can be converted to
|
||||
;; a contiguous array of bytes while maintaining
|
||||
;; efficient lookups.
|
||||
;;
|
||||
;; A regular Clojure hash-map can be used as a trie,
|
||||
;; but there's a lot of memory overhead that comes along
|
||||
;; with hash-maps.
|
||||
;;
|
||||
;; To conveniently get the benefit of packing a trie into a contiguous array of
|
||||
;; bytes, there are a few restrictions.
|
||||
;;
|
||||
;; Everything must be numeric IDs. Keys must be numeric IDs.
|
||||
;; Values must be numeric IDs. Everything must be a number.
|
||||
;;
|
||||
;; This lets us encode everything as variable-length-encoded byte arrays.
|
||||
;;
|
||||
;; To maximize efficiency, your most common keys should have the
|
||||
;; smallest IDs.
|
||||
|
||||
|
||||
;; It's convenient to work with hash-map representations
|
||||
;; while developing.
|
||||
;;
|
||||
;; {"T" {:children:
|
||||
;; "I": {:value "IT"}
|
||||
;; "A": {:value "AT"}}}
|
||||
;;
|
||||
;; If you want to work with a hash-map representation and
|
||||
;; want this library to handle conversion of the trie,
|
||||
;; then your hash-map version will need to follow a few conventions.
|
||||
;;
|
||||
;; For example, this code expects child nodes to be key/value pairs under
|
||||
;; the :children key.
|
||||
|
||||
|
||||
;; To pack a trie into a contiguous array of bytes
|
||||
;; and still be able to find a key in a list of
|
||||
;; child indexes in an efficient way, the child
|
||||
;; indexes need to be sorted so they can be
|
||||
;; binary-searched.
|
||||
;;
|
||||
;; When working with a hash-map-backed Trie,
|
||||
;; it's convenient to use update-in to add
|
||||
;; new values to the trie. But the default
|
||||
;; update-in creates unsorted hash-maps when it encounters
|
||||
;; a new key. This has the same functionality of update-in,
|
||||
;; but new keys are given sorted-map values.
|
||||
(defn update-in-sorted
|
||||
"'Updates' a value in a nested associative structure, where ks is a
|
||||
sequence of keys and f is a function that will take the old value
|
||||
and any supplied args and return the new value, and returns a new
|
||||
nested structure. If any levels do not exist, hash-maps will be
|
||||
created."
|
||||
{:added "1.0"
|
||||
:static true}
|
||||
([m ks f & args]
|
||||
(let [up (fn up [m ks f args]
|
||||
(let [m (or m (sorted-map))
|
||||
[k & ks] ks]
|
||||
(assert (instance? clojure.lang.PersistentTreeMap m)
|
||||
(apply str
|
||||
"A non-sorted hash-map in a sorted"
|
||||
"hash-map will probably be the seed of some problems."))
|
||||
(if ks
|
||||
(assoc m k (up (get m k) ks f args))
|
||||
(assoc m k (apply f (get m k) args)))))]
|
||||
(up m ks f args))))
|
||||
|
||||
|
||||
(defn children-of-map-trie-node
|
||||
"Grab the children of a node.
|
||||
|
||||
A node is a map of {node-key {:children {,,,} ,,,}}.
|
||||
|
||||
This functions gives you the child nodes not as a single map
|
||||
where all the child keys are part of the same map, but instead as a
|
||||
a seq where each value is a single node.
|
||||
|
||||
This is a useful helper for turning a Trie from a map into a
|
||||
depth-first post-order traversable zipper."
|
||||
[node]
|
||||
(let [[k {:keys [children]}] (first (seq node))]
|
||||
(->> children
|
||||
(map (partial apply hash-map)))))
|
||||
|
||||
(comment
|
||||
(let [root-node {:root {:children {"T" {:some 'val} "U" {:other 'val}}}}]
|
||||
(children-of-map-trie-node root-node))
|
||||
;; => ({"T" {:some val}} {"U" {:other val}})
|
||||
)
|
||||
|
||||
(defn without-children [node]
|
||||
(let [[k v] (first (seq node))]
|
||||
{k (dissoc v :children)}))
|
||||
|
||||
(defn map->depth-first-post-order-traversable-zipperable-vector
|
||||
[node]
|
||||
[(vec
|
||||
(map
|
||||
map->depth-first-post-order-traversable-zipperable-vector
|
||||
(children-of-map-trie-node node)))
|
||||
(without-children node)])
|
||||
|
||||
(comment
|
||||
;; This comment demonstrates how we change the order in which
|
||||
;; we traverse the map. If we don't turn each node into a vector
|
||||
;; where the list of children are first, then the parent node would
|
||||
;; get traversed befor the children. In the example below, the
|
||||
;; "AT" node would be traversed befor the "SAT" and "TAT" nodes.
|
||||
(let [m {:root
|
||||
{:children
|
||||
{"T"
|
||||
{:children
|
||||
{"A" {:children
|
||||
{"T" {:value "TAT", :count 1}
|
||||
"S" {:value "SAT" :count 1}}
|
||||
:value "AT"
|
||||
:count 1},
|
||||
"U" {:children {"T" {:value "TUT", :count 1}}}}}}}}]
|
||||
(let [z (zip/vector-zip
|
||||
(map->depth-first-post-order-traversable-zipperable-vector m))]
|
||||
(->> z
|
||||
(iterate zip/next)
|
||||
(take-while (complement zip/end?))
|
||||
(map zip/node)
|
||||
(filter map?))))
|
||||
;; => ({"T" {:value "TAT", :count 1}}
|
||||
;; {"S" {:value "SAT", :count 1}}
|
||||
;; {"A" {:value "AT", :count 1}}
|
||||
;; {"T" {:value "TUT", :count 1}}
|
||||
;; {"U" {}}
|
||||
;; {"T" {}}
|
||||
;; {:root {}})
|
||||
)
|
||||
|
||||
(defn depth-first-post-order-traversable-zipperable-vector->map
|
||||
"Parity reversal of the code above. Maps are easier to reason about
|
||||
while developing. It's just inconvenient to traverse them in the
|
||||
order needed by the algorithms we use to pack them into a contiguous
|
||||
array of byte for the tightly packed trie."
|
||||
[node]
|
||||
(let [children (first node)
|
||||
parent (second node)
|
||||
[parent-key parent-val] (first (seq parent))]
|
||||
(sorted-map
|
||||
parent-key
|
||||
(assoc
|
||||
parent-val
|
||||
:children
|
||||
(into
|
||||
(sorted-map)
|
||||
(map depth-first-post-order-traversable-zipperable-vector->map children))))))
|
||||
|
||||
(comment
|
||||
(let [m {:root
|
||||
{:children
|
||||
{"T"
|
||||
{:children
|
||||
{"A" {:children
|
||||
{"T" {:value "TAT", :count 1}
|
||||
"S" {:value "SAT" :count 1}}
|
||||
:value "AT"
|
||||
:count 1},
|
||||
"U" {:children {"T" {:value "TUT", :count 1}}}}}}}}]
|
||||
(let [vect (map->depth-first-post-order-traversable-zipperable-vector m)]
|
||||
(depth-first-post-order-traversable-zipperable-vector->map vect)))
|
||||
;; => {:root
|
||||
;; {:children
|
||||
;; {"T"
|
||||
;; {:children
|
||||
;; {"A"
|
||||
;; {:value "AT",
|
||||
;; :count 1,
|
||||
;; :children
|
||||
;; {"S" {:value "SAT", :count 1, :children {}},
|
||||
;; "T" {:value "TAT", :count 1, :children {}}}},
|
||||
;; "U" {:children {"T" {:value "TUT", :count 1, :children {}}}}}}}}}
|
||||
)
|
||||
|
||||
;;;; Zipper utilities
|
||||
;;
|
||||
;; Some useful functions for traversing/transforming zippers.
|
||||
;;
|
||||
;; This is what a tree looks like.
|
||||
;;
|
||||
;; [1 [2 [3 4 5] [6 7]]]
|
||||
;; { }
|
||||
;; / \
|
||||
;; / \
|
||||
;; 1 [2 [3 4 5] [6 7]]
|
||||
;; / | \
|
||||
;; / | \
|
||||
;; 2 [3 4 5] [6 7]
|
||||
;;
|
||||
;; Clojure's zipper gives us convenient ways
|
||||
;; to iterate over the nodes of the zipper.
|
||||
;;
|
||||
;; We can use the functions below to
|
||||
;; recreate the functionality of
|
||||
;; (map #(if (even? %) (* % %) %) coll)
|
||||
;; but with zippers instead of collections.
|
||||
(defn visitor-next
|
||||
"Visits every loc in a zipper and calls zip/next with the result of applying f
|
||||
to the loc.
|
||||
|
||||
Goes without saying that f should return a loc."
|
||||
[zipper f]
|
||||
(loop [loc zipper]
|
||||
(if (zip/end? loc)
|
||||
(zip/root loc)
|
||||
(recur (zip/next (f loc))))))
|
||||
|
||||
(defn visitor-prev
|
||||
"visitor-next in reverse."
|
||||
[zipper f]
|
||||
(loop [loc zipper]
|
||||
(if (nil? (zip/prev loc))
|
||||
(zip/root (f loc))
|
||||
(recur (zip/prev (f loc))))))
|
||||
|
||||
(defn visitor-filter
|
||||
"Helper for traversing a zipper with a visitor function.
|
||||
|
||||
Convenient for pulling filtering logic out of a visitor function
|
||||
into composable and specific filter functions."
|
||||
[pred visitor]
|
||||
(fn [loc]
|
||||
(if (pred loc)
|
||||
(visitor loc)
|
||||
loc)))
|
||||
|
||||
(comment
|
||||
;; This comment shows an example of using the visitor
|
||||
;; helpers to recreate the functionality similar to
|
||||
;; (->> [1 2 3 4 5]
|
||||
;; (map #(if (even? %) (* % %) %))
|
||||
;;
|
||||
;; This first example has the conditional logic inside
|
||||
;; the visitor function.
|
||||
(let [zipper (zip/vector-zip [1 [2 [3 4 5] [6 7]]])]
|
||||
(visitor-next
|
||||
zipper
|
||||
#(if (and (int? (zip/node %)) (even? (zip/node %)))
|
||||
(zip/edit % (fn [n] (int (Math/pow n 2))))
|
||||
%)))
|
||||
;; => [1 [4 [3 16 5] [36 7]]]
|
||||
;;
|
||||
;; This second example has a seperate filter predicate
|
||||
;; from the transformation visitor and combines them
|
||||
;; with the visitor-filter function.
|
||||
(let [zipper (zip/vector-zip [1 [2 [3 4 5] [6 7]]])
|
||||
pred (fn [loc]
|
||||
(let [node (zip/node loc)]
|
||||
(and (int? node)
|
||||
(even? node))))
|
||||
transform (fn [loc]
|
||||
(zip/edit loc (fn [n] (int (Math/pow n 2)))))]
|
||||
(visitor-next zipper (visitor-filter pred transform)))
|
||||
;; => [1 [4 [3 16 5] [36 7]]]
|
||||
)
|
||||
|
||||
(defn loc-children
|
||||
"Takes a zipper loc and returns seq of children locs.
|
||||
|
||||
Written to work with zippers of a particular structure:
|
||||
[[child1, child2, ,,,] parent]"
|
||||
[loc]
|
||||
(if (and (zip/left loc)
|
||||
(zip/down (zip/left loc)))
|
||||
(let [children (zip/down (zip/left loc))]
|
||||
(->> children
|
||||
(iterate zip/right)
|
||||
(take-while (complement nil?))))))
|
||||
|
||||
(comment
|
||||
(let [v [[1 2 3] :parent]
|
||||
z (zip/vector-zip v)]
|
||||
(->> z
|
||||
zip/down
|
||||
zip/right
|
||||
loc-children
|
||||
(map zip/node)))
|
||||
;; => (1 2 3)
|
||||
)
|
||||
|
||||
;;;; Tightly Packing Tries
|
||||
;;
|
||||
;; These next functions are all helpers
|
||||
;; related to byte-packing nodes in preperation
|
||||
;; for writing them to a byte stream.
|
||||
|
||||
(defn previous-node [loc]
|
||||
(loop [loc (zip/prev loc)]
|
||||
(cond
|
||||
(nil? loc) nil
|
||||
(map? (zip/node loc)) loc
|
||||
:else (recur (zip/prev loc)))))
|
||||
|
||||
(defn loc->byte-address
|
||||
"Given a loc without a byte-address, calculate it from the previous loc.
|
||||
|
||||
0x00|node val of previous loc |
|
||||
0x01|size of child index |
|
||||
0x02|child1 key |
|
||||
0x03|child1 byte address offset|
|
||||
0x04|child2 key |
|
||||
0x05|child2 byte address offset|
|
||||
0x??|node val of current loc |
|
||||
|
||||
We obviously wouldn't need this if we were writing directly to a
|
||||
ByteBuffer. Whatever position we are at is our address.
|
||||
|
||||
But if we want to maintain the byte-packed data as
|
||||
part of the map- or vector-like trie structure, then
|
||||
we need this."
|
||||
([loc]
|
||||
(loc->byte-address loc 0))
|
||||
([loc starting-offset]
|
||||
(let [prev (previous-node loc)]
|
||||
(if prev
|
||||
(let [[k {:keys [byte-address byte-array]}] (first (seq (zip/node prev)))]
|
||||
(+ byte-address (count byte-array)))
|
||||
starting-offset))))
|
||||
|
||||
(defn child->index
|
||||
"Given a child gets a map with info needed to build an index.
|
||||
|
||||
The below info is just a little bit of lagniappe. The only thing we're
|
||||
pulling off the child are the values of the keys :byte-address and :key.
|
||||
The encoding will happen later.
|
||||
But for reference, the encoding will be:
|
||||
The index will be a list of pairs of variable-length encoded bytes.
|
||||
The first number of the pair, the bytes will be encoded with a flag bit of 1.
|
||||
The second number of the pair, the bytes will be encoded with a flag bit of 0."
|
||||
[child]
|
||||
(let [[k {:keys [byte-address byte-array] :as v}] (first (seq child))]
|
||||
{:byte-address byte-address
|
||||
:key k}))
|
||||
|
||||
(defn child-node-key-and-offset-from-parent-as-byte-array
|
||||
[{:keys [key offset]}]
|
||||
(let [baos (ByteArrayOutputStream.)]
|
||||
(.write baos (encoding/encode-key-to-tightly-packed-trie-index key))
|
||||
(.write baos (encoding/encode-offset-to-tightly-packed-trie-index offset))
|
||||
(.toByteArray baos)))
|
||||
|
||||
(defn pack-node-value
|
||||
"Returns byte-array of node value.
|
||||
Byte-array is 2 variable-length encoded numbers.
|
||||
For a markov trie, this would be an number ID
|
||||
of the n-gram and an number of the frequency.
|
||||
|
||||
Nodes without terminal values get a value and count of 0."
|
||||
[node]
|
||||
(let [baos (ByteArrayOutputStream.)]
|
||||
(.write baos (encoding/encode (get node :value 0)))
|
||||
(.write baos (encoding/encode (get node :count 0)))
|
||||
(.toByteArray baos)))
|
||||
|
||||
(defn transform-trie-add-byte-pack-to-each-node
|
||||
"Visitor for a vector-based trie of structure [[child1, child2, ,,,,] parent].
|
||||
|
||||
Transforms each node adding keys for a byte-array of the node and its children index key/offsets
|
||||
as well as a key for its own offset.
|
||||
|
||||
Starts at byte 8, reserving the first 8 bytes for the for root address won't be known
|
||||
until the end of the zipper.
|
||||
|
||||
Transforming the trie to add these keys is an intermediary step that can probably be
|
||||
bypassed in the future by writing directly to a ByteArrayOutputStream."
|
||||
[loc]
|
||||
(let [baos (ByteArrayOutputStream.)
|
||||
;; Byte-address of the current node. Gets calculated from the
|
||||
;; previous node's byte address and the size of the previous
|
||||
;; node's byte array.
|
||||
byte-address (loc->byte-address loc 8)
|
||||
child-nodes (->> loc
|
||||
loc-children
|
||||
(map (comp second zip/node)))
|
||||
;; For child, we need to know the offset of the child node's address
|
||||
;; from this parent node's address.
|
||||
children (map
|
||||
(fn [child-node]
|
||||
(let [child-index (child->index child-node)]
|
||||
(assoc
|
||||
child-index
|
||||
:offset
|
||||
(- byte-address (:byte-address child-index)))))
|
||||
child-nodes)
|
||||
;; Create the byte array of the index of the children
|
||||
index-ba (let [index-baos (ByteArrayOutputStream.)
|
||||
child-byte-arrays
|
||||
(map
|
||||
child-node-key-and-offset-from-parent-as-byte-array
|
||||
children)]
|
||||
(loop [bas child-byte-arrays]
|
||||
(if (empty? bas)
|
||||
(.toByteArray index-baos)
|
||||
(do (.write index-baos (first bas))
|
||||
(recur (rest bas))))))]
|
||||
(zip/edit
|
||||
loc
|
||||
(fn [node]
|
||||
(let [[k v] (first (seq node))]
|
||||
(.write baos (pack-node-value v))
|
||||
(.write baos (encoding/encode (count index-ba)))
|
||||
(.write baos index-ba)
|
||||
{k (conj v {:byte-address byte-address
|
||||
:byte-array (.toByteArray baos)})})))))
|
||||
|
||||
(defprotocol ITrie
|
||||
(as-map [this] "Map that underlies trie.")
|
||||
(as-vec [this] "Depth-first post-order vector.")
|
||||
(as-byte-array [this] (str "Add key/values to each node containing"
|
||||
" the tightly-packed byte-array representation of the node."))
|
||||
(transform [this f] "Depth-first post-order apply each function to each node."))
|
||||
|
||||
(def not-found# (gensym))
|
||||
|
||||
(deftype Trie [f trie]
|
||||
ITrie
|
||||
(as-map [_] trie)
|
||||
(as-vec [_] (map->depth-first-post-order-traversable-zipperable-vector trie))
|
||||
(as-byte-array [self]
|
||||
(->> (transform
|
||||
self
|
||||
(visitor-filter
|
||||
#(map? (zip/node %))
|
||||
transform-trie-add-byte-pack-to-each-node))))
|
||||
(transform [self f]
|
||||
(->> self
|
||||
as-vec
|
||||
zip/vector-zip
|
||||
(#(visitor-next % f))
|
||||
depth-first-post-order-traversable-zipperable-vector->map
|
||||
(Trie. f)))
|
||||
|
||||
;; By returning lookups in the same strucure as root-level nodes
|
||||
;; but with the root at the found descendent, we can treat this
|
||||
;; descendant as a new root Trie node. This gives us the advantage
|
||||
;; of being able to re-use the Trie functions on sub-nodes.
|
||||
clojure.lang.ILookup
|
||||
(valAt [_ ks]
|
||||
(let [v (get-in trie (cons :root (interleave (repeat :children) ks)) not-found#)]
|
||||
(if (= v not-found#)
|
||||
(throw (Exception. (format "Key not found: %s" ks)))
|
||||
(Trie. f (sorted-map (last ks) v)))))
|
||||
(valAt [_ ks not-found]
|
||||
(Trie. f (sorted-map
|
||||
(last ks)
|
||||
(get-in trie (cons :root (interleave (repeat :children) ks)) not-found))))
|
||||
|
||||
clojure.lang.IPersistentCollection
|
||||
(seq [self]
|
||||
(->> self
|
||||
as-vec
|
||||
zip/vector-zip
|
||||
(iterate zip/next)
|
||||
(take-while (complement zip/end?))
|
||||
(map zip/node)
|
||||
(filter map?)
|
||||
(filter (comp :value second first seq))))
|
||||
(cons [_ o]
|
||||
(let [path (cons :root (interleave (repeat :children) (butlast o)))
|
||||
node (get-in trie path)]
|
||||
(Trie. f (update-in-sorted trie path f o))))
|
||||
(empty [_] (Trie. f {}))
|
||||
(equiv [_ o]
|
||||
(and (isa? (class o) Trie)
|
||||
(= (as-map o) trie))))
|
||||
|
||||
(defn trie
|
||||
([] (->Trie
|
||||
(fn update-fn [prev cur]
|
||||
(if (nil? prev)
|
||||
{:value (last cur)
|
||||
:count 1}
|
||||
(-> prev
|
||||
(update :count (fnil inc 0))
|
||||
(assoc :value (last cur)))))
|
||||
(sorted-map)))
|
||||
([& ks]
|
||||
(reduce
|
||||
(fn [t k]
|
||||
(conj t k))
|
||||
(trie)
|
||||
ks)))
|
||||
|
||||
(comment
|
||||
(let [v1 '(1 2 3 123)
|
||||
v2 '(1 2 12)
|
||||
v3 '(1 2 2 122)
|
||||
v4 '(1 3 1 131)
|
||||
t1 (trie v1 v2 v3 v4)]
|
||||
{:as-vec (as-vec (get t1 '(1 2)))
|
||||
:as-map (as-map (get t1 '(1 2)))
|
||||
:as-byte-array (as-byte-array (get t1 '(1 2)))})
|
||||
;; => {:as-vec
|
||||
;; [[[[] {2 {:value 122, :count 1}}] [[] {3 {:value 123, :count 1}}]]
|
||||
;; {2 {:count 1, :value 12}}],
|
||||
;; :as-map
|
||||
;; {2
|
||||
;; {:children {2 {:value 122, :count 1}, 3 {:value 123, :count 1}},
|
||||
;; :count 1,
|
||||
;; :value 12}},
|
||||
;; :as-byte-array
|
||||
;; {2
|
||||
;; {:byte-address 14,
|
||||
;; :byte-array [-116, -127, -124, -126, 6, -125, 3],
|
||||
;; :children
|
||||
;; {2
|
||||
;; {:value 122,
|
||||
;; :count 1,
|
||||
;; :byte-address 8,
|
||||
;; :byte-array [-6, -127, -128],
|
||||
;; :children {}},
|
||||
;; 3
|
||||
;; {:value 123,
|
||||
;; :count 1,
|
||||
;; :byte-address 11,
|
||||
;; :byte-array [-5, -127, -128],
|
||||
;; :children {}}},
|
||||
;; :count 1,
|
||||
;; :value 12}}}
|
||||
)
|
||||
|
||||
(defn rewind-to-key [bb stop]
|
||||
(loop []
|
||||
(let [current (.get bb (.position bb))
|
||||
previous (.get bb (dec (.position bb)))]
|
||||
(if (or (= stop (.position bb))
|
||||
(and (encoding/key-byte? current)
|
||||
(encoding/offset-byte? previous)))
|
||||
bb
|
||||
(do (.position bb (dec (.position bb)))
|
||||
(recur))))))
|
||||
|
||||
(defn forward-to-key [bb stop]
|
||||
(loop []
|
||||
(if (or (= stop (.position bb))
|
||||
(and (encoding/key-byte? (.get bb (.position bb)))
|
||||
(encoding/offset-byte?
|
||||
(.get bb (inc (.position bb))))))
|
||||
bb
|
||||
(do (.position bb (inc (.position bb)))
|
||||
(recur)))))
|
||||
|
||||
(defn find-key-in-index
|
||||
[bb target-key max-address not-found]
|
||||
(.limit bb max-address)
|
||||
(let [key
|
||||
(loop [previous-key nil
|
||||
min-position (.position bb)
|
||||
max-position max-address]
|
||||
(if (zero? (- max-position min-position))
|
||||
not-found
|
||||
(let [mid-position (+ min-position (quot (- max-position min-position) 2))]
|
||||
(.position bb mid-position)
|
||||
(let [bb (rewind-to-key bb min-position)
|
||||
current-key
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)]
|
||||
(cond
|
||||
(= current-key target-key)
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
|
||||
(= current-key previous-key)
|
||||
(do
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(let [final-key (encoding/decode-number-from-tightly-packed-trie-index bb)]
|
||||
(if (= target-key final-key)
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(throw (Exception. "Key not found.")))))
|
||||
|
||||
(< current-key target-key)
|
||||
;; Chew the next decoded number. It's a useless offset.
|
||||
(do
|
||||
(encoding/decode-number-from-tightly-packed-trie-index bb)
|
||||
(recur
|
||||
current-key
|
||||
(.position bb)
|
||||
max-position))
|
||||
|
||||
(> current-key target-key)
|
||||
;; This could also be rewound.
|
||||
(do
|
||||
(rewind-to-key bb min-position)
|
||||
(recur
|
||||
current-key
|
||||
min-position
|
||||
(.position bb))))))))]
|
||||
(.limit bb (.capacity bb))
|
||||
key))
|
||||
|
||||
(defn tightly-packed-trie-node-value
|
||||
[byte-buffer]
|
||||
(let [value (encoding/decode byte-buffer)
|
||||
freq (encoding/decode byte-buffer)]
|
||||
{:value value
|
||||
:count freq}))
|
||||
|
||||
(defmacro wrap-byte-buffer
|
||||
"Saves the position and limit of a byte buffer, runs body,
|
||||
returns byte buffer to original position and limit."
|
||||
[byte-buffer & body]
|
||||
`(let [original-position# (.position ~byte-buffer)
|
||||
original-limit# (.limit ~byte-buffer)]
|
||||
(try (do ~@body)
|
||||
(finally
|
||||
(.limit ~byte-buffer original-limit#)
|
||||
(.position ~byte-buffer original-position#)))))
|
||||
|
||||
(defprotocol ITightlyPackedTrie
|
||||
(children [self] "Immediate children of a node.")
|
||||
(value [self] "Return node value, disassociated with children."))
|
||||
|
||||
(deftype TightlyPackedTrie [byte-buffer key address limit]
|
||||
ITightlyPackedTrie
|
||||
(value [self]
|
||||
(wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer limit)
|
||||
(.position byte-buffer address)
|
||||
(tightly-packed-trie-node-value byte-buffer)))
|
||||
(children [self]
|
||||
(wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer limit)
|
||||
(.position byte-buffer address)
|
||||
(let [val (encoding/decode byte-buffer)
|
||||
freq (encoding/decode byte-buffer)
|
||||
size-of-index (encoding/decode byte-buffer)]
|
||||
(.limit byte-buffer (+ (.position byte-buffer)
|
||||
size-of-index))
|
||||
(loop [children []]
|
||||
(if (= (.position byte-buffer) (.limit byte-buffer))
|
||||
children
|
||||
(let [child-key (encoding/decode-number-from-tightly-packed-trie-index byte-buffer)
|
||||
child-offset (encoding/decode-number-from-tightly-packed-trie-index byte-buffer)]
|
||||
(recur
|
||||
(conj
|
||||
children
|
||||
(TightlyPackedTrie.
|
||||
byte-buffer
|
||||
child-key
|
||||
(- address child-offset)
|
||||
(.capacity byte-buffer))))))))))
|
||||
|
||||
clojure.lang.ILookup
|
||||
(valAt [self ks]
|
||||
(wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer limit)
|
||||
(.position byte-buffer address)
|
||||
(if (empty? ks)
|
||||
self
|
||||
(let [val (encoding/decode byte-buffer)
|
||||
freq (encoding/decode byte-buffer)
|
||||
size-of-index (encoding/decode byte-buffer)
|
||||
offset (find-key-in-index
|
||||
byte-buffer
|
||||
(first ks)
|
||||
(+ (.position byte-buffer) size-of-index)
|
||||
:not-found)]
|
||||
(if (= offset :not-found)
|
||||
(throw (Exception. (format "Index not found %s" ks)))
|
||||
(let [child (TightlyPackedTrie.
|
||||
byte-buffer
|
||||
(first ks)
|
||||
(- address offset)
|
||||
(.capacity byte-buffer))]
|
||||
(get child (rest ks))))))))
|
||||
(valAt [self ks not-found]
|
||||
(wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer limit)
|
||||
(.position byte-buffer address)
|
||||
(if (empty? ks)
|
||||
self
|
||||
(let [val (encoding/decode byte-buffer)
|
||||
freq (encoding/decode byte-buffer)
|
||||
size-of-index (encoding/decode byte-buffer)
|
||||
offset (find-key-in-index
|
||||
byte-buffer
|
||||
(first ks)
|
||||
(+ (.position byte-buffer) size-of-index)
|
||||
:not-found)]
|
||||
(if (= offset :not-found)
|
||||
not-found
|
||||
(let [child (TightlyPackedTrie.
|
||||
byte-buffer
|
||||
(first ks)
|
||||
(- address offset)
|
||||
(.capacity byte-buffer))]
|
||||
(get child (rest ks)))))))))
|
||||
|
||||
(defn tightly-packed-trie
|
||||
"Assumes the trie has been transformed so that each node
|
||||
includes a :byte-array key to the byte array that needs to be written
|
||||
for that node and a :byte-address key that has been calculated
|
||||
with an offset of 8. (The first 8 bytes are reserved for the root address.)"
|
||||
[trie]
|
||||
(let [baos (ByteArrayOutputStream.)
|
||||
trie (as-byte-array trie)]
|
||||
;; This transform writes to the ByteArrayOutputStream.
|
||||
(transform
|
||||
trie
|
||||
(visitor-filter
|
||||
#(map? (zip/node %))
|
||||
(fn [loc]
|
||||
(let [{:keys [byte-array]} (second (first (seq (zip/node loc))))]
|
||||
(.write baos byte-array)
|
||||
loc))))
|
||||
(let [ba (.toByteArray baos)
|
||||
root-address (get-in (as-map trie) [:root :byte-address])
|
||||
byte-buf (java.nio.ByteBuffer/allocate (+ 8 (count ba)))]
|
||||
(.putLong byte-buf root-address)
|
||||
(.put byte-buf ba)
|
||||
(.rewind byte-buf)
|
||||
(->TightlyPackedTrie byte-buf 0 (.getLong byte-buf) (.capacity byte-buf)))))
|
||||
|
||||
(defn zipper-tpt
|
||||
"Turns a tightly-packed trie into a zipper.
|
||||
Since the byte buffer that backs the trie can't be edited,
|
||||
`make-node` and the zipper edit functions won't work."
|
||||
[tpt]
|
||||
(let [branch? (fn branch? [node]
|
||||
(and (instance? TightlyPackedTrie node)
|
||||
(not-empty (children node))))
|
||||
zipper-children (fn zippper-children [node]
|
||||
(children node))
|
||||
make-node (fn make-node [node children]
|
||||
(throw (Exception. "Can't add children to Tightly Packed Trie nodes.")))]
|
||||
(zip/zipper branch? zipper-children make-node tpt)))
|
||||
|
||||
(comment
|
||||
(let [v1 '(1 2 3 123)
|
||||
v2 '(1 2 12)
|
||||
v3 '(1 2 2 122)
|
||||
v4 '(1 3 1 131)
|
||||
t1 (trie v1 v2 v3 v4)
|
||||
tpt (tightly-packed-trie t1)]
|
||||
(->> tpt
|
||||
zipper-tpt
|
||||
(iterate zip/next)
|
||||
(take-while (complement zip/end?))
|
||||
(map zip/node)
|
||||
(map #(hash-map (.key %) (value %)))))
|
||||
|
||||
)
|
||||
|
||||
;; TODO: Shared "save" interface for Trie?
|
||||
(defn save-tightly-packed-trie-to-file
|
||||
[filepath trie]
|
||||
(with-open [o (io/output-stream filepath)]
|
||||
(.write o (.array (.byte-buffer trie)))))
|
||||
|
||||
(defn load-tightly-packed-trie-from-file
|
||||
[filepath]
|
||||
(with-open [i (io/input-stream filepath)
|
||||
baos (ByteArrayOutputStream.)]
|
||||
(io/copy i baos)
|
||||
(let [byte-buffer (java.nio.ByteBuffer/wrap (.toByteArray baos))]
|
||||
(.rewind byte-buffer)
|
||||
(->TightlyPackedTrie byte-buffer 0 (.getLong byte-buffer) (.capacity byte-buffer)))))
|
@ -0,0 +1,192 @@
|
||||
(ns com.owoga.trie
|
||||
(:require [clojure.zip :as zip]))
|
||||
|
||||
(defn trie->depth-first-post-order-traversable-zipperable-vector
|
||||
([path node]
|
||||
(vec
|
||||
(map
|
||||
(fn [[k v]]
|
||||
[(trie->depth-first-post-order-traversable-zipperable-vector (conj path k) v)
|
||||
(clojure.lang.MapEntry. (conj path k) (.value v))])
|
||||
(.children- node)))))
|
||||
|
||||
(defn depth-first-post-order-traversable-zipperable-vector->trie
|
||||
[cls [children [key node]]]
|
||||
(sorted-map
|
||||
(last key)
|
||||
(cls (.key node) (.value node)
|
||||
(into (sorted-map)
|
||||
(map depth-first-post-order-traversable-zipperable-vector->trie children)))))
|
||||
|
||||
(declare ->Trie)
|
||||
|
||||
(defn -without
|
||||
[trie [k & ks]]
|
||||
(if k
|
||||
(if-let [next-trie (get (.children- trie) k)]
|
||||
(let [next-trie-without (-without next-trie ks)
|
||||
new-trie (->Trie (.key trie)
|
||||
(.value trie)
|
||||
(if next-trie-without
|
||||
(assoc (.children- trie) k next-trie-without)
|
||||
(dissoc (.children- trie) k)))]
|
||||
(if (and (empty? new-trie)
|
||||
(nil? (.value new-trie)))
|
||||
nil
|
||||
new-trie)))
|
||||
(if (seq (.children- trie))
|
||||
(->Trie
|
||||
(.key trie)
|
||||
nil
|
||||
(.children- trie))
|
||||
nil)))
|
||||
|
||||
(defprotocol ITrie
|
||||
(children [self] "Immediate children of a node.")
|
||||
(lookup [self ks] "Return node at key."))
|
||||
|
||||
(deftype Trie [key value children-]
|
||||
ITrie
|
||||
(children [trie]
|
||||
(map
|
||||
(fn [[k child]]
|
||||
(Trie. k
|
||||
(.value child)
|
||||
#_(sorted-map)
|
||||
(.children- child)))
|
||||
children-))
|
||||
|
||||
(lookup [trie k]
|
||||
(loop [k' k
|
||||
trie' trie]
|
||||
(cond
|
||||
;; Allows `update` to work the same as with maps... can use `fnil`.
|
||||
;; (nil? trie') (throw (Exception. (format "Key not found: %s" k)))
|
||||
(nil? trie') nil
|
||||
(empty? k')
|
||||
(Trie. (.key trie')
|
||||
(.value trie')
|
||||
(.children- trie'))
|
||||
:else (recur
|
||||
(rest k')
|
||||
(get (.children- trie') (first k'))))))
|
||||
|
||||
clojure.lang.ILookup
|
||||
(valAt [trie k]
|
||||
(loop [k' k
|
||||
trie' trie]
|
||||
(cond
|
||||
;; Allows `update` to work the same as with maps... can use `fnil`.
|
||||
;; (nil? trie') (throw (Exception. (format "Key not found: %s" k)))
|
||||
(nil? trie') nil
|
||||
(empty? k') (.value trie')
|
||||
:else (recur
|
||||
(rest k')
|
||||
(get (.children- trie') (first k'))))))
|
||||
|
||||
(valAt [trie k not-found]
|
||||
(loop [k' k
|
||||
trie' trie]
|
||||
(cond
|
||||
(nil? trie') not-found
|
||||
(empty? k') (.value trie')
|
||||
:else (recur
|
||||
(rest k')
|
||||
(get (.children- trie') (first k'))))))
|
||||
|
||||
clojure.lang.IPersistentCollection
|
||||
(cons [trie entry]
|
||||
(cond
|
||||
(instance? Trie (second entry))
|
||||
(assoc trie (first entry) (.value (second entry)))
|
||||
:else
|
||||
(assoc trie (first entry) (second entry))))
|
||||
|
||||
(empty [trie]
|
||||
(Trie. key nil (sorted-map)))
|
||||
|
||||
(equiv [trie o]
|
||||
(and (= (.value trie)
|
||||
(.value o))
|
||||
(= (.children- trie)
|
||||
(.children- o))
|
||||
(= (.key trie)
|
||||
(.key o))))
|
||||
|
||||
clojure.lang.Associative
|
||||
(assoc [trie opath ovalue]
|
||||
(if (empty? opath)
|
||||
(Trie. key ovalue children-)
|
||||
(Trie. key value (update
|
||||
children-
|
||||
(first opath)
|
||||
(fnil assoc (Trie. (first opath) nil (sorted-map)))
|
||||
(rest opath)
|
||||
ovalue))))
|
||||
(entryAt [trie key]
|
||||
(clojure.lang.MapEntry. key (get trie key)))
|
||||
(containsKey [trie key]
|
||||
(boolean (get trie key)))
|
||||
|
||||
clojure.lang.IPersistentMap
|
||||
(assocEx [trie key val]
|
||||
(if (contains? trie key)
|
||||
(throw (Exception. (format "Value already exists at key %s." key)))
|
||||
(assoc trie key val)))
|
||||
(without [trie key]
|
||||
(-without trie key))
|
||||
|
||||
clojure.lang.Counted
|
||||
(count [trie]
|
||||
(count (seq trie)))
|
||||
|
||||
clojure.lang.Seqable
|
||||
(seq [trie]
|
||||
(->> trie
|
||||
((partial trie->depth-first-post-order-traversable-zipperable-vector []))
|
||||
zip/vector-zip
|
||||
(iterate zip/next)
|
||||
(take-while (complement zip/end?))
|
||||
(map zip/node)
|
||||
(filter (partial instance? clojure.lang.MapEntry))
|
||||
(#(if (empty? %) nil %)))))
|
||||
|
||||
(defn make-trie
|
||||
([]
|
||||
(->Trie '() nil (sorted-map)))
|
||||
([& ks]
|
||||
(reduce
|
||||
(fn [t kv]
|
||||
(conj t kv))
|
||||
(make-trie)
|
||||
(partition 2 ks))))
|
||||
|
||||
(comment
|
||||
(make-trie "do" "do" "dot" "dot" "dog" "dog")
|
||||
;; => {[\d \o \g] "dog", [\d \o \t] "dot", [\d \o] "do"}
|
||||
|
||||
;; Access values at a particular key with get
|
||||
;; and the key (as any seqable).
|
||||
(let [trie (make-trie "do" "do" "dot" "dot" "dog" "dog")]
|
||||
(get trie [\d \o]))
|
||||
;; => "do"
|
||||
|
||||
;; Access children with `traverse`
|
||||
(let [trie (make-trie "do" "do" "dot" "dot" "dog" "dog" "doggo" "fluffy")]
|
||||
(traverse trie "do"))
|
||||
;; => {[\g \g \o] "fluffy", [\g] "dog", [\t] "dot"}
|
||||
|
||||
;; Any seq of comparable elements will work for keys.
|
||||
;; But they *must* be comparable, so you can't mix-and-match
|
||||
;; different types.
|
||||
(make-trie '(:k :e :y) 1 '(:k :e :e) 2)
|
||||
;; => {[:k :e :e] 2, [:k :e :y] 1}
|
||||
|
||||
;; Seqing over a trie will return the elements in
|
||||
;; depth-first post-order traversal with children sorted by key.
|
||||
(->> (make-trie '(1 2 4) 124 '(1 2) 12 '(1 2 3) 123 '(1 2 2) 122)
|
||||
(map (fn [[k v]]
|
||||
[k (* 2 v)]))
|
||||
(into (make-trie)))
|
||||
;; => {[1 2 2] 244, [1 2 3] 246, [1 2 4] 248, [1 2] 24}
|
||||
)
|
@ -0,0 +1,99 @@
|
||||
;; Fast weighted random selection thanks to the Vose algorithm.
|
||||
;; https://gist.github.com/ghadishayban/a26cc402958ef3c7ce61
|
||||
(ns com.owoga.trie.math
|
||||
(:import clojure.lang.PersistentQueue))
|
||||
|
||||
;; Vose's alias method
|
||||
;; http://www.keithschwarz.com/darts-dice-coins/
|
||||
(defprotocol Rand
|
||||
(nextr [_ rng]))
|
||||
|
||||
(deftype Vose [n ^ints alias ^doubles prob]
|
||||
Rand
|
||||
;; returns the index of the chosen weight
|
||||
(nextr [_ rng] ;; not using the rng for now
|
||||
(let [i (rand-int n)
|
||||
p (aget prob i)]
|
||||
(if (or (= p 1.0)
|
||||
(< (rand) p))
|
||||
i
|
||||
(aget alias i)))))
|
||||
|
||||
(defn ^:private make-vose [dist]
|
||||
(let [N (count dist)
|
||||
alias (int-array N)
|
||||
prob (double-array N)]
|
||||
(if (zero? N)
|
||||
(->Vose N alias prob)
|
||||
(let [^doubles ps (->> dist
|
||||
(map (partial * N))
|
||||
(into-array Double/TYPE))
|
||||
|
||||
[small large] (loop [i 0
|
||||
[small large] [PersistentQueue/EMPTY
|
||||
PersistentQueue/EMPTY]
|
||||
ps (seq ps)]
|
||||
(if (seq ps)
|
||||
(let [p (first ps)]
|
||||
(if (< p 1)
|
||||
(recur (inc i)
|
||||
[(conj small i) large]
|
||||
(rest ps))
|
||||
(recur (inc i)
|
||||
[small (conj large i)]
|
||||
(rest ps))))
|
||||
[small large]))
|
||||
|
||||
[small large] (loop [small small
|
||||
large large]
|
||||
(if (and (seq small) (seq large))
|
||||
(let [l (first small)
|
||||
g (first large)
|
||||
small (pop small)
|
||||
large (pop large)]
|
||||
(aset-double prob l (aget ps l))
|
||||
(aset-int alias l g)
|
||||
(let [pg (- (+ (aget ps g) (aget ps l))
|
||||
1.0)]
|
||||
(aset-double ps g pg)
|
||||
(if (< pg 1)
|
||||
(recur (conj small g) large)
|
||||
(recur small (conj large g)))))
|
||||
[small large]))]
|
||||
(doseq [g (concat large small)]
|
||||
(aset-double prob g 1))
|
||||
(->Vose N alias prob)))))
|
||||
|
||||
(defn from-weights [ws]
|
||||
(let [tot (reduce + 0.0 ws)]
|
||||
(assert (> tot 0) "Can't Vose RNG from 0 weights.")
|
||||
(let [dist (map #(/ % tot) ws)]
|
||||
(make-vose (vec dist)))))
|
||||
|
||||
(comment
|
||||
(let [ws [1 2 1 3 3]
|
||||
rng (from-weights ws)
|
||||
chosen (repeatedly 1000000 #(nextr rng nil))
|
||||
accuracy (mapv (comp float
|
||||
#(/ % 100000)
|
||||
(frequencies chosen))
|
||||
(range (count ws)))]
|
||||
accuracy))
|
||||
|
||||
(defn weighted-selection
|
||||
"If given a coll, assumes the coll is weights and returns the selected index by
|
||||
weighted random selection.
|
||||
|
||||
If given a key function and a collection, uses the key function to get a
|
||||
collection of weights and returns the value at the randomly selected index."
|
||||
([coll]
|
||||
(assert (not-empty coll) "Can't select from empty coll")
|
||||
(let [rng (from-weights coll)
|
||||
index (nextr rng nil)]
|
||||
index))
|
||||
([key-fn coll]
|
||||
(assert (not-empty coll) "Can't select from empty coll")
|
||||
(let [rng (from-weights (map key-fn coll))
|
||||
index (nextr rng nil)
|
||||
selection (nth coll index)]
|
||||
selection)))
|
@ -0,0 +1,21 @@
|
||||
(ns tightly-packed-trie.core-test
|
||||
"Basic tests for the primary API of `next.jdbc`."
|
||||
(:require [clojure.test :refer [deftest is testing use-fixtures]]
|
||||
[com.owoga.tightly-packed-trie.core :as tpt]))
|
||||
|
||||
(deftest basic-tests
|
||||
;; use ds-opts instead of (ds) anywhere you want default options applied:
|
||||
(testing "map-based trie"
|
||||
(let [trie (tpt/trie)]
|
||||
(testing "key not found"
|
||||
(is (thrown-with-msg?
|
||||
Exception
|
||||
#"Key not found"
|
||||
(get trie '("foo"))))
|
||||
(is (= :not-found
|
||||
(get trie '("foo") :not-found))))
|
||||
(testing "conjing to trie"
|
||||
(let [trie (conj trie '("d" "o" "g" "dog"))]
|
||||
(is (instance? com.owoga.tightly_packed_trie.core.Trie (get trie '("d" "o" "g"))))
|
||||
(is (= (tpt/as-map (get trie '("d" "o" "g")))
|
||||
{"g" {:value "dog" :count 1}})))))))
|
@ -0,0 +1,89 @@
|
||||
(ns tightly-packed-trie-test
|
||||
(:require [clojure.test :refer [deftest is testing] :as t]
|
||||
[com.owoga.trie :as trie]
|
||||
[com.owoga.tightly-packed-trie :as tpt]
|
||||
[com.owoga.tightly-packed-trie.encoding :as encode]
|
||||
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
||||
[com.owoga.tightly-packed-trie.bit-manip :as bm]))
|
||||
|
||||
(defn value-encode-fn [v]
|
||||
(if (or (= v ::tpt/root)
|
||||
(nil? v))
|
||||
(encode/encode 0)
|
||||
(encode/encode v)))
|
||||
|
||||
(defn value-decode-fn [byte-buffer]
|
||||
(let [v (encode/decode byte-buffer)]
|
||||
(if (zero? v)
|
||||
nil
|
||||
v)))
|
||||
|
||||
(comment
|
||||
(let [t (->> '([1 3] 13 [1] 1 [1 2] 12)
|
||||
(apply trie/make-trie)
|
||||
(#(tpt/tightly-packed-trie % value-encode-fn value-decode-fn)))
|
||||
bb (.byte-buffer t)]
|
||||
(trie/lookup t [1 2]))
|
||||
|
||||
)
|
||||
|
||||
(deftest tightly-packed-trie-tests
|
||||
(let [empty-trie (-> (trie/make-trie)
|
||||
(#(tpt/tightly-packed-trie % value-encode-fn value-decode-fn)))
|
||||
initialized-trie (->> '([1 3] 13 [1] 1 [1 2] 12)
|
||||
(apply trie/make-trie)
|
||||
(#(tpt/tightly-packed-trie % value-encode-fn value-decode-fn)))]
|
||||
(testing "ILookup"
|
||||
(is (= 13 (get initialized-trie [1 3])))
|
||||
(is (= :not-found (get initialized-trie [4] :not-found)))
|
||||
(is (= nil (get initialized-trie [4]))))
|
||||
(testing "ITrie"
|
||||
(testing "lookup"
|
||||
(is (= nil (trie/lookup empty-trie [1])))
|
||||
(is (= 1 (get (trie/lookup initialized-trie [1]) [])))
|
||||
(is (= 12 (get (trie/lookup initialized-trie [1]) [2]))))
|
||||
(testing "children"
|
||||
(is (= '(12 13)
|
||||
(map #(get % [])
|
||||
(trie/children (trie/lookup initialized-trie [1])))))))))
|
||||
|
||||
(deftest extended-tightly-packed-trie-tests
|
||||
(let [initialized-trie (->> (trie/make-trie '(1 2 3) 123 '(1 2 1) 121 '(1 2 2) 122 '(1 3 1) 131)
|
||||
(#(tpt/tightly-packed-trie % value-encode-fn value-decode-fn)))]
|
||||
(testing "ILookup"
|
||||
(is (= 123 (get initialized-trie [1 2 3])))
|
||||
(is (= :not-found (get initialized-trie [4] :not-found)))
|
||||
(is (= nil (get initialized-trie [4]))))
|
||||
(testing "ITrie"
|
||||
(testing "lookup"
|
||||
(is (= nil (trie/lookup initialized-trie [4])))
|
||||
(is (= nil (get (trie/lookup initialized-trie [1]) [])))
|
||||
(is (= nil (get (trie/lookup initialized-trie [1]) [2]))))
|
||||
(testing "children"
|
||||
(is (= '(121 122 123)
|
||||
(map #(get % [])
|
||||
(trie/children (trie/lookup initialized-trie [1 2])))))))
|
||||
(testing "Seq"
|
||||
(is (= '([[1 2 1] 121]
|
||||
[[1 2 2] 122]
|
||||
[[1 2 3] 123]
|
||||
[[1 2] nil]
|
||||
[[1 3 1] 131]
|
||||
[[1 3] nil]
|
||||
[[1] nil])
|
||||
(seq initialized-trie))))))
|
||||
|
||||
(comment
|
||||
(let [trie (trie/make-trie '(1 2 3) 123 '(1 2 1) 121 '(1 2 2) 122 '(1 3 1) 131)
|
||||
tpt (tpt/tightly-packed-trie trie value-encode-fn value-decode-fn)
|
||||
byte-buffer (.byte-buffer tpt)]
|
||||
(tpt/wrap-byte-buffer
|
||||
byte-buffer
|
||||
(.limit byte-buffer (.limit tpt))
|
||||
(.position byte-buffer (.address tpt))
|
||||
[(value-decode-fn byte-buffer)
|
||||
(value-decode-fn byte-buffer)
|
||||
(encode/decode-number-from-tightly-packed-trie-index byte-buffer)
|
||||
(encode/decode-number-from-tightly-packed-trie-index byte-buffer)]))
|
||||
|
||||
)
|
@ -0,0 +1,34 @@
|
||||
(ns trie-test
|
||||
(:require [clojure.test :refer [deftest is testing use-fixtures] :as t]
|
||||
[com.owoga.trie :as trie]
|
||||
[clojure.zip :as zip]
|
||||
[clojure.main :as main]))
|
||||
|
||||
(deftest trie-tests
|
||||
(let [empty-trie (trie/make-trie)
|
||||
initialized-trie (trie/make-trie '(1 2) 12)]
|
||||
(testing "assoc"
|
||||
(is (= (assoc empty-trie '(1 2) 12)
|
||||
initialized-trie)))
|
||||
(testing "dissoc"
|
||||
(let [expected (-> (trie/make-trie)
|
||||
(assoc '(1) 1))
|
||||
trie (-> (trie/make-trie)
|
||||
(assoc '(1) 1)
|
||||
(assoc '(1 3) 13))]
|
||||
(is (= expected (dissoc trie '(1 3))))))
|
||||
(testing "ILookup"
|
||||
(is (= 12 (get initialized-trie '(1 2))))
|
||||
(is (= :not-found (get initialized-trie '(1 3) :not-found)))
|
||||
(is (nil? (get initialized-trie '(1 3)))))
|
||||
(testing "IPersistentCollection"
|
||||
(is (empty? empty-trie))
|
||||
(is (= (conj empty-trie ['(1 2) 12])
|
||||
initialized-trie)))
|
||||
(testing "Counted"
|
||||
(is (zero? (count empty-trie)))
|
||||
(is (= 2 (count initialized-trie))))
|
||||
(testing "Seqable"
|
||||
(is (= '([[1 2] 12] [[1] nil])
|
||||
(seq initialized-trie))))))
|
||||
|
Loading…
Reference in New Issue