Add code trie and variable-length code

main
Eric Ihli 4 years ago
parent bf6836bb69
commit 1ef29bf96b

@ -0,0 +1,12 @@
1 32
2 20
3 10
4 3
5 1
6 2
7 1
8 1
9 1
10 2
12 1
26 1

@ -1,2 +1,6 @@
(ns com.owoga.prhyme.corpus.db
(:require [integrant.core :as ig]))
(defn tokens->db
[tokens]
)

@ -46,6 +46,37 @@
(bit-shift-left n 7))
(inc i))))))
(defn vb-decode
([ba]
(vb-decode ba 0))
([ba i]
(if (>= i (count ba))
(cons (first (vb-decode-1 ba))
nil)
(let [[value byte-count] (vb-decode-1 ba)]
(lazy-seq
(cons
value
(vb-decode (byte-array (drop byte-count ba))
(+ i byte-count))))))))
(comment
(let [n1 0
n2 1
n3 127
n4 128
n5 257
n6 9876543210
baos (java.io.ByteArrayOutputStream.)]
(->> [n1 n2 n3 n4 n5 n6]
(map vb-encode)
(run! #(.writeBytes baos %)))
(let [ba (.toByteArray baos)]
(vb-decode ba)))
;; => ([0 1] [1 1] [127 1] [128 2] [257 2] [9876543210 5])
)
(def dictionary ["hi" "my" "name" "is" "what"])
(defn slurp-bytes [x]

@ -14,6 +14,10 @@
commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
(defn pad-tokens
[tokens n]
(concat (repeat (min 1 (dec n)) "<s>") tokens ["</s>"]))
(defn tokenize-line
[line]
(->> line

@ -0,0 +1,287 @@
(ns com.owoga.prhyme.rhyme-trie
(:require [clojure.java.io :as io]
[clojure.walk :as walk]
[clojure.zip :as zip])
(:import (java.io ByteArrayOutputStream ByteArrayInputStream
DataOutputStream DataInputStream)))
(defn branch? [node]
(and (map? node)
(:children (first (vals node)))))
(defn children [node]
(map (partial apply hash-map) (seq (:children (first (vals node))))))
(defn without-children [node]
{(first (keys node))
(dissoc (get node (first (keys node))) :children)})
(defn map-trie->seq-trie
[trie]
[(vec (map map-trie->seq-trie (children trie)))
(without-children trie)])
(let [m {:root
{:children
{"T"
{:children
{"A" {:children
{"T" {:value "TAT", :count 1}
"S" {:value "SAT" :count 1}}
:value "AT"
:count 1},
"U" {:children {"T" {:value "TUT", :count 1}}}}}}}}]
(let [z (zip/vector-zip (map-trie->seq-trie m))]
(->> z
(iterate zip/next)
(take-while (complement zip/end?))
(map zip/node))))
(defn vec-trie->map-trie
[trie]
(let [children (first trie)
parent (second trie)
[parent-key parent-val] (first (seq parent))]
{parent-key (assoc parent-val :children (into {} (map vec-trie->map-trie children)))}))
(comment
(let [vect [[[[[[[[] {"T" {:value "TAT", :count 1}}]] {"A" {:value "AT", :count 1}}]
[[[[] {"T" {:value "TUT", :count 1}}]] {"U" {}}]]
{"T" {}}]]
{:root {}}]]
(vec-trie->map-trie vect))
)
(comment
(let [v1 '("T" "A" "T" "TAT")
v2 '("T" "U" "T" "TUT")
v3 '("T" "A" "AT")
t1 (trie v1)
t2 (trie v2)
t3 (trie v1 v2 v3)
vect (as-vec t3)]
vect)
)
(defn parent?
[node]
(and (vector? (first node))
(map? (second node))))
(defn child-seq [loc]
(if (and (zip/left loc)
(zip/down (zip/left loc)))
((fn inner [child]
(if child
(lazy-seq
(cons child
(inner (zip/right child))))
nil))
(->> loc zip/left zip/down))
'()))
(defn zip-visitor
([visitor zipper]
(loop [zipper zipper]
(if(zip/end? zipper)
(zip/root zipper)
(recur (zip/next (visitor zipper)))))))
(comment
(let [m {:root
{:children
{"T"
{:children
{"A" {:children
{"T" {:value "TAT", :count 1}
"S" {:value "SAT" :count 1}}
:value "AT"
:count 1},
"U" {:children {"T" {:value "TUT", :count 1}}}}}}}}]
(let [z (zip/vector-zip (map-trie->seq-trie m))
pred (fn [loc]
(map? (zip/node loc)))]
(zip-visitor
(fn [loc]
(if (pred loc)
(zip/edit
loc
(fn [node]
(let [[k v] (first (seq node))
children-counts (->> (child-seq loc)
(map zip/node)
(map (comp :count second first seq second)))]
(if (not-empty children-counts)
(update-in node [k :count] (partial apply (fnil + 0)) children-counts)
node))))
loc))
z)))
)
(defprotocol ITrie
(as-map [this] "Map that underlies trie.")
(as-vec [this] "Depth-first post-order vector")
(transform [this f] "Depth-first post-order apply each function to each node."))
;; Seq offers a depth-first post-order traversal
;; with children ordered by key.
(deftype Trie [trie]
ITrie
(as-map [_] trie)
(as-vec [_] (map-trie->seq-trie trie))
(transform [self f]
(->> self
as-vec
zip/vector-zip
(zip-visitor f)
(vec-trie->map-trie)
(Trie.)))
clojure.lang.IPersistentCollection
(seq [self]
(->> self
as-vec
zip/vector-zip
(iterate zip/next)
(take-while (complement zip/end?))
(map zip/node)
(filter map?)
(filter (comp :value second first seq))))
(cons [_ o]
(let [path (cons :root (interleave (repeat :children) (butlast o)))
id (last o)
node (get-in trie path)]
(Trie.
(update-in
trie
path
(fn [prev]
(if (nil? prev)
{:value id
:count 1}
(-> prev
(assoc :value id) ; Assert value same?
(update :count (fnil inc 0)))))))))
(empty [_] (Trie. {}))
(equiv [_ o]
(and (isa? (class o) Trie)
(= (as-map o) trie))))
(defn trie
([] (->Trie {}))
([& entries]
(reduce
(fn [t entry]
(conj t entry))
(trie)
entries)))
(comment
(let [v1 '("T" "A" "T" "TAT")
v2 '("T" "U" "T" "TUT")
v3 '("T" "A" "AT")
t1 (trie v1)
t2 (trie v2)
t3 (trie v1 v2 v3)]
(seq t3))
(let [v1 '("T" "A" "T" "TAT")
v2 '("T" "U" "T" "TUT")
v3 '("T" "A" "AT")
t1 (trie v1)
t2 (trie v2)
t3 (trie v1 v2 v3)
pred (fn [loc]
(map? (zip/node loc)))]
(transform
t3
(fn [loc]
(if (pred loc)
(zip/edit
loc
(fn [node]
(let [[k v] (first (seq node))
children-counts (->> (child-seq loc)
(map zip/node)
(map (comp :count second first seq second)))]
(if (not-empty children-counts)
(update-in node [k :count] (partial apply (fnil + 0)) children-counts)
node))))
loc))))
)
(defn vec->trie
[v]
(let [zipper (zip/vector-zip v)]
(->> zipper
(iterate zip/next)
(take-while (complement zip/end?))
(filter (comp map? zip/node))
#_(map #(concat (zip/path %) [(->> % zip/node keys first)
(->> % zip/node vals first :value)]))
(map zip/path))))
(comment
(let [v1 '("T" "A" "T" "TAT")
v2 '("T" "U" "T" "TUT")
v3 '("T" "A" "AT")
t1 (trie v1)
t2 (trie v2)
t3 (trie v1 v2 v3)
vect (as-vec t3)]
(vec->trie vect))
)
(defn write-node [baos node])
(defn write-index [baos children])
(defn pack-index-to-children [children]
(let [baos (ByteArrayOutputStream.)]
(run!
(fn [[index-key byte-address]]
(.write baos index-key)
(.write baos byte-address))
children)
(.toByteArray baos)))
(defn node->byte-array [index-key node-value children]
(let [baos (ByteArrayOutputStream.)
child-index (pack-index-to-children children)]
(.write baos node-value)
(.write baos (count child-index))
(.writeBytes baos child-index)
(.toByteArray baos)))
(defn tpt [trie]
(let [node? (fn [x]
(and (seq? x)
(not-empty x)
(not (seq? (first x)))))
transform (fn [x]
(if (node? x)
(let [[index-key node-value children] x]
(list index-key node-value (count children) children))
x))]
(walk/postwalk transform trie)))
(comment
(let [trie '(b 3 ())]
(tpt trie))
(let [trie '(nil 20 ((a 17 ())))]
(tpt trie))
(let [trie '(nil 20 ((a 17 ((a 10 ())
(b 7 ())))
(b 3 ())))]
(tpt trie))
)

@ -12,6 +12,11 @@
(with-open [reader (io/reader (io/resource "dark-corpus-test.txt"))]
(->> (line-seq reader) doall)))
(def test-sentence (first test-corpus))
(def test-tokens
(sgt/pad-tokens (sgt/tokenize-line test-sentence) 1))
(def train-trie
(sgt/lines->trie train-corpus 3))
@ -31,3 +36,49 @@
(deftest simple-good-turing
(testing "accuracy"))
(partition 2 1 test-tokens)
;; => (("<s>" "three")
;; ("three" "years")
;; ("years" "passed")
;; ("passed" "since")
;; ("since" "it")
;; ("it" "began")
;; ("began" "</s>"))
(defn ngram-perplexity [model vocab n]
(fn [tokens]
(->> tokens
(partition n 1)
(map #(model vocab %))
(map #(/ (Math/log %)
(Math/log 2))))))
(def unigram-perplexity (ngram-perplexity sgt-model vocab 1))
(def bigram-perplexity (ngram-perplexity sgt-model vocab 2))
(unigram-perplexity ["<s>" "you're" "a" "dweller" "</s>"])
;; => (-2.3988984034800693
;; -9.075314877722885
;; -4.843244892573044
;; -11.69197410313705
;; -2.3988984034800693)
;; => -30.408330680393117
(bigram-perplexity ["<s>" "you're" "a" "dweller" "</s>"])
;; => (-8.228033550648288 -9.95473739539021 -10.916015756609784 -13.731947650675696)
;; => -42.830734353323976
(sgt-model vocab '("it" "began"))
(->> test-corpus
(map sgt/tokenize-line)
(map #(sgt/pad-tokens % 1))
(map #(partition 2 1 %))
(map
(fn [tokens]
(map #(sgt-model vocab %) tokens)))
(take 10)
(map
(fn [tokens]
(map #(/ (Math/log %)
(Math/log 2))
tokens))))
(/ (Math/log 10)
(Math/log 2))

Loading…
Cancel
Save