From 969240f91989b2043421caf40811a7ee4ffb83f7 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Sat, 19 Dec 2020 14:20:04 -0800 Subject: [PATCH] Scratchpad for tightly-packed-trie Includes variable-length encoding examples --- src/com/owoga/prhyme/data/tpt.clj | 102 ++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 src/com/owoga/prhyme/data/tpt.clj diff --git a/src/com/owoga/prhyme/data/tpt.clj b/src/com/owoga/prhyme/data/tpt.clj new file mode 100644 index 0000000..b65a504 --- /dev/null +++ b/src/com/owoga/prhyme/data/tpt.clj @@ -0,0 +1,102 @@ +(ns com.owoga.prhyme.data.tpt + (:require [clojure.string :as string] + [clojure.java.io :as io]) + (:import (java.nio ByteBuffer) + (java.lang.reflect Array))) + +(defn ubyte [b] + (bit-and 0xff b)) + +(defn sbyte [b] + (bit-or -0x100 b)) + +;; I've got a dictionary of ~120,000 words. +;; I need to reference them in a compact way. +;; It will be a little bit of a waste, but we can use a 32 bit index. + +(/ (Math/log 1.2e5) (Math/log 2)) + +(defn as-binary-string [b] + (string/replace + (format "%8s" (Integer/toBinaryString b)) + #" " + "0")) + +(defn vb-encode [n] + (loop [n n + b '()] + (cond + (< n 128) (let [ba (byte-array (cons n b)) + i (dec (count ba))] + (aset-byte ba i (sbyte (+ 128 (aget ba i)))) + ba) + :else (recur (quot n 128) (cons (mod n 128) b))))) + +(defn vb-decode-1 [ba] + (loop [n 0 + i 0] + (let [b (aget ba i)] + (cond + (> (bit-and b 0x80) 0) + [(bit-or (bit-and 0x7f b) + (bit-shift-left n 7)) + (inc i)] + :else + (recur (bit-or (bit-and 0x7f b) + (bit-shift-left n 7)) + (inc i)))))) + +(def dictionary ["hi" "my" "name" "is" "what"]) + +(defn slurp-bytes [x] + (with-open [out (java.io.ByteArrayOutputStream.)] + (io/copy (io/input-stream x) out) + (.toByteArray out))) + +(def markov + {"hello" {:freq 5 + :children {"world" {:freq 2} + "eric" {:freq 1}}} + "goodbye" {:freq 2 + :children {"cruel" {:freq 1 + :children {"world" {:freq 1}}}}} + "world" {:freq 3} + "eric" {:freq 1} + "cruel" {:freq 1 + :children {"world" {:freq 1}}}}) + +(def markov + {"world" + {:freq 3 + :children {"hello" {:freq 2} + "goodbye" {:freq 2} + "cruel" {:freq 1 + :children {"goodbye" {:freq 1}}}}} + "hello" + {:freq 2} + "goodbye" + {:freq 3} + "cruel" + {:freq 1 + :children {"goodbye" {:freq 1}}}}) + +(comment + (with-open [o (io/output-stream "test.bin")] + (.write o (byte-array 8) 0 8)) + + (let [ba (slurp-bytes "test.bin") + len (count ba)] + ) + + (with-open [o (io/output-stream "test.bin")] + (.write o (count dictionary)) + (run! + (fn [[i w]] + (let [b (vb-encode i)] + (.write o b 0 (count b))) + (let [b (.getBytes w)] + (.write o b 0 (count b)))) + (map vector (range) dictionary)) + ) + + )