diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj
new file mode 100644
index 0000000..378434e
--- /dev/null
+++ b/src/com/owoga/prhyme/data_transform.clj
@@ -0,0 +1,161 @@
+(ns com.owoga.prhyme.data-transform
+ (:require [clojure.string :as string]
+ [clojure.java.io :as io]
+ [com.owoga.prhyme.data.dictionary :as dict]
+ [com.owoga.trie :as trie]
+ [com.owoga.tightly-packed-trie :as tpt]
+ [com.owoga.tightly-packed-trie.encoding :as encoding]
+ [taoensso.nippy :as nippy]))
+
+(def re-word
+ "Regex for tokenizing a string into words
+ (including contractions and hyphenations),
+ commas, periods, and newlines."
+ #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
+
+(defn xf-file-seq [start end]
+ (comp (remove #(.isDirectory %))
+ (drop start)
+ (take end)))
+
+(defn make-token-padder
+ [beginning-token
+ end-token
+ number-of-beginning-tokens
+ number-of-end-tokens]
+ (let [beginning-pad (repeat number-of-beginning-tokens beginning-token)
+ end-pad (repeat number-of-end-tokens end-token)]
+ (fn [tokens]
+ (into (vec beginning-pad) (into tokens end-pad)))))
+
+(defn xf-pad-tokens [num-beg beg-tok num-end end-tok]
+ (map (make-token-padder beg-tok end-tok num-beg num-end)))
+
+(def xf-tokenize
+ (comp
+ (map string/trim)
+ (map (partial re-seq re-word))
+ (map (partial map second))
+ (map (partial mapv string/lower-case))))
+
+(def xf-filter-english
+ (let [word? (fn [x] (or (#{"." "?" ","} x)
+ (dict/cmu-with-stress-map x)))]
+ (filter (partial every? word?))))
+
+(defn n-to-m-partitions
+ "Exclusive of m, similar to range."
+ [n m partitions]
+ (mapcat
+ #(partition % 1 partitions)
+ (range n m)))
+
+(defn new-key [database k]
+ (let [next-id (@database :next-id)]
+ (swap!
+ database
+ #(-> %
+ (assoc k next-id)
+ (assoc next-id k)
+ (update :next-id inc)))
+ next-id))
+
+(defn make-database-processor
+ "Takes an atom and returns a function that takes a Trie key/value.
+ When the returned function is called, it checks to see
+ if the key is in the database and if so it returns the associated id.
+ If not, it increments the id (which is stored in the database
+ under :next-id) and returns that new id."
+ [database]
+ (fn [[k v]]
+ (let [k' (map (fn [kn]
+ (if-let [id (get @database kn)]
+ id
+ (new-key database kn)))
+ k)]
+ [k' 1])))
+
+(def encode-fn
+ "Encodes a number as a variable-length encoded value.
+ nil gets encoded as 0."
+ (fnil encoding/encode 0))
+
+(defn decode-fn
+ "Decodes a variable-length encoded number from a byte-buffer.
+ Zero gets decoded to nil."
+ [byte-buffer]
+ (let [value (encoding/decode byte-buffer)]
+ (if (zero? value)
+ nil
+ value)))
+
+(defn file-seq->trie
+ "Transduces text files into an n-to-m-gram trie.
+
+ Takes an atom as a database and updates it to
+ map integer ids to their string representations and
+ strings to their integer ids.
+
+ Splits text on newline, question marks, and periods. Pads
+ each split with and .
+
+ Currently configured for backwards trie for generation starting from the
+ end of a sentence. To reconfigure, remove the map partial map reverse and change
+ the count of the beginning/end pads."
+ [database files n m]
+ (transduce
+ (comp
+ (map slurp)
+ (map #(string/split % #"[\n+\?\.]"))
+ (map (partial transduce xf-tokenize conj))
+ (map (partial transduce xf-filter-english conj))
+ (remove empty?)
+ (map (partial transduce (xf-pad-tokens 1 "" (dec m) "") conj))
+ (map (partial map reverse))
+ (mapcat (partial map (partial n-to-m-partitions n (inc m))))
+ (mapcat (partial into []))
+ (map #(clojure.lang.MapEntry. (vec %) %))
+ (map (make-database-processor database)))
+ (completing
+ (fn [trie [k v]]
+ (update trie k (fnil inc 0))))
+ (trie/make-trie)
+ files))
+
+(defn trie->tightly-packed-trie
+ [trie encode-fn decode-fn]
+ (tpt/tightly-packed-trie trie encode-fn decode-fn))
+
+(comment
+ (time
+ (let [database (atom {:next-id 1})
+ trie (transduce
+ (comp (xf-file-seq 0 250000)
+ (map slurp)
+ (map #(string/split % #"[\n+\?\.]"))
+ (map (partial transduce xf-tokenize conj))
+ (map (partial transduce xf-filter-english conj))
+ (remove empty?)
+ (map (partial transduce (xf-pad-tokens 1 "" 3 "") conj))
+ (map (partial map reverse))
+ (mapcat (partial map (partial n-to-m-partitions 1 5)))
+ (mapcat (partial into []))
+ (map #(clojure.lang.MapEntry. (vec %) %))
+ (map (make-database-processor database)))
+ (completing
+ (fn [trie [k v]]
+ (update trie k (fnil inc 0))))
+ (trie/make-trie)
+ (file-seq (io/file "dark-corpus")))
+ tpt (tpt/tightly-packed-trie trie encode-fn decode-fn)]
+ (tpt/save-tightly-packed-trie-to-file "/tmp/tpt.bin" tpt)
+ (nippy/freeze-to-file "/tmp/db.bin" @database)))
+
+ (time
+ (let [database (nippy/thaw-from-file "/tmp/db.bin")
+ tpt (tpt/load-tightly-packed-trie-from-file "/tmp/tpt.bin" decode-fn)]
+ (->> tpt
+ (take-last 10)
+ (map (fn [[k v]] [k (map database k) v])))))
+
+ )