From dc5340be7f8bd3fe95491d18ebb3085ce177ddae Mon Sep 17 00:00:00 2001
From: Eric Ihli <eihli@owoga.com>
Date: Mon, 26 Apr 2021 10:06:43 -0500
Subject: [PATCH] Add package for data transform

Transforms text files to tries.
---
 src/com/owoga/prhyme/data_transform.clj | 161 ++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 src/com/owoga/prhyme/data_transform.clj
diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj
new file mode 100644
index 0000000..378434e
--- /dev/null
+++ b/src/com/owoga/prhyme/data_transform.clj
@@ -0,0 +1,161 @@
+(ns com.owoga.prhyme.data-transform
+  (:require [clojure.string :as string]
+            [clojure.java.io :as io]
+            [com.owoga.prhyme.data.dictionary :as dict]
+            [com.owoga.trie :as trie]
+            [com.owoga.tightly-packed-trie :as tpt]
+            [com.owoga.tightly-packed-trie.encoding :as encoding]
+            [taoensso.nippy :as nippy]))
+
+(def re-word
+  "Regex for tokenizing a string into words
+  (including contractions and hyphenations),
+  commas, periods, and newlines."
+  #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
+
+(defn xf-file-seq [start end]
+  (comp (remove #(.isDirectory %))
+        (drop start)
+        (take end)))
+
+(defn make-token-padder
+  [beginning-token
+   end-token
+   number-of-beginning-tokens
+   number-of-end-tokens]
+  (let [beginning-pad (repeat number-of-beginning-tokens beginning-token)
+        end-pad (repeat number-of-end-tokens end-token)]
+    (fn [tokens]
+      (into (vec beginning-pad) (into tokens end-pad)))))
+
+(defn xf-pad-tokens [num-beg beg-tok num-end end-tok]
+  (map (make-token-padder beg-tok end-tok num-beg num-end)))
+
+(def xf-tokenize
+  (comp
+   (map string/trim)
+   (map (partial re-seq re-word))
+   (map (partial map second))
+   (map (partial mapv string/lower-case))))
+
+(def xf-filter-english
+  (let [word? (fn [x] (or (#{"." "?" ","} x)
+                          (dict/cmu-with-stress-map x)))]
+    (filter (partial every? word?))))
+
+(defn n-to-m-partitions
+  "Exclusive of m, similar to range."
+  [n m partitions]
+  (mapcat
+   #(partition % 1 partitions)
+   (range n m)))
+
+(defn new-key [database k]
+  (let [next-id (@database :next-id)]
+    (swap!
+     database
+     #(-> %
+          (assoc k next-id)
+          (assoc next-id k)
+          (update :next-id inc)))
+    next-id))
+
+(defn make-database-processor
+  "Takes an atom and returns a function that takes a Trie key/value.
+  When the returned function is called, it checks to see
+  if the key is in the database and if so it returns the associated id.
+  If not, it increments the id (which is stored in the database
+  under :next-id) and returns that new id."
+  [database]
+  (fn [[k v]]
+    (let [k' (map (fn [kn]
+                    (if-let [id (get @database kn)]
+                      id
+                      (new-key database kn)))
+                  k)]
+      [k' 1])))
+
+(def encode-fn
+  "Encodes a number as a variable-length encoded value.
+  nil gets encoded as 0."
+  (fnil encoding/encode 0))
+
+(defn decode-fn
+  "Decodes a variable-length encoded number from a byte-buffer.
+  Zero gets decoded to nil."
+  [byte-buffer]
+  (let [value (encoding/decode byte-buffer)]
+    (if (zero? value)
+      nil
+      value)))
+
+(defn file-seq->trie
+  "Transduces text files into an n-to-m-gram trie.
+
+  Takes an atom as a database and updates it to
+  map integer ids to their string representations and
+  strings to their integer ids.
+
+  Splits text on newline, question marks, and periods. Pads
+  each split with <s> and </s>.
+
+  Currently configured for backwards trie for generation starting from the
+  end of a sentence. To reconfigure, remove the map partial map reverse and change
+  the count of the beginning/end pads."
+  [database files n m]
+  (transduce
+   (comp
+    (map slurp)
+    (map #(string/split % #"[\n+\?\.]"))
+    (map (partial transduce xf-tokenize conj))
+    (map (partial transduce xf-filter-english conj))
+    (remove empty?)
+    (map (partial transduce (xf-pad-tokens 1 "<s>" (dec m) "</s>") conj))
+    (map (partial map reverse))
+    (mapcat (partial map (partial n-to-m-partitions n (inc m))))
+    (mapcat (partial into []))
+    (map #(clojure.lang.MapEntry. (vec %) %))
+    (map (make-database-processor database)))
+   (completing
+    (fn [trie [k v]]
+      (update trie k (fnil inc 0))))
+   (trie/make-trie)
+   files))
+
+(defn trie->tightly-packed-trie
+  [trie encode-fn decode-fn]
+  (tpt/tightly-packed-trie trie encode-fn decode-fn))
+
+(comment
+  (time
+   (let [database (atom {:next-id 1})
+         trie (transduce
+               (comp (xf-file-seq 0 250000)
+                     (map slurp)
+                     (map #(string/split % #"[\n+\?\.]"))
+                     (map (partial transduce xf-tokenize conj))
+                     (map (partial transduce xf-filter-english conj))
+                     (remove empty?)
+                     (map (partial transduce (xf-pad-tokens 1 "<s>" 3 "</s>") conj))
+                     (map (partial map reverse))
+                     (mapcat (partial map (partial n-to-m-partitions 1 5)))
+                     (mapcat (partial into []))
+                     (map #(clojure.lang.MapEntry. (vec %) %))
+                     (map (make-database-processor database)))
+               (completing
+                (fn [trie [k v]]
+                  (update trie k (fnil inc 0))))
+               (trie/make-trie)
+               (file-seq (io/file "dark-corpus")))
+         tpt (tpt/tightly-packed-trie trie encode-fn decode-fn)]
+     (tpt/save-tightly-packed-trie-to-file "/tmp/tpt.bin" tpt)
+     (nippy/freeze-to-file "/tmp/db.bin" @database)))
+
+  (time
+   (let [database (nippy/thaw-from-file "/tmp/db.bin")
+         tpt (tpt/load-tightly-packed-trie-from-file "/tmp/tpt.bin" decode-fn)]
+     (->> tpt
+          (take-last 10)
+          (map (fn [[k v]] [k (map database k) v])))))
+
+  )