parent
6cf1236708
commit
dc5340be7f
@ -0,0 +1,161 @@
|
|||||||
|
(ns com.owoga.prhyme.data-transform
|
||||||
|
(:require [clojure.string :as string]
|
||||||
|
[clojure.java.io :as io]
|
||||||
|
[com.owoga.prhyme.data.dictionary :as dict]
|
||||||
|
[com.owoga.trie :as trie]
|
||||||
|
[com.owoga.tightly-packed-trie :as tpt]
|
||||||
|
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
||||||
|
[taoensso.nippy :as nippy]))
|
||||||
|
|
||||||
|
(def re-word
|
||||||
|
"Regex for tokenizing a string into words
|
||||||
|
(including contractions and hyphenations),
|
||||||
|
commas, periods, and newlines."
|
||||||
|
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
|
||||||
|
|
||||||
|
(defn xf-file-seq [start end]
|
||||||
|
(comp (remove #(.isDirectory %))
|
||||||
|
(drop start)
|
||||||
|
(take end)))
|
||||||
|
|
||||||
|
(defn make-token-padder
|
||||||
|
[beginning-token
|
||||||
|
end-token
|
||||||
|
number-of-beginning-tokens
|
||||||
|
number-of-end-tokens]
|
||||||
|
(let [beginning-pad (repeat number-of-beginning-tokens beginning-token)
|
||||||
|
end-pad (repeat number-of-end-tokens end-token)]
|
||||||
|
(fn [tokens]
|
||||||
|
(into (vec beginning-pad) (into tokens end-pad)))))
|
||||||
|
|
||||||
|
(defn xf-pad-tokens [num-beg beg-tok num-end end-tok]
|
||||||
|
(map (make-token-padder beg-tok end-tok num-beg num-end)))
|
||||||
|
|
||||||
|
(def xf-tokenize
|
||||||
|
(comp
|
||||||
|
(map string/trim)
|
||||||
|
(map (partial re-seq re-word))
|
||||||
|
(map (partial map second))
|
||||||
|
(map (partial mapv string/lower-case))))
|
||||||
|
|
||||||
|
(def xf-filter-english
|
||||||
|
(let [word? (fn [x] (or (#{"." "?" ","} x)
|
||||||
|
(dict/cmu-with-stress-map x)))]
|
||||||
|
(filter (partial every? word?))))
|
||||||
|
|
||||||
|
(defn n-to-m-partitions
|
||||||
|
"Exclusive of m, similar to range."
|
||||||
|
[n m partitions]
|
||||||
|
(mapcat
|
||||||
|
#(partition % 1 partitions)
|
||||||
|
(range n m)))
|
||||||
|
|
||||||
|
(defn new-key [database k]
|
||||||
|
(let [next-id (@database :next-id)]
|
||||||
|
(swap!
|
||||||
|
database
|
||||||
|
#(-> %
|
||||||
|
(assoc k next-id)
|
||||||
|
(assoc next-id k)
|
||||||
|
(update :next-id inc)))
|
||||||
|
next-id))
|
||||||
|
|
||||||
|
(defn make-database-processor
|
||||||
|
"Takes an atom and returns a function that takes a Trie key/value.
|
||||||
|
When the returned function is called, it checks to see
|
||||||
|
if the key is in the database and if so it returns the associated id.
|
||||||
|
If not, it increments the id (which is stored in the database
|
||||||
|
under :next-id) and returns that new id."
|
||||||
|
[database]
|
||||||
|
(fn [[k v]]
|
||||||
|
(let [k' (map (fn [kn]
|
||||||
|
(if-let [id (get @database kn)]
|
||||||
|
id
|
||||||
|
(new-key database kn)))
|
||||||
|
k)]
|
||||||
|
[k' 1])))
|
||||||
|
|
||||||
|
(def encode-fn
|
||||||
|
"Encodes a number as a variable-length encoded value.
|
||||||
|
nil gets encoded as 0."
|
||||||
|
(fnil encoding/encode 0))
|
||||||
|
|
||||||
|
(defn decode-fn
|
||||||
|
"Decodes a variable-length encoded number from a byte-buffer.
|
||||||
|
Zero gets decoded to nil."
|
||||||
|
[byte-buffer]
|
||||||
|
(let [value (encoding/decode byte-buffer)]
|
||||||
|
(if (zero? value)
|
||||||
|
nil
|
||||||
|
value)))
|
||||||
|
|
||||||
|
(defn file-seq->trie
|
||||||
|
"Transduces text files into an n-to-m-gram trie.
|
||||||
|
|
||||||
|
Takes an atom as a database and updates it to
|
||||||
|
map integer ids to their string representations and
|
||||||
|
strings to their integer ids.
|
||||||
|
|
||||||
|
Splits text on newline, question marks, and periods. Pads
|
||||||
|
each split with <s> and </s>.
|
||||||
|
|
||||||
|
Currently configured for backwards trie for generation starting from the
|
||||||
|
end of a sentence. To reconfigure, remove the map partial map reverse and change
|
||||||
|
the count of the beginning/end pads."
|
||||||
|
[database files n m]
|
||||||
|
(transduce
|
||||||
|
(comp
|
||||||
|
(map slurp)
|
||||||
|
(map #(string/split % #"[\n+\?\.]"))
|
||||||
|
(map (partial transduce xf-tokenize conj))
|
||||||
|
(map (partial transduce xf-filter-english conj))
|
||||||
|
(remove empty?)
|
||||||
|
(map (partial transduce (xf-pad-tokens 1 "<s>" (dec m) "</s>") conj))
|
||||||
|
(map (partial map reverse))
|
||||||
|
(mapcat (partial map (partial n-to-m-partitions n (inc m))))
|
||||||
|
(mapcat (partial into []))
|
||||||
|
(map #(clojure.lang.MapEntry. (vec %) %))
|
||||||
|
(map (make-database-processor database)))
|
||||||
|
(completing
|
||||||
|
(fn [trie [k v]]
|
||||||
|
(update trie k (fnil inc 0))))
|
||||||
|
(trie/make-trie)
|
||||||
|
files))
|
||||||
|
|
||||||
|
(defn trie->tightly-packed-trie
|
||||||
|
[trie encode-fn decode-fn]
|
||||||
|
(tpt/tightly-packed-trie trie encode-fn decode-fn))
|
||||||
|
|
||||||
|
(comment
|
||||||
|
(time
|
||||||
|
(let [database (atom {:next-id 1})
|
||||||
|
trie (transduce
|
||||||
|
(comp (xf-file-seq 0 250000)
|
||||||
|
(map slurp)
|
||||||
|
(map #(string/split % #"[\n+\?\.]"))
|
||||||
|
(map (partial transduce xf-tokenize conj))
|
||||||
|
(map (partial transduce xf-filter-english conj))
|
||||||
|
(remove empty?)
|
||||||
|
(map (partial transduce (xf-pad-tokens 1 "<s>" 3 "</s>") conj))
|
||||||
|
(map (partial map reverse))
|
||||||
|
(mapcat (partial map (partial n-to-m-partitions 1 5)))
|
||||||
|
(mapcat (partial into []))
|
||||||
|
(map #(clojure.lang.MapEntry. (vec %) %))
|
||||||
|
(map (make-database-processor database)))
|
||||||
|
(completing
|
||||||
|
(fn [trie [k v]]
|
||||||
|
(update trie k (fnil inc 0))))
|
||||||
|
(trie/make-trie)
|
||||||
|
(file-seq (io/file "dark-corpus")))
|
||||||
|
tpt (tpt/tightly-packed-trie trie encode-fn decode-fn)]
|
||||||
|
(tpt/save-tightly-packed-trie-to-file "/tmp/tpt.bin" tpt)
|
||||||
|
(nippy/freeze-to-file "/tmp/db.bin" @database)))
|
||||||
|
|
||||||
|
(time
|
||||||
|
(let [database (nippy/thaw-from-file "/tmp/db.bin")
|
||||||
|
tpt (tpt/load-tightly-packed-trie-from-file "/tmp/tpt.bin" decode-fn)]
|
||||||
|
(->> tpt
|
||||||
|
(take-last 10)
|
||||||
|
(map (fn [[k v]] [k (map database k) v])))))
|
||||||
|
|
||||||
|
)
|
Loading…
Reference in New Issue