You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

278 lines
7.9 KiB
Clojure

(ns examples.tpt
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.trie.math :as math]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data.dictionary :as dict]
[clojure.zip :as zip]))
(defn dark-corpus-file-seq [start end]
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop start)
(take end))]
documents))
(defn remove-sentences-with-words-not-in-dictionary
"This gets rid of a lot of good words. All contractions for example... I'll, They'll...
possessives like morning's...
Might not end up using it."
[dictionary]
(let [dictionary (into #{} dictionary)]
(fn [sentences]
(->> sentences
(map #(string/split % #" "))
(remove #(some (complement dictionary) %))
(remove #(some string/blank? %))
(map #(string/join " " %))))))
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
(defn pad-tokens
"Pads the beginning with n - 1 <s> tokens and
the end with 1 </s> token."
[tokens n]
(concat (repeat (max 1 (dec n)) "<s>") tokens ["</s>"]))
(defn tokenize-line
[line]
(->> line
(string/trim)
(re-seq re-word)
(map second)
(map string/lower-case)))
(defn process-files-for-trie
"Expects an entire song, lines seperated by \n."
[files]
(->> files
(map slurp)
(filter dict/english?)
(map util/clean-text)
(map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?)
(remove #(some empty? %))
(map (fn [lines]
(map tokenize-line lines)))
(map (fn [lines]
(map #(pad-tokens % 1) lines)))
(map (fn [lines]
(map #(partition 2 1 %) lines)))))
(defn text->ngrams
"Takes text from a file, including newlines.
Pads lines with <s> and </s> for start/end of line.
Pads beginning with n - 1 <s>s"
[text n]
(let [words-not-in-dict-filter (remove-sentences-with-words-not-in-dictionary dict/popular)]
(->> text
util/clean-text
(#(string/split % #"\n+"))
(remove empty?)
(map tokenize-line)
(map #(pad-tokens % n))
(map #(partition n 1 %))
(apply concat))))
(defn n-to-m-grams
"Exclusive of m, similar to range."
[n m text]
(loop [i n
r '()]
(cond
(= i m)
(apply concat r)
:else
(recur (inc i) (cons (text->ngrams text i) r)))))
(defn prep-ngram-for-trie
"The tpt/trie expects values conjed into an ngram
to be of format '(k1 k2 k3 value)."
[ngram]
(clojure.lang.MapEntry. (vec ngram) ngram))
(defn create-trie-from-texts [texts]
(->> texts
(map #(n-to-m-grams 1 5 %))
(apply concat)
(map prep-ngram-for-trie)
(reduce
(fn [trie [k v]]
(let [existing (or (get trie k) {:count 0 :value (last v)})]
(conj trie [k (update existing :count inc)])))
(trie/make-trie))))
(defn seq-of-nodes->sorted-by-count
"Sorted first by the rank of the ngram, lowest ranks first.
Sorted second by the frequency of the ngram, highest frequencies first.
This is the order that you'd populate a mapping of keys to IDs."
[trie]
(->> trie
trie/children
(map #(get % []))
(sort-by :count)
reverse))
(defn trie->database [trie]
(let [sorted-keys
(seq-of-nodes->sorted-by-count trie)]
(loop [sorted-keys sorted-keys
database {}
i 1]
(if (empty? sorted-keys)
database
(recur
(rest sorted-keys)
(-> database
(assoc i {:count (:count (first sorted-keys))
:value (:value (first sorted-keys))})
(assoc (:value (first sorted-keys)) i))
(inc i))))))
(def trie
(let [texts (->> (dark-corpus-file-seq 0 1000)
(map slurp))]
(create-trie-from-texts texts)))
(def trie-database
(trie->database trie))
(defn encode-fn [v]
(let [{:keys [count value]} v]
(if (and (number? v) (not (zero? v)))
(byte-array
(concat (encoding/encode (trie-database value))
(encoding/encode count)))
(encoding/encode 0))))
(defn decode-fn [byte-buffer]
(let [v (encoding/decode byte-buffer)]
(if (and (number? v) (zero? v))
nil
(trie-database v))))
(comment
(def tight-ready-trie
(->> trie
(map (fn [[k v]]
(let [k (map #(get trie-database %) k)]
[k v])))
(into (trie/make-trie))))
)
(def tightly-packed-trie
(let [tight-ready-trie
(->> trie
(map (fn [[k v]]
(let [k (map #(get trie-database %) k)]
[k v])))
(into (trie/make-trie)))]
(tpt/tightly-packed-trie
tight-ready-trie
encode-fn
decode-fn)))
(defn key-get-in-tpt [tpt db ks]
(let [id (map #(get-in db [(list %) :id]) ks)
v (get tpt id)]
(println id ks)
{id v}))
(defn id-get-in-tpt [tpt db ids]
(let [ks (apply concat (map #(get db %) ids))
v (get tpt ids)
id (get-in db [ks :id])]
{ks (assoc v :value (get db id))}))
(comment
(def example-story
(loop [generated-text [(get trie-database "<s>")]
i 0]
(if (> i 100)
generated-text
(let [node (loop [i 3]
(let [node (trie/lookup
tightly-packed-trie
(vec (take-last i generated-text)))]
(cond
(nil? node) (recur (dec i))
(< i 0) (throw (Exception. "Error"))
(seq (trie/children node)) node
:else (recur (dec i)))))]
(recur
(conj
generated-text
(->> node
trie/children
(map #(get % []))
(remove nil?)
(math/weighted-selection :count)
:value
(get trie-database)))
(inc i))))))
(->> example-story
(map #(get-in trie-database [% :value]))
(concat)
(string/join " ")
(#(string/replace % #" ([\.,\?])" "$1"))
((fn [txt]
(string/replace txt #"(^|\. |\? )([a-z])" (fn [[a b c]]
(str b (.toUpperCase c)))))))
(key-get-in-tpt
tightly-packed-trie
trie-database
'("<s>" "<s>" "the"))
;; => {(2 2 3) {:value 3263, :count 462}}
(id-get-in-tpt
tightly-packed-trie
trie-database
'(2 2 3))
;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}}
)
(comment
;; database
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(->> (trie->database trie)
(#(get % 3))))
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(tpt/as-map (transform-trie->ids trie)))
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)
tightly-packed-trie (tpt/tightly-packed-trie
(transform-trie->ids trie))]
(get tightly-packed-trie '(2 2 3)))
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(tpt/as-map trie))
(let [text (slurp (first (dark-corpus-file-seq 500 1)))]
(->> text
util/clean-text
(#(string/split % #"\n+"))))
)