Add tightly-packed-trie example
parent
ded92cc0a2
commit
f66210d7dd
@ -0,0 +1,251 @@
|
||||
(ns examples.tpt
|
||||
(:require [clojure.string :as string]
|
||||
[clojure.java.io :as io]
|
||||
[com.owoga.tightly-packed-trie.core :as tpt]
|
||||
[com.owoga.prhyme.util :as util]
|
||||
[com.owoga.prhyme.data.dictionary :as dict]
|
||||
[clojure.zip :as zip]))
|
||||
|
||||
(defn dark-corpus-file-seq [start end]
|
||||
(let [documents (->> "dark-corpus"
|
||||
io/file
|
||||
file-seq
|
||||
(remove #(.isDirectory %))
|
||||
(drop start)
|
||||
(take end))]
|
||||
documents))
|
||||
|
||||
(defn remove-sentences-with-words-not-in-dictionary
|
||||
"This gets rid of a lot of good words. All contractions for example... I'll, They'll...
|
||||
possessives like morning's...
|
||||
|
||||
Might not end up using it."
|
||||
[dictionary]
|
||||
(let [dictionary (into #{} dictionary)]
|
||||
(fn [sentences]
|
||||
(->> sentences
|
||||
(map #(string/split % #" "))
|
||||
(remove #(some (complement dictionary) %))
|
||||
(remove #(some string/blank? %))
|
||||
(map #(string/join " " %))))))
|
||||
|
||||
(def re-word
|
||||
"Regex for tokenizing a string into words
|
||||
(including contractions and hyphenations),
|
||||
commas, periods, and newlines."
|
||||
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
|
||||
|
||||
(defn pad-tokens
|
||||
"Pads the beginning with n - 1 <s> tokens and
|
||||
the end with 1 </s> token."
|
||||
[tokens n]
|
||||
(concat (repeat (max 1 (dec n)) "<s>") tokens ["</s>"]))
|
||||
|
||||
(defn tokenize-line
|
||||
[line]
|
||||
(->> line
|
||||
(string/trim)
|
||||
(re-seq re-word)
|
||||
(map second)
|
||||
(map string/lower-case)))
|
||||
|
||||
(def database (atom {}))
|
||||
|
||||
(defn process-files-for-trie
|
||||
"Expects an entire song, lines seperated by \n."
|
||||
[files]
|
||||
(->> files
|
||||
(map slurp)
|
||||
(filter dict/english?)
|
||||
(map util/clean-text)
|
||||
(map #(string/split % #"\n+"))
|
||||
(map (remove-sentences-with-words-not-in-dictionary dict/popular))
|
||||
(remove empty?)
|
||||
(remove #(some empty? %))
|
||||
(map (fn [lines]
|
||||
(map tokenize-line lines)))
|
||||
(map (fn [lines]
|
||||
(map #(pad-tokens % 1) lines)))
|
||||
(map (fn [lines]
|
||||
(map #(partition 2 1 %) lines)))))
|
||||
|
||||
(defn text->ngrams
|
||||
"Takes text from a file, including newlines.
|
||||
Pads lines with <s> and </s> for start/end of line.
|
||||
Pads beginning with n - 1 <s>s"
|
||||
[text n]
|
||||
(let [words-not-in-dict-filter (remove-sentences-with-words-not-in-dictionary dict/popular)]
|
||||
(->> text
|
||||
util/clean-text
|
||||
(#(string/split % #"\n+"))
|
||||
(remove empty?)
|
||||
(map tokenize-line)
|
||||
(map #(pad-tokens % n))
|
||||
(map #(partition n 1 %))
|
||||
(apply concat))))
|
||||
|
||||
(defn make-trie
|
||||
([] (tpt/->Trie
|
||||
(fn update-fn [prev cur]
|
||||
(if (nil? prev)
|
||||
{:value (last cur)
|
||||
:count 1}
|
||||
(-> prev
|
||||
(update :count (fnil inc 0))
|
||||
(assoc :value (last cur)))))
|
||||
(sorted-map)))
|
||||
([& ks]
|
||||
(reduce
|
||||
(fn [t k]
|
||||
(conj t k))
|
||||
(make-trie)
|
||||
ks)))
|
||||
|
||||
(defn n-to-m-grams
|
||||
"Exclusive of m, similar to range."
|
||||
[n m text]
|
||||
(loop [i n
|
||||
r '()]
|
||||
(cond
|
||||
(= i m)
|
||||
(apply concat r)
|
||||
:else
|
||||
(recur (inc i) (cons (text->ngrams text i) r)))))
|
||||
|
||||
|
||||
(defn prep-ngram-for-trie
|
||||
"The tpt/trie expects values conjed into an ngram
|
||||
to be of format '(k1 k2 k3 value)."
|
||||
[ngram]
|
||||
(concat ngram (list ngram)))
|
||||
|
||||
(defn create-trie-from-texts [texts]
|
||||
(->> texts
|
||||
(map #(n-to-m-grams 1 4 %))
|
||||
(apply concat)
|
||||
(map prep-ngram-for-trie)
|
||||
(apply make-trie)))
|
||||
|
||||
(defn trie->seq-of-nodes [trie]
|
||||
(->> trie
|
||||
tpt/as-vec
|
||||
zip/vector-zip
|
||||
(iterate zip/next)
|
||||
(take-while (complement zip/end?))
|
||||
(map zip/node)
|
||||
(filter map?)))
|
||||
|
||||
(defn seq-of-nodes->sorted-by-count
|
||||
"Sorted first by the rank of the ngram, lowest ranks first.
|
||||
Sorted second by the frequency of the ngram, highest frequencies first.
|
||||
This is the order that you'd populate a mapping of keys to IDs."
|
||||
[nodes]
|
||||
(->> nodes
|
||||
(map (comp first seq))
|
||||
(map (fn [[k v]]
|
||||
(vector (:value v) (:count v))))
|
||||
;; root node and padded starts
|
||||
(remove (comp nil? second))
|
||||
(sort-by #(vector (count (first %))
|
||||
(- (second %))))))
|
||||
|
||||
(defn trie->database [trie]
|
||||
(let [sorted-keys
|
||||
(->> (trie->seq-of-nodes trie)
|
||||
seq-of-nodes->sorted-by-count)]
|
||||
(loop [sorted-keys sorted-keys
|
||||
database {}
|
||||
i 1]
|
||||
(if (empty? sorted-keys)
|
||||
database
|
||||
(recur
|
||||
(rest sorted-keys)
|
||||
(-> database
|
||||
(assoc (first (first sorted-keys))
|
||||
{:count (second (first sorted-keys))
|
||||
:id i})
|
||||
(assoc i (first (first sorted-keys))))
|
||||
(inc i))))))
|
||||
|
||||
(defn transform-trie->ids [trie database]
|
||||
(let [transform-p #(map? (zip/node %))
|
||||
transform-f
|
||||
(fn tf [loc]
|
||||
(zip/edit
|
||||
loc
|
||||
(fn [node]
|
||||
(let [[k v] (first (seq node))]
|
||||
{(get-in database [(list k) :id] (if (= k :root) :root))
|
||||
(assoc v :value (get-in database [(list k) :count] 0))}))))]
|
||||
(tpt/transform trie (tpt/visitor-filter transform-p transform-f))))
|
||||
|
||||
(defonce trie
|
||||
(let [texts (->> (dark-corpus-file-seq 500 500)
|
||||
(map slurp))]
|
||||
(create-trie-from-texts texts)))
|
||||
|
||||
(defonce trie-database
|
||||
(trie->database trie))
|
||||
|
||||
(def tightly-packed-trie
|
||||
(let [trie-with-ids (transform-trie->ids trie trie-database)
|
||||
tightly-packed-trie (tpt/tightly-packed-trie trie-with-ids)]
|
||||
tightly-packed-trie))
|
||||
|
||||
(defn key-get-in-tpt [tpt db ks]
|
||||
(let [id (map #(get-in db [(list %) :id]) ks)
|
||||
v (get tpt id)]
|
||||
(println id ks)
|
||||
{id v}))
|
||||
|
||||
(defn id-get-in-tpt [tpt db ids]
|
||||
(let [ks (apply concat (map #(get db %) ids))
|
||||
v (get tpt ids)
|
||||
id (get-in db [ks :id])]
|
||||
{ks (assoc v :value (get db id))}))
|
||||
|
||||
(comment
|
||||
(key-get-in-tpt
|
||||
tightly-packed-trie
|
||||
trie-database
|
||||
'("<s>" "<s>" "the"))
|
||||
;; => {(2 2 3) {:value 3263, :count 462}}
|
||||
(id-get-in-tpt
|
||||
tightly-packed-trie
|
||||
trie-database
|
||||
'(2 2 3))
|
||||
;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}}
|
||||
)
|
||||
|
||||
(comment
|
||||
;; database
|
||||
(let [texts (->> (dark-corpus-file-seq 500 2)
|
||||
(map slurp))
|
||||
trie (create-trie-from-texts texts)]
|
||||
(->> (trie->database trie)
|
||||
(#(get % 3))))
|
||||
|
||||
(let [texts (->> (dark-corpus-file-seq 500 2)
|
||||
(map slurp))
|
||||
trie (create-trie-from-texts texts)]
|
||||
(tpt/as-map (transform-trie->ids trie)))
|
||||
|
||||
(let [texts (->> (dark-corpus-file-seq 500 2)
|
||||
(map slurp))
|
||||
trie (create-trie-from-texts texts)
|
||||
tightly-packed-trie (tpt/tightly-packed-trie
|
||||
(transform-trie->ids trie))]
|
||||
(get tightly-packed-trie '(2 2 3)))
|
||||
|
||||
|
||||
(let [texts (->> (dark-corpus-file-seq 500 2)
|
||||
(map slurp))
|
||||
trie (create-trie-from-texts texts)]
|
||||
(tpt/as-map trie))
|
||||
|
||||
(let [text (slurp (first (dark-corpus-file-seq 500 1)))]
|
||||
(->> text
|
||||
util/clean-text
|
||||
(#(string/split % #"\n+"))))
|
||||
|
||||
)
|
Loading…
Reference in New Issue