|
|
@ -1,13 +1,17 @@
|
|
|
|
(ns examples.tpt
|
|
|
|
(ns examples.tpt
|
|
|
|
(:require [clojure.string :as string]
|
|
|
|
(:require [clojure.string :as string]
|
|
|
|
[clojure.java.io :as io]
|
|
|
|
[clojure.java.io :as io]
|
|
|
|
|
|
|
|
[taoensso.tufte :as tufte :refer (defnp p profiled profile)]
|
|
|
|
[com.owoga.trie :as trie]
|
|
|
|
[com.owoga.trie :as trie]
|
|
|
|
[com.owoga.tightly-packed-trie :as tpt]
|
|
|
|
[com.owoga.tightly-packed-trie :as tpt]
|
|
|
|
[com.owoga.trie.math :as math]
|
|
|
|
[com.owoga.trie.math :as math]
|
|
|
|
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
|
|
|
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
|
|
|
[com.owoga.prhyme.util :as util]
|
|
|
|
[com.owoga.prhyme.util :as util]
|
|
|
|
[com.owoga.prhyme.data.dictionary :as dict]
|
|
|
|
[com.owoga.prhyme.data.dictionary :as dict]
|
|
|
|
[clojure.zip :as zip]))
|
|
|
|
[clojure.zip :as zip]
|
|
|
|
|
|
|
|
[cljol.dig9 :as d]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(tufte/add-basic-println-handler! {})
|
|
|
|
|
|
|
|
|
|
|
|
(defn dark-corpus-file-seq [start end]
|
|
|
|
(defn dark-corpus-file-seq [start end]
|
|
|
|
(let [documents (->> "dark-corpus"
|
|
|
|
(let [documents (->> "dark-corpus"
|
|
|
@ -108,10 +112,24 @@
|
|
|
|
(apply concat)
|
|
|
|
(apply concat)
|
|
|
|
(map prep-ngram-for-trie)
|
|
|
|
(map prep-ngram-for-trie)
|
|
|
|
(reduce
|
|
|
|
(reduce
|
|
|
|
(fn [trie [k v]]
|
|
|
|
(fn [[trie i db] [k v]]
|
|
|
|
(let [existing (or (get trie k) {:count 0 :value (last v)})]
|
|
|
|
(let [[db i] (reduce
|
|
|
|
(conj trie [k (update existing :count inc)])))
|
|
|
|
(fn [[db i] k]
|
|
|
|
(trie/make-trie))))
|
|
|
|
(let [k' (get db k i)
|
|
|
|
|
|
|
|
i' (if (= i k') (inc i) i)
|
|
|
|
|
|
|
|
db' (-> db
|
|
|
|
|
|
|
|
(assoc k' k)
|
|
|
|
|
|
|
|
(assoc k k'))]
|
|
|
|
|
|
|
|
[db' i']))
|
|
|
|
|
|
|
|
[db i]
|
|
|
|
|
|
|
|
k)]
|
|
|
|
|
|
|
|
(let [k' (map #(get db %) k)]
|
|
|
|
|
|
|
|
(if-let [existing (get trie k')]
|
|
|
|
|
|
|
|
(let [[val count] existing
|
|
|
|
|
|
|
|
trie (assoc trie k' [val (inc count)])]
|
|
|
|
|
|
|
|
[trie i db])
|
|
|
|
|
|
|
|
[(assoc trie k' [i 1]) i db]))))
|
|
|
|
|
|
|
|
[(trie/make-trie) 1 {}])))
|
|
|
|
|
|
|
|
|
|
|
|
(defn seq-of-nodes->sorted-by-count
|
|
|
|
(defn seq-of-nodes->sorted-by-count
|
|
|
|
"Sorted first by the rank of the ngram, lowest ranks first.
|
|
|
|
"Sorted first by the rank of the ngram, lowest ranks first.
|
|
|
@ -124,69 +142,39 @@
|
|
|
|
(sort-by :count)
|
|
|
|
(sort-by :count)
|
|
|
|
reverse))
|
|
|
|
reverse))
|
|
|
|
|
|
|
|
|
|
|
|
(defn trie->database [trie]
|
|
|
|
(time
|
|
|
|
(let [sorted-keys
|
|
|
|
(let [texts (->> (dark-corpus-file-seq 0 250000)
|
|
|
|
(seq-of-nodes->sorted-by-count trie)]
|
|
|
|
(map slurp))
|
|
|
|
(loop [sorted-keys sorted-keys
|
|
|
|
[trie _ db] (create-trie-from-texts texts)]
|
|
|
|
database {}
|
|
|
|
(def trie trie)
|
|
|
|
i 1]
|
|
|
|
(def trie-database db)))
|
|
|
|
(if (empty? sorted-keys)
|
|
|
|
|
|
|
|
database
|
|
|
|
|
|
|
|
(recur
|
|
|
|
|
|
|
|
(rest sorted-keys)
|
|
|
|
|
|
|
|
(-> database
|
|
|
|
|
|
|
|
(assoc i {:count (:count (first sorted-keys))
|
|
|
|
|
|
|
|
:value (:value (first sorted-keys))})
|
|
|
|
|
|
|
|
(assoc (:value (first sorted-keys)) i))
|
|
|
|
|
|
|
|
(inc i))))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(def trie
|
|
|
|
|
|
|
|
(let [texts (->> (dark-corpus-file-seq 0 1000)
|
|
|
|
|
|
|
|
(map slurp))]
|
|
|
|
|
|
|
|
(create-trie-from-texts texts)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(def trie-database
|
|
|
|
|
|
|
|
(trie->database trie))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(defn encode-fn [v]
|
|
|
|
(defn encode-fn [v]
|
|
|
|
(let [{:keys [count value]} v]
|
|
|
|
(let [[value count] (if (seqable? v) v [nil nil])]
|
|
|
|
(if (and (number? v) (not (zero? v)))
|
|
|
|
(if (nil? value)
|
|
|
|
|
|
|
|
(encoding/encode 0)
|
|
|
|
(byte-array
|
|
|
|
(byte-array
|
|
|
|
(concat (encoding/encode (trie-database value))
|
|
|
|
(concat (encoding/encode value)
|
|
|
|
(encoding/encode count)))
|
|
|
|
(encoding/encode count))))))
|
|
|
|
(encoding/encode 0))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(defn decode-fn [byte-buffer]
|
|
|
|
(defn decode-fn [byte-buffer]
|
|
|
|
(let [v (encoding/decode byte-buffer)]
|
|
|
|
(let [value (encoding/decode byte-buffer)]
|
|
|
|
(if (and (number? v) (zero? v))
|
|
|
|
(if (zero? value)
|
|
|
|
nil
|
|
|
|
[nil nil]
|
|
|
|
(trie-database v))))
|
|
|
|
[value (encoding/decode byte-buffer)])))
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
|
|
|
|
(def tight-ready-trie
|
|
|
|
|
|
|
|
(->> trie
|
|
|
|
|
|
|
|
(map (fn [[k v]]
|
|
|
|
|
|
|
|
(let [k (map #(get trie-database %) k)]
|
|
|
|
|
|
|
|
[k v])))
|
|
|
|
|
|
|
|
(into (trie/make-trie))))
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(time
|
|
|
|
(def tightly-packed-trie
|
|
|
|
(def tightly-packed-trie
|
|
|
|
(let [tight-ready-trie
|
|
|
|
|
|
|
|
(->> trie
|
|
|
|
|
|
|
|
(map (fn [[k v]]
|
|
|
|
|
|
|
|
(let [k (map #(get trie-database %) k)]
|
|
|
|
|
|
|
|
[k v])))
|
|
|
|
|
|
|
|
(into (trie/make-trie)))]
|
|
|
|
|
|
|
|
(tpt/tightly-packed-trie
|
|
|
|
(tpt/tightly-packed-trie
|
|
|
|
tight-ready-trie
|
|
|
|
trie
|
|
|
|
encode-fn
|
|
|
|
encode-fn
|
|
|
|
decode-fn)))
|
|
|
|
decode-fn)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(take 20 tightly-packed-trie)
|
|
|
|
|
|
|
|
|
|
|
|
(defn key-get-in-tpt [tpt db ks]
|
|
|
|
(defn key-get-in-tpt [tpt db ks]
|
|
|
|
(let [id (map #(get-in db [(list %) :id]) ks)
|
|
|
|
(let [id (map #(get-in db [(list %) :id]) ks)
|
|
|
|
v (get tpt id)]
|
|
|
|
v (get tpt id)]
|
|
|
|
(println id ks)
|
|
|
|
|
|
|
|
{id v}))
|
|
|
|
{id v}))
|
|
|
|
|
|
|
|
|
|
|
|
(defn id-get-in-tpt [tpt db ids]
|
|
|
|
(defn id-get-in-tpt [tpt db ids]
|
|
|
@ -195,32 +183,37 @@
|
|
|
|
id (get-in db [ks :id])]
|
|
|
|
id (get-in db [ks :id])]
|
|
|
|
{ks (assoc v :value (get db id))}))
|
|
|
|
{ks (assoc v :value (get db id))}))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(time (count (map #(get % []) (trie/children (trie/lookup tightly-packed-trie [1])))))
|
|
|
|
|
|
|
|
(time (count (trie/children (trie/lookup tightly-packed-trie [1]))))
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
(comment
|
|
|
|
|
|
|
|
(profile {}
|
|
|
|
(def example-story
|
|
|
|
(def example-story
|
|
|
|
(loop [generated-text [(get trie-database "<s>")]
|
|
|
|
(loop [generated-text [(get trie-database "<s>")]
|
|
|
|
i 0]
|
|
|
|
i 0]
|
|
|
|
(if (> i 100)
|
|
|
|
(if (> i 10)
|
|
|
|
generated-text
|
|
|
|
generated-text
|
|
|
|
(let [node (loop [i 3]
|
|
|
|
(let [children (loop [i 4]
|
|
|
|
(let [node (trie/lookup
|
|
|
|
(let [node (p :lookup
|
|
|
|
|
|
|
|
(trie/lookup
|
|
|
|
tightly-packed-trie
|
|
|
|
tightly-packed-trie
|
|
|
|
(vec (take-last i generated-text)))]
|
|
|
|
(vec (take-last i generated-text))))
|
|
|
|
|
|
|
|
children (p :seq-children (doall (seq (and node (trie/children node)))))]
|
|
|
|
(cond
|
|
|
|
(cond
|
|
|
|
(nil? node) (recur (dec i))
|
|
|
|
(nil? node) (recur (dec i))
|
|
|
|
(< i 0) (throw (Exception. "Error"))
|
|
|
|
(< i 0) (throw (Exception. "Error"))
|
|
|
|
(seq (trie/children node)) node
|
|
|
|
children children
|
|
|
|
:else (recur (dec i)))))]
|
|
|
|
:else (recur (dec i)))))]
|
|
|
|
(recur
|
|
|
|
(recur
|
|
|
|
(conj
|
|
|
|
(conj
|
|
|
|
generated-text
|
|
|
|
generated-text
|
|
|
|
(->> node
|
|
|
|
(->> children
|
|
|
|
trie/children
|
|
|
|
|
|
|
|
(map #(get % []))
|
|
|
|
(map #(get % []))
|
|
|
|
(remove nil?)
|
|
|
|
(remove nil?)
|
|
|
|
(math/weighted-selection :count)
|
|
|
|
(#(p :weighted-selection (math/weighted-selection :count %)))
|
|
|
|
:value
|
|
|
|
:value
|
|
|
|
(get trie-database)))
|
|
|
|
(get trie-database)))
|
|
|
|
(inc i))))))
|
|
|
|
(inc i)))))))
|
|
|
|
|
|
|
|
|
|
|
|
(->> example-story
|
|
|
|
(->> example-story
|
|
|
|
(map #(get-in trie-database [% :value]))
|
|
|
|
(map #(get-in trie-database [% :value]))
|
|
|
@ -244,13 +237,6 @@
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
(comment
|
|
|
|
;; database
|
|
|
|
|
|
|
|
(let [texts (->> (dark-corpus-file-seq 500 2)
|
|
|
|
|
|
|
|
(map slurp))
|
|
|
|
|
|
|
|
trie (create-trie-from-texts texts)]
|
|
|
|
|
|
|
|
(->> (trie->database trie)
|
|
|
|
|
|
|
|
(#(get % 3))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(let [texts (->> (dark-corpus-file-seq 500 2)
|
|
|
|
(let [texts (->> (dark-corpus-file-seq 500 2)
|
|
|
|
(map slurp))
|
|
|
|
(map slurp))
|
|
|
|
trie (create-trie-from-texts texts)]
|
|
|
|
trie (create-trie-from-texts texts)]
|
|
|
|