Faster trie/db gen in tpt example

main
Eric Ihli 3 years ago
parent 26694b5e52
commit a2fd40bd7c

@ -19,6 +19,6 @@
org.clojure/data.fressian {:mvn/version "1.0.0"} org.clojure/data.fressian {:mvn/version "1.0.0"}
com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/nippy {:mvn/version "3.0.0"}
com.taoensso/timbre {:mvn/version "4.10.0"} com.taoensso/timbre {:mvn/version "4.10.0"}
com.owoga/tightly-packed-trie {:mvn/version "0.2.0"}} com.owoga/tightly-packed-trie {:mvn/version "0.2.1"}}
:aliases {:dev {:extra-paths ["test" "examples" "dev"] :aliases {:dev {:extra-paths ["test" "examples" "dev"]
:extra-deps {}}}} :extra-deps {}}}}

@ -1,13 +1,17 @@
(ns examples.tpt (ns examples.tpt
(:require [clojure.string :as string] (:require [clojure.string :as string]
[clojure.java.io :as io] [clojure.java.io :as io]
[taoensso.tufte :as tufte :refer (defnp p profiled profile)]
[com.owoga.trie :as trie] [com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie :as tpt]
[com.owoga.trie.math :as math] [com.owoga.trie.math :as math]
[com.owoga.tightly-packed-trie.encoding :as encoding] [com.owoga.tightly-packed-trie.encoding :as encoding]
[com.owoga.prhyme.util :as util] [com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.data.dictionary :as dict]
[clojure.zip :as zip])) [clojure.zip :as zip]
[cljol.dig9 :as d]))
(tufte/add-basic-println-handler! {})
(defn dark-corpus-file-seq [start end] (defn dark-corpus-file-seq [start end]
(let [documents (->> "dark-corpus" (let [documents (->> "dark-corpus"
@ -108,10 +112,24 @@
(apply concat) (apply concat)
(map prep-ngram-for-trie) (map prep-ngram-for-trie)
(reduce (reduce
(fn [trie [k v]] (fn [[trie i db] [k v]]
(let [existing (or (get trie k) {:count 0 :value (last v)})] (let [[db i] (reduce
(conj trie [k (update existing :count inc)]))) (fn [[db i] k]
(trie/make-trie)))) (let [k' (get db k i)
i' (if (= i k') (inc i) i)
db' (-> db
(assoc k' k)
(assoc k k'))]
[db' i']))
[db i]
k)]
(let [k' (map #(get db %) k)]
(if-let [existing (get trie k')]
(let [[val count] existing
trie (assoc trie k' [val (inc count)])]
[trie i db])
[(assoc trie k' [i 1]) i db]))))
[(trie/make-trie) 1 {}])))
(defn seq-of-nodes->sorted-by-count (defn seq-of-nodes->sorted-by-count
"Sorted first by the rank of the ngram, lowest ranks first. "Sorted first by the rank of the ngram, lowest ranks first.
@ -124,69 +142,39 @@
(sort-by :count) (sort-by :count)
reverse)) reverse))
(defn trie->database [trie] (time
(let [sorted-keys (let [texts (->> (dark-corpus-file-seq 0 250000)
(seq-of-nodes->sorted-by-count trie)] (map slurp))
(loop [sorted-keys sorted-keys [trie _ db] (create-trie-from-texts texts)]
database {} (def trie trie)
i 1] (def trie-database db)))
(if (empty? sorted-keys)
database
(recur
(rest sorted-keys)
(-> database
(assoc i {:count (:count (first sorted-keys))
:value (:value (first sorted-keys))})
(assoc (:value (first sorted-keys)) i))
(inc i))))))
(def trie
(let [texts (->> (dark-corpus-file-seq 0 1000)
(map slurp))]
(create-trie-from-texts texts)))
(def trie-database
(trie->database trie))
(defn encode-fn [v] (defn encode-fn [v]
(let [{:keys [count value]} v] (let [[value count] (if (seqable? v) v [nil nil])]
(if (and (number? v) (not (zero? v))) (if (nil? value)
(encoding/encode 0)
(byte-array (byte-array
(concat (encoding/encode (trie-database value)) (concat (encoding/encode value)
(encoding/encode count))) (encoding/encode count))))))
(encoding/encode 0))))
(defn decode-fn [byte-buffer] (defn decode-fn [byte-buffer]
(let [v (encoding/decode byte-buffer)] (let [value (encoding/decode byte-buffer)]
(if (and (number? v) (zero? v)) (if (zero? value)
nil [nil nil]
(trie-database v)))) [value (encoding/decode byte-buffer)])))
(comment
(def tight-ready-trie
(->> trie
(map (fn [[k v]]
(let [k (map #(get trie-database %) k)]
[k v])))
(into (trie/make-trie))))
)
(def tightly-packed-trie (time
(let [tight-ready-trie (def tightly-packed-trie
(->> trie
(map (fn [[k v]]
(let [k (map #(get trie-database %) k)]
[k v])))
(into (trie/make-trie)))]
(tpt/tightly-packed-trie (tpt/tightly-packed-trie
tight-ready-trie trie
encode-fn encode-fn
decode-fn))) decode-fn)))
(take 20 tightly-packed-trie)
(defn key-get-in-tpt [tpt db ks] (defn key-get-in-tpt [tpt db ks]
(let [id (map #(get-in db [(list %) :id]) ks) (let [id (map #(get-in db [(list %) :id]) ks)
v (get tpt id)] v (get tpt id)]
(println id ks)
{id v})) {id v}))
(defn id-get-in-tpt [tpt db ids] (defn id-get-in-tpt [tpt db ids]
@ -195,32 +183,37 @@
id (get-in db [ks :id])] id (get-in db [ks :id])]
{ks (assoc v :value (get db id))})) {ks (assoc v :value (get db id))}))
(time (count (map #(get % []) (trie/children (trie/lookup tightly-packed-trie [1])))))
(time (count (trie/children (trie/lookup tightly-packed-trie [1]))))
(comment (comment
(profile {}
(def example-story (def example-story
(loop [generated-text [(get trie-database "<s>")] (loop [generated-text [(get trie-database "<s>")]
i 0] i 0]
(if (> i 100) (if (> i 10)
generated-text generated-text
(let [node (loop [i 3] (let [children (loop [i 4]
(let [node (trie/lookup (let [node (p :lookup
(trie/lookup
tightly-packed-trie tightly-packed-trie
(vec (take-last i generated-text)))] (vec (take-last i generated-text))))
children (p :seq-children (doall (seq (and node (trie/children node)))))]
(cond (cond
(nil? node) (recur (dec i)) (nil? node) (recur (dec i))
(< i 0) (throw (Exception. "Error")) (< i 0) (throw (Exception. "Error"))
(seq (trie/children node)) node children children
:else (recur (dec i)))))] :else (recur (dec i)))))]
(recur (recur
(conj (conj
generated-text generated-text
(->> node (->> children
trie/children
(map #(get % [])) (map #(get % []))
(remove nil?) (remove nil?)
(math/weighted-selection :count) (#(p :weighted-selection (math/weighted-selection :count %)))
:value :value
(get trie-database))) (get trie-database)))
(inc i)))))) (inc i)))))))
(->> example-story (->> example-story
(map #(get-in trie-database [% :value])) (map #(get-in trie-database [% :value]))
@ -244,13 +237,6 @@
) )
(comment (comment
;; database
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(->> (trie->database trie)
(#(get % 3))))
(let [texts (->> (dark-corpus-file-seq 500 2) (let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp)) (map slurp))
trie (create-trie-from-texts texts)] trie (create-trie-from-texts texts)]

Loading…
Cancel
Save