Add tightly-packed-trie example

main
Eric Ihli 4 years ago
parent ded92cc0a2
commit f66210d7dd

@ -9,13 +9,14 @@
cljol/cljol {:git/url "https://github.com/jafingerhut/cljol"
:sha "11d4aa72fdd19248bd7600fb7b5cde7189f32938"}
org.xerial/sqlite-jdbc {:mvn/version "3.32.3.2"}
inflections {:mvn/version "0.13.2"}
inflections/inflections {:mvn/version "0.13.2"}
com.taoensso/tufte {:mvn/version "2.2.0"}
clojure-opennlp {:mvn/version "0.5.0"}
clojure-opennlp/clojure-opennlp {:mvn/version "0.5.0"}
uk.ac.abdn/SimpleNLG {:mvn/version "4.5.0"}
net.sf.sociaal/freetts {:mvn/version "1.2.2"}
enlive {:mvn/version "1.1.6"}
integrant {:mvn/version "0.8.0"}
enlive/enlive {:mvn/version "1.1.6"}
integrant/integrant {:mvn/version "0.8.0"}
com.owoga/tightly-packed-trie {:local/root "/home/eihli/code/tightly-packed-trie/TightlyPackedTrie.jar"}
org.clojure/data.fressian {:mvn/version "1.0.0"}
com.taoensso/nippy {:mvn/version "3.0.0"}
com.taoensso/timbre {:mvn/version "4.10.0"}}

@ -0,0 +1,251 @@
(ns examples.tpt
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.tightly-packed-trie.core :as tpt]
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data.dictionary :as dict]
[clojure.zip :as zip]))
(defn dark-corpus-file-seq [start end]
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop start)
(take end))]
documents))
(defn remove-sentences-with-words-not-in-dictionary
"This gets rid of a lot of good words. All contractions for example... I'll, They'll...
possessives like morning's...
Might not end up using it."
[dictionary]
(let [dictionary (into #{} dictionary)]
(fn [sentences]
(->> sentences
(map #(string/split % #" "))
(remove #(some (complement dictionary) %))
(remove #(some string/blank? %))
(map #(string/join " " %))))))
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
(defn pad-tokens
"Pads the beginning with n - 1 <s> tokens and
the end with 1 </s> token."
[tokens n]
(concat (repeat (max 1 (dec n)) "<s>") tokens ["</s>"]))
(defn tokenize-line
[line]
(->> line
(string/trim)
(re-seq re-word)
(map second)
(map string/lower-case)))
(def database (atom {}))
(defn process-files-for-trie
"Expects an entire song, lines seperated by \n."
[files]
(->> files
(map slurp)
(filter dict/english?)
(map util/clean-text)
(map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?)
(remove #(some empty? %))
(map (fn [lines]
(map tokenize-line lines)))
(map (fn [lines]
(map #(pad-tokens % 1) lines)))
(map (fn [lines]
(map #(partition 2 1 %) lines)))))
(defn text->ngrams
"Takes text from a file, including newlines.
Pads lines with <s> and </s> for start/end of line.
Pads beginning with n - 1 <s>s"
[text n]
(let [words-not-in-dict-filter (remove-sentences-with-words-not-in-dictionary dict/popular)]
(->> text
util/clean-text
(#(string/split % #"\n+"))
(remove empty?)
(map tokenize-line)
(map #(pad-tokens % n))
(map #(partition n 1 %))
(apply concat))))
(defn make-trie
([] (tpt/->Trie
(fn update-fn [prev cur]
(if (nil? prev)
{:value (last cur)
:count 1}
(-> prev
(update :count (fnil inc 0))
(assoc :value (last cur)))))
(sorted-map)))
([& ks]
(reduce
(fn [t k]
(conj t k))
(make-trie)
ks)))
(defn n-to-m-grams
"Exclusive of m, similar to range."
[n m text]
(loop [i n
r '()]
(cond
(= i m)
(apply concat r)
:else
(recur (inc i) (cons (text->ngrams text i) r)))))
(defn prep-ngram-for-trie
"The tpt/trie expects values conjed into an ngram
to be of format '(k1 k2 k3 value)."
[ngram]
(concat ngram (list ngram)))
(defn create-trie-from-texts [texts]
(->> texts
(map #(n-to-m-grams 1 4 %))
(apply concat)
(map prep-ngram-for-trie)
(apply make-trie)))
(defn trie->seq-of-nodes [trie]
(->> trie
tpt/as-vec
zip/vector-zip
(iterate zip/next)
(take-while (complement zip/end?))
(map zip/node)
(filter map?)))
(defn seq-of-nodes->sorted-by-count
"Sorted first by the rank of the ngram, lowest ranks first.
Sorted second by the frequency of the ngram, highest frequencies first.
This is the order that you'd populate a mapping of keys to IDs."
[nodes]
(->> nodes
(map (comp first seq))
(map (fn [[k v]]
(vector (:value v) (:count v))))
;; root node and padded starts
(remove (comp nil? second))
(sort-by #(vector (count (first %))
(- (second %))))))
(defn trie->database [trie]
(let [sorted-keys
(->> (trie->seq-of-nodes trie)
seq-of-nodes->sorted-by-count)]
(loop [sorted-keys sorted-keys
database {}
i 1]
(if (empty? sorted-keys)
database
(recur
(rest sorted-keys)
(-> database
(assoc (first (first sorted-keys))
{:count (second (first sorted-keys))
:id i})
(assoc i (first (first sorted-keys))))
(inc i))))))
(defn transform-trie->ids [trie database]
(let [transform-p #(map? (zip/node %))
transform-f
(fn tf [loc]
(zip/edit
loc
(fn [node]
(let [[k v] (first (seq node))]
{(get-in database [(list k) :id] (if (= k :root) :root))
(assoc v :value (get-in database [(list k) :count] 0))}))))]
(tpt/transform trie (tpt/visitor-filter transform-p transform-f))))
(defonce trie
(let [texts (->> (dark-corpus-file-seq 500 500)
(map slurp))]
(create-trie-from-texts texts)))
(defonce trie-database
(trie->database trie))
(def tightly-packed-trie
(let [trie-with-ids (transform-trie->ids trie trie-database)
tightly-packed-trie (tpt/tightly-packed-trie trie-with-ids)]
tightly-packed-trie))
(defn key-get-in-tpt [tpt db ks]
(let [id (map #(get-in db [(list %) :id]) ks)
v (get tpt id)]
(println id ks)
{id v}))
(defn id-get-in-tpt [tpt db ids]
(let [ks (apply concat (map #(get db %) ids))
v (get tpt ids)
id (get-in db [ks :id])]
{ks (assoc v :value (get db id))}))
(comment
(key-get-in-tpt
tightly-packed-trie
trie-database
'("<s>" "<s>" "the"))
;; => {(2 2 3) {:value 3263, :count 462}}
(id-get-in-tpt
tightly-packed-trie
trie-database
'(2 2 3))
;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}}
)
(comment
;; database
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(->> (trie->database trie)
(#(get % 3))))
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(tpt/as-map (transform-trie->ids trie)))
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)
tightly-packed-trie (tpt/tightly-packed-trie
(transform-trie->ids trie))]
(get tightly-packed-trie '(2 2 3)))
(let [texts (->> (dark-corpus-file-seq 500 2)
(map slurp))
trie (create-trie-from-texts texts)]
(tpt/as-map trie))
(let [text (slurp (first (dark-corpus-file-seq 500 1)))]
(->> text
util/clean-text
(#(string/split % #"\n+"))))
)

@ -286,9 +286,6 @@
[data rime]
(map (partial rhyming-word data) rime))
(defn deep-merge-with [& maps]
((apply merge-with merge maps)))
(defn flatten-node [node]
(let [zipper (zip/zipper
(fn branch? [node]

@ -16,7 +16,7 @@
(defn pad-tokens
[tokens n]
(concat (repeat (min 1 (dec n)) "<s>") tokens ["</s>"]))
(concat (repeat (max 1 (dec n)) "<s>") tokens ["</s>"]))
(defn tokenize-line
[line]

@ -359,7 +359,6 @@
child-nodes)
index-ba (let [index-baos (ByteArrayOutputStream.)
child-byte-arrays (map pack-index-entry children)]
(println child-byte-arrays)
(loop [bas child-byte-arrays]
(if (empty? bas)
(.toByteArray index-baos)
@ -368,11 +367,9 @@
(zip/edit
loc
(fn [node]
(println children)
(let [[k v] (first (seq node))]
(.write baos (pack-node-value v))
(.write baos (tpt/vb-encode (count index-ba)))
(println "writing index" (map int index-ba))
(.write baos index-ba)
{k (conj v {:byte-address byte-address
:byte-array (.toByteArray baos)})})))))
@ -428,7 +425,6 @@
clojure.lang.ILookup
(valAt [_ k]
(println (cons :root (interleave (repeat :children) k)))
(get-in trie (cons :root (interleave (repeat :children) k))))
(valAt [_ k not-found]
(get-in trie (cons :root (interleave (repeat :children) k)) not-found))
@ -503,7 +499,6 @@
(let [slice (partial tpt/bit-slice 0 7)
combine (partial tpt/combine-significant-bits 7)]
(loop [bytes []]
(println (.position bb) (map int bytes))
(cond
(or (< max-position (.position bb))
(zero? (.remaining bb)))
@ -519,8 +514,6 @@
(let [slice (partial tpt/bit-slice 0 7)
combine (partial tpt/combine-significant-bits 7)]
(loop [bytes []]
(println (.position bb) (map int bytes))
(println "max" max-position)
(cond
(or (< max-position (.position bb))
(zero? (.remaining bb)))
@ -545,21 +538,15 @@
(defn find-key-in-index
[bb target-key max-address not-found]
(println target-key "pos" (.position bb))
(loop [previous-key nil
min-position (.position bb)
max-position max-address]
(if (zero? (- max-position min-position))
not-found
(let [mid-position (+ min-position (quot 2 (- max-position min-position)))]
(Thread/sleep 20)
(println min-position mid-position max-position)
(.position bb mid-position)
(let [bb (rewind-to-key bb min-position)
_ (println "rewound to key")
current-key (decode-key bb max-position)
_ (println "cur key" current-key)]
(println "keys" current-key target-key)
current-key (decode-key bb max-position)]
(cond
(= current-key target-key)
(decode-offset bb max-position)
@ -595,7 +582,6 @@
(let [val (tpt/byte-buffer-variable-length-decode byte-buffer)
freq (tpt/byte-buffer-variable-length-decode byte-buffer)
size-of-index (tpt/byte-buffer-variable-length-decode byte-buffer)
_ (println "val" val "freq" freq "size" size-of-index)
offset (find-key-in-index
byte-buffer
(first ks)
@ -619,7 +605,6 @@
(let [val (tpt/byte-buffer-variable-length-decode byte-buffer)
freq (tpt/byte-buffer-variable-length-decode byte-buffer)
size-of-index (tpt/byte-buffer-variable-length-decode byte-buffer)
_ (println "val" val "freq" freq "size" size-of-index)
offset (find-key-in-index
byte-buffer
(first ks)
@ -630,6 +615,7 @@
(do (.position byte-buffer (- current-address offset))
(recur (rest ks)))))))))))
(comment
(let [v1 '(1 2 1 121)
v2 '(1 3 1 131)
@ -659,9 +645,7 @@
(.write baos byte-array)
loc))))
(let [ba (.toByteArray baos)
_ (println "root-address")
root-address (get-in (as-map trie) [:root :byte-address])
_ (println root-address)
byte-buf (java.nio.ByteBuffer/allocate (+ 4 (count ba)))]
(.putInt byte-buf root-address)
(.put byte-buf ba)

Loading…
Cancel
Save