Move generation code to nlg

main
Eric Ihli 4 years ago
parent 1c3a07708a
commit acd22d9b2d

@ -98,7 +98,7 @@
english-words
(->> words
(filter #(word-set (string/lower-case %))))]
(< 0.7 (/ (count english-words) (max 1 (count words))))))
(< 0.8 (/ (count english-words) (max 1 (count words))))))
(comment
(let [phoneme-lookup (into

@ -2,7 +2,6 @@
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
@ -97,21 +96,23 @@
k)]
[k' 1])))
(defn xf-part-of-speech-database
[database]
(fn [sentence]
(let [leafs (->> sentence
nlp/treebank-zipper
nlp/leaf-pos-path-word-freqs)]
(run!
(fn [[k v]]
(swap!
database
assoc
k
(merge-with + (@database k) v)))
leafs)
sentence)))
(comment
;; TODO: Move to nlp.core
(defn xf-part-of-speech-database
[database]
(fn [sentence]
(let [leafs (->> sentence
nlp/treebank-zipper
nlp/leaf-pos-path-word-freqs)]
(run!
(fn [[k v]]
(swap!
database
assoc
k
(merge-with + (@database k) v)))
leafs)
sentence))))
(comment
(let [database (atom {})]
@ -206,32 +207,31 @@
(recur (conj result [k v])
(rest k)))))
(defn process-text
"Processes text into key value pairs where
(comment
;; TODO: Move to nlp.core
(defn process-text
"Processes text into key value pairs where
the keys are parts-of-speech paths and the values
are the children at that path.
Ready to be inserted into a trie."
[text]
(->> text
(split-text-into-sentences)
(map string/trim)
(remove empty?)
(mapv nlp/treebank-zipper)
(remove nil?)
(map nlp/parts-of-speech-trie-entries)
(mapv (fn [file]
(mapv (fn [line]
(mapv vec line))
file)))
(reduce into [])
(map flatten-trie-entry-to-all-subkeys)
(reduce into [])
(mapv normalize-text)
(mapv (fn [[k v]]
(clojure.lang.MapEntry. (into (vec k) [v]) v)))))
[text]
(->> text
(split-text-into-sentences)
(map string/trim)
(remove empty?)
(mapv nlp/treebank-zipper)
(remove nil?)
(map nlp/parts-of-speech-trie-entries)
(reduce into [])
(map flatten-trie-entry-to-all-subkeys)
(reduce into [])
(mapv normalize-text)
(mapv (fn [[k v]]
(clojure.lang.MapEntry. (into (vec k) [v]) v))))))
(comment
(process-text (first texts))
(flatten-trie-entry-to-all-subkeys
'[(TOP S NP) (NP PP)])
;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]]
@ -304,7 +304,7 @@
trie
entries)))
(trie/make-trie)
(take 300 texts))))
(take 3000 texts))))
(nippy/freeze-to-file "/tmp/test-trie.bin" (seq test-trie))
(time
@ -441,32 +441,6 @@
(#(zip/insert-right % (zip/node z2)))
(zip/root))))
(defn generate
[trie database zipper]
(let [k (map first (zip/path zipper))]
(do (Thread/sleep 10) (println k))
(if (vector? (database (last k)))
(loop [zipper zipper]
(let [children (last (map first (zip/path zipper)))]
(Thread/sleep 50) (println children (zip/root zipper))
(if (empty? children)
zipper
(recur
(-> zipper
zip/up
(zip/append-child [(first children)])
(zip/down)
(zip/rightmost)
(zip/down)
(#(generate trie database %))
(zip/up)
(zip/up)
(zip/down)
(zip/replace (subvec 1 children)))))))
(zip/insert-right
zipper
(choose trie database k)))))
(defn generate
[trie database zipper]
(cond
@ -526,12 +500,14 @@
(comment
(trie/lookup test-trie [1])
(->> (generate test-trie @test-database (zip/vector-zip [1]))
(zip/vector-zip)
(iterate zip/next)
(take-while (complement zip/end?))
(map zip/node)
(filter string?))
(repeatedly
20
#(->> (generate test-trie @test-database (zip/vector-zip [1]))
(zip/vector-zip)
(iterate zip/next)
(take-while (complement zip/end?))
(map zip/node)
(filter string?)))
(-> [:a [:b] [:b]]
zip/vector-zip
@ -649,43 +625,47 @@
)
(defn xf-grammar-database
[database]
(fn [sentence]
(let [leafs (->> sentence
nlp/treebank-zipper
nlp/leaf-pos-path-word-freqs)]
(run!
(fn [[k v]]
(swap!
database
assoc
k
(merge-with + (@database k) v)))
leafs)
sentence)))
(defn file-seq->grammar-tree
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map nlp/grammar-tree-frequencies)
(map (partial into {})))
(fn
([acc]
(sort-by (comp - second) acc))
([acc m]
(merge-with + acc m)))
{}
files))
(comment
;; TODO: Move to nlp.core
(defn xf-grammar-database
[database]
(fn [sentence]
(let [leafs (->> sentence
nlp/treebank-zipper
nlp/leaf-pos-path-word-freqs)]
(run!
(fn [[k v]]
(swap!
database
assoc
k
(merge-with + (@database k) v)))
leafs)
sentence))))
(comment
;; TODO: remove or move to nlp.core
(defn file-seq->grammar-tree
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map nlp/grammar-tree-frequencies)
(map (partial into {})))
(fn
([acc]
(sort-by (comp - second) acc))
([acc m]
(merge-with + acc m)))
{}
files)))
(comment
(time
@ -699,27 +679,29 @@
)
(defn file-seq->part-of-speech-freqs
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map (partial map nlp/treebank-zipper))
(map (partial map nlp/leaf-pos-path-word-freqs))
(map (partial reduce (fn [acc m]
(nlp/deep-merge-with + acc m)) {})))
(completing
(fn [result input]
(nlp/deep-merge-with + result input)))
{}
files))
(comment
;; TODO: Remove or move to nlp.core
(defn file-seq->part-of-speech-freqs
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map (partial map nlp/treebank-zipper))
(map (partial map nlp/leaf-pos-path-word-freqs))
(map (partial reduce (fn [acc m]
(nlp/deep-merge-with + acc m)) {})))
(completing
(fn [result input]
(nlp/deep-merge-with + result input)))
{}
files)))
(comment
(time (->> (file-seq->part-of-speech-freqs
@ -732,24 +714,26 @@
)
(defn file-seq->parts-of-speech-trie
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map nlp/grammar-tree-frequencies)
(map (partial into {})))
(fn
([acc]
(sort-by (comp - second) acc))
([acc m]
(merge-with + acc m)))
{}
files))
(comment
;; TODO: Remove or move to nlp.core
(defn file-seq->parts-of-speech-trie
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map nlp/grammar-tree-frequencies)
(map (partial into {})))
(fn
([acc]
(sort-by (comp - second) acc))
([acc m]
(merge-with + acc m)))
{}
files)))

@ -2,9 +2,13 @@
(:require [clojure.zip :as zip]
[clojure.string :as string]
[taoensso.timbre :as timbre]
[com.owoga.prhyme.util.math :as math]
[examples.core :as examples]
[taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.core :as nlp]
[clojure.java.io :as io]
[com.owoga.prhyme.data-transform :as df]
[com.owoga.trie :as trie]
[com.owoga.prhyme.util.weighted-rand :as weighted-rand]
[clojure.set :as set]))
@ -403,3 +407,181 @@
[(TOP (NP (NP (NN)) (PP (IN) (NP (PRP$) (NN))))) 218]
[(TOP (NP (JJ) (NNS))) 211]
[(TOP (VB)) 204]))
(comment
(def test-database (atom {::nlp/next-id 1}))
(def texts
(eduction
(comp (df/xf-file-seq 0 250000)
(map slurp))
(file-seq (io/file "dark-corpus"))))
(time
(def test-trie
(transduce
(comp
(map
(fn [text]
(try
(nlp/text->grammar-trie-map-entry text)
(catch Exception e
(throw e)))))
(map (partial map (nlp/make-database-stateful-xf test-database))))
(completing
(fn [trie entries]
(reduce
(fn [trie [k v]]
(update trie k (fnil inc 0)))
trie
entries)))
(trie/make-trie)
(take 300 texts))))
)
(defn children
[trie database k]
(->> (trie/lookup trie k)
(trie/children)
(map #(vector (.key %) (get % [])))
(remove (comp nil? second))
(sort-by (comp - second))))
(defn choose
[trie database k]
(math/weighted-selection
second
(children trie database k)))
(defn markov-generate-grammar
[trie database zipper]
(cond
(zip/end? zipper)
(zip/root zipper)
(seqable? (zip/node zipper))
(recur trie database (zip/next zipper))
(symbol? (zip/node zipper))
(recur trie database (zip/next zipper))
(symbol? (database (zip/node zipper)))
(let [sym (database (zip/node zipper))
sym-path (->> (map first (zip/path zipper))
butlast
(filter symbol?)
(#(concat % (list sym))))
path (map database sym-path)
choice (first (choose trie database path))]
(recur
trie
database
(-> zipper
(zip/replace
[sym choice])
(zip/root)
(zip/vector-zip))))
(string? (database (zip/node zipper)))
(let [terminal (database (zip/node zipper))
path (->> (map first (zip/path zipper))
butlast
(filter symbol?))]
(recur
trie
database
(-> zipper
zip/remove
zip/root
zip/vector-zip)))
:else
(recur
trie
database
(-> zipper
(zip/replace
(mapv
database
(database (zip/node zipper))))
(zip/next)
(zip/root)
(zip/vector-zip)))))
(comment
(markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))
)
(defn markov-generate-sentence
[trie database zipper]
(cond
(zip/end? zipper)
(zip/root zipper)
(seqable? (zip/node zipper))
(recur trie database (zip/next zipper))
(symbol? (zip/node zipper))
(recur trie database (zip/next zipper))
(symbol? (database (zip/node zipper)))
(let [sym (database (zip/node zipper))
sym-path (->> (map first (zip/path zipper))
butlast
(filter symbol?)
(#(concat % (list sym))))
path (map database sym-path)
choice (first (choose trie database path))]
(recur
trie
database
(-> zipper
(zip/replace
[sym choice])
(zip/root)
(zip/vector-zip))))
(string? (database (zip/node zipper)))
(let [terminal (database (zip/node zipper))
path (->> (map first (zip/path zipper))
butlast
(filter symbol?))]
(recur
trie
database
(-> zipper
(zip/replace
terminal)
(zip/next)
(zip/root)
(zip/vector-zip))))
:else
(recur
trie
database
(-> zipper
(zip/replace
(mapv
database
(database (zip/node zipper))))
(zip/next)
(zip/root)
(zip/vector-zip)))))
(comment
(generate test-trie @test-database (zip/vector-zip [1]))
(repeatedly
20
#(->> (generate test-trie @test-database (zip/vector-zip [1]))
(zip/vector-zip)
(iterate zip/next)
(take-while (complement zip/end?))
(map zip/node)
(filter string?)))
)

@ -2,11 +2,14 @@
(:require [opennlp.nlp :as nlp]
[opennlp.treebank :as tb]
[clojure.string :as string]
[com.owoga.prhyme.data-transform :as df]
[com.owoga.trie :as trie]
[clojure.java.io :as io]
[clojure.zip :as zip]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
[com.owoga.prhyme.util.weighted-rand :as weighted-rand]
[clojure.walk :as walk])
[clojure.walk :as walk]
[com.owoga.prhyme.data.dictionary :as dict])
(:import (opennlp.tools.postag POSModel POSTaggerME)
(opennlp.tools.parser Parse ParserModel
ParserFactory)
@ -1209,3 +1212,118 @@
;; [(TOP S NP PP NP NN) ("today")])
)
;;;; Grammar Trie
;;
;; Create a trie from treebank parsed grammar trees.
(defn -split-text-into-sentences
"Splits text on newlines, periods, exclamation and question marks."
[text]
(->> text
(#(string/replace % #"([\.\?\!\n]+)" "$1\n"))
(string/split-lines)))
(defn -flatten-trie-entry-to-all-subkeys
"Turns
[[k1 k2 k3] v]
into
[[[k1 k2 k3] v]]
[[k2 k3] v]]
[[k3] v]]]
This is useful for creating a trie from a grammar tree. It's
nice to know that k3 is a child of both [k1 k2] and [k2] so
if you need to generate a [k2] in isolation, you have
acces to [k1 k2] and [k4 k2] and [kn k2] etc... all under the
top-level key [k2].
"
[[k v]]
(loop [result []
k k]
(if (empty? k)
result
(recur (conj result [k v])
(rest k)))))
(defn -normalize-text
[[k v]]
(if (string? (first v))
[k (string/lower-case (first v))]
[k v]))
(defn english?
[text]
(->> text
(#(string/replace % #"\W" " "))
(#(string/replace % #" +" " "))
(#(string/split % #" "))
(every? #(dict/cmu-with-stress-map (string/lower-case %)))))
(defn text->grammar-trie-map-entry
"Processes text into key value pairs where
the keys are parts-of-speech paths and the values
are the children at that path.
Ready to be inserted into a trie."
[text]
(->> text
(-split-text-into-sentences)
(map string/trim)
(remove empty?)
(mapv treebank-zipper)
(remove nil?)
(map parts-of-speech-trie-entries)
(reduce into [])
(map -flatten-trie-entry-to-all-subkeys)
(reduce into [])
(mapv -normalize-text)
(mapv (fn [[k v]]
(clojure.lang.MapEntry. (into (vec k) [v]) v)))))
(defn -new-key
"Associates key with an auto-incrementing ID
and the ID with the key.
This 'database' is an atom that maps
keys to integer ids and integer ids to keys.
This lets us use integers throughout the trie data structure,
which ends up being a lot more efficient and prepares the trie
for being turned into a tightly-packed-trie."
[database k]
(let [next-id (@database ::next-id)]
(swap!
database
#(-> %
(assoc k next-id)
(assoc next-id k)
(update ::next-id inc)))
next-id))
(defn make-database-stateful-xf
"This 'database' is an atom that maps
keys to integer ids and integer ids to keys.
This lets us use integers throughout the trie data structure,
which ends up being a lot more efficient and prepares the trie
for being turned into a tightly-packed-trie.
Takes an atom and returns a function that takes a Trie key/value.
When the returned function is called, it checks to see
if the key is in the database and if so it returns the associated id.
If not, it increments the id (which is stored in the database
under :next-id) and returns that new id."
[database]
(fn [[k v]]
(let [k' (mapv (fn [kn]
(if-let [id (get @database kn)]
id
(-new-key database kn)))
k)]
[k' 1])))

Loading…
Cancel
Save