Add function to parse top-k

main
Eric Ihli 4 years ago
parent 0c7da21610
commit 75218e770b

@ -12,6 +12,7 @@
[com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.data.thesaurus :as thesaurus] [com.owoga.prhyme.data.thesaurus :as thesaurus]
[com.owoga.prhyme.data.darklyrics :as darklyrics] [com.owoga.prhyme.data.darklyrics :as darklyrics]
[com.owoga.prhyme.util.weighted-rand :as weighted-rand]
[com.owoga.prhyme.generation.weighted-selection :as weighted] [com.owoga.prhyme.generation.weighted-selection :as weighted]
[clojure.set :as set] [clojure.set :as set]
[clojure.zip :as zip] [clojure.zip :as zip]
@ -193,7 +194,6 @@
(->> rhymes (->> rhymes
(take 5) (take 5)
(map :normalized-word))))) (map :normalized-word)))))
) )
(defn remove-sentences-with-words-not-in-dictionary [dictionary] (defn remove-sentences-with-words-not-in-dictionary [dictionary]
@ -209,13 +209,14 @@
(let [directory "dark-corpus"] (let [directory "dark-corpus"]
(->> (file-seq (io/file directory)) (->> (file-seq (io/file directory))
(remove #(.isDirectory %)) (remove #(.isDirectory %))
(drop 10) (take 1000)
(take 10)
(map slurp) (map slurp)
(map util/clean-text) (map util/clean-text)
(filter dict/english?) (filter dict/english?)
(map #(string/split % #"\n+")) (map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular)) (map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?)
(remove #(some empty? %))
(map nlp/treebank-zipper) (map nlp/treebank-zipper)
(map nlp/leaf-pos-path-word-freqs) (map nlp/leaf-pos-path-word-freqs)
(apply nlp/deep-merge-with +)))) (apply nlp/deep-merge-with +))))
@ -224,12 +225,14 @@
(let [directory "dark-corpus"] (let [directory "dark-corpus"]
(->> (file-seq (io/file directory)) (->> (file-seq (io/file directory))
(remove #(.isDirectory %)) (remove #(.isDirectory %))
(take 1000) (take 500)
(map slurp) (map slurp)
(map util/clean-text) (map util/clean-text)
(filter dict/english?) (filter dict/english?)
(map #(string/split % #"\n+")) (map #(string/split % #"\n+"))
(map #(remove string/blank? %)) (map (remove-sentences-with-words-not-in-dictionary dict/popular))
(remove empty?)
(remove #(some empty? %))
(map nlp/parse-to-simple-tree) (map nlp/parse-to-simple-tree)
(map nlp/parse-tree-sans-leaf-words) (map nlp/parse-tree-sans-leaf-words)
(map (map
@ -242,39 +245,55 @@
flatten flatten
(apply merge-with +)))) (apply merge-with +))))
(defn weighted-selection-from-map [m]
(first (weighted-rand/weighted-selection second (seq m))))
(comment (comment
(time (def example-pos-freqs (dark-pos-freqs))) (time (def example-pos-freqs (dark-pos-freqs)))
example-pos-freqs
(take 20 example-pos-freqs)
(time (def example-structures (dark-structures))) (time (def example-structures (dark-structures)))
(def common-example-structures (let [structure (weighted-selection-from-map example-structures)]
(filter (repeatedly
#(< 10 (second %)) 10
example-structures)) (fn []
(count common-example-structures) (->> (nlp/generate-from-structure-and-pos-freqs
(let [structure (rand-nth (seq common-example-structures)) structure
zipper (zip/seq-zip (first structure))] example-pos-freqs)
(loop [zipper zipper] nlp/leaf-nodes
(let [path (map first (zip/path zipper))] (string/join " ")))))
(cond ;; => ("then get your life"
(zip/end? zipper) (zip/root zipper) ;; "sometimes lie my hand"
(and (not-empty path) ;; "still become your chapter"
(example-pos-freqs path)) ;; "alright fade our surfing"
(recur ;; "far care my band"
(-> zipper ;; "all fake my fallow"
zip/up ;; "here gimme our head"
(zip/append-child ;; "long back my guide"
(first ;; "never stop their seed"
(rand-nth ;; "never consume our tomorrow")
(seq
(example-pos-freqs path))))) ;; => ("now scarred towards the future"
zip/down ;; "never gone among the side"
zip/next ;; "ill removed with the end"
zip/next)) ;; "well filled in the life"
:else (recur (zip/next zipper)))))) ;; "again torn towards the world"
;; "desperately matched in the love"
;; "nowadays matched in the ark"
;; "awhile needed through all night"
;; "so torn in the darkness"
;; "first erased on the land")
;; => ("pictures of the destiny"
;; "tears on the pain"
;; "lights in the disaster"
;; "corpses on the fire"
;; "castles on the universe"
;; "efforts for the king"
;; "visions of the night"
;; "retreats into the darker"
;; "tales into the attack"
;; "pictures into the play")
(get-in {:a 1} '()) (get-in {:a 1} '())
(let [zipper (zip/seq-zip '(TOP (S (NP) (VB))))] (let [zipper (zip/seq-zip '(TOP (S (NP) (VB))))]

@ -38,40 +38,3 @@
{'("away") {"her" 10 {'("away") {"her" 10
"them" 50 "them" 50
"baz" 99}})) "baz" 99}}))
(defn gen-markov [directory]
(->> (file-seq (io/file directory))
(remove #(.isDirectory %))
(map #(slurp %))
(map clean-text)
(filter dict/english?)
(map #(string/split % #"\n+"))
(flatten)
(map #(string/split % #"\s+"))
(map reverse)
(map #(util/extend-coll % nil 2))
(map #(make-markov % 2))
(apply merge-markov)
#_(util/write-markov "resources/dark-corpus-2.edn")))
(defn gen-pos-markov [directory]
(->> (file-seq (io/file directory))
(remove #(.isDirectory %))
(map #(slurp %))
(map clean-text)
(filter dict/english?)
(map #(string/split % #"\n+"))
(map (fn [lyrics] (filter #(nlp/valid-sentence? %) lyrics)))
(map #(remove nil? %))
(take 400)
(flatten)
(nlp/pos-constituent-frequencies)
#_(map (fn [lyrics]
(map #(nlp/tags nlp/prhyme-pos-tagger (nlp/tokenize %)) lyrics)))))
(comment
(time
(let [directory "dark-corpus/"]
(gen-pos-markov directory)))
)

@ -7,7 +7,10 @@
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2] [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
[com.owoga.prhyme.util.weighted-rand :as weighted-rand] [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
[clojure.walk :as walk]) [clojure.walk :as walk])
(:import (opennlp.tools.postag POSModel POSTaggerME))) (:import (opennlp.tools.postag POSModel POSTaggerME)
(opennlp.tools.parser Parse ParserModel
ParserFactory)
(opennlp.tools.cmdline.parser ParserTool)))
(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin"))) (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
(def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin"))) (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
@ -48,6 +51,42 @@
;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)]) ;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
) )
;;;; Custom parser to get access to top N parses
(def custom-parser
(ParserFactory/create
(ParserModel.
(io/input-stream (io/resource "models/en-parser-chunking.bin")))
3
0.95))
(defn parse-probs [parses]
(map #(.getProb %) parses))
(defn parse-strs [parses]
(let [results (StringBuffer.)]
(run!
#(do (.show % results)
(.append results "\n"))
parses)
(string/split results #"\n")))
(comment
(tokenize "Eric's testing.")
(let [results (StringBuffer.)
parses (ParserTool/parseLine "Eric 's testing ." custom-parser 3)]
((juxt parse-probs parse-strs) parses))
)
(defn parse-top-n [tokenized n]
(let [results (StringBuffer.)
parses (ParserTool/parseLine tokenized custom-parser n)]
(apply map vector ((juxt parse-strs parse-probs) parses))))
(comment
(parse-top-n "." 3)
)
(defn deep-merge-with [f & maps] (defn deep-merge-with [f & maps]
(letfn [(m [& xs] (letfn [(m [& xs]
(if (some map? xs) (if (some map? xs)
@ -593,6 +632,14 @@
;; ??? ;; ???
:else (recur (zip/next zipper))))))) :else (recur (zip/next zipper)))))))
(defn leaf-nodes [tree]
(->> tree
zip/seq-zip
(iterate zip/next)
(take-while (complement zip/end?))
(filter #(string? (zip/node %)))
(map zip/node)))
(comment (comment
(let [corpus ["this is a test" (let [corpus ["this is a test"
"that is a test" "that is a test"
@ -970,3 +1017,11 @@
(into {}))) (into {})))
) )
(comment
(let [text ["bother me"]]
(->> text
(map tokenize)
(map #(top-k-sequences prhyme-pos-tagger %))))
)

Loading…
Cancel
Save