From 75218e770b03f19dc855ddd4003fc66b0ead1556 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Fri, 6 Nov 2020 14:48:34 -0800 Subject: [PATCH] Add function to parse top-k --- dev/examples/core.clj | 85 +++++++++++++++++++------------ src/com/owoga/corpus/markov.clj | 37 -------------- src/com/owoga/prhyme/nlp/core.clj | 57 ++++++++++++++++++++- 3 files changed, 108 insertions(+), 71 deletions(-) diff --git a/dev/examples/core.clj b/dev/examples/core.clj index 99522b0..98dc696 100644 --- a/dev/examples/core.clj +++ b/dev/examples/core.clj @@ -12,6 +12,7 @@ [com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.data.thesaurus :as thesaurus] [com.owoga.prhyme.data.darklyrics :as darklyrics] + [com.owoga.prhyme.util.weighted-rand :as weighted-rand] [com.owoga.prhyme.generation.weighted-selection :as weighted] [clojure.set :as set] [clojure.zip :as zip] @@ -193,7 +194,6 @@ (->> rhymes (take 5) (map :normalized-word))))) - ) (defn remove-sentences-with-words-not-in-dictionary [dictionary] @@ -209,13 +209,14 @@ (let [directory "dark-corpus"] (->> (file-seq (io/file directory)) (remove #(.isDirectory %)) - (drop 10) - (take 10) + (take 1000) (map slurp) (map util/clean-text) (filter dict/english?) (map #(string/split % #"\n+")) (map (remove-sentences-with-words-not-in-dictionary dict/popular)) + (remove empty?) + (remove #(some empty? %)) (map nlp/treebank-zipper) (map nlp/leaf-pos-path-word-freqs) (apply nlp/deep-merge-with +)))) @@ -224,12 +225,14 @@ (let [directory "dark-corpus"] (->> (file-seq (io/file directory)) (remove #(.isDirectory %)) - (take 1000) + (take 500) (map slurp) (map util/clean-text) (filter dict/english?) (map #(string/split % #"\n+")) - (map #(remove string/blank? %)) + (map (remove-sentences-with-words-not-in-dictionary dict/popular)) + (remove empty?) + (remove #(some empty? %)) (map nlp/parse-to-simple-tree) (map nlp/parse-tree-sans-leaf-words) (map @@ -242,39 +245,55 @@ flatten (apply merge-with +)))) +(defn weighted-selection-from-map [m] + (first (weighted-rand/weighted-selection second (seq m)))) + (comment (time (def example-pos-freqs (dark-pos-freqs))) - example-pos-freqs - - (take 20 example-pos-freqs) (time (def example-structures (dark-structures))) - (def common-example-structures - (filter - #(< 10 (second %)) - example-structures)) - (count common-example-structures) - (let [structure (rand-nth (seq common-example-structures)) - zipper (zip/seq-zip (first structure))] - (loop [zipper zipper] - (let [path (map first (zip/path zipper))] - (cond - (zip/end? zipper) (zip/root zipper) - (and (not-empty path) - (example-pos-freqs path)) - (recur - (-> zipper - zip/up - (zip/append-child - (first - (rand-nth - (seq - (example-pos-freqs path))))) - zip/down - zip/next - zip/next)) - :else (recur (zip/next zipper)))))) + (let [structure (weighted-selection-from-map example-structures)] + (repeatedly + 10 + (fn [] + (->> (nlp/generate-from-structure-and-pos-freqs + structure + example-pos-freqs) + nlp/leaf-nodes + (string/join " "))))) + ;; => ("then get your life" + ;; "sometimes lie my hand" + ;; "still become your chapter" + ;; "alright fade our surfing" + ;; "far care my band" + ;; "all fake my fallow" + ;; "here gimme our head" + ;; "long back my guide" + ;; "never stop their seed" + ;; "never consume our tomorrow") + + ;; => ("now scarred towards the future" + ;; "never gone among the side" + ;; "ill removed with the end" + ;; "well filled in the life" + ;; "again torn towards the world" + ;; "desperately matched in the love" + ;; "nowadays matched in the ark" + ;; "awhile needed through all night" + ;; "so torn in the darkness" + ;; "first erased on the land") + + ;; => ("pictures of the destiny" + ;; "tears on the pain" + ;; "lights in the disaster" + ;; "corpses on the fire" + ;; "castles on the universe" + ;; "efforts for the king" + ;; "visions of the night" + ;; "retreats into the darker" + ;; "tales into the attack" + ;; "pictures into the play") (get-in {:a 1} '()) (let [zipper (zip/seq-zip '(TOP (S (NP) (VB))))] diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index 2920186..e378009 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -38,40 +38,3 @@ {'("away") {"her" 10 "them" 50 "baz" 99}})) - -(defn gen-markov [directory] - (->> (file-seq (io/file directory)) - (remove #(.isDirectory %)) - (map #(slurp %)) - (map clean-text) - (filter dict/english?) - (map #(string/split % #"\n+")) - (flatten) - (map #(string/split % #"\s+")) - (map reverse) - (map #(util/extend-coll % nil 2)) - (map #(make-markov % 2)) - (apply merge-markov) - #_(util/write-markov "resources/dark-corpus-2.edn"))) - -(defn gen-pos-markov [directory] - (->> (file-seq (io/file directory)) - (remove #(.isDirectory %)) - (map #(slurp %)) - (map clean-text) - (filter dict/english?) - (map #(string/split % #"\n+")) - (map (fn [lyrics] (filter #(nlp/valid-sentence? %) lyrics))) - (map #(remove nil? %)) - (take 400) - (flatten) - (nlp/pos-constituent-frequencies) - #_(map (fn [lyrics] - (map #(nlp/tags nlp/prhyme-pos-tagger (nlp/tokenize %)) lyrics))))) - -(comment - (time - (let [directory "dark-corpus/"] - (gen-pos-markov directory))) - - ) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index 145c81d..26736f0 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -7,7 +7,10 @@ [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2] [com.owoga.prhyme.util.weighted-rand :as weighted-rand] [clojure.walk :as walk]) - (:import (opennlp.tools.postag POSModel POSTaggerME))) + (:import (opennlp.tools.postag POSModel POSTaggerME) + (opennlp.tools.parser Parse ParserModel + ParserFactory) + (opennlp.tools.cmdline.parser ParserTool))) (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin"))) (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin"))) @@ -48,6 +51,42 @@ ;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)]) ) +;;;; Custom parser to get access to top N parses +(def custom-parser + (ParserFactory/create + (ParserModel. + (io/input-stream (io/resource "models/en-parser-chunking.bin"))) + 3 + 0.95)) + +(defn parse-probs [parses] + (map #(.getProb %) parses)) + +(defn parse-strs [parses] + (let [results (StringBuffer.)] + (run! + #(do (.show % results) + (.append results "\n")) + parses) + (string/split results #"\n"))) + +(comment + (tokenize "Eric's testing.") + (let [results (StringBuffer.) + parses (ParserTool/parseLine "Eric 's testing ." custom-parser 3)] + ((juxt parse-probs parse-strs) parses)) + + ) + +(defn parse-top-n [tokenized n] + (let [results (StringBuffer.) + parses (ParserTool/parseLine tokenized custom-parser n)] + (apply map vector ((juxt parse-strs parse-probs) parses)))) + +(comment + (parse-top-n "." 3) + ) + (defn deep-merge-with [f & maps] (letfn [(m [& xs] (if (some map? xs) @@ -593,6 +632,14 @@ ;; ??? :else (recur (zip/next zipper))))))) +(defn leaf-nodes [tree] + (->> tree + zip/seq-zip + (iterate zip/next) + (take-while (complement zip/end?)) + (filter #(string? (zip/node %))) + (map zip/node))) + (comment (let [corpus ["this is a test" "that is a test" @@ -970,3 +1017,11 @@ (into {}))) ) + +(comment + (let [text ["bother me"]] + (->> text + (map tokenize) + (map #(top-k-sequences prhyme-pos-tagger %)))) + + )