Add comments to nlp/g code

main
Eric Ihli 4 years ago
parent c0ad5acefc
commit 0c7da21610

@ -1,16 +1,21 @@
(ns examples.core (ns examples.core
(:require [clojure.string :as string] (:require [clojure.string :as string]
[clojure.set] [clojure.set]
[clojure.java.io :as io]
[com.owoga.prhyme.frp :as frp] [com.owoga.prhyme.frp :as frp]
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.core :as prhyme] [com.owoga.prhyme.core :as prhyme]
[com.owoga.prhyme.data.bigrams :as bigrams] [com.owoga.prhyme.data.bigrams :as bigrams]
[com.owoga.prhyme.gen :as gen] [com.owoga.prhyme.gen :as gen]
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.prhyme.nlg.core :as nlg]
[com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.data.thesaurus :as thesaurus] [com.owoga.prhyme.data.thesaurus :as thesaurus]
[com.owoga.prhyme.data.darklyrics :as darklyrics] [com.owoga.prhyme.data.darklyrics :as darklyrics]
[com.owoga.prhyme.generation.weighted-selection :as weighted] [com.owoga.prhyme.generation.weighted-selection :as weighted]
[clojure.set :as set] [clojure.set :as set]
[clojure.zip :as zip])) [clojure.zip :as zip]
[clojure.walk :as walk]))
(defn weight-fn [word target result] (defn weight-fn [word target result]
(let [rimes (frp/consecutive-matching word target :rimes) (let [rimes (frp/consecutive-matching word target :rimes)
@ -190,3 +195,91 @@
(map :normalized-word))))) (map :normalized-word)))))
) )
(defn remove-sentences-with-words-not-in-dictionary [dictionary]
(let [dictionary (into #{} dictionary)]
(fn [sentences]
(->> sentences
(map #(string/split % #" "))
(remove #(some (complement dictionary) %))
(remove #(some string/blank? %))
(map #(string/join " " %))))))
(defn dark-pos-freqs []
(let [directory "dark-corpus"]
(->> (file-seq (io/file directory))
(remove #(.isDirectory %))
(drop 10)
(take 10)
(map slurp)
(map util/clean-text)
(filter dict/english?)
(map #(string/split % #"\n+"))
(map (remove-sentences-with-words-not-in-dictionary dict/popular))
(map nlp/treebank-zipper)
(map nlp/leaf-pos-path-word-freqs)
(apply nlp/deep-merge-with +))))
(defn dark-structures []
(let [directory "dark-corpus"]
(->> (file-seq (io/file directory))
(remove #(.isDirectory %))
(take 1000)
(map slurp)
(map util/clean-text)
(filter dict/english?)
(map #(string/split % #"\n+"))
(map #(remove string/blank? %))
(map nlp/parse-to-simple-tree)
(map nlp/parse-tree-sans-leaf-words)
(map
(fn [lines]
(map
(fn [line]
(hash-map line 1))
lines)))
(map (partial merge-with +))
flatten
(apply merge-with +))))
(comment
(time (def example-pos-freqs (dark-pos-freqs)))
example-pos-freqs
(take 20 example-pos-freqs)
(time (def example-structures (dark-structures)))
(def common-example-structures
(filter
#(< 10 (second %))
example-structures))
(count common-example-structures)
(let [structure (rand-nth (seq common-example-structures))
zipper (zip/seq-zip (first structure))]
(loop [zipper zipper]
(let [path (map first (zip/path zipper))]
(cond
(zip/end? zipper) (zip/root zipper)
(and (not-empty path)
(example-pos-freqs path))
(recur
(-> zipper
zip/up
(zip/append-child
(first
(rand-nth
(seq
(example-pos-freqs path)))))
zip/down
zip/next
zip/next))
:else (recur (zip/next zipper))))))
(get-in {:a 1} '())
(let [zipper (zip/seq-zip '(TOP (S (NP) (VB))))]
(-> zipper
zip/down
zip/right
zip/node))
)

@ -3,7 +3,7 @@
[com.owoga.prhyme.util.math :as math] [com.owoga.prhyme.util.math :as math]
[com.owoga.prhyme.util :as util] [com.owoga.prhyme.util :as util]
[com.owoga.prhyme.util.weighted-rand :as weighted-rand] [com.owoga.prhyme.util.weighted-rand :as weighted-rand]
[com.owoga.prhyme.util.nlp :as nlp] [com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.prhyme.frp :as frp] [com.owoga.prhyme.frp :as frp]
[com.owoga.prhyme.core :as prhyme])) [com.owoga.prhyme.core :as prhyme]))

@ -4,7 +4,9 @@
[clojure.string :as string] [clojure.string :as string]
[clojure.java.io :as io] [clojure.java.io :as io]
[clojure.zip :as zip] [clojure.zip :as zip]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]) [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
[com.owoga.prhyme.util.weighted-rand :as weighted-rand]
[clojure.walk :as walk])
(:import (opennlp.tools.postag POSModel POSTaggerME))) (:import (opennlp.tools.postag POSModel POSTaggerME)))
(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin"))) (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
@ -44,13 +46,6 @@
(top-k-sequences prhyme-pos-tagger (tokenize phrase)))) (top-k-sequences prhyme-pos-tagger (tokenize phrase))))
;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)] ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)]) ;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
(let [phrase "Blood falls."]
(->> phrase
tokenize
(top-k-sequences prhyme-pos-tagger)
(map (juxt #(.getOutcomes %)
#(map float (.getProbs %))))))
) )
(defn deep-merge-with [f & maps] (defn deep-merge-with [f & maps]
@ -67,6 +62,7 @@
;; => {:a 1, :b {:b 7}, :c 3} ;; => {:a 1, :b {:b 7}, :c 3}
) )
(defn valid-sentence? (defn valid-sentence?
"Tokenizes and parses the phrase using OpenNLP models from "Tokenizes and parses the phrase using OpenNLP models from
http://opennlp.sourceforge.net/models-1.5/ http://opennlp.sourceforge.net/models-1.5/
@ -95,7 +91,16 @@
representation of the tree, then we can `unmake` the tree representation of the tree, then we can `unmake` the tree
to turn it into a list representation of the tree that to turn it into a list representation of the tree that
we can easily use in a clojure zipper. (read-string almost works, we can easily use in a clojure zipper. (read-string almost works,
but falls apart when reading things like commas)." but falls apart when reading things like commas).
This doesn't map to the string returned by the parser.
Children here are all nested inside a single list.
For example: (NP ((DT 'a') (NN ('sample')) (NN 'test')))
rather than: (NP (DT 'a') (NN 'sample') (NN 'test'))
If you want a data structure that is in parity with the
string returned by the parser, look at `parse-to-simple-tree`
"
[node] [node]
(cond (cond
(string? node) node (string? node) node
@ -103,42 +108,78 @@
:else (map unmake-tree node))) :else (map unmake-tree node)))
(comment (comment
(let [phrase "Hello, Eric"] (let [text-lines ["This is a sample test."
(->> phrase "This is another line."]]
tokenize (->> text-lines
(string/join " ") (map tokenize)
vector (map (partial string/join " "))
parse parse
(map tb/make-tree) (map tb/make-tree)
unmake-tree)) (map unmake-tree)))
;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric"))))))) ;; => ((TOP
;; ((S
;; ((NP ((DT ("This"))))
;; (VP ((VBZ ("is")) (NP ((DT ("a")) (NN ("sample")) (NN ("test"))))))
;; (. ("."))))))
;; (TOP
;; ((S
;; ((NP ((DT ("This"))))
;; (VP ((VBZ ("is")) (NP ((DT ("another")) (NN ("line"))))))
;; (. (".")))))))
) )
(defn treebank-zipper
"Turns a bit of text into a parse tree into a zipper." (defn parse-to-simple-tree
[text] "Returns a Clojure data structure, a list, in the shape of a tree, that
(let [tree (->> text matches the structure returned by the parser.
tokenize
(string/join " ") The implementation takes a roundabout path. It uses `tb/make-tree` to
vector get a map/list data structure, it then unmakes the tree info a data
structure that can be `walk`ed, it then walks the tree making modifications
to bring it in line with the parse string structure.
"
[text-lines]
(->> text-lines
(map tokenize)
(map (partial string/join " "))
parse parse
(map tb/make-tree) (map tb/make-tree)
unmake-tree)] (map unmake-tree)
(zip/seq-zip tree))) (map
#(walk/postwalk
(fn [node]
(cond
(and (seq? node)
(symbol? (first node))
(and (seq? (second node))
(seq? (first (second node)))))
(cons (first node) (second node))
(and (seq? node)
(string? (first node)))
(first node)
:else node))
%))))
(comment (comment
(let [zipper (treebank-zipper "This is a zipper test.")] ;; This is what the parsed string looks like.
zipper) (let [text-lines ["Hello, world!"]]
;; => [((TOP (->> text-lines (map tokenize) (map (partial string/join " ")) parse))
;; ((S ;; => ["(TOP (FRAG (INTJ (UH Hello)) (, ,) (NP (NN world)) (. !)))"]
;; ((NP ((DT ("This"))))
;; (VP ((VBZ ("is")) (NP ((DT ("a")) (NN ("zipper")) (NN ("test")))))) ;; And this is what the simple-tree Clojure data structure looks like.
;; (. ("."))))))) (let [text-lines ["Hello, world!"]]
;; nil] (parse-to-simple-tree text-lines))
;; => ((TOP (FRAG (INTJ (UH "Hello")) (, ",") (NP (NN "world")) (. "!"))))
) )
(defn treebank-zipper-1 (defn treebank-zipper
"Turns a bit of text into a parse tree into a zipper." "Turns a bit of text into a parse tree into a zipper.
Porcelain. If you have the simple tree data structure
returned by `parse-to-simple-tree`, then you can just
pass that directly to `zip/seq-zip`."
[texts] [texts]
(let [tree (->> texts (let [tree (->> texts
(map tokenize) (map tokenize)
@ -148,14 +189,30 @@
unmake-tree)] unmake-tree)]
(zip/seq-zip tree))) (zip/seq-zip tree)))
(defn seq-zip [zipper] (comment
(let [texts ["Eric's test is difficult."]]
(loop [zipper (treebank-zipper texts)]
(cond
(zip/end? zipper) (zip/root zipper)
(= 'JJ (zip/node zipper)) (recur (-> zipper
zip/next
(zip/replace '("thorough"))))
:else (recur (zip/next zipper)))))
;; => ((TOP
;; ((S
;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
;; (. (".")))))))
)
(defn iter-zip [zipper]
(->> zipper (->> zipper
(iterate zip/next) (iterate zip/next)
(take-while (complement zip/end?)))) (take-while (complement zip/end?))))
(defn seq-nodes [zipper] (defn iter-nodes [zipper]
(->> zipper (->> zipper
seq-zip iter-zip
(map zip/node))) (map zip/node)))
(defn phrase-level? [node] (defn phrase-level? [node]
@ -168,9 +225,19 @@
(= 1 (count (second node))) (= 1 (count (second node)))
(string? (first (second node))))) (string? (first (second node)))))
(defn leaf-pos-paths [zipper] (defn leaf-pos-paths
"Seq of the path down the parse tree to each leaf part-of-speech.
Useful to aggregate over a corpus information like how often
different words are used as different parts of speech.
Benefit of having the entire path down the tree is that you
can know things like 'A noun phrase as a child of a verb phrase
never has a determiner.'
"
[zipper]
(->> zipper (->> zipper
seq-zip iter-zip
(filter (complement zip/branch?)) (filter (complement zip/branch?))
(map zip/path) (map zip/path)
(map #(map first %)) (map #(map first %))
@ -182,23 +249,79 @@
(filter #(string? (last %))))) (filter #(string? (last %)))))
(comment (comment
(let [zipper (treebank-zipper-1 ["This is a zipper test." (let [zipper (treebank-zipper ["Commas, they work."
"And this is another one."])] "Eric's test is difficult."
"Eric's testing the code."])]
(leaf-pos-paths zipper)) (leaf-pos-paths zipper))
;; => ((TOP S NP DT "This") ;; => ((TOP S NP NNP "Commas")
;; (TOP S VP VBZ "is") ;; (TOP S , ",")
;; (TOP S VP NP DT "a") ;; (TOP S NP PRP "they")
;; (TOP S VP NP NN "zipper") ;; (TOP S VP VBP "work")
;; (TOP S VP NP NN "test")
;; (TOP S . ".") ;; (TOP S . ".")
;; (TOP S CC "And") ;; (TOP S NP NP NNP "Eric")
;; (TOP S NP DT "this") ;; (TOP S NP NP POS "'s")
;; (TOP S NP NN "test")
;; (TOP S VP VBZ "is") ;; (TOP S VP VBZ "is")
;; (TOP S VP NP DT "another") ;; (TOP S VP ADJP JJ "difficult")
;; (TOP S VP NP CD "one") ;; (TOP S . ".")
;; (TOP S NP NNP "Eric")
;; (TOP S VP VBZ "'s")
;; (TOP S VP VP VBG "testing")
;; (TOP S VP VP NP DT "the")
;; (TOP S VP VP NP NN "code")
;; (TOP S . ".")) ;; (TOP S . "."))
) )
(defn leaf-pos-path-word-freqs [zipper]
(->> zipper
leaf-pos-paths
(map #(hash-map (butlast %) {(last %) 1}))
(apply deep-merge-with +)))
(comment
(let [zipper (treebank-zipper ["Eric's test is difficult."
"Eric's test is thorough."
"Eric's testing."])]
(leaf-pos-path-word-freqs zipper))
;; => {(TOP NP .) {"." 1},
;; (TOP NP NP POS) {"'s" 1},
;; (TOP S NP NP NNP) {"Eric" 2},
;; (TOP NP NN) {"testing" 1},
;; (TOP S VP VBZ) {"is" 2},
;; (TOP S .) {"." 2},
;; (TOP S NP NP POS) {"'s" 2},
;; (TOP NP NP NNP) {"Eric" 1},
;; (TOP S NP NN) {"test" 2},
;; (TOP S VP ADJP JJ) {"difficult" 1, "thorough" 1}}
)
(defn parse-tree-sans-leaf-words
"Takes a 'simple' parse tree (`parse-to-simple-tree`)
Removes the leaf words from the tree.
Useful if you want to work with the structure of something
without caring about the actual words."
[tree]
(walk/postwalk
(fn [node]
(if (and (seq? node)
(string? (second node)))
(take 1 node)
node))
tree))
(comment
(let [tree (parse-to-simple-tree ["Eric's test is difficult."
"Eric's test is thorough."
"Eric's testing."
"Eric is testing."])]
(parse-tree-sans-leaf-words tree))
;; => ((TOP (S (NP (NP (NNP) (POS)) (NN)) (VP (VBZ) (ADJP (JJ))) (.)))
;; (TOP (S (NP (NP (NNP) (POS)) (NN)) (VP (VBZ) (ADJP (JJ))) (.)))
;; (TOP (NP (NP (NNP) (POS)) (NN) (.)))
;; (TOP (S (NP (NNP)) (VP (VBZ) (VP (VBG))) (.))))
)
;;;; This is not sufficient ;;;; This is not sufficient
;; You'll end up with a mapping that says a verb phrase can be a ;; You'll end up with a mapping that says a verb phrase can be a
@ -232,7 +355,7 @@
" "
[zipper] [zipper]
(->> zipper (->> zipper
seq-nodes iter-nodes
(filter phrase-level?) (filter phrase-level?)
(map (map
(fn [[pos xs]] (fn [[pos xs]]
@ -246,29 +369,47 @@
(apply deep-merge-with +))) (apply deep-merge-with +)))
(comment (comment
(let [zipper (treebank-zipper-1 ["This is a test and that is not a test." (let [zipper (treebank-zipper ["Eric's test is difficult."
"My name is Eric." "Eric's test is thorough."
"Go to the store." "Eric's testing."])
"Your name is not Eric."
"This is a sample test."])
freqs (->> zipper freqs (->> zipper
pos->children-freqs)] pos->children-freqs)]
freqs) freqs)
;; => {NP {(DT) 3, (DT NN) 3, (PRP$ NN) 2, (NNP) 2, (DT NN NN) 1}, ;; => {NP {(NP NN) 2, (NNP POS) 3, (NP NN .) 1},
;; VB {"Go" 1}, ;; ADJP {(JJ) 2},
;; VBZ {"is" 5}, ;; VBZ {"is" 2},
;; S {(S CC S .) 1, (NP VP) 2, (NP VP .) 3}, ;; S {(NP VP .) 2},
;; RB {"not" 2}, ;; NNP {"Eric" 3},
;; NNP {"Eric" 2}, ;; JJ {"difficult" 1, "thorough" 1},
;; TO {"to" 1}, ;; TOP {(S) 2, (NP) 1},
;; DT {"This" 2, "a" 3, "that" 1, "the" 1}, ;; POS {"'s" 3},
;; TOP {(S) 4, (VP) 1}, ;; NN {"test" 2, "testing" 1},
;; CC {"and" 1}, ;; VP {(VBZ ADJP) 2},
;; NN {"test" 3, "name" 2, "store" 1, "sample" 1}, ;; . {"." 3}}
;; PP {(TO NP) 1}, )
;; VP {(VBZ NP) 3, (VBZ RB NP) 2, (VB PP .) 1},
;; PRP$ {"My" 1, "Your" 1}, (defn remove-sentences-with-words-not-in-dictionary [dictionary]
;; . {"." 5}} (let [dictionary (into #{} dictionary)]
(fn [sentences]
(println sentences)
(println dictionary)
(->> sentences
(map #(string/split % #" "))
(remove #(some (complement dictionary) %))
(remove #(some string/blank? %))
(map #(string/join " " %))))))
(comment
(let [dictionary ["this" "is" "a" "test"]
remove-fn (remove-sentences-with-words-not-in-dictionary
dictionary)
sentences ["this is a foobar test"
"Eric is a test"
"this is a test"
"a test this is"]]
(->> sentences
remove-fn))
;; => ("this is a test" "a test this is")
) )
(defn generate-from-freqs (defn generate-from-freqs
@ -276,86 +417,78 @@
and the number of times each constituent is seen in a corpus. and the number of times each constituent is seen in a corpus.
start is the part-of-speech to start generating for. start is the part-of-speech to start generating for.
Creates a zipper that will initially have just the starting node.
Iterates through the zipper with zip/next. When we encounter a node
that exists in the frequencies map, then replace the node we are on
with a value from the map.
A problem with this is that we might not always get a valid structure.
For example, a noun phrase might not start with a determiner 20% of the time,
but 100% of the time when the noun phrase follows a verb phrase.
So if we are just selecting based on frequency, we'll end up worse off than
if we take into account the entire context we're in.
" "
[freqs start] [freqs start]
(let [zipper (zip/seq-zip (list start))] (let [zipper (zip/seq-zip (list start))]
(loop [zipper zipper] (loop [zipper zipper]
(let [cur-freqs (freqs (zip/node zipper))]
(cond (cond
(zip/end? zipper) (zip/root zipper) (zip/end? zipper) (zip/root zipper)
(zip/branch? zipper) (recur (zip/next zipper)) (zip/branch? zipper) (recur (zip/next zipper))
(freqs (zip/node zipper)) cur-freqs
(let [selection (first (weighted-rand/weighted-selection second cur-freqs))]
(recur (recur
(zip/next (zip/next
(zip/next (zip/next
(zip/replace (zip/replace
zipper zipper
(list (list (zip/node zipper) selection))))))
(zip/node zipper)
(first (rand-nth (seq (freqs (zip/node zipper))))))))))
:else (recur (zip/next zipper)))))) :else (recur (zip/next zipper)))))))
(comment (comment
(let [zipper (treebank-zipper-1 ["This is a zipper test in the car." (letfn [;; This is just a helper function to get the string leafs of a tree and
"And this is another one." ;; join them into a single string.
"Here are some sample sentences." (string-leaf-nodes [tree]
"Let's see what we can generate." (->> tree
"This is a big adjective."
"That's an ugly sentence."
"The corpus will be simple to start."])
freqs (->> zipper
pos->children-freqs)]
freqs
(->> (generate-from-freqs freqs 'TOP))
(->> (repeatedly
(fn []
(->> (generate-from-freqs freqs 'TOP)
(zip/seq-zip) (zip/seq-zip)
(iterate zip/next) (iterate zip/next)
(take-while (complement zip/end?)) (take-while (complement zip/end?))
(filter (complement zip/branch?)) (filter (complement zip/branch?))
(map zip/node) (map zip/node)
(filter string?) (filter string?)
(string/join " ")))) (string/join " ")))]
(filter valid-sentence?) (let [ ;; Create an initial corpus.
(take 10))) zipper (treebank-zipper ["This is a zipper test in the car."
;; => ("Here be what start big And This test see what be That one start . . That one ." "And this is another one."
;; "Here are This ugly zipper ." "Here are some sample sentences."
;; "And an is The sample ." "Let's see what we can generate."
;; "generate simple see This corpus test to Let And a Let what an corpus adjective to be And another ugly car can see ugly start what this one to are . . . . . . ." "This is a big adjective."
;; "This sample sentences Let big 's a sample adjective in we ." "That's a short adjective."
;; "will generate what Let" "The corpus will be a simple start."])
;; "Here be ugly That sentence generate a big test ." ;; Extract parts-of-speech frequencies from the corpus.
;; "Here is this corpus sentences in a test this sample sentences ." freqs (pos->children-freqs zipper)]
;; "And some ugly zipper 's 's ." ;; Let's generate a bunch of noun phrases.
;; "Here generate big the adjective 's another corpus an .") (repeatedly
;; => ((TOP 10
;; ((SINV (fn []
;; ((ADVP ((RB "Here"))) (->> (generate-from-freqs freqs 'NP)
;; (VP string-leaf-nodes)))))
;; ((TO "to") ;; => ("another simple corpus"
;; (VP ;; "we"
;; ((TO "to") ;; "a big start"
;; (VP ;; "This start"
;; ((VB "Let") ;; "'s"
;; (SBAR ;; "a"
;; ((WHNP ((WP "what"))) ;; "a test sentences"
;; (S ;; "a sample"
;; ((NP ((DT "this") (CD "one"))) ;; "another car"
;; (VP ;; "this car sentences")
;; ((VBZ "is")
;; (NP ((DT "The") (NN "adjective") (NN "car")))
;; (PP
;; ((IN "in")
;; (NP
;; ((DT "The") (NN "test") (NNS "sentences")))))))))))))))))
;; (NP ((DT "another") (CD "one")))
;; (. "."))))))
) )
@ -384,39 +517,119 @@
(comment (comment
(let [pos-path '(TOP S NP DT "This")] (let [pos-path '(TOP S NP DT "This")]
(pos-freq pos-path)) (pos-freq pos-path))
;; => {TOP {:freq 1, S {:freq 1, NP {:freq 1, DT {:words {"This" 1}, :freq 1}}}}} ;; => {TOP {:freq 1, S {:freq 1, NP {:freq 1, DT {:words {"This" 1}, :freq 1}}}}}
) )
(defn pos-freqs [pos-paths] (defn pos-freqs [pos-paths]
(apply deep-merge-with + pos-paths)) (apply deep-merge-with + pos-paths))
(defn structure-freqs
"Frequencies of entire parse trees."
[text-lines]
(let [parse-tree (-> text-lines
parse-to-simple-tree
parse-tree-sans-leaf-words)
freqs (->> parse-tree
(map (fn [line] {line 1}))
(apply merge-with +))]
freqs))
(comment (comment
(let [zipper (treebank-zipper-1 ["This is a zipper test." (let [text-lines ["This is a test."
"And this is another one." "This is a line."
"This is not a test."])] "That is a test."
(->> zipper "You are a test."
leaf-pos-paths "This is not a test."
(map pos-freq) "I am a test."
(apply deep-merge-with +))) "This is a sample."
;; => {TOP "Go to the store."
;; {:freq 18, "Run the test."
;; S "The test is running."]]
;; {:freq 18, (structure-freqs text-lines))
;; NP {:freq 3, DT {:words {"This" 2, "this" 1}, :freq 3}}, ;; => {(TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))) (.))) 4,
;; VP ;; (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))) (.))) 2,
;; {:freq 11, ;; (TOP (S (NP (DT)) (VP (VBZ) (RB) (NP (DT) (NN))) (.))) 1,
;; VBZ {:words {"is" 3}, :freq 3}, ;; (TOP (VP (VB) (PP (TO) (NP (DT) (NN))) (.))) 1,
;; NP ;; (TOP (VP (VB) (NP (DT) (NN)) (.))) 1,
;; {:freq 7, ;; (TOP (S (NP (DT) (NN)) (VP (VBZ) (VP (VBG))) (.))) 1}
;; DT {:words {"a" 2, "another" 1}, :freq 3}, )
;; NN {:words {"zipper" 1, "test" 2}, :freq 3},
;; CD {:words {"one" 1}, :freq 1}}, (defn generate-from-structure-and-pos-freqs
;; RB {:words {"not" 1}, :freq 1}}, "This is an attempt to improve on selecting words for a particular part-of-speech
;; . {:words {"." 3}, :freq 3}, based solely on the part of speech.
;; CC {:words {"And" 1}, :freq 1}}}}
By passing a `structure-freqs` map, we can choose from a collection of known-valid
structures.
S -> NP | VP | NP VP
NP -> NN | DT NN | DT
VP -> VB | VB NN
If we go just based on frequencies, we might have a grammar that allows something like the above.
But it might be such that when S is NP VP, then NP is NEVER DT.
Thak's what supplying the structure frequency map can improve.
"
[structure pos-freqs]
(let [zipper (zip/seq-zip structure)]
(loop [zipper zipper]
(let [path (map first (zip/path zipper))
cur-freqs (pos-freqs path)]
(cond
(zip/end? zipper) (zip/root zipper)
(zip/branch? zipper) (recur (zip/next zipper))
cur-freqs
(let [selection (first (weighted-rand/weighted-selection second cur-freqs))]
(recur
(zip/next
(zip/next
(zip/replace
zipper
(list (zip/node zipper) selection))))))
;; ???
:else (recur (zip/next zipper)))))))
(comment
(let [corpus ["this is a test"
"that is a test"
"this is a sample"
"that is some code"
"there is a car"
"those are some cars"
"that is a dog"
"it is a dog"]
structure '(TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN)))))
pos-freqs (->> corpus
treebank-zipper
leaf-pos-path-word-freqs)]
(repeatedly
10
(fn []
(->> (generate-from-structure-and-pos-freqs
structure
pos-freqs)
zip/seq-zip
(iterate zip/next)
(take-while (complement zip/end?))
(filter #(string? (zip/node %)))
(map zip/node)
(string/join " ")))))
;; => ("this is some dog"
;; "that is a test"
;; "those is a dog"
;; "those is a sample"
;; "this is a test"
;; "that is some test"
;; "that is a car"
;; "that is a car"
;; "that is some dog"
;; "this is a test")
) )
(defn node-constituents (defn node-constituents
"Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))), "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
returns a list of the top-level node tag and its first-level child tags. returns a list of the top-level node tag and its first-level child tags.

@ -68,7 +68,7 @@
(lazy-seq ((window n) (drop n coll))))))) (lazy-seq ((window n) (drop n coll)))))))
(defn clean-text [text] (defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]+" "")))
(defn padr [val n coll] (defn padr [val n coll]
(concat coll (repeat n val))) (concat coll (repeat n val)))

@ -1,210 +0,0 @@
(ns com.owoga.prhyme.util.nlp
(:require [opennlp.nlp :as nlp]
[opennlp.treebank :as tb]
[clojure.string :as string]
[clojure.java.io :as io]
[clojure.zip :as zip]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])
(:import (opennlp.tools.postag POSModel POSTaggerME)))
(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
(def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
(def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
(def pos-tagger (nlp/make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
;;;; The tagger that onennlp.nlp gives us doesn't provide access
;;;; to the probabilities of all tags. It gives us the probability of the
;;;; top tag through some metadata. But to get probs for all tags, we
;;;; need to implement our own tagger.
(defprotocol Tagger
(tags [this sent])
(probs [this])
(top-k-sequences [this sent]))
(defn make-pos-tagger
[modelfile]
(let [model (with-open [model-stream (io/input-stream modelfile)]
(POSModel. model-stream))
tagger (POSTaggerME. model)]
(reify Tagger
(tags [_ tokens]
(let [token-array (into-array String tokens)]
(map vector tokens (.tag tagger #^"[Ljava.lang.String;" token-array))))
(probs [_] (seq (.probs tagger)))
(top-k-sequences [_ tokens]
(let [token-array (into-array String tokens)]
(.topKSequences tagger #^"[Ljava.lang.String;" token-array))))))
(def prhyme-pos-tagger (make-pos-tagger (io/resource "models/en-pos-maxent.bin")))
(comment
(let [phrase "The feeling hurts."]
(map (juxt #(.getOutcomes %)
#(map float (.getProbs %)))
(top-k-sequences prhyme-pos-tagger (tokenize phrase))))
;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)]
;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)])
)
(defn valid-sentence?
"Tokenizes and parses the phrase using OpenNLP models from
http://opennlp.sourceforge.net/models-1.5/
If the parse tree has an clause as the top-level tag, then
we consider it a valid English sentence."
[phrase]
(->> phrase
tokenize
(string/join " ")
vector
parse
first
tb/make-tree
:chunk
first
:tag
tb2/clauses
boolean))
(defn unmake-tree
"Tokenizing and then parsing a sentence returns a string
representation of the parse tree. This is a helper function
to make working with the parse tree more convenient. We
can use `opennlp.treebank/make-tree` to make a clojure map
representation of the tree, then we can `unmake` the tree
to turn it into a list representation of the tree that
we can easily use in a clojure zipper. (read-string almost works,
but falls apart when reading things like commas)."
[node]
(cond
(string? node) node
(map? node) (list (:tag node) (unmake-tree (:chunk node)))
:else (map unmake-tree node)))
(comment
(let [phrase "Hello, Eric"]
(->> phrase
tokenize
(string/join " ")
vector
parse
(map tb/make-tree)
unmake-tree))
;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric")))))))
)
(defn treebank-zipper
"Turns a bit of text into a parse tree into a zipper."
[text]
(let [tree (->> text
tokenize
(string/join " ")
vector
parse
(map tb/make-tree)
unmake-tree)]
(zip/zipper seq? seq (fn [_ c] c) tree)))
(defn node-constituents
"Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))),
returns a list of the top-level node tag and its first-level child tags.
"
[node]
(list
(first node)
(if (every? string? (map first (rest node)))
nil
(map first (first (rest node))))))
(defn phrase-constituents
"Given a bit of text that can be parsed into a treebank tree,
Get a sequence of the tags and their chunks.
For example:
My name is Eric.
Returns the sequence:
At the TOP tag, we have a 'S part-of-speech (a clause).
At the 'S tag, we have a 'NP, 'VP, '. (noun-phrase + verb-phrase + period)
At the 'NP tag, we have a 'PRP$, 'NN (personal-pronoun + singular-noun)
...
"
[text]
(->> (treebank-zipper text)
(iterate zip/next)
(take-while (complement zip/end?))
(filter (complement zip/branch?))
(map zip/path)
(map last)
(map node-constituents)
(remove #(string? (first %)))))
(comment
(phrase-constituents "My name is Eric.")
;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
(phrase-constituents "How are you?")
;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP)))
)
(defn pos-constituent-frequencies
"Frequencies of the parts of speech that make up phrases.
Example:
Clauses are made up of:
NounPhrase + VerbPhrase 2 times
Clause + CoordinatingConjuction + Clause 1 times
NounPhrases are made up of:
ProperNouns 2 times
PersonalPronoun + SingularNoun 3 times
Does not include frequencies for leaf words. By that I mean: A SingularNoun might
appear 5 times all together, 3 times as part of a PersonalPronoun + SingularNoun pair
and 2 times as part of an Adjective + SingularNoun pair, but the data structure returned
by this function won't include that 5 anywhere. This is due to the (remove #(nil? (second %)))
line. This data structure is used as a kind of markov selection process and we don't really
care how often the leafs are used. We just care about the ratio at which we should pick each
leaf from a given parent.
"
[texts]
(reduce
(fn [acc text]
(let [constituents (->> text
phrase-constituents
(remove #(nil? (second %))))]
(reduce
(fn [acc constituent]
(let [k1 (first constituent)
k2 (second constituent)]
(update-in acc [k1 k2] (fnil inc 0))))
acc
constituents)))
{}
texts))
(comment
(pos-constituent-frequencies
["My name is Eric."
"My hat is blue and I like cake."
"Your name is Taylor."
"How are you?"])
;; => {TOP {(S) 3, (SBARQ) 1},
;; S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
;; NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
;; VP {(VBZ NP) 2, (VBZ ADJP) 1, (VBP NP) 1},
;; ADJP {(JJ) 1},
;; SBARQ {(WHADVP SQ .) 1},
;; WHADVP {(WRB) 1},
;; SQ {(VBP NP) 1}}
(let [phrase "How are you today?"]
(->> phrase
tokenize
(string/join " ")
vector
parse
(map tb/make-tree)))
(let [phrase "I gave the cake to John at the store."]
(parse (tokenize phrase)))
(let [phrase "I've got a good feeling"]
(pos-tagger (tokenize phrase)))
)

@ -78,7 +78,8 @@
selection (nth coll index)] selection (nth coll index)]
selection)) selection))
([key-fn coll] ([key-fn coll]
(let [rng (from-weights (map key-fn coll)) (let [coll (seq coll)
rng (from-weights (map key-fn coll))
index (nextr rng nil) index (nextr rng nil)
selection (nth coll index)] selection (nth coll index)]
selection))) selection)))

Loading…
Cancel
Save