From c0ad5acefcdcf7a75f267b85a2d5b5771e1c941b Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Thu, 5 Nov 2020 14:48:04 -0800 Subject: [PATCH] Add simple-nlg --- deps.edn | 1 + src/com/owoga/corpus/markov.clj | 13 +- src/com/owoga/prhyme/data/dictionary.clj | 6 +- src/com/owoga/prhyme/nlg/core.clj | 162 +++++ src/com/owoga/prhyme/nlp/core.clj | 759 +++++++++++++++++++++++ 5 files changed, 933 insertions(+), 8 deletions(-) create mode 100644 src/com/owoga/prhyme/nlg/core.clj create mode 100644 src/com/owoga/prhyme/nlp/core.clj diff --git a/deps.edn b/deps.edn index 863dc5e..ce3aa9f 100644 --- a/deps.edn +++ b/deps.edn @@ -7,6 +7,7 @@ inflections {:mvn/version "0.13.2"} com.taoensso/tufte {:mvn/version "2.2.0"} clojure-opennlp {:mvn/version "0.5.0"} + uk.ac.abdn/SimpleNLG {:mvn/version "4.5.0"} net.sf.sociaal/freetts {:mvn/version "1.2.2"} enlive {:mvn/version "1.1.6"} integrant {:mvn/version "0.8.0"} diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index 1e036e8..2920186 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -1,7 +1,7 @@ (ns com.owoga.corpus.markov (:require [com.owoga.prhyme.util :as util] [com.owoga.prhyme.data.dictionary :as dict] - [com.owoga.prhyme.util.nlp :as nlp] + [com.owoga.prhyme.nlp.core :as nlp] [clojure.string :as string] [clojure.java.io :as io])) @@ -63,12 +63,15 @@ (map #(string/split % #"\n+")) (map (fn [lyrics] (filter #(nlp/valid-sentence? %) lyrics))) (map #(remove nil? %)) - (take 5) - (map (fn [lyrics] + (take 400) + (flatten) + (nlp/pos-constituent-frequencies) + #_(map (fn [lyrics] (map #(nlp/tags nlp/prhyme-pos-tagger (nlp/tokenize %)) lyrics))))) (comment - (let [directory "dark-corpus/zero-hour/"] - (gen-pos-markov directory)) + (time + (let [directory "dark-corpus/"] + (gen-pos-markov directory))) ) diff --git a/src/com/owoga/prhyme/data/dictionary.clj b/src/com/owoga/prhyme/data/dictionary.clj index 6e2dd3e..0e3ff48 100644 --- a/src/com/owoga/prhyme/data/dictionary.clj +++ b/src/com/owoga/prhyme/data/dictionary.clj @@ -56,9 +56,9 @@ (set (line-seq (io/reader (io/resource "nouns.txt")))))) (defn english? [text] - (let [words (string/split text #"\s+") + (let [word-set (into #{} (map :normalized-word prhyme-dict)) + words (string/split text #"\s+") english-words (->> words - (filter #((into #{} (map :normalized-word prhyme-dict)) - (string/lower-case %))))] + (filter #(word-set (string/lower-case %))))] (< 0.7 (/ (count english-words) (max 1 (count words)))))) diff --git a/src/com/owoga/prhyme/nlg/core.clj b/src/com/owoga/prhyme/nlg/core.clj new file mode 100644 index 0000000..1fe022d --- /dev/null +++ b/src/com/owoga/prhyme/nlg/core.clj @@ -0,0 +1,162 @@ +(ns com.owoga.prhyme.nlg.core + (:import (simplenlg.framework NLGFactory) + (simplenlg.lexicon Lexicon) + (simplenlg.realiser.english Realiser) + (simplenlg.features Feature Form))) + +(def lexicon (Lexicon/getDefaultLexicon)) +(def nlg-factory (NLGFactory. lexicon)) +(def realiser (Realiser. lexicon)) + +(defn realise [element] + (.realise realiser element)) + +(let [text "my dog is happy" + sentence (.createSentence nlg-factory text)] + (.realiseSentence realiser sentence)) + +(let [clause (.createClause nlg-factory) + _ (.setSubject clause "us") + _ (.setVerb clause "chase") + _ (.setObject clause "the monkey")] + (.realiseSentence realiser clause)) + +(let [NP (.createNounPhrase nlg-factory "zipper") + _ (.setDeterminer NP "another") + VP1 (.createVerbPhrase nlg-factory "eat") + clause (.createSentence nlg-factory [NP VP1])] + (.realiseSentence realiser clause)) + +(defn create-noun-phrase + ([] + (.createNounPhrase nlg-factory)) + ([noun] + (.createNounPhrase nlg-factory noun)) + ([specifier noun] + (.createNounPhrase nlg-factory specifier noun))) + +(defn set-determiner [noun-phrase determiner] + (.setDeterminer noun-phrase determiner) + noun-phrase) + +(defn set-noun [noun-phrase noun] + (.setNoun noun-phrase noun) + noun-phrase) + +(defn create-adjective-phrase + ([] + (.createAdjectivePhrase nlg-factory)) + ([adjective] + (.createAdjectivePhrase nlg-factory adjective))) + +(defn create-adverb-phrase + ([] + (.createAdverbPhrase nlg-factory)) + ([adverb] + (.createAdverbPhrase nlg-factory adverb))) + +(defn create-verb-phrase + ([] + (.createVerbPhrase nlg-factory)) + ([verb] + (.createVerbPhrase nlg-factory verb))) + +(defn set-feature [element feature-name feature-value] + (.setFeature element feature-name feature-value) + element) + +(defn set-pre-modifier [element new-pre-modifier] + (.setPreModifier element new-pre-modifier) + element) + +(ns-unmap *ns* 'create-element) +(defmulti create-element + (fn [children] (doall (map first children)))) + +(defmethod create-element '(PRP$) + [[[_ child]]] (.createNounPhrase nlg-factory child)) + +(defmethod create-element '(NN) + [[[_ child]]] (.createNounPhrase nlg-factory child)) + +(defmethod create-element '(NNP) + [[[_ child]]] + (let [noun-phrase (.createNounPhrase nlg-factory child)] + (.setPlural noun-phrase true) + noun-phrase)) + +;; A determiner by itself is usually a subject. +;; i.e. "This is a test." <- "This" is tagged by the treebank parser +;; as a determiner +(defmethod create-element '(DT) + [[[_ child]]] + (.createNounPhrase nlg-factory child)) + +(defmethod create-element '(DT NN NN) + [[DT NN1 NN2]] + (let [dt (create-element (list DT)) + nn1 (create-element (list NN1)) + nn2 (create-element (list NN2)) + clause (.createNounPhrase nlg-factory)] + (.setSpecifier clause dt) + (.setPreModifier clause nn1) + (.setNoun clause nn2) + clause)) + +(realise (create-element '((DT "a") (NN "sample") (NN "test")))) + +(defmethod create-element '(PRP$ NN) + [[prp$ nn]] + (-> (.createNounPhrase + nlg-factory + (create-element (list nn))) + (set-pre-modifier + (-> (create-element (list prp$)) + (set-feature Feature/POSSESSIVE true))))) + +(defmethod create-element + '(NP) + [[[_ child]]] + (create-element child)) + +(realise (create-element '((NP ((PRP$ "Eric") (NN "test")))))) + +(realise (create-element '((DT "This")))) +(create-element '((NN "test"))) +(realise (create-element '((PRP$ "Eric") (NN "test")))) +(realise (create-element '((NNP "tests")))) + +(defmethod create-element '(VB) + [[[_ child]]] + (.createVerbPhrase nlg-factory child)) + +(realise (create-element '((VB "run")))) + +(defmethod create-element '(VBZ) + [[[_ child]]] + (.createVerbPhrase nlg-factory child)) + +(defmethod create-element '(VBZ NP) + [[VBZ NP]] + (let [np (create-element (list NP)) + vbz (create-element (list VBZ)) + clause (.createClause nlg-factory)] + (.setObject clause np) + (.setVerb clause vbz) + clause)) + +(comment + (realise (create-element '((VBZ "is") (NP ((NN "test")))))) + ) + +(comment + (-> (create-noun-phrase) + (set-determiner "a") + (set-noun "test")) + + (-> (create-verb-phrase "let") + (set-feature Feature/FORM Form/INFINITIVE) + (#(realise %))) + + + ) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj new file mode 100644 index 0000000..e9d819f --- /dev/null +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -0,0 +1,759 @@ +(ns com.owoga.prhyme.nlp.core + (:require [opennlp.nlp :as nlp] + [opennlp.treebank :as tb] + [clojure.string :as string] + [clojure.java.io :as io] + [clojure.zip :as zip] + [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]) + (:import (opennlp.tools.postag POSModel POSTaggerME))) + +(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin"))) +(def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin"))) +(def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin"))) +(def pos-tagger (nlp/make-pos-tagger (io/resource "models/en-pos-maxent.bin"))) + +;;;; The tagger that onennlp.nlp gives us doesn't provide access +;;;; to the probabilities of all tags. It gives us the probability of the +;;;; top tag through some metadata. But to get probs for all tags, we +;;;; need to implement our own tagger. +(defprotocol Tagger + (tags [this sent]) + (probs [this]) + (top-k-sequences [this sent])) + +(defn make-pos-tagger + [modelfile] + (let [model (with-open [model-stream (io/input-stream modelfile)] + (POSModel. model-stream)) + tagger (POSTaggerME. model)] + (reify Tagger + (tags [_ tokens] + (let [token-array (into-array String tokens)] + (map vector tokens (.tag tagger #^"[Ljava.lang.String;" token-array)))) + (probs [_] (seq (.probs tagger))) + (top-k-sequences [_ tokens] + (let [token-array (into-array String tokens)] + (.topKSequences tagger #^"[Ljava.lang.String;" token-array)))))) + +(def prhyme-pos-tagger (make-pos-tagger (io/resource "models/en-pos-maxent.bin"))) + +(comment + (let [phrase "The feeling hurts."] + (map (juxt #(.getOutcomes %) + #(map float (.getProbs %))) + (top-k-sequences prhyme-pos-tagger (tokenize phrase)))) + ;; => ([["DT" "NN" "VBZ" "."] (0.9758878 0.93964833 0.7375927 0.95285994)] + ;; [["DT" "VBG" "VBZ" "."] (0.9758878 0.03690145 0.27251 0.9286113)]) + (let [phrase "Blood falls."] + (->> phrase + tokenize + (top-k-sequences prhyme-pos-tagger) + (map (juxt #(.getOutcomes %) + #(map float (.getProbs %)))))) + + ) + +(defn deep-merge-with [f & maps] + (letfn [(m [& xs] + (if (some map? xs) + (apply merge-with m xs) + (apply f xs)))] + (reduce m maps))) + +(comment + (let [m1 {:a 1 :b {:b 2}} + m2 {:c 3 :b {:b 5}}] + (deep-merge-with + m1 m2)) + ;; => {:a 1, :b {:b 7}, :c 3} + ) + +(defn valid-sentence? + "Tokenizes and parses the phrase using OpenNLP models from + http://opennlp.sourceforge.net/models-1.5/ + + If the parse tree has an clause as the top-level tag, then + we consider it a valid English sentence." + [phrase] + (->> phrase + tokenize + (string/join " ") + vector + parse + first + tb/make-tree + :chunk + first + :tag + tb2/clauses + boolean)) + +(defn unmake-tree + "Tokenizing and then parsing a sentence returns a string + representation of the parse tree. This is a helper function + to make working with the parse tree more convenient. We + can use `opennlp.treebank/make-tree` to make a clojure map + representation of the tree, then we can `unmake` the tree + to turn it into a list representation of the tree that + we can easily use in a clojure zipper. (read-string almost works, + but falls apart when reading things like commas)." + [node] + (cond + (string? node) node + (map? node) (list (:tag node) (unmake-tree (:chunk node))) + :else (map unmake-tree node))) + +(comment + (let [phrase "Hello, Eric"] + (->> phrase + tokenize + (string/join " ") + vector + parse + (map tb/make-tree) + unmake-tree)) + ;; => ((TOP ((S ((INTJ ((UH ("Hello")))) (, (",")) (. ("Eric"))))))) + ) + +(defn treebank-zipper + "Turns a bit of text into a parse tree into a zipper." + [text] + (let [tree (->> text + tokenize + (string/join " ") + vector + parse + (map tb/make-tree) + unmake-tree)] + (zip/seq-zip tree))) + +(comment + (let [zipper (treebank-zipper "This is a zipper test.")] + zipper) + ;; => [((TOP + ;; ((S + ;; ((NP ((DT ("This")))) + ;; (VP ((VBZ ("is")) (NP ((DT ("a")) (NN ("zipper")) (NN ("test")))))) + ;; (. ("."))))))) + ;; nil] + ) + +(defn treebank-zipper-1 + "Turns a bit of text into a parse tree into a zipper." + [texts] + (let [tree (->> texts + (map tokenize) + (map (partial string/join " ")) + parse + (map tb/make-tree) + unmake-tree)] + (zip/seq-zip tree))) + +(defn seq-zip [zipper] + (->> zipper + (iterate zip/next) + (take-while (complement zip/end?)))) + +(defn seq-nodes [zipper] + (->> zipper + seq-zip + (map zip/node))) + +(defn phrase-level? [node] + (and (seq? node) + (symbol? (first node)))) + +(defn word-level? [node] + (and (seq? node) + (symbol? (first node)) + (= 1 (count (second node))) + (string? (first (second node))))) + +(defn leaf-pos-paths [zipper] + (->> zipper + seq-zip + (filter (complement zip/branch?)) + (map zip/path) + (map #(map first %)) + (map + #(filter + (fn [v] (or (string? v) + (symbol? v))) + %)) + (filter #(string? (last %))))) + +(comment + (let [zipper (treebank-zipper-1 ["This is a zipper test." + "And this is another one."])] + (leaf-pos-paths zipper)) +;; => ((TOP S NP DT "This") +;; (TOP S VP VBZ "is") +;; (TOP S VP NP DT "a") +;; (TOP S VP NP NN "zipper") +;; (TOP S VP NP NN "test") +;; (TOP S . ".") +;; (TOP S CC "And") +;; (TOP S NP DT "this") +;; (TOP S VP VBZ "is") +;; (TOP S VP NP DT "another") +;; (TOP S VP NP CD "one") +;; (TOP S . ".")) + ) + + +;;;; This is not sufficient +;; You'll end up with a mapping that says a verb phrase can be a +;; third-person-singular verb, like "is" followed by a +;; noun-phrase. But "is" can't be followed by just any noun-phrase. +;; It must be followed by a noun-phrase that starts with a determiner, +;; like "a" or "the". +(defn pos->children-freqs + "Takes a zipper representation of a parsed text. + + Returns a map of parts-of-speech to a map of their constituent parts-of-speech + and the number of times each constituent is seen in a corpus. + + For example, in the sentence 'This is a test and that is not a test.', + there are 4 noun phrases; 'this', 'a test', 'that', 'a test'. + + Twice, the noun phrase is made up of a determiner (namely, 'this' and 'that'). + Twice, the noun phras is made up of a determiner and a noun (namely, 'a test' and 'a test'). + + If the constituent part of speech is an actual word, then the key is the word. + + For the sentence 'This is a test and that is not a test.', this function would include a kv pairs of: + {NP {(DT) 2 (DT NN) 2} + CC {'and' 1} + VBZ {'is' 2} + ,,,} + + This data structure is useful for text generation. If you're generating a sentence and you + know that you need a coordinating conjunction, you can choose one in from the list weighted + by their frequencies. + " + [zipper] + (->> zipper + seq-nodes + (filter phrase-level?) + (map + (fn [[pos xs]] + (vector + pos + (if (string? (first xs)) + (first xs) + (map first xs))))) + (map (fn [[k v]] + {k {v 1}})) + (apply deep-merge-with +))) + +(comment + (let [zipper (treebank-zipper-1 ["This is a test and that is not a test." + "My name is Eric." + "Go to the store." + "Your name is not Eric." + "This is a sample test."]) + freqs (->> zipper + pos->children-freqs)] + freqs) +;; => {NP {(DT) 3, (DT NN) 3, (PRP$ NN) 2, (NNP) 2, (DT NN NN) 1}, +;; VB {"Go" 1}, +;; VBZ {"is" 5}, +;; S {(S CC S .) 1, (NP VP) 2, (NP VP .) 3}, +;; RB {"not" 2}, +;; NNP {"Eric" 2}, +;; TO {"to" 1}, +;; DT {"This" 2, "a" 3, "that" 1, "the" 1}, +;; TOP {(S) 4, (VP) 1}, +;; CC {"and" 1}, +;; NN {"test" 3, "name" 2, "store" 1, "sample" 1}, +;; PP {(TO NP) 1}, +;; VP {(VBZ NP) 3, (VBZ RB NP) 2, (VB PP .) 1}, +;; PRP$ {"My" 1, "Your" 1}, +;; . {"." 5}} + ) + +(defn generate-from-freqs + "freqs is a map of parts-of-speech to a map of their constituent parts-of-speech + and the number of times each constituent is seen in a corpus. + + start is the part-of-speech to start generating for. + " + [freqs start] + (let [zipper (zip/seq-zip (list start))] + (loop [zipper zipper] + (cond + (zip/end? zipper) (zip/root zipper) + + (zip/branch? zipper) (recur (zip/next zipper)) + + (freqs (zip/node zipper)) + (recur + (zip/next + (zip/next + (zip/replace + zipper + (list + (zip/node zipper) + (first (rand-nth (seq (freqs (zip/node zipper)))))))))) + + :else (recur (zip/next zipper)))))) + +(comment + (let [zipper (treebank-zipper-1 ["This is a zipper test in the car." + "And this is another one." + "Here are some sample sentences." + "Let's see what we can generate." + "This is a big adjective." + "That's an ugly sentence." + "The corpus will be simple to start."]) + freqs (->> zipper + pos->children-freqs)] + freqs + (->> (generate-from-freqs freqs 'TOP)) + (->> (repeatedly + (fn [] + (->> (generate-from-freqs freqs 'TOP) + (zip/seq-zip) + (iterate zip/next) + (take-while (complement zip/end?)) + (filter (complement zip/branch?)) + (map zip/node) + (filter string?) + (string/join " ")))) + (filter valid-sentence?) + (take 10))) + ;; => ("Here be what start big And This test see what be That one start . . That one ." + ;; "Here are This ugly zipper ." + ;; "And an is The sample ." + ;; "generate simple see This corpus test to Let And a Let what an corpus adjective to be And another ugly car can see ugly start what this one to are . . . . . . ." + ;; "This sample sentences Let big 's a sample adjective in we ." + ;; "will generate what Let" + ;; "Here be ugly That sentence generate a big test ." + ;; "Here is this corpus sentences in a test this sample sentences ." + ;; "And some ugly zipper 's 's ." + ;; "Here generate big the adjective 's another corpus an .") + ;; => ((TOP + ;; ((SINV + ;; ((ADVP ((RB "Here"))) + ;; (VP + ;; ((TO "to") + ;; (VP + ;; ((TO "to") + ;; (VP + ;; ((VB "Let") + ;; (SBAR + ;; ((WHNP ((WP "what"))) + ;; (S + ;; ((NP ((DT "this") (CD "one"))) + ;; (VP + ;; ((VBZ "is") + ;; (NP ((DT "The") (NN "adjective") (NN "car"))) + ;; (PP + ;; ((IN "in") + ;; (NP + ;; ((DT "The") (NN "test") (NNS "sentences"))))))))))))))))) + ;; (NP ((DT "another") (CD "one"))) + ;; (. ".")))))) + + + + ) + + +(defn pos-freq [pos-path] + (loop [remaining-path (rest pos-path) + current-path [(first pos-path)] + result {}] + (cond + (string? (first remaining-path)) + (-> result + (update-in + (conj current-path :words) + #(merge-with + % {(first remaining-path) 1})) + (update-in + current-path + #(merge-with + % {:freq 1}))) + + :else + (recur (rest remaining-path) + (conj current-path (first remaining-path)) + (update-in + result + current-path + #(merge-with + % {:freq 1})))))) + +(comment + (let [pos-path '(TOP S NP DT "This")] + (pos-freq pos-path)) + ;; => {TOP {:freq 1, S {:freq 1, NP {:freq 1, DT {:words {"This" 1}, :freq 1}}}}} + ) + + + +(defn pos-freqs [pos-paths] + (apply deep-merge-with + pos-paths)) + +(comment + (let [zipper (treebank-zipper-1 ["This is a zipper test." + "And this is another one." + "This is not a test."])] + (->> zipper + leaf-pos-paths + (map pos-freq) + (apply deep-merge-with +))) +;; => {TOP +;; {:freq 18, +;; S +;; {:freq 18, +;; NP {:freq 3, DT {:words {"This" 2, "this" 1}, :freq 3}}, +;; VP +;; {:freq 11, +;; VBZ {:words {"is" 3}, :freq 3}, +;; NP +;; {:freq 7, +;; DT {:words {"a" 2, "another" 1}, :freq 3}, +;; NN {:words {"zipper" 1, "test" 2}, :freq 3}, +;; CD {:words {"one" 1}, :freq 1}}, +;; RB {:words {"not" 1}, :freq 1}}, +;; . {:words {"." 3}, :freq 3}, +;; CC {:words {"And" 1}, :freq 1}}}} + ) +(defn node-constituents + "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))), + returns a list of the top-level node tag and its first-level child tags. + " + [node] + (list + (first node) + (if (every? string? (map first (rest node))) + nil + (map first (first (rest node)))))) + +(defn node-constituents-with-tokens + "Given a node of a parse tree, like ('NP (('PRP$ (\"my\" 'NN (\"name\"))))), + returns a list of the top-level node tag and its first-level child tags. + + Like the above, but instead of converting leaf tokens to nil, leave them + as-is. This can be used to merge a tree keeping the leaf tokens in a set + and that tree can be used for a kind of markov generation of new text. + " + [node] + (list + (first node) + (if (every? string? (map first (rest node))) + (first (first (rest node))) + (map first (first (rest node)))))) + +(comment + (let [text "My name is Eric." + parse-tree (->> (treebank-zipper text) + (iterate zip/next) + (take-while (complement zip/end?)) + (filter (complement zip/branch?)) + (map zip/path) + (map last) + (map node-constituents-with-tokens))] + parse-tree) + ) + +(defn phrase-constituents + "Given a bit of text that can be parsed into a treebank tree, + Get a sequence of the tags and their chunks. + For example: + My name is Eric. + Returns the sequence: + At the TOP tag, we have a 'S part-of-speech (a clause). + At the 'S tag, we have a 'NP, 'VP, '. (noun-phrase + verb-phrase + period) + At the 'NP tag, we have a 'PRP$, 'NN (personal-pronoun + singular-noun) + ... + " + [text] + (->> (treebank-zipper text) + (iterate zip/next) + (take-while (complement zip/end?)) + (filter (complement zip/branch?)) + (map zip/path) + (map last) + (map node-constituents) + (remove #(string? (first %))))) + +(comment + (phrase-constituents "My name is Eric.") + ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP))) + (phrase-constituents "How are you?") + ;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP))) + ) + + +(defn pos-constituent-frequencies + "Frequencies of the parts of speech that make up phrases. + Example: + Clauses are made up of: + NounPhrase + VerbPhrase 2 times + Clause + CoordinatingConjuction + Clause 1 times + NounPhrases are made up of: + ProperNouns 2 times + PersonalPronoun + SingularNoun 3 times + + Does not include frequencies for leaf words. By that I mean: A SingularNoun might + appear 5 times all together, 3 times as part of a PersonalPronoun + SingularNoun pair + and 2 times as part of an Adjective + SingularNoun pair, but the data structure returned + by this function won't include that 5 anywhere. This is due to the (remove #(nil? (second %))) + line. This data structure is used as a kind of markov selection process and we don't really + care how often the leafs are used. We just care about the ratio at which we should pick each + leaf from a given parent. + " + [texts] + (reduce + (fn [acc text] + (let [constituents (->> text + phrase-constituents + (remove #(nil? (second %))))] + (reduce + (fn [acc constituent] + (let [k1 (first constituent) + k2 (second constituent)] + (update-in acc [k1 k2] (fnil inc 0)))) + acc + constituents))) + {} + texts)) + +(comment + (pos-constituent-frequencies + ["My name is Eric." + "My hat is blue and I like cake." + "Your name is Taylor." + "How are you?"]) + ;; => {TOP {(S) 3, (SBARQ) 1}, + ;; S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2}, + ;; NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1}, + ;; VP {(VBZ NP) 2, (VBZ ADJP) 1, (VBP NP) 1}, + ;; ADJP {(JJ) 1}, + ;; SBARQ {(WHADVP SQ .) 1}, + ;; WHADVP {(WRB) 1}, + ;; SQ {(VBP NP) 1}} + + (let [phrase "How are you today?"] + (->> phrase + tokenize + (string/join " ") + vector + parse + (map tb/make-tree))) + + (let [phrase "I gave the cake to John at the store."] + (parse (tokenize phrase))) + + (let [phrase "I've got a good feeling"] + (pos-tagger (tokenize phrase))) + ) + +(comment + (let [text "My name is Eric." + zipper (treebank-zipper text)] + (map identity (seq-zip zipper))) + + ) + +(defn zipper->freqs [zipper] + (let [coll ()])) + +(defn reduce-to-freqs [tree] + (reduce + (fn [acc [pos [children]]] + (let [ks (map first children) + vs (map rest children)] + (-> acc + (assoc pos (merge-with + {children 1})) + ))) + {} + tree)) + +(defn text->parsed-pos-freqs + [text] + (->> text + (map tokenize) + (map (partial string/join " ")) + parse + (map tb/make-tree) + (map unmake-tree))) + +(comment + (let [text ["My name is Eric and my style is wild." + "Your name is unknown and where are you going?"]] + (text->parsed-pos-freqs text)) + + ) + +(defn text->pos + "'My name is Eric.' -> (['my' 'PRP$'] ['name' 'NN'] ,,,)" + [text] + (->> text + tokenize + pos-tagger + (map #(vector (string/lower-case (first %)) (second %))))) + +(comment + (text->pos "My name is Eric.") + ;; => (["my" "PRP$"] ["name" "NN"] ["is" "VBZ"] ["eric" "NNP"] ["." "."]) + ) + +(defn text->pos-freqs + "Turn a line of text into a map of parts of speech to word frequencies. + Useful if you have a corpus and need to sample a particular part-of-speech." + [text] + (let [pos (text->pos text)] + (into + {} + (->> pos + (map reverse) + (reduce + (fn [acc [pos word]] + (update + acc + (symbol pos) + #(update % word (fnil inc 0)))) + {}))))) + +(comment + (text->pos-freqs "My name is Eric and my style is wild.") + ;; => {PRP$ {"my" 2}, + ;; NN {"name" 1, "style" 1}, + ;; VBZ {"is" 2}, + ;; NNP {"eric" 1}, + ;; CC {"and" 1}, + ;; RB {"wild" 1}, + ;; . {"." 1}} + ) + +(defn texts->pos-freqs + "Convenience method for turning multiple lines of texts into a + tree with their pos-word-freqs merged." + [texts] + (->> texts + (map text->pos-freqs) + (apply + merge-with + (fn [old new] + (merge-with + old new))))) + +(comment + (let [texts ["My name is Eric and my style is wildish." + "Your name is unknown and your style is childish."]] + (texts->pos-freqs texts)) + ;; => {PRP$ {"my" 2, "your" 2}, + ;; NN {"name" 2, "style" 2}, + ;; VBZ {"is" 4}, + ;; NNP {"eric" 1}, + ;; CC {"and" 2}, + ;; JJ {"wildish" 1, "unknown" 1, "childish" 1}, + ;; . {"." 2}} + ) + +(defn pos-freqs->reverse + "For getting a map of words to their pos frequencies" + [tree] + (reduce + (fn [acc [pos word-freqs]] + (reduce + (fn [acc [word freq]] + (update acc word (fnil conj {}) {pos freq})) + acc + word-freqs)) + {} + tree)) + +(comment + (let [pos-freqs {'NN {"name" 2 "style" 2} + 'VBZ {"is" 4 "name" 1}}] + (pos-freqs->reverse pos-freqs)) + ;; => {"name" {NN 2, VBZ 1}, "style" {NN 2}, "is" {VBZ 4}} + ) + +(defn pos-tree->str [pos-tree] + (->> pos-tree + (map (fn [[k v]] + (cons (str k) (seq v)))) + (map #(string/join " " %)) + (string/join "\n"))) + +(defn str->pos-tree [s] + (let [lines (string/split s #"\n")] + (into + {} + (map + (fn [line] + (->> line + (#(string/split % #" " 2)) + ((fn [[k v]] + (vector (symbol k) (into #{} (string/split v #" "))))))) + lines)))) + +(comment + (text->pos-freqs "My name is Eric.") + + (->> ["My name is Eric." + "Your name is Taylor." + "Is your feeling hurt?" + "How are you feeling?" + "It is a lovely feeling." + "Your name is not Eric." + "Who is your mother and what does she do?"] + (pos-constituent-frequencies) + #_#_(apply + merge-with + (fn [a b] + (merge-with + a b))) + (pos-freqs->reverse)) + + ) + +(defn reverse-children-to-parents + "We want to work from the reverse, or backwards, representation of pos-constituents-frequencies. + Pos-constituent-frequencies is good for generating text top-down. But we want + to generate text bottom-up for rhyming." + [pos-freqs] + (reduce + (fn [acc [k v]] + (reduce + (fn [acc [children freq]] + (update acc children assoc k freq)) + acc + v)) + {} + pos-freqs)) + +(defn ways-to-make-a [pos pos-freqs] + (let [children (pos-freqs pos)] + (->> children + ((juxt keys vals)) + (apply map vector) + (into {})))) + +(comment + (let [pos-freqs (->> ["My name is Eric." + "Your name is Taylor." + "Is your feeling hurt?" + "How are you feeling?" + "It is a lovely feeling." + "Your name is not Eric." + "Who is your mother and what does she do?"] + (pos-constituent-frequencies))] + (->> pos-freqs + #_(ways-to-make-a 'S) + (#(get % 'NN)))) + + (let [pos-freqs (->> ["My name is Eric." + "Your name is Taylor." + "Is your feeling hurt?" + "How are you feeling?" + "It is a lovely feeling." + "Your name is not Eric." + "Who is your mother and what does she do?"] + (pos-constituent-frequencies))] + (->> (reverse-children-to-parents pos-freqs) + (remove (fn [[k v]] (and (= 1 (count k)) + (pos-freqs (first k))))) + (into {}))) + + )