From dd51e0fdcaa66e87a6132c301e5d5f46c6358953 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Mon, 26 Apr 2021 17:45:32 -0500 Subject: [PATCH] Generate with some grammar rules --- dev/examples/tpt.clj | 6 +- src/com/owoga/prhyme/data_transform.clj | 156 ++++- .../prhyme/generation/markov_example.clj | 544 ++++++++++++++++++ src/com/owoga/prhyme/nlp/core.clj | 57 +- src/com/owoga/prhyme/syllabify.clj | 1 + src/com/owoga/prhyme/util/math.clj | 47 +- 6 files changed, 788 insertions(+), 23 deletions(-) create mode 100644 src/com/owoga/prhyme/generation/markov_example.clj diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj index 89ed2d2..5cfffcf 100644 --- a/dev/examples/tpt.clj +++ b/dev/examples/tpt.clj @@ -857,9 +857,9 @@ generated-text (let [children (loop [i 4] (let [node (p :lookup - (trie/lookup - loaded-tightly-packed-trie - (vec (take-last i generated-text)))) + (trie/lookup + loaded-tightly-packed-trie + (vec (take-last i generated-text)))) children (p :seq-children (and node (trie/children node)))] (cond (nil? node) (recur (dec i)) diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj index 378434e..a45e2fb 100644 --- a/src/com/owoga/prhyme/data_transform.clj +++ b/src/com/owoga/prhyme/data_transform.clj @@ -2,16 +2,18 @@ (:require [clojure.string :as string] [clojure.java.io :as io] [com.owoga.prhyme.data.dictionary :as dict] + [com.owoga.prhyme.nlp.core :as nlp] [com.owoga.trie :as trie] [com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie.encoding :as encoding] - [taoensso.nippy :as nippy])) + [taoensso.nippy :as nippy] + [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])) (def re-word "Regex for tokenizing a string into words (including contractions and hyphenations), commas, periods, and newlines." - #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)") + #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\?|\n)") (defn xf-file-seq [start end] (comp (remove #(.isDirectory %)) @@ -38,6 +40,23 @@ (map (partial map second)) (map (partial mapv string/lower-case)))) +(def xf-untokenize + (comp + (map #(string/join " " %)) + (map #(string/replace % #" (['\-,\?\.] ?)" "$1")))) + +(comment + (let [tokens (transduce + xf-tokenize + conj + ["Eric's name, is Bond." "James, bond? Yes."])] + [tokens + (map #(string/join " " %) tokens) + (transduce + xf-untokenize + conj + tokens)])) + (def xf-filter-english (let [word? (fn [x] (or (#{"." "?" ","} x) (dict/cmu-with-stress-map x)))] @@ -75,6 +94,34 @@ k)] [k' 1]))) +(defn xf-part-of-speech-database + [database] + (fn [sentence] + (let [leafs (->> sentence + nlp/treebank-zipper + nlp/leaf-pos-path-word-freqs)] + (run! + (fn [[k v]] + (swap! + database + assoc + k + (merge-with + (@database k) v))) + leafs) + sentence))) + +(comment + (let [database (atom {})] + (transduce + (map (partial mapv (part-of-speech-database database))) + conj + [] + [["this test is difficult"] + ["this foot is sore"]]) + @database) + + ) + (def encode-fn "Encodes a number as a variable-length encoded value. nil gets encoded as 0." @@ -159,3 +206,108 @@ (map (fn [[k v]] [k (map database k) v]))))) ) + +(defn xf-grammar-database + [database] + (fn [sentence] + (let [leafs (->> sentence + nlp/treebank-zipper + nlp/leaf-pos-path-word-freqs)] + (run! + (fn [[k v]] + (swap! + database + assoc + k + (merge-with + (@database k) v))) + leafs) + sentence))) + +(defn file-seq->grammar-tree + [files] + (transduce + (comp + (xf-file-seq 0 1000) + (map slurp) + (map #(string/split % #"[\n+\?\.]")) + (map (partial transduce xf-tokenize conj)) + (map (partial transduce xf-filter-english conj)) + (map (partial remove empty?)) + (remove empty?) + (map (partial transduce xf-untokenize conj)) + (map nlp/grammar-tree-frequencies) + (map (partial into {}))) + (fn + ([acc] + (sort-by (comp - second) acc)) + ([acc m] + (merge-with + acc m))) + {} + files)) + +(comment + (time + (->> (file-seq->grammar-tree + (file-seq (io/file "dark-corpus"))) + (take 100) + (nippy/freeze-to-file "/tmp/grammar-freqs-top-100.bin"))) + + (def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin")) + (take 10 grammar-freqs) + + ) + +(defn file-seq->part-of-speech-freqs + [files] + (transduce + (comp + (xf-file-seq 0 1000) + (map slurp) + (map #(string/split % #"[\n+\?\.]")) + (map (partial transduce xf-tokenize conj)) + (map (partial transduce xf-filter-english conj)) + (map (partial remove empty?)) + (remove empty?) + (map (partial transduce xf-untokenize conj)) + (map (partial map nlp/treebank-zipper)) + (map (partial map nlp/leaf-pos-path-word-freqs)) + (map (partial reduce (fn [acc m] + (nlp/deep-merge-with + acc m)) {}))) + (completing + (fn [result input] + (nlp/deep-merge-with + result input))) + {} + files)) + +(comment + (time (->> (file-seq->part-of-speech-freqs + (file-seq (io/file "dark-corpus"))) + (nippy/freeze-to-file "/tmp/part-of-speech-freqs.bin"))) + + (def parts-of-speech-freqs + (nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin")) + (take 20 parts-of-speech-freqs) + ) + + +(defn file-seq->parts-of-speech-trie + [files] + (transduce + (comp + (xf-file-seq 0 1000) + (map slurp) + (map #(string/split % #"[\n+\?\.]")) + (map (partial transduce xf-tokenize conj)) + (map (partial transduce xf-filter-english conj)) + (map (partial remove empty?)) + (remove empty?) + (map (partial transduce xf-untokenize conj)) + (map nlp/grammar-tree-frequencies) + (map (partial into {}))) + (fn + ([acc] + (sort-by (comp - second) acc)) + ([acc m] + (merge-with + acc m))) + {} + files)) diff --git a/src/com/owoga/prhyme/generation/markov_example.clj b/src/com/owoga/prhyme/generation/markov_example.clj new file mode 100644 index 0000000..04cf239 --- /dev/null +++ b/src/com/owoga/prhyme/generation/markov_example.clj @@ -0,0 +1,544 @@ +(ns com.owoga.prhyme.generation.markov-example + (:require [clojure.string :as string] + [clojure.java.io :as io] + [com.owoga.prhyme.util.math :as math] + [com.owoga.phonetics :as phonetics] + [com.owoga.phonetics.syllabify :as syllabify] + [cljol.dig9 :as d] + [clojure.zip :as zip] + [com.owoga.tightly-packed-trie.bit-manip :as bm] + [com.owoga.trie :as trie] + [com.owoga.tightly-packed-trie.encoding :as encoding] + [com.owoga.tightly-packed-trie :as tpt] + [taoensso.nippy :as nippy] + [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2] + [com.owoga.prhyme.nlp.core :as nlp])) + +(def corpus (slurp (io/resource "cask_of_amontillado.txt"))) + +;; For better generation of text, you'll probably want to pad the starts +;; of sentences with n-1 "start-of-sentence" tokens. +(defn prep-punctuation-for-tokenization + "Puts spaces around punctuation so that they aren't + tokenized with the words they are attached to. + + Might add extraneous whitespace, but presumedly that will be ignored/removed + during tokenization." + [text] + (string/replace text #"([\.,!?])" " $1 ")) + +(defn remove-quotes + "...and hyphens" + [text] + (string/replace text #"[\"-]" "")) + +(defn remove-formatting-characters + "Input has underscores, presumably because the text + might be rendered by something that can italicize or bold text. + We'll just ignore them for now." + [text] + (string/replace text #"[_*]" "")) + +(defn tokenize [text] + (-> text + remove-formatting-characters + prep-punctuation-for-tokenization + remove-quotes + string/lower-case + (string/split #"[\n ]+"))) + +(defn interleave-all + "Like interleave, but instead of ending the interleave when the shortest collection + has been consumed, continues to interleave the remaining collections." + {:added "1.0" + :static true} + ([] ()) + ([c1] (lazy-seq c1)) + ([c1 c2] + (lazy-seq + (let [s1 (seq c1) s2 (seq c2)] + (if (and s1 s2) + (cons (first s1) (cons (first s2) + (interleave-all (rest s1) (rest s2)))) + (lazy-seq (or s1 s2)))))) + ([c1 c2 & colls] + (lazy-seq + (let [ss (->> (map seq (conj colls c2 c1)) + (remove nil?))] + (when ss + (concat (map first ss) (apply interleave-all (map rest ss)))))))) + +(comment + (let [tokens [1 2 3 4 5] + p1 (partition 1 1 tokens) + p2 (partition 2 1 tokens) + p3 (partition 3 1 tokens)] + (interleave-all p1 p2 p3))) + +(defn ngramify-tokens [n m tokens] + (let [partition-colls (map #(partition % 1 tokens) (range n m)) + ngrams (apply interleave-all partition-colls)] + ngrams)) + +(comment + (->> (tokenize corpus) + (take 5) + (ngramify-tokens 1 4)) + ;; => (("the") + ;; ("the" "thousand") + ;; ("the" "thousand" "injuries") + ;; ("thousand") + ;; ("thousand" "injuries") + ;; ("thousand" "injuries" "of") + ;; ("injuries") + ;; ("injuries" "of") + ;; ("injuries" "of" "fortunato") + ;; ("of") + ;; ("of" "fortunato") + ;; ("fortunato")) + ) + + + +(defn add-terminal-value-to-ngram + "The Trie expects entries to be of the form '(k1 k2 k3 value). + The ngrams generated above are just '(k1 k2 k3). + This adds a value that is simply the ngram itself: + '(k1 k2 k3 '(k1 k2 k3))." + [ngram] + (concat ngram (list ngram))) + + +(defn trie->frequency-of-frequencies-map + "The second argument to this function specifies which rank you + want to get the map for." + [trie n] + (->> trie + (trie/children-at-depth n) + (map (comp :count second first seq)) + frequencies + (into (sorted-map)))) + +(comment + (trie->frequency-of-frequencies-map trie 1) + ;; => {1 558, + ;; 2 110, + ;; ,,, + ;; 167 1, + ;; 177 1} + + ) + +;; The frequency of a thus-far unseen species is the number of species seen once over the +;; total number of species. +;; That's commonly referred to as P0 +;; There will be a different P0 for each rank of N-gram. + +(defn P0 [trie n] + (let [freq-map (trie->frequency-of-frequencies-map trie n)] + (/ (freq-map 1) (apply + (vals freq-map))))) + +(comment + (P0 trie 1) + ;; => 31/45 + ) + +;; From here on out, we follow a similar procedure. +;; What we just did, P0, is the probability of seeing something +;; that has been previously unseen. +;; We found that by using what we know about P1 (how many times +;; things have been seen once). +;; +;; Now, we need to adjust our P1 number since we just gave some probability +;; to P0, which previously had no probability since it wasn't in our +;; frequency table. +;; +;; What's the new probability that the next thing we see is from the group of +;; n-grams that we've seen once? +;; +;; The same way P0 was based off P1, P1 will be based off P2. +;; +;; It's basically 2 * the number of times we've seen things twice divided +;; by the total number of things we've seen. +;; +;; P0 was 1 * number of 1-time things / total number of n-time things. +;; P1 is 2 * number of 2-time things / total number of n-time things. +;; P2 is 3 * number of 3-time things / total number of n-time things. +;; +;; With a slight adjustment. The frequency of frequencies needs to be smoothed +;; so there are no 0-values. When you get up to P14, P15, etc... there might be gaps +;; where you'll see P14 1-time, then won't see anything 15 or 16 times, so P15 and P16 will +;; be 0, then you'll see something 17 times twice. +;; +;; This is just noise from having limited data. The noise needs to be smoothed out. + +(defn simple-good-turing-map [trie n] + (let [freq-map (trie->frequency-of-frequencies-map trie n) + xs (->> freq-map keys (map #(Math/log %))) + ys (->> freq-map vals (map #(Math/log %))) + sgt (math/sgt (keys freq-map) (vals freq-map)) + sgt-map (into (sorted-map) (apply map vector sgt))] + sgt-map)) + + +(comment + (let [freq-map (trie->frequency-of-frequencies-map trie 2) + xs (->> freq-map keys (map #(Math/log %))) + ys (->> freq-map vals (map #(Math/log %))) + sgt (math/sgt (keys freq-map) (vals freq-map)) + sgt-map (into (sorted-map) (apply map vector sgt)) + sgt-with-counts (math/sgt-with-counts (keys freq-map) + (vals freq-map)) + c1 (freq-map 1) + c1* (sgt-map 1)] + [c1 c1* sgt-with-counts]) + + ) + + +;; Maximum Likelihood Estimate +;; +;; It was about dusk, one evening during the supreme madness of the +;; carnival season, that I encountered my friend. He accosted me with +;; excessive warmth, for he had been drinking much. The man wore motley. +;; He had on a tight-fitting parti-striped dress, and his head was +;; surmounted by the conical cap and bells. I was so pleased to see him, +;; that I thought I should never have done wringing his hand. +;; +;; Consider 3-grams... +;; +;; it was about +;; it was there +;; +;; Let `N` be a sample text size and `nr` be the number of +;; m-grams which occurred in the text exactly `r` times. +;; +;; So that `N` = (apply + (map #(* r nr) frequency-of-frequencies) +;; `N` = sum for all seen-counts ("number of things seen 'count' times" * 'count') +;; 10 things seen 5 times +;; 4 things seen 4 times +;; 2 things seen 1 time +;; +;; 10 things seen 5 times each makes up 50 "things" +;; 4 things seen 4 times each makes up 16 "things +;; 2 things seen once each makes up 2 "things" +;; +;; Makes for `N` = 50 + 16 + 2 things... 68 things (m-grams). +;; +;; Consider the m-gram "it was about" occurred 4 times. +;; And in total we saw 60 3-grams. Then the MLE +;; is 4 / 60. +;; +;; +;;;; Base MLE +;; +;; Disount of the n-gram +;; * +;; Count of n-gram +;; / +;; Count of n-1-gram + + +(defn maximum-likelihood-estimate [trie trie-database n-gram] + (/ (get-in trie-database [n-gram :count]) + (get-in trie-database [(butlast n-gram) :count]))) + + +(comment + (maximum-likelihood-estimate trie trie-database '("," "the")) + + (maximum-likelihood-estimate trie trie-database '(",")) + + (let [[rs nrs ests lgts] + (apply + math/sgt-with-counts + (apply map vector (seq (trie->frequency-of-frequencies-map trie 2))))] + [rs nrs ests lgts]) + + ) + +;;;; KATZ ;;;; + +;; (defn N [trie n-gram-rank] +;; (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)] +;; (apply + (map (fn [[r nr]] (* r nr)) r->Nr)))) + +;; (defn r* [trie n-gram-rank] +;; (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank) +;; _ _ _ r*s])) + + +;;;; + +(defn zipper-leaf-path-seq + [zipper] + (->> zipper + (iterate zip/next) + (take-while (complement zip/end?)) + (filter (complement zip/branch?)) + (map zip/path) + (map (partial map first)) + (filter (comp tb2/words last)))) + +(comment + (def target-grammar-structure + '(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN)))))) + + (reverse (zipper-leaf-path-seq (zip/seq-zip target-grammar-structure))) + + (defn decode-fn + "Decodes a variable-length encoded number from a byte-buffer. + Zero gets decoded to nil." + [byte-buffer] + (let [value (encoding/decode byte-buffer)] + (if (zero? value) + nil + value))) + + (def tpt (tpt/load-tightly-packed-trie-from-file + (io/resource "dark-corpus-4-gram-backwards-tpt.bin") + decode-fn)) + + (def database (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin"))) + + (def example-story + (loop [generated-text (vec (repeat 3 (get database ""))) + i 0] + (if (> i 20) + generated-text + (let [children (loop [i 4] + (let [node + (trie/lookup + tpt + (vec (take-last i generated-text))) + children + (and node (trie/children node))] + (cond + (nil? node) (recur (dec i)) + (< i 0) (throw (Exception. "Error")) + (seq children) children + :else (recur (dec i)))))] + (recur + (conj + generated-text + (->> children + (map #(vector (.key %) (get % []))) + (remove (comp nil? second)) + (#(math/weighted-selection + (fn [[_ c]] c) + %)) + first)) + (inc i)))))) + + (map database example-story) + + ) + +(defn syllabify-phrase + [phrase] + (->> phrase + (#(string/split % #" ")) + (map phonetics/get-phones) + (map first) + (map syllabify/syllabify) + (reduce into []))) + +(defn markov-choice + [trie generated-text k xf-filter] + (let [node (trie/lookup trie k) + children (and node + (->> node + trie/children + (map #(vector (.key %) (get % []))) + (remove (comp nil? second)))) + choices (transduce + (comp + (map (fn [child] + (vector generated-text child))) + xf-filter) conj children)] + (cond + (nil? node) (recur trie generated-text (butlast k) xf-filter) + + (seq children) + (if (< (rand) (/ (apply max (map second children)) + (apply + (map second children)))) + (recur trie generated-text (butlast k) xf-filter) + (first + (math/weighted-selection + (fn [[_ c]] c) + choices))) + + (> (count k) 0) (recur trie generated-text (butlast k) xf-filter) + + :else (throw (Exception. "Error"))))) + +(defn syllable-count-pred + [syllable-count database] + (fn [node] + (let [syllables (syllabify-phrase (database (get node [])))] + (= syllable-count + (count syllables))))) + +(defn markov-select + [n] + (fn [{:keys [trie database xf-filter tokens] :as context}] + (loop [n n] + (if (= n 0) + ;; Unable to find a selection + nil + (let [key (take-last n tokens) + node (trie/lookup trie key) + children (and node (->> (trie/children node) + (remove nil?))) + choices (transduce + (comp + (map #(vector (.key %) (get % []))) + (map (fn [child] [context child])) + xf-filter) + conj + children)] + (let [freqs (map #(get % []) children)] + (cond + (or (empty? choices) (empty? freqs)) + (recur (dec n)) + + (and + (> n 1) + (< (rand) + (/ (apply max freqs) + (apply + freqs)))) + (recur + (dec n)) + + :else + (let [result (second (math/weighted-selection + (comp second second) + choices))] + (first result))))))))) + +(defn generate-sentence + [{:keys [trie database stop? xf-filter tokens] :as context}] + (let [markov-fn (markov-select 4)] + (loop [context (assoc context :i 0)] + (let [tokens (:tokens context)] + (cond + (stop? context) + tokens + + :else + (let [selection (markov-fn context)] + (if (nil? selection) + (do + (println tokens) + (throw (Exception. "No possible selection"))) + (recur (update + context + :tokens + conj + selection))))))))) + +(comment + (trie/lookup tpt '(1 1 1)) + (let [context {:tokens (vec (repeat 3 (database ""))) + :trie tpt + :database database + :stop? (fn [{:keys [tokens] :as context}] + (let [sentence (->> tokens + (map database) + (remove #{""}) + (string/join " "))] + (<= 10 (count (syllabify-phrase sentence))))) + :xf-filter (comp + (remove + (fn [[context [k v]]] + (= k 7))) + (filter + (fn [[context [k v]]] + (let [current-sentence + (->> (:tokens context) + (map database) + (remove #{""}) + (string/join " ")) + current-syllable-count + (count (syllabify-phrase current-sentence)) + current-word (database k) + current-word-syllable-count (count (syllabify-phrase current-word))] + (>= (- 10 current-syllable-count) + current-word-syllable-count)))))}] + (->> (generate-sentence context) + (map database))) + + + (database "") + ) +#_(defn generate-sentence + [trie database stop? filters] + (loop [generated-text (vec (repeat 3 (get database ""))) + i 0] + (cond + (> i 400) + nil + + (stop? generated-text) + generated-text + + ;; reset + (or (zero? (mod i 40)) + (> syllable-count target-syllable-count)) + (recur (vec (repeat 3 (get database ""))) 0) + + :else + (let [choice (markov-choice + trie + (take-last 4 generated-text) + filters)] + (recur + (conj + generated-text + choice) + (inc i)))))) + +(comment + (let [disallow-sentence-start-xf + (remove (= (database (first %)) "") children) + ]) + + (map database (generate-sentence tpt database 10)) + + (def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin")) + (def part-of-speech-freqs (nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin")) + (take 100 part-of-speech-freqs) + + (loop [generated-text (vec (repeat 3 (get database ""))) + i 0] + (let [current-sentence + (take-while + (complement (into #{} (map database ["" ""]))) + (reverse generated-text))] + (if (> i 20) + generated-text + (let [children (loop [i 4] + (let [node + (trie/lookup + tpt + (vec (take-last i generated-text))) + children + (and node (trie/children node))] + (cond + (nil? node) (recur (dec i)) + (< i 0) (throw (Exception. "Error")) + (seq children) children + :else (recur (dec i)))))] + (recur + (conj + generated-text + (->> children + (map #(vector (.key %) (get % []))) + (remove (comp nil? second)) + (#(math/weighted-selection + (fn [[_ c]] c) + %)) + first)) + (inc i)))))) + + ) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index e7261fa..a7d0557 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -232,29 +232,34 @@ Porcelain. If you have the simple tree data structure returned by `parse-to-simple-tree`, then you can just pass that directly to `zip/seq-zip`." - [texts] - (let [tree (->> texts - (map tokenize) - (map (partial string/join " ")) + [text] + (let [tree (->> text + tokenize + (string/join " ") + vector parse - (map tb/make-tree) + first + tb/make-tree unmake-tree)] (zip/seq-zip tree))) (comment - (let [texts ["Eric's test is difficult."]] - (loop [zipper (treebank-zipper texts)] + ;; Here is a demo of zipping through a parse tree and changing + ;; all adjectives to "thorough". + (let [text "Eric's test is difficult."] + (loop [zipper (treebank-zipper text)] (cond (zip/end? zipper) (zip/root zipper) (= 'JJ (zip/node zipper)) (recur (-> zipper zip/next (zip/replace '("thorough")))) :else (recur (zip/next zipper))))) - ;; => ((TOP - ;; ((S - ;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test")))) - ;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough")))))) - ;; (. ("."))))))) + ;; => (TOP + ;; ((S + ;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test")))) + ;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough")))))) + ;; (. (".")))))) + ) (defn iter-zip @@ -341,6 +346,9 @@ (leaf-pos-path-word-freqs zipper)) (comment + (treebank-zipper ["Eric's test is difficult." + "Eric's test is thorough." + "Eric's testing."]) (let [zipper (treebank-zipper ["Eric's test is difficult." "Eric's test is thorough." "Eric's testing."])] @@ -406,6 +414,9 @@ "you are a test"]] (grammar-tree-frequencies document)) + + (grammar-tree-frequencies ["this is a test."]) + (parse-to-simple-tree ["this is a test."]) ;; => {(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))) 1, ;; (TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))))) 2, ;; (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))))) 1} @@ -786,7 +797,7 @@ (remove #(string? (first %))))) (comment - (phrase-constituents "My name is Eric.") + (phrase-constituents ["My name is Eric."]) ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP))) (phrase-constituents "How are you?") ;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP))) @@ -833,6 +844,7 @@ "My hat is blue and I like cake." "Your name is Taylor." "How are you?"]) + ;; => {TOP {(S) 3, (SBARQ) 1}, ;; S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2}, ;; NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1}, @@ -1002,6 +1014,7 @@ "Your name is not Eric." "Who is your mother and what does she do?"] (pos-constituent-frequencies) + #_#_(apply merge-with (fn [a b] @@ -1061,10 +1074,20 @@ ) +(defn most-likely-parts-of-speech + [phrase] + (top-k-sequences prhyme-pos-tagger (tokenize phrase))) + (comment - (let [text ["bother me"]] - (->> text - (map tokenize) - (map #(top-k-sequences prhyme-pos-tagger %)))) + (let [text "a dog"] + (first + (map #(.getOutcomes %) + (most-likely-parts-of-speech text)))) + ;; => ["PRP" "VBP" "DT" "NN" "."] + + + (map (juxt #(.getOutcomes %) + #(map float (.getProbs %))) + (top-k-sequences prhyme-pos-tagger (tokenize ""))) ) diff --git a/src/com/owoga/prhyme/syllabify.clj b/src/com/owoga/prhyme/syllabify.clj index 8958e80..647bfa4 100644 --- a/src/com/owoga/prhyme/syllabify.clj +++ b/src/com/owoga/prhyme/syllabify.clj @@ -29,6 +29,7 @@ ;; of a word. So it should be e.lip.sis ;; As an alternative to handling the isolated "s"-at-the-end-of-internal-coda case, ;; it works well-enough for me to treat all fricatives as lowest priority. + (def ^clojure.lang.PersistentVector sonority-hierarchy ["vowel" "liquid" "affricate" "fricative" "nasal" "stop" "semivowel" "aspirate"]) diff --git a/src/com/owoga/prhyme/util/math.clj b/src/com/owoga/prhyme/util/math.clj index f530552..7bafe82 100644 --- a/src/com/owoga/prhyme/util/math.clj +++ b/src/com/owoga/prhyme/util/math.clj @@ -286,6 +286,50 @@ [lgt-estimate lgt?] [turing-estimate lgt?])))))))) + +(defn smoothed-frequencies + [rs nrs] + (let [l (count rs) + N (apply + (map #(apply * %) (map vector rs nrs))) + p0 (/ (first nrs) N) + zrs (average-consecutives rs nrs) + log-rs (map #(Math/log %) rs) + log-zrs (map #(Math/log %) zrs) + lm (least-squares-linear-regression log-rs log-zrs) + lgts (map lm rs) + estimations (loop [coll rs + lgt? false + e (estimator lm rs zrs) + estimations []] + (cond + (empty? coll) estimations + :else + (let [[estimation lgt?] (e (first coll) lgt?)] + (recur + (rest coll) + lgt? + e + (conj estimations estimation))))) + N* (apply + (map #(apply * %) (map vector nrs estimations))) + probs (cons + (float p0) + (map #(* (- 1 p0) (/ % N*)) estimations)) + sum-probs (apply + probs)] + [lgts + (map + (fn [r] + (* (inc r) (/ (lm (inc r)) (lm r)))) + (partition 2 1 (conj rs (inc (peek rs)))))])) + +(comment + (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] + nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1] + rs [1 2 3 4 5 6 7 8 9 10 12 26] + nrs [32 20 10 3 1 2 1 1 1 2 1 1]] + (smoothed-frequencies rs nrs)) + + ) + (defn sgt [rs nrs] (assert (and (not-empty nrs) (not-empty rs)) "frequencies and frequency-of-frequencies can't be empty") @@ -316,7 +360,8 @@ (map #(* (- 1 p0) (/ % N*)) estimations)) sum-probs (apply + probs)] [(cons 0 rs) - (map #(/ % sum-probs) probs)])) + (map #(/ % sum-probs) probs) + estimations])) (comment (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]