diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj
index 89ed2d2..5cfffcf 100644
--- a/dev/examples/tpt.clj
+++ b/dev/examples/tpt.clj
@@ -857,9 +857,9 @@
generated-text
(let [children (loop [i 4]
(let [node (p :lookup
- (trie/lookup
- loaded-tightly-packed-trie
- (vec (take-last i generated-text))))
+ (trie/lookup
+ loaded-tightly-packed-trie
+ (vec (take-last i generated-text))))
children (p :seq-children (and node (trie/children node)))]
(cond
(nil? node) (recur (dec i))
diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj
index 378434e..a45e2fb 100644
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@@ -2,16 +2,18 @@
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.prhyme.data.dictionary :as dict]
+ [com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
- [taoensso.nippy :as nippy]))
+ [taoensso.nippy :as nippy]
+ [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
- #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
+ #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\?|\n)")
(defn xf-file-seq [start end]
(comp (remove #(.isDirectory %))
@@ -38,6 +40,23 @@
(map (partial map second))
(map (partial mapv string/lower-case))))
+(def xf-untokenize
+ (comp
+ (map #(string/join " " %))
+ (map #(string/replace % #" (['\-,\?\.] ?)" "$1"))))
+
+(comment
+ (let [tokens (transduce
+ xf-tokenize
+ conj
+ ["Eric's name, is Bond." "James, bond? Yes."])]
+ [tokens
+ (map #(string/join " " %) tokens)
+ (transduce
+ xf-untokenize
+ conj
+ tokens)]))
+
(def xf-filter-english
(let [word? (fn [x] (or (#{"." "?" ","} x)
(dict/cmu-with-stress-map x)))]
@@ -75,6 +94,34 @@
k)]
[k' 1])))
+(defn xf-part-of-speech-database
+ [database]
+ (fn [sentence]
+ (let [leafs (->> sentence
+ nlp/treebank-zipper
+ nlp/leaf-pos-path-word-freqs)]
+ (run!
+ (fn [[k v]]
+ (swap!
+ database
+ assoc
+ k
+ (merge-with + (@database k) v)))
+ leafs)
+ sentence)))
+
+(comment
+ (let [database (atom {})]
+ (transduce
+ (map (partial mapv (part-of-speech-database database)))
+ conj
+ []
+ [["this test is difficult"]
+ ["this foot is sore"]])
+ @database)
+
+ )
+
(def encode-fn
"Encodes a number as a variable-length encoded value.
nil gets encoded as 0."
@@ -159,3 +206,108 @@
(map (fn [[k v]] [k (map database k) v])))))
)
+
+(defn xf-grammar-database
+ [database]
+ (fn [sentence]
+ (let [leafs (->> sentence
+ nlp/treebank-zipper
+ nlp/leaf-pos-path-word-freqs)]
+ (run!
+ (fn [[k v]]
+ (swap!
+ database
+ assoc
+ k
+ (merge-with + (@database k) v)))
+ leafs)
+ sentence)))
+
+(defn file-seq->grammar-tree
+ [files]
+ (transduce
+ (comp
+ (xf-file-seq 0 1000)
+ (map slurp)
+ (map #(string/split % #"[\n+\?\.]"))
+ (map (partial transduce xf-tokenize conj))
+ (map (partial transduce xf-filter-english conj))
+ (map (partial remove empty?))
+ (remove empty?)
+ (map (partial transduce xf-untokenize conj))
+ (map nlp/grammar-tree-frequencies)
+ (map (partial into {})))
+ (fn
+ ([acc]
+ (sort-by (comp - second) acc))
+ ([acc m]
+ (merge-with + acc m)))
+ {}
+ files))
+
+(comment
+ (time
+ (->> (file-seq->grammar-tree
+ (file-seq (io/file "dark-corpus")))
+ (take 100)
+ (nippy/freeze-to-file "/tmp/grammar-freqs-top-100.bin")))
+
+ (def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin"))
+ (take 10 grammar-freqs)
+
+ )
+
+(defn file-seq->part-of-speech-freqs
+ [files]
+ (transduce
+ (comp
+ (xf-file-seq 0 1000)
+ (map slurp)
+ (map #(string/split % #"[\n+\?\.]"))
+ (map (partial transduce xf-tokenize conj))
+ (map (partial transduce xf-filter-english conj))
+ (map (partial remove empty?))
+ (remove empty?)
+ (map (partial transduce xf-untokenize conj))
+ (map (partial map nlp/treebank-zipper))
+ (map (partial map nlp/leaf-pos-path-word-freqs))
+ (map (partial reduce (fn [acc m]
+ (nlp/deep-merge-with + acc m)) {})))
+ (completing
+ (fn [result input]
+ (nlp/deep-merge-with + result input)))
+ {}
+ files))
+
+(comment
+ (time (->> (file-seq->part-of-speech-freqs
+ (file-seq (io/file "dark-corpus")))
+ (nippy/freeze-to-file "/tmp/part-of-speech-freqs.bin")))
+
+ (def parts-of-speech-freqs
+ (nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin"))
+ (take 20 parts-of-speech-freqs)
+ )
+
+
+(defn file-seq->parts-of-speech-trie
+ [files]
+ (transduce
+ (comp
+ (xf-file-seq 0 1000)
+ (map slurp)
+ (map #(string/split % #"[\n+\?\.]"))
+ (map (partial transduce xf-tokenize conj))
+ (map (partial transduce xf-filter-english conj))
+ (map (partial remove empty?))
+ (remove empty?)
+ (map (partial transduce xf-untokenize conj))
+ (map nlp/grammar-tree-frequencies)
+ (map (partial into {})))
+ (fn
+ ([acc]
+ (sort-by (comp - second) acc))
+ ([acc m]
+ (merge-with + acc m)))
+ {}
+ files))
diff --git a/src/com/owoga/prhyme/generation/markov_example.clj b/src/com/owoga/prhyme/generation/markov_example.clj
new file mode 100644
index 0000000..04cf239
--- /dev/null
+++ b/src/com/owoga/prhyme/generation/markov_example.clj
@@ -0,0 +1,544 @@
+(ns com.owoga.prhyme.generation.markov-example
+ (:require [clojure.string :as string]
+ [clojure.java.io :as io]
+ [com.owoga.prhyme.util.math :as math]
+ [com.owoga.phonetics :as phonetics]
+ [com.owoga.phonetics.syllabify :as syllabify]
+ [cljol.dig9 :as d]
+ [clojure.zip :as zip]
+ [com.owoga.tightly-packed-trie.bit-manip :as bm]
+ [com.owoga.trie :as trie]
+ [com.owoga.tightly-packed-trie.encoding :as encoding]
+ [com.owoga.tightly-packed-trie :as tpt]
+ [taoensso.nippy :as nippy]
+ [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
+ [com.owoga.prhyme.nlp.core :as nlp]))
+
+(def corpus (slurp (io/resource "cask_of_amontillado.txt")))
+
+;; For better generation of text, you'll probably want to pad the starts
+;; of sentences with n-1 "start-of-sentence" tokens.
+(defn prep-punctuation-for-tokenization
+ "Puts spaces around punctuation so that they aren't
+ tokenized with the words they are attached to.
+
+ Might add extraneous whitespace, but presumedly that will be ignored/removed
+ during tokenization."
+ [text]
+ (string/replace text #"([\.,!?])" " $1 "))
+
+(defn remove-quotes
+ "...and hyphens"
+ [text]
+ (string/replace text #"[\"-]" ""))
+
+(defn remove-formatting-characters
+ "Input has underscores, presumably because the text
+ might be rendered by something that can italicize or bold text.
+ We'll just ignore them for now."
+ [text]
+ (string/replace text #"[_*]" ""))
+
+(defn tokenize [text]
+ (-> text
+ remove-formatting-characters
+ prep-punctuation-for-tokenization
+ remove-quotes
+ string/lower-case
+ (string/split #"[\n ]+")))
+
+(defn interleave-all
+ "Like interleave, but instead of ending the interleave when the shortest collection
+ has been consumed, continues to interleave the remaining collections."
+ {:added "1.0"
+ :static true}
+ ([] ())
+ ([c1] (lazy-seq c1))
+ ([c1 c2]
+ (lazy-seq
+ (let [s1 (seq c1) s2 (seq c2)]
+ (if (and s1 s2)
+ (cons (first s1) (cons (first s2)
+ (interleave-all (rest s1) (rest s2))))
+ (lazy-seq (or s1 s2))))))
+ ([c1 c2 & colls]
+ (lazy-seq
+ (let [ss (->> (map seq (conj colls c2 c1))
+ (remove nil?))]
+ (when ss
+ (concat (map first ss) (apply interleave-all (map rest ss))))))))
+
+(comment
+ (let [tokens [1 2 3 4 5]
+ p1 (partition 1 1 tokens)
+ p2 (partition 2 1 tokens)
+ p3 (partition 3 1 tokens)]
+ (interleave-all p1 p2 p3)))
+
+(defn ngramify-tokens [n m tokens]
+ (let [partition-colls (map #(partition % 1 tokens) (range n m))
+ ngrams (apply interleave-all partition-colls)]
+ ngrams))
+
+(comment
+ (->> (tokenize corpus)
+ (take 5)
+ (ngramify-tokens 1 4))
+ ;; => (("the")
+ ;; ("the" "thousand")
+ ;; ("the" "thousand" "injuries")
+ ;; ("thousand")
+ ;; ("thousand" "injuries")
+ ;; ("thousand" "injuries" "of")
+ ;; ("injuries")
+ ;; ("injuries" "of")
+ ;; ("injuries" "of" "fortunato")
+ ;; ("of")
+ ;; ("of" "fortunato")
+ ;; ("fortunato"))
+ )
+
+
+
+(defn add-terminal-value-to-ngram
+ "The Trie expects entries to be of the form '(k1 k2 k3 value).
+ The ngrams generated above are just '(k1 k2 k3).
+ This adds a value that is simply the ngram itself:
+ '(k1 k2 k3 '(k1 k2 k3))."
+ [ngram]
+ (concat ngram (list ngram)))
+
+
+(defn trie->frequency-of-frequencies-map
+ "The second argument to this function specifies which rank you
+ want to get the map for."
+ [trie n]
+ (->> trie
+ (trie/children-at-depth n)
+ (map (comp :count second first seq))
+ frequencies
+ (into (sorted-map))))
+
+(comment
+ (trie->frequency-of-frequencies-map trie 1)
+ ;; => {1 558,
+ ;; 2 110,
+ ;; ,,,
+ ;; 167 1,
+ ;; 177 1}
+
+ )
+
+;; The frequency of a thus-far unseen species is the number of species seen once over the
+;; total number of species.
+;; That's commonly referred to as P0
+;; There will be a different P0 for each rank of N-gram.
+
+(defn P0 [trie n]
+ (let [freq-map (trie->frequency-of-frequencies-map trie n)]
+ (/ (freq-map 1) (apply + (vals freq-map)))))
+
+(comment
+ (P0 trie 1)
+ ;; => 31/45
+ )
+
+;; From here on out, we follow a similar procedure.
+;; What we just did, P0, is the probability of seeing something
+;; that has been previously unseen.
+;; We found that by using what we know about P1 (how many times
+;; things have been seen once).
+;;
+;; Now, we need to adjust our P1 number since we just gave some probability
+;; to P0, which previously had no probability since it wasn't in our
+;; frequency table.
+;;
+;; What's the new probability that the next thing we see is from the group of
+;; n-grams that we've seen once?
+;;
+;; The same way P0 was based off P1, P1 will be based off P2.
+;;
+;; It's basically 2 * the number of times we've seen things twice divided
+;; by the total number of things we've seen.
+;;
+;; P0 was 1 * number of 1-time things / total number of n-time things.
+;; P1 is 2 * number of 2-time things / total number of n-time things.
+;; P2 is 3 * number of 3-time things / total number of n-time things.
+;;
+;; With a slight adjustment. The frequency of frequencies needs to be smoothed
+;; so there are no 0-values. When you get up to P14, P15, etc... there might be gaps
+;; where you'll see P14 1-time, then won't see anything 15 or 16 times, so P15 and P16 will
+;; be 0, then you'll see something 17 times twice.
+;;
+;; This is just noise from having limited data. The noise needs to be smoothed out.
+
+(defn simple-good-turing-map [trie n]
+ (let [freq-map (trie->frequency-of-frequencies-map trie n)
+ xs (->> freq-map keys (map #(Math/log %)))
+ ys (->> freq-map vals (map #(Math/log %)))
+ sgt (math/sgt (keys freq-map) (vals freq-map))
+ sgt-map (into (sorted-map) (apply map vector sgt))]
+ sgt-map))
+
+
+(comment
+ (let [freq-map (trie->frequency-of-frequencies-map trie 2)
+ xs (->> freq-map keys (map #(Math/log %)))
+ ys (->> freq-map vals (map #(Math/log %)))
+ sgt (math/sgt (keys freq-map) (vals freq-map))
+ sgt-map (into (sorted-map) (apply map vector sgt))
+ sgt-with-counts (math/sgt-with-counts (keys freq-map)
+ (vals freq-map))
+ c1 (freq-map 1)
+ c1* (sgt-map 1)]
+ [c1 c1* sgt-with-counts])
+
+ )
+
+
+;; Maximum Likelihood Estimate
+;;
+;; It was about dusk, one evening during the supreme madness of the
+;; carnival season, that I encountered my friend. He accosted me with
+;; excessive warmth, for he had been drinking much. The man wore motley.
+;; He had on a tight-fitting parti-striped dress, and his head was
+;; surmounted by the conical cap and bells. I was so pleased to see him,
+;; that I thought I should never have done wringing his hand.
+;;
+;; Consider 3-grams...
+;;
+;; it was about
+;; it was there
+;;
+;; Let `N` be a sample text size and `nr` be the number of
+;; m-grams which occurred in the text exactly `r` times.
+;;
+;; So that `N` = (apply + (map #(* r nr) frequency-of-frequencies)
+;; `N` = sum for all seen-counts ("number of things seen 'count' times" * 'count')
+;; 10 things seen 5 times
+;; 4 things seen 4 times
+;; 2 things seen 1 time
+;;
+;; 10 things seen 5 times each makes up 50 "things"
+;; 4 things seen 4 times each makes up 16 "things
+;; 2 things seen once each makes up 2 "things"
+;;
+;; Makes for `N` = 50 + 16 + 2 things... 68 things (m-grams).
+;;
+;; Consider the m-gram "it was about" occurred 4 times.
+;; And in total we saw 60 3-grams. Then the MLE
+;; is 4 / 60.
+;;
+;;
+;;;; Base MLE
+;;
+;; Disount of the n-gram
+;; *
+;; Count of n-gram
+;; /
+;; Count of n-1-gram
+
+
+(defn maximum-likelihood-estimate [trie trie-database n-gram]
+ (/ (get-in trie-database [n-gram :count])
+ (get-in trie-database [(butlast n-gram) :count])))
+
+
+(comment
+ (maximum-likelihood-estimate trie trie-database '("," "the"))
+
+ (maximum-likelihood-estimate trie trie-database '(","))
+
+ (let [[rs nrs ests lgts]
+ (apply
+ math/sgt-with-counts
+ (apply map vector (seq (trie->frequency-of-frequencies-map trie 2))))]
+ [rs nrs ests lgts])
+
+ )
+
+;;;; KATZ ;;;;
+
+;; (defn N [trie n-gram-rank]
+;; (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)]
+;; (apply + (map (fn [[r nr]] (* r nr)) r->Nr))))
+
+;; (defn r* [trie n-gram-rank]
+;; (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)
+;; _ _ _ r*s]))
+
+
+;;;;
+
+(defn zipper-leaf-path-seq
+ [zipper]
+ (->> zipper
+ (iterate zip/next)
+ (take-while (complement zip/end?))
+ (filter (complement zip/branch?))
+ (map zip/path)
+ (map (partial map first))
+ (filter (comp tb2/words last))))
+
+(comment
+ (def target-grammar-structure
+ '(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))))
+
+ (reverse (zipper-leaf-path-seq (zip/seq-zip target-grammar-structure)))
+
+ (defn decode-fn
+ "Decodes a variable-length encoded number from a byte-buffer.
+ Zero gets decoded to nil."
+ [byte-buffer]
+ (let [value (encoding/decode byte-buffer)]
+ (if (zero? value)
+ nil
+ value)))
+
+ (def tpt (tpt/load-tightly-packed-trie-from-file
+ (io/resource "dark-corpus-4-gram-backwards-tpt.bin")
+ decode-fn))
+
+ (def database (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
+
+ (def example-story
+ (loop [generated-text (vec (repeat 3 (get database "")))
+ i 0]
+ (if (> i 20)
+ generated-text
+ (let [children (loop [i 4]
+ (let [node
+ (trie/lookup
+ tpt
+ (vec (take-last i generated-text)))
+ children
+ (and node (trie/children node))]
+ (cond
+ (nil? node) (recur (dec i))
+ (< i 0) (throw (Exception. "Error"))
+ (seq children) children
+ :else (recur (dec i)))))]
+ (recur
+ (conj
+ generated-text
+ (->> children
+ (map #(vector (.key %) (get % [])))
+ (remove (comp nil? second))
+ (#(math/weighted-selection
+ (fn [[_ c]] c)
+ %))
+ first))
+ (inc i))))))
+
+ (map database example-story)
+
+ )
+
+(defn syllabify-phrase
+ [phrase]
+ (->> phrase
+ (#(string/split % #" "))
+ (map phonetics/get-phones)
+ (map first)
+ (map syllabify/syllabify)
+ (reduce into [])))
+
+(defn markov-choice
+ [trie generated-text k xf-filter]
+ (let [node (trie/lookup trie k)
+ children (and node
+ (->> node
+ trie/children
+ (map #(vector (.key %) (get % [])))
+ (remove (comp nil? second))))
+ choices (transduce
+ (comp
+ (map (fn [child]
+ (vector generated-text child)))
+ xf-filter) conj children)]
+ (cond
+ (nil? node) (recur trie generated-text (butlast k) xf-filter)
+
+ (seq children)
+ (if (< (rand) (/ (apply max (map second children))
+ (apply + (map second children))))
+ (recur trie generated-text (butlast k) xf-filter)
+ (first
+ (math/weighted-selection
+ (fn [[_ c]] c)
+ choices)))
+
+ (> (count k) 0) (recur trie generated-text (butlast k) xf-filter)
+
+ :else (throw (Exception. "Error")))))
+
+(defn syllable-count-pred
+ [syllable-count database]
+ (fn [node]
+ (let [syllables (syllabify-phrase (database (get node [])))]
+ (= syllable-count
+ (count syllables)))))
+
+(defn markov-select
+ [n]
+ (fn [{:keys [trie database xf-filter tokens] :as context}]
+ (loop [n n]
+ (if (= n 0)
+ ;; Unable to find a selection
+ nil
+ (let [key (take-last n tokens)
+ node (trie/lookup trie key)
+ children (and node (->> (trie/children node)
+ (remove nil?)))
+ choices (transduce
+ (comp
+ (map #(vector (.key %) (get % [])))
+ (map (fn [child] [context child]))
+ xf-filter)
+ conj
+ children)]
+ (let [freqs (map #(get % []) children)]
+ (cond
+ (or (empty? choices) (empty? freqs))
+ (recur (dec n))
+
+ (and
+ (> n 1)
+ (< (rand)
+ (/ (apply max freqs)
+ (apply + freqs))))
+ (recur
+ (dec n))
+
+ :else
+ (let [result (second (math/weighted-selection
+ (comp second second)
+ choices))]
+ (first result)))))))))
+
+(defn generate-sentence
+ [{:keys [trie database stop? xf-filter tokens] :as context}]
+ (let [markov-fn (markov-select 4)]
+ (loop [context (assoc context :i 0)]
+ (let [tokens (:tokens context)]
+ (cond
+ (stop? context)
+ tokens
+
+ :else
+ (let [selection (markov-fn context)]
+ (if (nil? selection)
+ (do
+ (println tokens)
+ (throw (Exception. "No possible selection")))
+ (recur (update
+ context
+ :tokens
+ conj
+ selection)))))))))
+
+(comment
+ (trie/lookup tpt '(1 1 1))
+ (let [context {:tokens (vec (repeat 3 (database "")))
+ :trie tpt
+ :database database
+ :stop? (fn [{:keys [tokens] :as context}]
+ (let [sentence (->> tokens
+ (map database)
+ (remove #{""})
+ (string/join " "))]
+ (<= 10 (count (syllabify-phrase sentence)))))
+ :xf-filter (comp
+ (remove
+ (fn [[context [k v]]]
+ (= k 7)))
+ (filter
+ (fn [[context [k v]]]
+ (let [current-sentence
+ (->> (:tokens context)
+ (map database)
+ (remove #{""})
+ (string/join " "))
+ current-syllable-count
+ (count (syllabify-phrase current-sentence))
+ current-word (database k)
+ current-word-syllable-count (count (syllabify-phrase current-word))]
+ (>= (- 10 current-syllable-count)
+ current-word-syllable-count)))))}]
+ (->> (generate-sentence context)
+ (map database)))
+
+
+ (database "")
+ )
+#_(defn generate-sentence
+ [trie database stop? filters]
+ (loop [generated-text (vec (repeat 3 (get database "")))
+ i 0]
+ (cond
+ (> i 400)
+ nil
+
+ (stop? generated-text)
+ generated-text
+
+ ;; reset
+ (or (zero? (mod i 40))
+ (> syllable-count target-syllable-count))
+ (recur (vec (repeat 3 (get database ""))) 0)
+
+ :else
+ (let [choice (markov-choice
+ trie
+ (take-last 4 generated-text)
+ filters)]
+ (recur
+ (conj
+ generated-text
+ choice)
+ (inc i))))))
+
+(comment
+ (let [disallow-sentence-start-xf
+ (remove (= (database (first %)) "") children)
+ ])
+
+ (map database (generate-sentence tpt database 10))
+
+ (def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin"))
+ (def part-of-speech-freqs (nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin"))
+ (take 100 part-of-speech-freqs)
+
+ (loop [generated-text (vec (repeat 3 (get database "")))
+ i 0]
+ (let [current-sentence
+ (take-while
+ (complement (into #{} (map database ["" ""])))
+ (reverse generated-text))]
+ (if (> i 20)
+ generated-text
+ (let [children (loop [i 4]
+ (let [node
+ (trie/lookup
+ tpt
+ (vec (take-last i generated-text)))
+ children
+ (and node (trie/children node))]
+ (cond
+ (nil? node) (recur (dec i))
+ (< i 0) (throw (Exception. "Error"))
+ (seq children) children
+ :else (recur (dec i)))))]
+ (recur
+ (conj
+ generated-text
+ (->> children
+ (map #(vector (.key %) (get % [])))
+ (remove (comp nil? second))
+ (#(math/weighted-selection
+ (fn [[_ c]] c)
+ %))
+ first))
+ (inc i))))))
+
+ )
diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj
index e7261fa..a7d0557 100644
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@@ -232,29 +232,34 @@
Porcelain. If you have the simple tree data structure
returned by `parse-to-simple-tree`, then you can just
pass that directly to `zip/seq-zip`."
- [texts]
- (let [tree (->> texts
- (map tokenize)
- (map (partial string/join " "))
+ [text]
+ (let [tree (->> text
+ tokenize
+ (string/join " ")
+ vector
parse
- (map tb/make-tree)
+ first
+ tb/make-tree
unmake-tree)]
(zip/seq-zip tree)))
(comment
- (let [texts ["Eric's test is difficult."]]
- (loop [zipper (treebank-zipper texts)]
+ ;; Here is a demo of zipping through a parse tree and changing
+ ;; all adjectives to "thorough".
+ (let [text "Eric's test is difficult."]
+ (loop [zipper (treebank-zipper text)]
(cond
(zip/end? zipper) (zip/root zipper)
(= 'JJ (zip/node zipper)) (recur (-> zipper
zip/next
(zip/replace '("thorough"))))
:else (recur (zip/next zipper)))))
- ;; => ((TOP
- ;; ((S
- ;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
- ;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
- ;; (. (".")))))))
+ ;; => (TOP
+ ;; ((S
+ ;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
+ ;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
+ ;; (. ("."))))))
+
)
(defn iter-zip
@@ -341,6 +346,9 @@
(leaf-pos-path-word-freqs zipper))
(comment
+ (treebank-zipper ["Eric's test is difficult."
+ "Eric's test is thorough."
+ "Eric's testing."])
(let [zipper (treebank-zipper ["Eric's test is difficult."
"Eric's test is thorough."
"Eric's testing."])]
@@ -406,6 +414,9 @@
"you are a test"]]
(grammar-tree-frequencies
document))
+
+ (grammar-tree-frequencies ["this is a test."])
+ (parse-to-simple-tree ["this is a test."])
;; => {(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))) 1,
;; (TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))))) 2,
;; (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))))) 1}
@@ -786,7 +797,7 @@
(remove #(string? (first %)))))
(comment
- (phrase-constituents "My name is Eric.")
+ (phrase-constituents ["My name is Eric."])
;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
(phrase-constituents "How are you?")
;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP)))
@@ -833,6 +844,7 @@
"My hat is blue and I like cake."
"Your name is Taylor."
"How are you?"])
+
;; => {TOP {(S) 3, (SBARQ) 1},
;; S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
;; NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
@@ -1002,6 +1014,7 @@
"Your name is not Eric."
"Who is your mother and what does she do?"]
(pos-constituent-frequencies)
+
#_#_(apply
merge-with
(fn [a b]
@@ -1061,10 +1074,20 @@
)
+(defn most-likely-parts-of-speech
+ [phrase]
+ (top-k-sequences prhyme-pos-tagger (tokenize phrase)))
+
(comment
- (let [text ["bother me"]]
- (->> text
- (map tokenize)
- (map #(top-k-sequences prhyme-pos-tagger %))))
+ (let [text "a dog"]
+ (first
+ (map #(.getOutcomes %)
+ (most-likely-parts-of-speech text))))
+ ;; => ["PRP" "VBP" "DT" "NN" "."]
+
+
+ (map (juxt #(.getOutcomes %)
+ #(map float (.getProbs %)))
+ (top-k-sequences prhyme-pos-tagger (tokenize "")))
)
diff --git a/src/com/owoga/prhyme/syllabify.clj b/src/com/owoga/prhyme/syllabify.clj
index 8958e80..647bfa4 100644
--- a/src/com/owoga/prhyme/syllabify.clj
+++ b/src/com/owoga/prhyme/syllabify.clj
@@ -29,6 +29,7 @@
;; of a word. So it should be e.lip.sis
;; As an alternative to handling the isolated "s"-at-the-end-of-internal-coda case,
;; it works well-enough for me to treat all fricatives as lowest priority.
+
(def ^clojure.lang.PersistentVector sonority-hierarchy
["vowel" "liquid" "affricate" "fricative" "nasal" "stop" "semivowel" "aspirate"])
diff --git a/src/com/owoga/prhyme/util/math.clj b/src/com/owoga/prhyme/util/math.clj
index f530552..7bafe82 100644
--- a/src/com/owoga/prhyme/util/math.clj
+++ b/src/com/owoga/prhyme/util/math.clj
@@ -286,6 +286,50 @@
[lgt-estimate lgt?]
[turing-estimate lgt?]))))))))
+
+(defn smoothed-frequencies
+ [rs nrs]
+ (let [l (count rs)
+ N (apply + (map #(apply * %) (map vector rs nrs)))
+ p0 (/ (first nrs) N)
+ zrs (average-consecutives rs nrs)
+ log-rs (map #(Math/log %) rs)
+ log-zrs (map #(Math/log %) zrs)
+ lm (least-squares-linear-regression log-rs log-zrs)
+ lgts (map lm rs)
+ estimations (loop [coll rs
+ lgt? false
+ e (estimator lm rs zrs)
+ estimations []]
+ (cond
+ (empty? coll) estimations
+ :else
+ (let [[estimation lgt?] (e (first coll) lgt?)]
+ (recur
+ (rest coll)
+ lgt?
+ e
+ (conj estimations estimation)))))
+ N* (apply + (map #(apply * %) (map vector nrs estimations)))
+ probs (cons
+ (float p0)
+ (map #(* (- 1 p0) (/ % N*)) estimations))
+ sum-probs (apply + probs)]
+ [lgts
+ (map
+ (fn [r]
+ (* (inc r) (/ (lm (inc r)) (lm r))))
+ (partition 2 1 (conj rs (inc (peek rs)))))]))
+
+(comment
+ (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
+ nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
+ rs [1 2 3 4 5 6 7 8 9 10 12 26]
+ nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
+ (smoothed-frequencies rs nrs))
+
+ )
+
(defn sgt [rs nrs]
(assert (and (not-empty nrs) (not-empty rs))
"frequencies and frequency-of-frequencies can't be empty")
@@ -316,7 +360,8 @@
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
[(cons 0 rs)
- (map #(/ % sum-probs) probs)]))
+ (map #(/ % sum-probs) probs)
+ estimations]))
(comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]