Generate with some grammar rules

5 years ago · dd51e0fdca
parent dc5340be7f
commit dd51e0fdca
6 changed files with 788 additions and 23 deletions
--- a/dev/examples/tpt.clj
+++ b/dev/examples/tpt.clj
@ -857,9 +857,9 @@
         generated-text
         (let [children (loop [i 4]
                          (let [node     (p :lookup
-                                        (trie/lookup
-                                         loaded-tightly-packed-trie
-                                         (vec (take-last i generated-text))))
+                                            (trie/lookup
+                                             loaded-tightly-packed-trie
+                                             (vec (take-last i generated-text))))
                                children (p :seq-children (and node (trie/children node)))]
                            (cond
                              (nil? node)    (recur (dec i))
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@ -2,16 +2,18 @@
  (:require [clojure.string :as string]
            [clojure.java.io :as io]
            [com.owoga.prhyme.data.dictionary :as dict]
+            [com.owoga.prhyme.nlp.core :as nlp]
            [com.owoga.trie :as trie]
            [com.owoga.tightly-packed-trie :as tpt]
            [com.owoga.tightly-packed-trie.encoding :as encoding]
-            [taoensso.nippy :as nippy]))
+            [taoensso.nippy :as nippy]
+            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))

 (def re-word
  "Regex for tokenizing a string into words
  (including contractions and hyphenations),
  commas, periods, and newlines."
-  #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
+  #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\?|\n)")

 (defn xf-file-seq [start end]
  (comp (remove #(.isDirectory %))
@ -38,6 +40,23 @@
   (map (partial map second))
   (map (partial mapv string/lower-case))))

+(def xf-untokenize
+  (comp
+   (map #(string/join " " %))
+   (map #(string/replace % #" (['\-,\?\.] ?)" "$1"))))
+
+(comment
+  (let [tokens (transduce
+                xf-tokenize
+                conj
+                ["Eric's name, is Bond." "James, bond? Yes."])]
+    [tokens
+     (map #(string/join " " %) tokens)
+     (transduce
+      xf-untokenize
+      conj
+      tokens)]))
+
 (def xf-filter-english
  (let [word? (fn [x] (or (#{"." "?" ","} x)
                          (dict/cmu-with-stress-map x)))]
@ -75,6 +94,34 @@
                  k)]
      [k' 1])))

+(defn xf-part-of-speech-database
+  [database]
+  (fn [sentence]
+    (let [leafs (->> sentence
+                     nlp/treebank-zipper
+                     nlp/leaf-pos-path-word-freqs)]
+      (run!
+       (fn [[k v]]
+         (swap!
+          database
+          assoc
+          k
+          (merge-with + (@database k) v)))
+       leafs)
+      sentence)))
+
+(comment
+  (let [database (atom {})]
+    (transduce
+     (map (partial mapv (part-of-speech-database database)))
+     conj
+     []
+     [["this test is difficult"]
+      ["this foot is sore"]])
+    @database)
+
+  )
+
 (def encode-fn
  "Encodes a number as a variable-length encoded value.
  nil gets encoded as 0."
@ -159,3 +206,108 @@
          (map (fn [[k v]] [k (map database k) v])))))

  )
+
+(defn xf-grammar-database
+  [database]
+  (fn [sentence]
+    (let [leafs (->> sentence
+                     nlp/treebank-zipper
+                     nlp/leaf-pos-path-word-freqs)]
+      (run!
+       (fn [[k v]]
+         (swap!
+          database
+          assoc
+          k
+          (merge-with + (@database k) v)))
+       leafs)
+      sentence)))
+
+(defn file-seq->grammar-tree
+  [files]
+  (transduce
+   (comp
+    (xf-file-seq 0 1000)
+    (map slurp)
+    (map #(string/split % #"[\n+\?\.]"))
+    (map (partial transduce xf-tokenize conj))
+    (map (partial transduce xf-filter-english conj))
+    (map (partial remove empty?))
+    (remove empty?)
+    (map (partial transduce xf-untokenize conj))
+    (map nlp/grammar-tree-frequencies)
+    (map (partial into {})))
+   (fn
+     ([acc]
+      (sort-by (comp - second) acc))
+     ([acc m]
+      (merge-with + acc m)))
+   {}
+   files))
+
+(comment
+  (time
+   (->> (file-seq->grammar-tree
+         (file-seq (io/file "dark-corpus")))
+        (take 100)
+        (nippy/freeze-to-file "/tmp/grammar-freqs-top-100.bin")))
+
+  (def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin"))
+  (take 10 grammar-freqs)
+
+  )
+
+(defn file-seq->part-of-speech-freqs
+  [files]
+  (transduce
+   (comp
+    (xf-file-seq 0 1000)
+    (map slurp)
+    (map #(string/split % #"[\n+\?\.]"))
+    (map (partial transduce xf-tokenize conj))
+    (map (partial transduce xf-filter-english conj))
+    (map (partial remove empty?))
+    (remove empty?)
+    (map (partial transduce xf-untokenize conj))
+    (map (partial map nlp/treebank-zipper))
+    (map (partial map nlp/leaf-pos-path-word-freqs))
+    (map (partial reduce (fn [acc m]
+                           (nlp/deep-merge-with + acc m)) {})))
+   (completing
+    (fn [result input]
+      (nlp/deep-merge-with + result input)))
+   {}
+   files))
+
+(comment
+  (time (->> (file-seq->part-of-speech-freqs
+              (file-seq (io/file "dark-corpus")))
+             (nippy/freeze-to-file "/tmp/part-of-speech-freqs.bin")))
+
+  (def parts-of-speech-freqs
+    (nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin"))
+  (take 20 parts-of-speech-freqs)
+  )
+
+
+(defn file-seq->parts-of-speech-trie
+  [files]
+  (transduce
+   (comp
+    (xf-file-seq 0 1000)
+    (map slurp)
+    (map #(string/split % #"[\n+\?\.]"))
+    (map (partial transduce xf-tokenize conj))
+    (map (partial transduce xf-filter-english conj))
+    (map (partial remove empty?))
+    (remove empty?)
+    (map (partial transduce xf-untokenize conj))
+    (map nlp/grammar-tree-frequencies)
+    (map (partial into {})))
+   (fn
+     ([acc]
+      (sort-by (comp - second) acc))
+     ([acc m]
+      (merge-with + acc m)))
+   {}
+   files))
--- a/src/com/owoga/prhyme/generation/markov_example.clj
+++ b/src/com/owoga/prhyme/generation/markov_example.clj
@ -0,0 +1,544 @@
+(ns com.owoga.prhyme.generation.markov-example
+  (:require [clojure.string :as string]
+            [clojure.java.io :as io]
+            [com.owoga.prhyme.util.math :as math]
+            [com.owoga.phonetics :as phonetics]
+            [com.owoga.phonetics.syllabify :as syllabify]
+            [cljol.dig9 :as d]
+            [clojure.zip :as zip]
+            [com.owoga.tightly-packed-trie.bit-manip :as bm]
+            [com.owoga.trie :as trie]
+            [com.owoga.tightly-packed-trie.encoding :as encoding]
+            [com.owoga.tightly-packed-trie :as tpt]
+            [taoensso.nippy :as nippy]
+            [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
+            [com.owoga.prhyme.nlp.core :as nlp]))
+
+(def corpus (slurp (io/resource "cask_of_amontillado.txt")))
+
+;; For better generation of text, you'll probably want to pad the starts
+;; of sentences with n-1 "start-of-sentence" tokens.
+(defn prep-punctuation-for-tokenization
+  "Puts spaces around punctuation so that they aren't
+  tokenized with the words they are attached to.
+
+  Might add extraneous whitespace, but presumedly that will be ignored/removed
+  during tokenization."
+  [text]
+  (string/replace text #"([\.,!?])" " $1 "))
+
+(defn remove-quotes
+  "...and hyphens"
+  [text]
+  (string/replace text #"[\"-]" ""))
+
+(defn remove-formatting-characters
+  "Input has underscores, presumably because the text
+  might be rendered by something that can italicize or bold text.
+  We'll just ignore them for now."
+  [text]
+  (string/replace text #"[_*]" ""))
+
+(defn tokenize [text]
+  (-> text
+      remove-formatting-characters
+      prep-punctuation-for-tokenization
+      remove-quotes
+      string/lower-case
+      (string/split #"[\n ]+")))
+
+(defn interleave-all
+  "Like interleave, but instead of ending the interleave when the shortest collection
+  has been consumed, continues to interleave the remaining collections."
+  {:added "1.0"
+   :static true}
+  ([] ())
+  ([c1] (lazy-seq c1))
+  ([c1 c2]
+   (lazy-seq
+    (let [s1 (seq c1) s2 (seq c2)]
+      (if (and s1 s2)
+        (cons (first s1) (cons (first s2)
+                               (interleave-all (rest s1) (rest s2))))
+        (lazy-seq (or s1 s2))))))
+  ([c1 c2 & colls]
+   (lazy-seq
+    (let [ss (->> (map seq (conj colls c2 c1))
+                  (remove nil?))]
+      (when ss
+        (concat (map first ss) (apply interleave-all (map rest ss))))))))
+
+(comment
+  (let [tokens [1 2 3 4 5]
+        p1 (partition 1 1 tokens)
+        p2 (partition 2 1 tokens)
+        p3 (partition 3 1 tokens)]
+    (interleave-all p1 p2 p3)))
+
+(defn ngramify-tokens [n m tokens]
+  (let [partition-colls (map #(partition % 1 tokens) (range n m))
+        ngrams (apply interleave-all partition-colls)]
+    ngrams))
+
+(comment
+  (->> (tokenize corpus)
+       (take 5)
+       (ngramify-tokens 1 4))
+  ;; => (("the")
+  ;;     ("the" "thousand")
+  ;;     ("the" "thousand" "injuries")
+  ;;     ("thousand")
+  ;;     ("thousand" "injuries")
+  ;;     ("thousand" "injuries" "of")
+  ;;     ("injuries")
+  ;;     ("injuries" "of")
+  ;;     ("injuries" "of" "fortunato")
+  ;;     ("of")
+  ;;     ("of" "fortunato")
+  ;;     ("fortunato"))
+  )
+
+
+
+(defn add-terminal-value-to-ngram
+  "The Trie expects entries to be of the form '(k1 k2 k3 value).
+  The ngrams generated above are just '(k1 k2 k3).
+  This adds a value that is simply the ngram itself:
+  '(k1 k2 k3 '(k1 k2 k3))."
+  [ngram]
+  (concat ngram (list ngram)))
+
+
+(defn trie->frequency-of-frequencies-map
+  "The second argument to this function specifies which rank you
+  want to get the map for."
+  [trie n]
+  (->> trie
+       (trie/children-at-depth n)
+       (map (comp :count second first seq))
+       frequencies
+       (into (sorted-map))))
+
+(comment
+  (trie->frequency-of-frequencies-map trie 1)
+  ;; => {1 558,
+  ;;     2 110,
+  ;;     ,,,
+  ;;     167 1,
+  ;;     177 1}
+
+  )
+
+;; The frequency of a thus-far unseen species is the number of species seen once over the
+;; total number of species.
+;; That's commonly referred to as P0
+;; There will be a different P0 for each rank of N-gram.
+
+(defn P0 [trie n]
+  (let [freq-map (trie->frequency-of-frequencies-map trie n)]
+    (/ (freq-map 1) (apply + (vals freq-map)))))
+
+(comment
+  (P0 trie 1)
+  ;; => 31/45
+  )
+
+;; From here on out, we follow a similar procedure.
+;; What we just did, P0, is the probability of seeing something
+;; that has been previously unseen.
+;; We found that by using what we know about P1 (how many times
+;; things have been seen once).
+;;
+;; Now, we need to adjust our P1 number since we just gave some probability
+;; to P0, which previously had no probability since it wasn't in our
+;; frequency table.
+;;
+;; What's the new probability that the next thing we see is from the group of
+;; n-grams that we've seen once?
+;;
+;; The same way P0 was based off P1, P1 will be based off P2.
+;;
+;; It's basically 2 * the number of times we've seen things twice divided
+;; by the total number of things we've seen.
+;;
+;; P0 was 1 * number of 1-time things / total number of n-time things.
+;; P1 is 2 * number of 2-time things / total number of n-time things.
+;; P2 is 3 * number of 3-time things / total number of n-time things.
+;;
+;; With a slight adjustment. The frequency of frequencies needs to be smoothed
+;; so there are no 0-values. When you get up to P14, P15, etc... there might be gaps
+;; where you'll see P14 1-time, then won't see anything 15 or 16 times, so P15 and P16 will
+;; be 0, then you'll see something 17 times twice.
+;;
+;; This is just noise from having limited data. The noise needs to be smoothed out.
+
+(defn simple-good-turing-map [trie n]
+  (let [freq-map (trie->frequency-of-frequencies-map trie n)
+        xs (->> freq-map keys (map #(Math/log %)))
+        ys (->> freq-map vals (map #(Math/log %)))
+        sgt (math/sgt (keys freq-map) (vals freq-map))
+        sgt-map (into (sorted-map) (apply map vector sgt))]
+    sgt-map))
+
+
+(comment
+  (let [freq-map (trie->frequency-of-frequencies-map trie 2)
+        xs (->> freq-map keys (map #(Math/log %)))
+        ys (->> freq-map vals (map #(Math/log %)))
+        sgt (math/sgt (keys freq-map) (vals freq-map))
+        sgt-map (into (sorted-map) (apply map vector sgt))
+        sgt-with-counts (math/sgt-with-counts (keys freq-map)
+                                                        (vals freq-map))
+        c1 (freq-map 1)
+        c1* (sgt-map 1)]
+    [c1 c1* sgt-with-counts])
+
+  )
+
+
+;; Maximum Likelihood Estimate
+;;
+;; It was about dusk, one evening during the supreme madness of the
+;; carnival season, that I encountered my friend.  He accosted me with
+;; excessive warmth, for he had been drinking much.  The man wore motley.
+;; He had on a tight-fitting parti-striped dress, and his head was
+;; surmounted by the conical cap and bells.  I was so pleased to see him,
+;; that I thought I should never have done wringing his hand.
+;;
+;; Consider 3-grams...
+;;
+;; it was about
+;; it was there
+;;
+;; Let `N` be a sample text size and `nr` be the number of
+;; m-grams which occurred in the text exactly `r` times.
+;;
+;; So that `N` = (apply + (map #(* r nr) frequency-of-frequencies)
+;; `N` = sum for all seen-counts ("number of things seen 'count' times" * 'count')
+;; 10 things seen 5 times
+;; 4 things seen 4 times
+;; 2 things seen 1 time
+;;
+;; 10 things seen 5 times each makes up 50 "things"
+;; 4 things seen 4 times each makes up 16 "things
+;; 2 things seen once each makes up 2 "things"
+;;
+;; Makes for `N` = 50 + 16 + 2 things... 68 things (m-grams).
+;;
+;; Consider the m-gram "it was about" occurred 4 times.
+;; And in total we saw 60 3-grams. Then the MLE
+;; is 4 / 60.
+;;
+;;
+;;;; Base MLE
+;;
+;; Disount of the n-gram
+;; *
+;; Count of n-gram
+;; /
+;; Count of n-1-gram
+
+
+(defn maximum-likelihood-estimate [trie trie-database n-gram]
+  (/ (get-in trie-database [n-gram :count])
+     (get-in trie-database [(butlast n-gram) :count])))
+
+
+(comment
+  (maximum-likelihood-estimate trie trie-database '("," "the"))
+
+  (maximum-likelihood-estimate trie trie-database '(","))
+
+  (let [[rs nrs ests lgts]
+        (apply
+         math/sgt-with-counts
+         (apply map vector (seq (trie->frequency-of-frequencies-map trie 2))))]
+    [rs nrs ests lgts])
+
+  )
+
+;;;; KATZ ;;;;
+
+;; (defn N [trie n-gram-rank]
+;;   (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)]
+;;     (apply + (map (fn [[r nr]] (* r nr)) r->Nr))))
+
+;; (defn r* [trie n-gram-rank]
+;;   (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)
+;;         _ _ _ r*s]))
+
+
+;;;;
+
+(defn zipper-leaf-path-seq
+  [zipper]
+  (->> zipper
+       (iterate zip/next)
+       (take-while (complement zip/end?))
+       (filter (complement zip/branch?))
+       (map zip/path)
+       (map (partial map first))
+       (filter (comp tb2/words last))))
+
+(comment
+  (def target-grammar-structure
+    '(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))))
+
+  (reverse (zipper-leaf-path-seq (zip/seq-zip target-grammar-structure)))
+
+  (defn decode-fn
+    "Decodes a variable-length encoded number from a byte-buffer.
+  Zero gets decoded to nil."
+    [byte-buffer]
+    (let [value (encoding/decode byte-buffer)]
+      (if (zero? value)
+        nil
+        value)))
+
+  (def tpt (tpt/load-tightly-packed-trie-from-file
+            (io/resource "dark-corpus-4-gram-backwards-tpt.bin")
+            decode-fn))
+
+  (def database (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
+
+  (def example-story
+    (loop [generated-text (vec (repeat 3 (get database "</s>")))
+           i              0]
+      (if (> i 20)
+        generated-text
+        (let [children (loop [i 4]
+                         (let [node
+                               (trie/lookup
+                                tpt
+                                (vec (take-last i generated-text)))
+                               children
+                               (and node (trie/children node))]
+                           (cond
+                             (nil? node)    (recur (dec i))
+                             (< i 0)        (throw (Exception. "Error"))
+                             (seq children) children
+                             :else          (recur (dec i)))))]
+          (recur
+           (conj
+            generated-text
+            (->> children
+                 (map #(vector (.key %) (get % [])))
+                 (remove (comp nil? second))
+                 (#(math/weighted-selection
+                    (fn [[_ c]] c)
+                    %))
+                 first))
+           (inc i))))))
+
+  (map database example-story)
+
+  )
+
+(defn syllabify-phrase
+  [phrase]
+  (->> phrase
+       (#(string/split % #" "))
+       (map phonetics/get-phones)
+       (map first)
+       (map syllabify/syllabify)
+       (reduce into [])))
+
+(defn markov-choice
+  [trie generated-text k xf-filter]
+  (let [node     (trie/lookup trie k)
+        children (and node
+                      (->> node
+                           trie/children
+                           (map #(vector (.key %) (get % [])))
+                           (remove (comp nil? second))))
+        choices (transduce
+                 (comp
+                  (map (fn [child]
+                         (vector generated-text child)))
+                  xf-filter) conj children)]
+    (cond
+      (nil? node) (recur trie generated-text (butlast k) xf-filter)
+
+      (seq children)
+      (if (< (rand) (/ (apply max (map second children))
+                       (apply + (map second children))))
+        (recur trie generated-text (butlast k) xf-filter)
+        (first
+         (math/weighted-selection
+          (fn [[_ c]] c)
+          choices)))
+
+      (> (count k) 0) (recur trie generated-text (butlast k) xf-filter)
+
+      :else (throw (Exception. "Error")))))
+
+(defn syllable-count-pred
+  [syllable-count database]
+  (fn [node]
+    (let [syllables (syllabify-phrase (database (get node [])))]
+      (= syllable-count
+         (count syllables)))))
+
+(defn markov-select
+  [n]
+  (fn [{:keys [trie database xf-filter tokens] :as context}]
+    (loop [n n]
+      (if (= n 0)
+        ;; Unable to find a selection
+        nil
+        (let [key (take-last n tokens)
+              node (trie/lookup trie key)
+              children (and node (->> (trie/children node)
+                                      (remove nil?)))
+              choices (transduce
+                       (comp
+                        (map #(vector (.key %) (get % [])))
+                        (map (fn [child] [context child]))
+                        xf-filter)
+                       conj
+                       children)]
+          (let [freqs (map #(get % []) children)]
+            (cond
+              (or (empty? choices) (empty? freqs))
+              (recur (dec n))
+
+              (and
+               (> n 1)
+               (< (rand)
+                  (/ (apply max freqs)
+                     (apply + freqs))))
+              (recur
+               (dec n))
+
+              :else
+              (let [result (second (math/weighted-selection
+                                    (comp second second)
+                                    choices))]
+                (first result)))))))))
+
+(defn generate-sentence
+  [{:keys [trie database stop? xf-filter tokens] :as context}]
+  (let [markov-fn (markov-select 4)]
+    (loop [context (assoc context :i 0)]
+      (let [tokens (:tokens context)]
+        (cond
+          (stop? context)
+          tokens
+
+          :else
+          (let [selection (markov-fn context)]
+            (if (nil? selection)
+              (do
+                (println tokens)
+                (throw (Exception. "No possible selection")))
+              (recur (update
+                      context
+                      :tokens
+                      conj
+                      selection)))))))))
+
+(comment
+  (trie/lookup tpt '(1 1 1))
+  (let [context {:tokens (vec (repeat 3 (database "</s>")))
+                 :trie tpt
+                 :database database
+                 :stop? (fn [{:keys [tokens] :as context}]
+                          (let [sentence (->> tokens
+                                              (map database)
+                                              (remove #{"</s>"})
+                                              (string/join " "))]
+                            (<= 10 (count (syllabify-phrase sentence)))))
+                 :xf-filter (comp
+                             (remove
+                              (fn [[context [k v]]]
+                                (= k 7)))
+                             (filter
+                              (fn [[context [k v]]]
+                                (let [current-sentence
+                                      (->> (:tokens context)
+                                           (map database)
+                                           (remove #{"</s>"})
+                                           (string/join " "))
+                                      current-syllable-count
+                                      (count (syllabify-phrase current-sentence))
+                                      current-word (database k)
+                                      current-word-syllable-count (count (syllabify-phrase current-word))]
+                                  (>= (- 10 current-syllable-count)
+                                      current-word-syllable-count)))))}]
+    (->> (generate-sentence context)
+         (map database)))
+
+
+  (database "<s>")
+  )
+#_(defn generate-sentence
+  [trie database stop? filters]
+  (loop [generated-text (vec (repeat 3 (get database "</s>")))
+         i              0]
+    (cond
+      (> i 400)
+      nil
+
+      (stop? generated-text)
+      generated-text
+
+      ;; reset
+      (or (zero? (mod i 40))
+          (> syllable-count target-syllable-count))
+      (recur (vec (repeat 3 (get database "</s>"))) 0)
+
+      :else
+      (let [choice (markov-choice
+                    trie
+                    (take-last 4 generated-text)
+                    filters)]
+        (recur
+         (conj
+          generated-text
+          choice)
+         (inc i))))))
+
+(comment
+  (let [disallow-sentence-start-xf
+        (remove (= (database (first %)) "<s>") children)
+        ])
+
+  (map database (generate-sentence tpt database 10))
+
+  (def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin"))
+  (def part-of-speech-freqs (nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin"))
+  (take 100 part-of-speech-freqs)
+
+  (loop [generated-text (vec (repeat 3 (get database "</s>")))
+         i              0]
+    (let [current-sentence
+          (take-while
+           (complement (into #{} (map database ["<s>" "</s>"])))
+           (reverse generated-text))]
+      (if (> i 20)
+        generated-text
+        (let [children (loop [i 4]
+                         (let [node
+                               (trie/lookup
+                                tpt
+                                (vec (take-last i generated-text)))
+                               children
+                               (and node (trie/children node))]
+                           (cond
+                             (nil? node)    (recur (dec i))
+                             (< i 0)        (throw (Exception. "Error"))
+                             (seq children) children
+                             :else          (recur (dec i)))))]
+          (recur
+           (conj
+            generated-text
+            (->> children
+                 (map #(vector (.key %) (get % [])))
+                 (remove (comp nil? second))
+                 (#(math/weighted-selection
+                    (fn [[_ c]] c)
+                    %))
+                 first))
+           (inc i))))))
+
+  )
--- a/src/com/owoga/prhyme/nlp/core.clj
+++ b/src/com/owoga/prhyme/nlp/core.clj
@ -232,29 +232,34 @@
  Porcelain. If you have the simple tree data structure
  returned by `parse-to-simple-tree`, then you can just
  pass that directly to `zip/seq-zip`."
-  [texts]
-  (let [tree (->> texts
-                  (map tokenize)
-                  (map (partial string/join " "))
+  [text]
+  (let [tree (->> text
+                  tokenize
+                  (string/join " ")
+                  vector
                  parse
-                  (map tb/make-tree)
+                  first
+                  tb/make-tree
                  unmake-tree)]
    (zip/seq-zip tree)))

 (comment
-  (let [texts ["Eric's test is difficult."]]
-    (loop [zipper (treebank-zipper texts)]
+  ;; Here is a demo of zipping through a parse tree and changing
+  ;; all adjectives to "thorough".
+  (let [text "Eric's test is difficult."]
+    (loop [zipper (treebank-zipper text)]
      (cond
        (zip/end? zipper) (zip/root zipper)
        (= 'JJ (zip/node zipper)) (recur (-> zipper
                                             zip/next
                                             (zip/replace '("thorough"))))
        :else (recur (zip/next zipper)))))
-  ;; => ((TOP
-  ;;      ((S
-  ;;        ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
-  ;;         (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
-  ;;         (. (".")))))))
+  ;; => (TOP
+  ;;     ((S
+  ;;       ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
+  ;;        (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
+  ;;        (. ("."))))))
+
  )

 (defn iter-zip
@ -341,6 +346,9 @@
  (leaf-pos-path-word-freqs zipper))

 (comment
+  (treebank-zipper ["Eric's test is difficult."
+                    "Eric's test is thorough."
+                    "Eric's testing."])
  (let [zipper (treebank-zipper ["Eric's test is difficult."
                                 "Eric's test is thorough."
                                 "Eric's testing."])]
@ -406,6 +414,9 @@
                  "you are a test"]]
    (grammar-tree-frequencies
     document))
+
+  (grammar-tree-frequencies ["this is a test."])
+  (parse-to-simple-tree ["this is a test."])
  ;; => {(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))) 1,
  ;;     (TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))))) 2,
  ;;     (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))))) 1}
@ -786,7 +797,7 @@
       (remove #(string? (first %)))))

 (comment
-  (phrase-constituents "My name is Eric.")
+  (phrase-constituents ["My name is Eric."])
  ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
  (phrase-constituents "How are you?")
  ;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP)))
@ -833,6 +844,7 @@
    "My hat is blue and I like cake."
    "Your name is Taylor."
    "How are you?"])
+
  ;; => {TOP {(S) 3, (SBARQ) 1},
  ;;     S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
  ;;     NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
@ -1002,6 +1014,7 @@
        "Your name is not Eric."
        "Who is your mother and what does she do?"]
       (pos-constituent-frequencies)
+
       #_#_(apply
        merge-with
        (fn [a b]
@ -1061,10 +1074,20 @@

  )

+(defn most-likely-parts-of-speech
+  [phrase]
+  (top-k-sequences prhyme-pos-tagger (tokenize phrase)))
+
 (comment
-  (let [text ["bother me"]]
-    (->> text
-         (map tokenize)
-         (map #(top-k-sequences prhyme-pos-tagger %))))
+  (let [text "a dog"]
+    (first
+     (map #(.getOutcomes %)
+          (most-likely-parts-of-speech text))))
+  ;; => ["PRP" "VBP" "DT" "NN" "."]
+
+
+  (map (juxt #(.getOutcomes %)
+             #(map float (.getProbs %)))
+       (top-k-sequences prhyme-pos-tagger (tokenize "")))

  )
--- a/src/com/owoga/prhyme/syllabify.clj
+++ b/src/com/owoga/prhyme/syllabify.clj
@ -29,6 +29,7 @@
 ;;   of a word. So it should be e.lip.sis
 ;; As an alternative to handling the isolated "s"-at-the-end-of-internal-coda case,
 ;; it works well-enough for me to treat all fricatives as lowest priority.
+
 (def ^clojure.lang.PersistentVector sonority-hierarchy
  ["vowel" "liquid" "affricate" "fricative" "nasal" "stop" "semivowel" "aspirate"])

--- a/src/com/owoga/prhyme/util/math.clj
+++ b/src/com/owoga/prhyme/util/math.clj
@ -286,6 +286,50 @@
               [lgt-estimate lgt?]
               [turing-estimate lgt?]))))))))

+
+(defn smoothed-frequencies
+  [rs nrs]
+  (let [l (count rs)
+        N (apply + (map #(apply * %) (map vector rs nrs)))
+        p0 (/ (first nrs) N)
+        zrs (average-consecutives rs nrs)
+        log-rs (map #(Math/log %) rs)
+        log-zrs (map #(Math/log %) zrs)
+        lm (least-squares-linear-regression log-rs log-zrs)
+        lgts (map lm rs)
+        estimations (loop [coll rs
+                           lgt? false
+                           e (estimator lm rs zrs)
+                           estimations []]
+                      (cond
+                        (empty? coll) estimations
+                        :else
+                        (let [[estimation lgt?] (e (first coll) lgt?)]
+                          (recur
+                           (rest coll)
+                           lgt?
+                           e
+                           (conj estimations estimation)))))
+        N* (apply + (map #(apply * %) (map vector nrs estimations)))
+        probs (cons
+               (float p0)
+               (map #(* (- 1 p0) (/ % N*)) estimations))
+        sum-probs (apply + probs)]
+    [lgts
+     (map
+        (fn [r]
+          (* (inc r) (/ (lm (inc r)) (lm r))))
+        (partition 2 1 (conj rs (inc (peek rs)))))]))
+
+(comment
+  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
+        nrs [32 20 10 3 1 2 1 1 1  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
+        rs  [1 2 3 4 5 6 7 8 9 10 12 26]
+        nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
+    (smoothed-frequencies rs nrs))
+
+  )
+
 (defn sgt [rs nrs]
  (assert (and (not-empty nrs) (not-empty rs))
          "frequencies and frequency-of-frequencies can't be empty")
@ -316,7 +360,8 @@
               (map #(* (- 1 p0) (/ % N*)) estimations))
        sum-probs (apply + probs)]
    [(cons 0 rs)
-     (map #(/ % sum-probs) probs)]))
+     (map #(/ % sum-probs) probs)
+     estimations]))

 (comment
  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]