Remove stale code, add comments, cleanup

main
Eric Ihli 3 years ago
parent 9b8fec9fc7
commit 62a1ff3a3b

@ -61,6 +61,24 @@ words, then you're stuck trying to rhyme with the single syllable
Implementation Implementation
-------------- --------------
2021-06-09
++++++++++
Most generation tasks are going to require some big data structures, like a Trie of n-grams.
A ``context`` is an atom that gets updated with those data structures.
Loading some of these data structures can take a long time, so only load what you need.
An example of the different data structures you might load:
Alliterations - From the database of n-grams, convert each n-gram to syllables then create a trie of the alliterations.
Perfect rhymes - Again, from the database of n-grams, convert n-gram to syllables and create trie of reverse of syllables.
Imperfect rhymes - Perform some manipulation of the syllables so that you can be more flexible with your rhymes.
One key that is probably always required is the ``database``. This maps words to their IDs and IDs to their words. The integer
IDs are necessary for tightly packed tries.
2020-10-20 2020-10-20
++++++++++ ++++++++++

@ -21,7 +21,7 @@
com.taoensso/nippy {:mvn/version "3.0.0"} com.taoensso/nippy {:mvn/version "3.0.0"}
com.taoensso/timbre {:mvn/version "4.10.0"} com.taoensso/timbre {:mvn/version "4.10.0"}
com.owoga/tightly-packed-trie com.owoga/tightly-packed-trie
{:local/root "/home/eihli/code/tightly-packed-trie"} {:local/root "/home/eihli/src/clj-tightly-packed-trie"}
com.owoga/phonetics {:mvn/version "0.1.1"}} com.owoga/phonetics {:mvn/version "0.1.1"}}
:aliases {:dev {:extra-paths ["test" "examples" "dev"] :aliases {:dev {:extra-paths ["test" "examples" "dev"]
:extra-deps {}}}} :extra-deps {}}}}

@ -18,6 +18,7 @@
[com.owoga.prhyme.syllabify :as syllabify] [com.owoga.prhyme.syllabify :as syllabify]
[taoensso.nippy :as nippy])) [taoensso.nippy :as nippy]))
(set! *warn-on-reflection* true)
(tufte/add-basic-println-handler! {}) (tufte/add-basic-println-handler! {})
(defn xf-file-seq [start end] (defn xf-file-seq [start end]
@ -505,7 +506,7 @@
(trie/make-trie) (trie/make-trie)
(@context :database))) (@context :database)))
#_(swap! (swap!
context context
assoc assoc
:flex-rhyme-trie :flex-rhyme-trie
@ -546,6 +547,7 @@
nil) nil)
(take 5 (@context :flex-rhyme-trie)) (take 5 (@context :flex-rhyme-trie))
) )
(comment (comment
@ -553,17 +555,31 @@
(take 20) (take 20)
(map first) (map first)
(map (partial map (@context :database)))) (map (partial map (@context :database))))
(trie/children (trie/lookup (@context :trie) [13393])) (trie/children (trie/lookup (@context :trie) [13393]))
((@context :database) "desk") ;; => 13393 ((@context :database) "desk") ;; => 13393
((@context :database) "wobbly") ;; => 152750 ((@context :database) "wobbly") ;; => 152750
(get (@context :trie) [13393 152750])) (get (@context :trie) [13393 152750]))
(defn rhyme-choices (defn rhyme-choices
[{:keys [flex-rhyme-trie database] :as context} phrase] [{:keys [rhyme-trie database] :as context} phrase]
(if (string? phrase) (if (string? phrase)
(let [phones (phrase->flex-rhyme-phones phrase)] (let [phones (phrase->phones phrase)]
(get flex-rhyme-trie phones)) (get rhyme-trie phones))
(get flex-rhyme-trie phrase))) (get rhyme-trie phrase)))
(comment
(get (:rhyme-trie @context) (phrase->phones "fall"))
(->> (rhyme-choices
@context
"fall")
#_(map (comp (:database @context) first first)))
(take 20 (:flex-rhyme-trie @context))
(take 20 (:rhyme-trie @context))
)
(defn exclude-non-rhymes-from-choices (defn exclude-non-rhymes-from-choices
"Removes any choice that includes the last "Removes any choice that includes the last
@ -633,8 +649,11 @@
(comment (comment
(get-flex-rhyme @context "bother me") (get-flex-rhyme @context "bother me")
(phrase->flex-rhyme-phones "bother me") (phrase->flex-rhyme-phones "bother me")
(get-flex-rhyme @context ["IY" "ER" "AA"]) (get-flex-rhyme @context ["IY" "ER" "AA"])
) )
(defn get-next-markov (defn get-next-markov
@ -644,7 +663,9 @@
children (and node children (and node
(->> node (->> node
trie/children trie/children
(map #(vector (.key %) (get % []))) (map (fn [^com.owoga.trie.ITrie child]
[(.key child)
(get child [])]))
(remove (comp nil? second)) (remove (comp nil? second))
(remove (remove
(fn [[k v]] (fn [[k v]]
@ -670,7 +691,14 @@
(database (get-next-markov context word-ids)))) (database (get-next-markov context word-ids))))
(comment (comment
(initialize)
(get (@context :database) "</s>")
(keys @context)
((@context :database) (get-next-markov @context [1]))
(get-next-markov @context [222]) (get-next-markov @context [222])
(get-next-markov-from-phrase-backwards @context "will strike you down" 3) (get-next-markov-from-phrase-backwards @context "will strike you down" 3)
(get (@context :database) 7982) (get (@context :database) 7982)
@ -693,6 +721,12 @@
(map database seed)))) (map database seed))))
(defn take-words-amounting-to-at-least-n-syllables (defn take-words-amounting-to-at-least-n-syllables
"This function is nice to grab the tail end of a sentence for making a good rhyme.
If the sentence ends with a single-syllable word, like 'me', but a more
interesting n-gram like 'bother me', then you might want to explore the rhymes
available for the last N syllables. Sure, a word like 'poverty' would show up if you
got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great
rhymes before you see it."
[phrase n] [phrase n]
(letfn [(phones [word] (letfn [(phones [word]
[word (first (owoga.phonetics/get-phones word))]) [word (first (owoga.phonetics/get-phones word))])
@ -711,7 +745,16 @@
(map first) (map first)
(string/join " ")))) (string/join " "))))
(comment
(take-words-amounting-to-at-least-n-syllables
"police can bother me" 3);; => "police can"
(take-words-amounting-to-at-least-n-syllables
"police can bother me" 4);; => "police can bother"
)
(defn take-n-syllables (defn take-n-syllables
"Returns the vowel sounds that make up the last n syllables.
Doesn't return stress."
[phrase n] [phrase n]
(if (string? phrase) (if (string? phrase)
(->> phrase (->> phrase
@ -720,7 +763,9 @@
(reverse)) (reverse))
(take-last n phrase))) (take-last n phrase)))
(take-n-syllables "bother me" 2) (comment
(take-n-syllables "bother me" 2);; => ("ER" "IY")
)
(defn valid-english-sentence? (defn valid-english-sentence?
[phrase] [phrase]
@ -747,7 +792,8 @@
(defn rhyming-n-gram-choices (defn rhyming-n-gram-choices
[context target-rhyme] [context target-rhyme]
(loop [target-rhyme target-rhyme] (loop [target-rhyme target-rhyme]
(let [choices (->> target-rhyme (let [context @context
choices (->> target-rhyme
(rhyme-choices context) (rhyme-choices context)
(exclude-non-rhymes-from-choices context target-rhyme) (exclude-non-rhymes-from-choices context target-rhyme)
(exclude-non-english-phrases-from-choices context))] (exclude-non-english-phrases-from-choices context))]
@ -758,11 +804,12 @@
choices)))) choices))))
(comment (comment
(->> (rhyming-n-gram-choices @context "fall") (->> (rhyming-n-gram-choices context "fall")
(map (comp (@context :database) first first))) (map (comp (@context :database) first first)))
) (rhyme-choices @context "tall")
)
(defn generate-n-syllable-sentence-rhyming-with (defn generate-n-syllable-sentence-rhyming-with
[context target-phrase n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count] [context target-phrase n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count]
@ -801,6 +848,16 @@
" " " "
phrase))))))) phrase)))))))
(comment
(generate-n-syllable-sentence-rhyming-with
context
"war on poverty"
3
3
8)
)
(defn generate-haiku (defn generate-haiku
[seed] [seed]
(let [haiku (cons (let [haiku (cons
@ -829,7 +886,9 @@
(map last) (map last)
(apply distinct?)))) (apply distinct?))))
(->> (generate-haiku "football") (println (first (generate-haiku "fall")))
(->> (generate-haiku "</s>")
(filter valid-haiku) (filter valid-haiku)
(map (partial string/join "\n")) (map (partial string/join "\n"))
(map #(vector % (sha256 %))) (map #(vector % (sha256 %)))
@ -837,7 +896,7 @@
(println haiku) (println haiku)
(println sha) (println sha)
(println))) (println)))
(take 10)) (take 1))
) )
@ -850,7 +909,9 @@
@context @context
(take 3 (phrase->flex-rhyme-phones "player unknown battlegrounds")) (take 3 (phrase->flex-rhyme-phones "player unknown battlegrounds"))
3 4 %) 3 4 %)
[9 6 6 9 6 6 9 6 6])))) [9 6 6 9 6 6 9 6 6])))
)
" "
another day a battleground another day a battleground
@ -878,7 +939,7 @@ witness sky is blackened now
(defn continuously-amulate (defn continuously-amulate
[seed] [seed]
(let [next-sentence (amul8 seed) (let [next-sentence (generate-haiku seed)
next-seed (->> next-sentence next-seed (->> next-sentence
(#(string/split % #" ")) (#(string/split % #" "))
(reverse) (reverse)
@ -898,6 +959,8 @@ witness sky is blackened now
(cons next-sentence (continuously-amulate next-seed))))) (cons next-sentence (continuously-amulate next-seed)))))
(comment (comment
(generate-haiku "technology")
(take 5 (continuously-amulate "technology")) (take 5 (continuously-amulate "technology"))
(->> (amul8 "technology" 1) (->> (amul8 "technology" 1)
@ -1034,7 +1097,16 @@ witness sky is blackened now
(defn choose-next-word (defn choose-next-word
"Given an n-gram of [[word1 freq1] [word2 freq2]] chooses "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses
the next word based on markove data in trie." the next word based on markov data in trie.
Could be improved by taking into account grammar and/or bidirectional context.
The n-gram parameter is a list of trie entries
For trie entries that are word/frequency pairs, it might look something like this.
`[[sunshine 38] [</s> 509]]`
But note that nothing in this function uses the frequency count from the passed in n-gram.
It's just easier for the calling functions to pass them in like that."
[{:keys [database trie] :as context} n-gram] [{:keys [database trie] :as context} n-gram]
(let [n-gram-ids (->> n-gram (map first) (map database)) (let [n-gram-ids (->> n-gram (map first) (map database))
node (trie/lookup trie n-gram-ids)] node (trie/lookup trie n-gram-ids)]
@ -1053,6 +1125,7 @@ witness sky is blackened now
n-minus-1-gram-odds (/ (second (first children-freqs)) n-minus-1-gram-odds (/ (second (first children-freqs))
(+ (second (get node [])) (+ (second (get node []))
(second (first children-freqs)))) (second (first children-freqs))))
;; Good-turing smoothing, take unseen ngram?
take-n-minus-1-gram? (and (< 1 (count n-gram-ids)) take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
(< (rand) n-minus-1-gram-odds))] (< (rand) n-minus-1-gram-odds))]
(if take-n-minus-1-gram? (if take-n-minus-1-gram?
@ -1137,22 +1210,19 @@ witness sky is blackened now
(get trie ids)) (get trie ids))
(choose-next-word @context (take 3 [["</s>" 509]])) (choose-next-word @context (take 3 [["</s>" 509]]))
(generate-sentence-backwards @context ["</s>"]) (generate-sentence-backwards @context ["kill" "</s>"])
(valid-sentences? (generate-phrase @context '(["bitter" 41]))) (valid-sentences? (generate-phrase @context '(["bitter" 41])))
(choose-next-word @context (take 3 [["theology" 41]])) (choose-next-word @context (take 3 [["theology" 41]]))
(choose-next-word @context [["and" 5] ["theology" 41]]) (choose-next-word @context [["and" 5] ["theology" 41]])
(find-rhymes (@context :perfect-rhyme-trie) "theology") (find-rhymes (@context :perfect-rhyme-trie) "theology")
(trie/chil(trie/lookup (@context :trie) '(57 2477))) (trie/lookup (@context :trie) '(57 2477))
(take 5 (@context :trie)) (take 5 (@context :trie))
(->> (find-rhymes (@context :perfect-rhyme-trie) "technology") (->> (find-rhymes (@context :perfect-rhyme-trie) "technology")
(map (fn [[word frq]] (map (fn [[word frq]]
(let [n+1grams (word->n+1grams (let [n+1grams (word->n+1grams
@ -1162,101 +1232,55 @@ witness sky is blackened now
(map vector n+1grams (repeat [word frq]))))) (map vector n+1grams (repeat [word frq])))))
(reduce into [])) (reduce into []))
(def loaded-backwards-trie
(tpt/load-tightly-packed-trie-from-file
"resources/dark-corpus-backwards-tpt.bin"
(decode-fn @trie-database)))
(def loaded-backwards-database
(with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
(into {} (map read-string (line-seq rdr)))))
(def rhyme-database (atom {}))
(def db
(nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
(def perfect-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
@loaded-backwards-database))
(def vowel-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse)
(map (fn [[phones v]]
[(map #(if (owoga.phonetics/vowel
(string/replace % #"\d" ""))
%
"?")
phones)
v])))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(take 1000 db)))
(take 20 vowel-rhyme-trie)
(do
#_(time
(def backwards-trie
(transduce (comp (xf-file-seq 0 250000)
(map slurp)
(map (partial n-to-m-backwards-grams 1 4))
(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
stateful-transducer)
conj
(file-seq (io/file "dark-corpus")))))
#_(time
(def tightly-packed-backwards-trie
(tpt/tightly-packed-trie
backwards-trie
encode-fn
(decode-fn @trie-database))))
#_(tpt/save-tightly-packed-trie-to-file
"resources/dark-corpus-backwards-tpt.bin"
tightly-packed-backwards-trie)
#_(with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")]
(let [lines (->> (seq @trie-database)
(map pr-str)
(map #(str % "\n")))]
(doseq [line lines]
(.write wtr line))))
(def loaded-backwards-trie
(tpt/load-tightly-packed-trie-from-file
"resources/dark-corpus-backwards-tpt.bin"
(decode-fn @trie-database)))
(def loaded-backwards-database
(with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
(into {} (map read-string (line-seq rdr)))))
(def rhyme-database (atom {}))
(def db
(nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
(def perfect-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
@loaded-backwards-database))
(def vowel-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse)
(map (fn [[phones v]]
[(map #(if (owoga.phonetics/vowel
(string/replace % #"\d" ""))
%
"?")
phones)
v])))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(take 1000 db)))
(take 20 vowel-rhyme-trie)
)
#_(with-open [wtr (clojure.java.io/writer "database.bin")] #_(with-open [wtr (clojure.java.io/writer "database.bin")]
(let [lines (->> (seq @trie-database) (let [lines (->> (seq @trie-database)
(map pr-str) (map pr-str)
@ -1310,9 +1334,9 @@ witness sky is blackened now
(id-get-in-tpt (id-get-in-tpt
tightly-packed-trie tightly-packed-trie
trie-database trie-database
'(2 2 3))) '(2 2 3))
;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}} ;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}}
)
(comment (comment
(->> (perfect-rhymes perfect-rhyme-trie (->> (perfect-rhymes perfect-rhyme-trie
@ -1417,3 +1441,9 @@ witness sky is blackened now
#(= (last %) \2) #(= (last %) \2)
phones))] phones))]
(trie/lookup trie rhyme-suffix))) (trie/lookup trie rhyme-suffix)))
(comment
(keys @context)
;; => (:flex-rhyme-trie :database :trie :perfect-rhyme-trie :rhyme-trie)
)

@ -0,0 +1,81 @@
(ns com.owoga.corpus.util
(:require [taoensso.tufte :as tufte :refer (defnp p profiled profile)]
[clojure.string :as string]))
(set! *warn-on-reflection* true)
(tufte/add-basic-println-handler! {})
(defn clean-text
"Removes all non-alphabetical characters and lowercases everything.
Very spartan way of cleaning."
[text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]+" "")))
(defn xf-file-seq [start end]
(comp (remove #(.isDirectory %))
(drop start)
(take end)))
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
(defn pad-tokens
"Pads the beginning with n - 1 <s> tokens and
the end with 1 </s> token."
[beginning-pad
number-of-beginning-pad
ending-pad
number-of-ending-pad
tokens]
(vec
(concat
(vec (repeat number-of-beginning-pad beginning-pad))
tokens
(vec (repeat number-of-ending-pad ending-pad)))))
(defn padder
[beg beg-n end end-n]
(partial pad-tokens beg beg-n end end-n))
(comment
(let [p (padder "<s>" 1 "</s>" 3)]
(p [1 2 3]));; => ["<s>" 1 2 3 "</s>" "</s>" "</s>"]
)
(defn tokenize-line
[line]
(->> line
(string/trim)
(re-seq re-word)
(mapv second)
(mapv string/lower-case)))
(comment
(tokenize-line "The lazy fox jumps over the moon.")
;; => ["the" "lazy" "fox" "jumps" "over" "the" "moon" "."]
)
(defn text->ngrams
"Takes text from a file, including newlines."
[text n]
(->> text
clean-text
(#(string/split % #"\n+"))
(remove empty?)
(mapv tokenize-line)
(mapv #(partition n 1 %))
(mapv #(mapv vec %))
(reduce #(into %1 %2) [])))
(comment
(text->ngrams "The lazy fox jumps.\nOver the 5th full moon." 3)
;; => [["the" "lazy" "fox"]
;; ["lazy" "fox" "jumps"]
;; ["over" "the" "th"]
;; ["the" "th" "full"]
;; ["th" "full" "moon"]]
)

@ -237,6 +237,8 @@
;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]] ;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]]
) )
(comment (comment
(->> (first texts) (->> (first texts)
(split-text-into-sentences) (split-text-into-sentences)
@ -406,11 +408,11 @@
(recur (zip/next zipper))))) (recur (zip/next zipper)))))
(defn grammar-children (defn grammar-children
[k] [database trie k]
(sort-by (sort-by
(comp - last) (comp - last)
(map #(vector (.key %) (@test-database (.key %)) (get % [])) (map #(vector (.key %) (database (.key %)) (get % []))
(remove (comp nil? #(get % [])) (trie/children (trie/lookup test-trie k)))))) (remove (comp nil? #(get % [])) (trie/children (trie/lookup trie k))))))
(defn grammar-branch? (defn grammar-branch?
[trie database k] [trie database k]

@ -1,9 +1,18 @@
(ns com.owoga.prhyme.limerick (ns com.owoga.prhyme.limerick
(:require [com.owoga.prhyme.gen :as gen] (:require [com.owoga.prhyme.gen :as gen]
[com.owoga.prhyme.generation.weighted-selection :as weighted-selection] [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
[com.owoga.prhyme.util.math :as math]
[com.owoga.prhyme.nlp.core :as nlp]
[clojure.string :as string] [clojure.string :as string]
[com.owoga.phonetics :as phonetics]
[com.owoga.phonetics.syllabify :as syllabify]
[com.owoga.prhyme.core :as prhyme] [com.owoga.prhyme.core :as prhyme]
[com.owoga.prhyme.util :as util])) [com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[clojure.java.io :as io]))
(defn rhyme-from-scheme (defn rhyme-from-scheme
"scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]" "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
@ -42,6 +51,7 @@
(require '[com.owoga.prhyme.data.dictionary :as dict] (require '[com.owoga.prhyme.data.dictionary :as dict]
'[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]] '[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]]
'[clojure.java.io :as io]) '[clojure.java.io :as io])
(rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8))) (rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))
) )
@ -74,3 +84,204 @@
"war we await the afterlife"]) "war we await the afterlife"])
;;;; Generating limericks with a markov model
(defn phrase->flex-rhyme-phones
"Takes a space-seperated string of words
and returns the concatenation of the words
vowel phones.
Returns them in reversed order so they
are ready to be used in a lookup of a rhyme trie.
"
[phrase]
(->> phrase
(#(string/split % #" "))
(map (comp syllabify/syllabify first phonetics/get-phones))
(map (partial reduce into []))
(map #(filter (partial re-find #"\d") %))
(flatten)
(map #(string/replace % #"\d" ""))
(reverse)))
(defn word->phones [word]
(or (dict/word->cmu-phones word)
(util/get-phones-with-stress word)))
(defonce context (atom {}))
(defn decode-fn [db]
(fn [byte-buffer]
(let [value (encoding/decode byte-buffer)]
(if (zero? value)
nil
[value (encoding/decode byte-buffer)]))))
(defn initialize []
(swap!
context
assoc
:database
(with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
(into {} (map read-string (line-seq rdr)))))
(swap!
context
assoc
:trie
(tpt/load-tightly-packed-trie-from-file
"resources/dark-corpus-backwards-tpt.bin"
(decode-fn (@context :database))))
(swap!
context
assoc
:perfect-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(@context :database)))
(swap!
context
assoc
:rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(@context :database)))
(swap!
context
assoc
:flex-rhyme-trie
(transduce
(comp
(map (fn [[k v]]
[(string/join " " (map (@context :database) k))
[k v]]))
(map (fn [[phrase [k v]]]
[(phrase->flex-rhyme-phones phrase)
[k v]])))
(completing
(fn [trie [k v]]
(update trie k (fnil conj [v]) v)))
(trie/make-trie)
(->> (trie/children-at-depth (@context :trie) 0 1))))
nil)
(comment
(time (initialize))
(println 2)
(take 5 (:flex-rhyme-trie @context))
)
(defn choose-next-word
"Given an n-gram of [[word1 freq1] [word2 freq2]] chooses
the next word based on markov data in trie.
Could be improved by taking into account grammar and/or bidirectional context.
The n-gram parameter is a list of trie entries
For trie entries that are word/frequency pairs, it might look something like this.
`[[sunshine 38] [</s> 509]]`
But note that nothing in this function uses the frequency count from the passed in n-gram.
It's just easier for the calling functions to pass them in like that."
[{:keys [database trie] :as context} n-gram]
(let [n-gram-ids (->> n-gram (map first) (map database))
node (trie/lookup trie n-gram-ids)]
(cond
(= 0 (count n-gram-ids))
(let [children (->> (trie/children trie)
(map #(get % [])))
choice (math/weighted-selection second children)]
[(database (first choice)) (second choice)])
node
(let [children (->> (trie/children node)
(map #(get % []))
(remove (fn [[id f]] (= id (first n-gram-ids)))))]
(if (seq children)
(let [children-freqs (into (sorted-map) (frequencies (map second children)))
n-minus-1-gram-odds (/ (second (first children-freqs))
(+ (second (get node []))
(second (first children-freqs))))
;; Good-turing smoothing, take unseen ngram?
take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
(< (rand) n-minus-1-gram-odds))]
(if take-n-minus-1-gram?
(choose-next-word context (butlast n-gram))
(let [choice (math/weighted-selection second children)]
[(database (first choice)) (second choice)])))
(choose-next-word context (butlast n-gram))))
:else
(choose-next-word context (butlast n-gram)))))
(defn valid-sentence? [phrase]
(->> phrase
(map first)
(string/join " ")
(#(string/replace % #"(<s>|</s>)" ""))
(nlp/valid-sentence?)))
(defn generate-sentence-backwards
"Given a phrase of [w1 w2 w3] generates a sentence
using a backwards markov."
([{:keys [database trie] :as context} phrase]
(let [phrase (map (fn [w]
(let [id (database w)]
[w (second (get trie [id]))]))
phrase)]
(loop [phrase' (loop [phrase phrase]
(if (= "<s>" (first (first phrase)))
phrase
(recur (cons (choose-next-word context (take 3 phrase))
phrase))))]
(if (valid-sentence? phrase')
phrase'
(recur (loop [phrase phrase]
(if (= "<s>" (first (first phrase)))
phrase
(recur (cons (choose-next-word context (take 3 phrase))
phrase)))))))))
)
(comment
(take 5 (:database @context))
(map (:database @context) ["me" "bother"])
(map (:database @context) ["bother me"])
(first
(filter
valid-sentence?
(repeatedly
(fn []
(generate-sentence-backwards
@context
["bother" "me" "</s>"])))))
(keys @context)
(time (initialize))
)
(defn rhyme-from-scheme-2
"Generate rhyme without the use of `weighted-selection/adjust-for-markov`."
[])

@ -6,7 +6,6 @@
[com.owoga.phonetics.syllabify :as owoga.syllabify] [com.owoga.phonetics.syllabify :as owoga.syllabify]
[com.owoga.phonetics :as owoga.phonetics] [com.owoga.phonetics :as owoga.phonetics]
[com.owoga.tightly-packed-trie.encoding :as encoding] [com.owoga.tightly-packed-trie.encoding :as encoding]
[examples.core :as examples]
[taoensso.nippy :as nippy] [taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.core :as nlp] [com.owoga.prhyme.nlp.core :as nlp]
[examples.tpt :as examples.tpt] [examples.tpt :as examples.tpt]
@ -18,6 +17,78 @@
[com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie :as tpt]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2])) [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))
;;;; Utilities
;;
;;
(defn phrase->flex-rhyme-phones
"Takes a space-seperated string of words
and returns the concatenation of the words
vowel phones.
Returns them in reversed order so they
are ready to be used in a lookup of a rhyme trie.
"
[phrase]
(->> phrase
(#(string/split % #" "))
(map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones))
(map (partial reduce into []))
(map #(filter (partial re-find #"\d") %))
(flatten)
(map #(string/replace % #"\d" ""))
(reverse)))
(defn take-words-amounting-to-at-least-n-syllables
"This function is nice to grab the tail end of a sentence for making a good rhyme.
If the sentence ends with a single-syllable word, like 'me', but a more
interesting n-gram like 'bother me', then you might want to explore the rhymes
available for the last N syllables. Sure, a word like 'poverty' would show up if you
got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great
rhymes before you see it."
[phrase n]
(letfn [(phones [word]
[word (first (owoga.phonetics/get-phones word))])
(syllables [[word phones]]
[word (owoga.syllabify/syllabify phones)])]
(->> phrase
(#(string/split % #" "))
(map phones)
(map syllables)
(reduce
(fn [result [word syllables]]
(if (<= n (count (mapcat second result)))
(reduced result)
(conj result [word syllables])))
[])
(map first)
(string/join " "))))
(comment
(take-words-amounting-to-at-least-n-syllables
"police can bother me" 3);; => "police can"
(take-words-amounting-to-at-least-n-syllables
"police can bother me" 4);; => "police can bother"
)
(defn take-n-syllables
"Returns the vowel sounds that make up the last n syllables.
Doesn't return stress."
[phrase n]
(if (string? phrase)
(->> phrase
(phrase->flex-rhyme-phones)
(take n)
(reverse))
(take-last n phrase)))
(comment
(take-n-syllables "bother me" 2);; => ("ER" "IY")
)
;;;; Much of the code below is related to grammar generation.
(defn update-values [m f & args] (defn update-values [m f & args]
(reduce (reduce
(fn [acc [k v]] (fn [acc [k v]]
@ -298,27 +369,6 @@
(recur (next parse-zipper))))) (recur (next parse-zipper)))))
(comment (comment
(let [structure '(TOP (S (NP (DT) (JJ) (NN))
(VP (RB) (VBZ))
(NP (DT) (JJ) (NN))))
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression-n-2-pos-freqs
zip/prev
zip/next
nil?
zip/end?
examples/pos-freqs-data-2
structure
examples/darkov-2)))))
(timbre/set-level! :info) (timbre/set-level! :info)
(timbre/set-level! :error) (timbre/set-level! :error)
@ -334,67 +384,6 @@
pos-path->word-freqs pos-path->word-freqs
pos->word-freqs pos->word-freqs
target-parse-tree))) target-parse-tree)))
(time (def example-pos-freqs examples/example-pos-freqs))
(nippy/thaw)
(nippy/freeze-to-file "resources/1000-pos-path-freqs.nip" example-pos-freqs)
(time (def example-structures examples/example-structures))
(weighted-rand/weighted-selection-from-map
example-structures)
(take 5 examples/t2)
(let [structure (weighted-rand/weighted-selection-from-map
examples/popular-structure-freq-data)
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs examples/pos-freqs-data-2]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression-n-2-pos-freqs
zip/prev
zip/next
nil?
zip/end?
pos-freqs
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))
(repeatedly
10
(fn []
(let [structure (weighted-rand/weighted-selection-from-map
(->> examples/t2
(sort-by second)
(reverse)
(take 20)))
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))))
) )
@ -442,12 +431,15 @@
(update trie k (fnil inc 0))) (update trie k (fnil inc 0)))
trie trie
entries))) entries)))
#_(trie/make-trie) (trie/make-trie)
test-trie
(->> texts (->> texts
(drop 4000) (drop 4000)
(take 4000))))) (take 4000)))))
(def test-trie (into (trie/make-trie) (nippy/thaw-from-file "resources/grammar-trie-take-8000.bin")))
(def test-database (atom (nippy/thaw-from-file "resources/grammar-database-take-8000.bin")))
) )
(defn children (defn children
@ -531,25 +523,63 @@
(nippy/freeze-to-file "resources/grammar-database-take-8000.bin" @test-database) (nippy/freeze-to-file "resources/grammar-database-take-8000.bin" @test-database)
(->> (take 20 test-trie)
(map (comp (partial map @test-database) first)))
(->> (take 20 (reverse (sort-by second test-trie)))
(map (fn [[a b]]
[(map @test-database a) b])))
;; A sampling of the words that have been seen in the [TOP S NP NN] position.
(->> (trie/lookup test-trie (map @test-database '[TOP S NP NN]))
(map (comp @test-database first first))
(drop 100)
(take 5))
;; => ("sink" "lose" "deep" "well" "help")
) )
(defn phrase->flex-rhyme-phones (defn zipper-last
"Takes a space-seperated string of words [zipper]
and returns the concatenation of the words (->> zipper
vowel phones. (iterate zip/next)
(take-while (complement zip/end?))
last))
Returns them in reversed order so they (defn previous-leaf-loc
are ready to be used in a lookup of a rhyme trie. [zipper]
" (->> zipper
[phrase] (iterate zip/prev)
(->> phrase (take-while (complement nil?))
(#(string/split % #" ")) (filter #(and (symbol? (zip/node %))
(map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones)) (zip/up %)
(map (partial reduce into [])) (= 1 (count (zip/node (zip/up %))))))
(map #(filter (partial re-find #"\d") %)) (first)))
(flatten)
(map #(string/replace % #"\d" "")) (defn previous-leaf-part-of-speech
(reverse))) [zipper]
(->> zipper
previous-leaf-loc
(zip/path)
(map first)
(filter symbol?)))
(defn nearest-ancestor-phrase
[loc]
(->> loc
(iterate zip/prev)
(take-while (complement nil?))
(filter (comp tb2/phrases zip/node))
(first)))
(comment
(nearest-ancestor-phrase
(->> (zip/vector-zip
'[NP [NN]])
zip/down
zip/right
zip/down)))
(defn markov-generate-grammar-with-rhyming-tail (defn markov-generate-grammar-with-rhyming-tail
[grammar-trie grammar-database rhyme-trie rhyme-database rhyme-target zipper] [grammar-trie grammar-database rhyme-trie rhyme-database rhyme-target zipper]
@ -645,6 +675,11 @@
@test-database @test-database
(zip/vector-zip [1])) (zip/vector-zip [1]))
(def test-sentence (markov-generate-sentence
test-trie
@test-database
(zip/vector-zip [1])))
(repeatedly (repeatedly
20 20
#(->> (generate test-trie @test-database (zip/vector-zip [1])) #(->> (generate test-trie @test-database (zip/vector-zip [1]))
@ -666,13 +701,6 @@
(zip/root (apply-fn loc))) (zip/root (apply-fn loc)))
(recur (next-fn (apply-fn loc)))))) (recur (next-fn (apply-fn loc))))))
(defn zipper-last
[zipper]
(->> zipper
(iterate zip/next)
(take-while (complement zip/end?))
last))
(defn decode-fn (defn decode-fn
"Decodes a variable-length encoded number from a byte-buffer. "Decodes a variable-length encoded number from a byte-buffer.
Zero gets decoded to nil." Zero gets decoded to nil."
@ -688,6 +716,13 @@
(filter (complement zip/branch?)) (filter (complement zip/branch?))
(map zip/node))) (map zip/node)))
(def tpt (tpt/load-tightly-packed-trie-from-file
(io/resource "dark-corpus-4-gram-backwards-tpt.bin")
decode-fn))
(def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
(defn choose-with-n-gram-markov (defn choose-with-n-gram-markov
"Hard-coded to work with 4-gram. That's the </s> at the end." "Hard-coded to work with 4-gram. That's the </s> at the end."
[zipper [zipper
@ -735,39 +770,7 @@
choice] choice]
(first choice))) (first choice)))
(defn previous-leaf-loc
[zipper]
(->> zipper
(iterate zip/prev)
(take-while (complement nil?))
(filter #(and (symbol? (zip/node %))
(zip/up %)
(= 1 (count (zip/node (zip/up %))))))
(first)))
(defn previous-leaf-part-of-speech
[zipper]
(->> zipper
previous-leaf-loc
(zip/path)
(map first)
(filter symbol?)))
(defn nearest-ancestor-phrase
[loc]
(->> loc
(iterate zip/prev)
(take-while (complement nil?))
(filter (comp tb2/phrases zip/node))
(first)))
(comment
(nearest-ancestor-phrase
(->> (zip/vector-zip
'[NP [NN]])
zip/down
zip/right
zip/down)))
(comment (comment
;; Working backwards from a completed grammar tree that has ;; Working backwards from a completed grammar tree that has
@ -796,8 +799,45 @@
(trie/lookup tpt) (trie/lookup tpt)
(trie/children) (trie/children)
(map #(vector (tpt-db (.key %)) (get % []))))] (map #(vector (tpt-db (.key %)) (get % []))))]
(choose-with-n-gram-markov [(zip/node loc)
loc test-trie @test-database tpt tpt-db)) prev-pos
prev-pos'
n-gram
n-gram'
(choose-with-n-gram-markov
loc test-trie @test-database tpt tpt-db)])
(let [zipper (zip/vector-zip
'[[TOP
[[VP
[[[VBN]]
[PP [[[TO]] [NP [[[NN]]]]]]
[PP [[[IN ["into"]]] [NP [[[PRP$ ["my"]]] [[NNS ["answers"]]]]]]]]]]]])
loc (->> zipper
(iterate zip/next)
(filter #(= "into" (zip/node %)))
(first))
prev-pos (previous-leaf-part-of-speech loc)
prev-pos' (map @test-database prev-pos)
n-gram (filter string? (rest-leafs loc))
n-gram' (mapv tpt-db n-gram)
grammar-children (->> (children test-trie @test-database prev-pos')
(map first)
(map @test-database))
n-gram-children (->> n-gram'
(take 2)
(reverse)
(trie/lookup tpt)
(trie/children)
(map #(vector (tpt-db (.key %)) (get % []))))]
[(zip/node loc)
prev-pos
prev-pos'
n-gram
n-gram'
(choose-with-n-gram-markov
loc test-trie @test-database tpt tpt-db)])
(let [zipper (zip/vector-zip (let [zipper (zip/vector-zip
'[[TOP '[[TOP
@ -840,6 +880,7 @@
choice])) choice]))
(trie/lookup test-trie [1 59 3 5 5 17]) (trie/lookup test-trie [1 59 3 5 5 17])
(@test-database 1911) (@test-database 1911)
(def tpt (tpt/load-tightly-packed-trie-from-file (def tpt (tpt/load-tightly-packed-trie-from-file
@ -847,6 +888,7 @@
decode-fn)) decode-fn))
(def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin"))) (def tpt-db (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
(markov-generate-grammar test-trie @test-database (zip/vector-zip [1])) (markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))
(-> (markov-generate-grammar test-trie @test-database (zip/vector-zip [1])) (-> (markov-generate-grammar test-trie @test-database (zip/vector-zip [1]))
@ -1044,6 +1086,7 @@
(comment (comment
(generate-grammar-from [[NN ["taylor"]]]) (generate-grammar-from [[NN ["taylor"]]])
(map (comp (partial map @test-database) first) (take 5 test-trie)) (map (comp (partial map @test-database) first) (take 5 test-trie))
(@test-database 1) (@test-database 1)

@ -15,7 +15,11 @@
ParserFactory) ParserFactory)
(opennlp.tools.cmdline.parser ParserTool))) (opennlp.tools.cmdline.parser ParserTool)))
(comment tb2/phrases) (comment
tb2/phrases
(.exists (io/file (io/resource "models/en-token.bin")))
)
(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin"))) (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
(def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin"))) (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
(def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin"))) (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
@ -68,8 +72,8 @@
(ParserFactory/create (ParserFactory/create
(ParserModel. (ParserModel.
(io/input-stream (io/resource "models/en-parser-chunking.bin"))) (io/input-stream (io/resource "models/en-parser-chunking.bin")))
5 50
0.95)) 0.90))
(defn parse-probs [parses] (defn parse-probs [parses]
(map #(.getProb %) parses)) (map #(.getProb %) parses))
@ -86,10 +90,32 @@
(- (Math/log 0.001) (Math/log 0.01)) (- (Math/log 0.001) (Math/log 0.01))
(Math/E) (Math/E)
(tokenize "Eric's testing.") (tokenize "Eric's testing.")
(Math/log 0.9999)
(Math/pow Math/E -0.5)
(let [results (StringBuffer.) (let [results (StringBuffer.)
parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)] parses (ParserTool/parseLine "The dog ran fast ." custom-parser 1)]
((juxt parse-probs parse-strs) parses)) ((juxt parse-probs parse-strs) parses))
(let [results (StringBuffer.)
parses (ParserTool/parseLine "Eric is testing." custom-parser 1)]
[((juxt parse-probs parse-strs) parses)
(count parses)])
(let [results (StringBuffer.)
parses (ParserTool/parseLine "Eric is testing." custom-parser 2)]
[((juxt parse-probs parse-strs) parses)
(count parses)])
(let [results (StringBuffer.)
parses (ParserTool/parseLine "This is a good day." custom-parser 1)]
[((juxt parse-probs parse-strs) parses)
(count parses)])
(let [results (StringBuffer.)
parses (ParserTool/parseLine "The do dog run drive a." custom-parser 1)]
((juxt parse-probs parse-strs) parses))
(let [results (StringBuffer.) (let [results (StringBuffer.)
parses (ParserTool/parseLine "Eric 's testing ." custom-parser 1)] parses (ParserTool/parseLine "Eric 's testing ." custom-parser 1)]
(meta parses)) (meta parses))
@ -146,6 +172,12 @@
tb2/clauses tb2/clauses
boolean)) boolean))
(comment
(->> "the lazy fox"
vector
parse)
)
(defn unmake-tree (defn unmake-tree
"Tokenizing and then parsing a sentence returns a string "Tokenizing and then parsing a sentence returns a string
representation of the parse tree. This is a helper function representation of the parse tree. This is a helper function

@ -27,7 +27,10 @@
(map str (.getPhones cmu-lexicon "two" nil))) (map str (.getPhones cmu-lexicon "two" nil)))
(defn get-phones (defn get-phones
"String must be lowercase." "Gets phones for known or unknonwn words from the CMULexicon,
removes the stress, and converts them to a format that matches the CMU Sphinx
dictionary (capitalizes and replaces 'ax' with 'ah') String must be
lowercase."
[word] [word]
(->> (map str (.getPhones cmu-lexicon word nil)) (->> (map str (.getPhones cmu-lexicon word nil))
(map remove-stress) (map remove-stress)
@ -35,9 +38,9 @@
(map string/upper-case))) (map string/upper-case)))
(defn get-phones-with-stress (defn get-phones-with-stress
"String must be lowercase. "Same as `get-phones` but leaves stress.
.getPhones only. Note that this might not be the same stress that you'd see
Might be different from stress in cmu-dict" in the CMU pronouncing dictionary."
[word] [word]
(->> (map str (.getPhones cmu-lexicon word nil)) (->> (map str (.getPhones cmu-lexicon word nil))
(map convert-to-sphinx) (map convert-to-sphinx)

Loading…
Cancel
Save