You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1410 lines
41 KiB
Clojure

(ns examples.tpt
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.phonetics :as owoga.phonetics]
[com.owoga.phonetics.syllabify :as owoga.syllabify]
[com.owoga.prhyme.core :as prhyme]
[com.owoga.prhyme.nlp.core :as nlp]
[taoensso.tufte :as tufte :refer (defnp p profiled profile)]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.trie.math :as math]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data.dictionary :as dict]
[clojure.zip :as zip]
[cljol.dig9 :as d]
[com.owoga.prhyme.data.phonetics :as phonetics]
[com.owoga.prhyme.syllabify :as syllabify]
[taoensso.nippy :as nippy]))
(set! *warn-on-reflection* true)
(tufte/add-basic-println-handler! {})
(defn xf-file-seq [start end]
(comp (remove #(.isDirectory %))
(drop start)
(take end)))
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
(defn pad-tokens
"Pads the beginning with n - 1 <s> tokens and
the end with 1 </s> token."
[tokens n]
(vec (concat (vec (repeat (max 1 (dec n)) "<s>")) tokens ["</s>"])))
(defn tokenize-line
[line]
(->> line
(string/trim)
(re-seq re-word)
(mapv second)
(mapv string/lower-case)))
(defn text->ngrams
"Takes text from a file, including newlines.
Pads lines with <s> and </s> for start/end of line.
Pads beginning with n - 1 <s>s"
[text n]
(->> text
util/clean-text
(#(string/split % #"\n+"))
(remove empty?)
(mapv tokenize-line)
(mapv #(pad-tokens % n))
(mapv #(partition n 1 %))
(mapv #(mapv vec %))
(reduce #(into %1 %2) [])))
(defn text->backwards-ngrams
"Takes text from a file, including newlines.
Pads lines with <s> and </s> for start/end of line.
Pads beginning with n - 1 <s>s"
[text n]
(->> text
util/clean-text
(#(string/split % #"\n+"))
(remove empty?)
(mapv tokenize-line)
(mapv #(pad-tokens % n))
reverse
(mapv reverse)
(mapv #(partition n 1 %))
(mapv #(mapv vec %))
(reduce #(into %1 %2) [])))
(defn n-to-m-grams
"Exclusive of m, similar to range."
[n m text]
(loop [i n
r []]
(cond
(= i m)
r
:else
(recur (inc i)
(into r (text->ngrams text i))))))
(defn n-to-m-backwards-grams
"Exclusive of m, similar to range."
[n m text]
(loop [i n
r []]
(cond
(= i m)
r
:else
(recur (inc i)
(into r (text->backwards-ngrams text i))))))
(declare ->TrieKey)
(deftype TrieKey [key]
clojure.lang.IPersistentStack
(peek [self]
(let [x (last (seq self))]
(if (.equals "" x)
nil
(Integer/parseInt x))))
(pop [self]
(TrieKey. (string/replace key #"(.*):.*$" "$1")))
clojure.lang.ISeq
(first [self]
(let [x (first (seq self))]
(if (.equals x "")
nil
(Integer/parseInt x))))
(next [self]
(TrieKey. (string/replace key #".*?:(.*)" "$1")))
(more [self]
(let [xs (string/split key #":")]
(if (.equals xs "") '() (into (->TrieKey "") (rest xs)))))
(cons [self o]
(TrieKey.
(cond
(.equals key "") ":"
(.equals key ":") (str key o)
:else (str key ":" o))))
clojure.lang.IPersistentCollection
(count [self]
(count (seq self)))
(empty [self]
(TrieKey. ""))
(equiv [self o]
(.equals self o))
clojure.lang.Seqable
(seq [self]
(if (.equals "" key)
nil
(seq (string/split key #":")))))
(defmethod print-method TrieKey [trie-key ^java.io.Writer w]
(print-method (.key trie-key) w))
(defmethod print-dup TrieKey [trie-key ^java.io.Writer w]
(print-ctor trie-key (fn [o w] (print-dup (.key trie-key) w)) w))
(defn trie-key
([]
(->TrieKey ""))
([coll]
(->TrieKey (string/join ":" coll))))
(def trie-database (atom nil))
(defn stateful-transducer [xf]
(let [trie (volatile! (trie/make-trie))
database (atom {})
next-id (volatile! 1)]
(fn
([] (xf))
([result]
(reset! trie-database @database)
(xf result))
([result input]
(let [ngrams-ids
(mapv
(fn [ngrams]
(mapv
(fn [ngram]
(let [gram-ids (mapv
(fn [gram]
(let [gram-id (get @database gram @next-id)]
(when (.equals gram-id @next-id)
(swap! database
#(-> %
(assoc gram gram-id)
(assoc gram-id gram)))
(vswap! next-id inc))
gram-id))
ngram)
ngram-id (get database gram-ids @next-id)]
gram-ids))
ngrams))
input)]
(vswap!
trie
(fn [trie ngrams-ids]
(reduce
(fn [trie [ngram-ids _]]
(update trie ngram-ids (fnil #(update % 1 inc) [(peek ngram-ids) 0])))
trie
ngrams-ids))
ngrams-ids))))))
(defn prep-ngram-for-trie
"The tpt/trie expects values conjed into an ngram
to be of format '(k1 k2 k3 value)."
[ngram]
(clojure.lang.MapEntry. (vec ngram) ngram))
(defn seq-of-nodes->sorted-by-count
"Sorted first by the rank of the ngram, lowest ranks first.
Sorted second by the frequency of the ngram, highest frequencies first.
This is the order that you'd populate a mapping of keys to IDs."
[trie]
(->> trie
trie/children
(map #(get % []))
(sort-by :count)
reverse))
(comment
(transduce (comp (xf-file-seq 0 10)
(map slurp)
(map (partial n-to-m-grams 1 5))
#_#_(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
stateful-transducer)
conj
(file-seq (io/file "dark-corpus")))
(time
(def trie
(transduce (comp (xf-file-seq 0 250000)
(map slurp)
(map (partial n-to-m-grams 1 4))
(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
stateful-transducer)
conj
(file-seq (io/file "dark-corpus")))))
(time
(def backwards-trie
(transduce (comp (xf-file-seq 0 1000)
(map slurp)
(map (partial n-to-m-backwards-grams 1 4))
(map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
stateful-transducer)
conj
(file-seq (io/file "dark-corpus")))))
)
(defn encode-fn [v]
(let [[value count] (if (seqable? v) v [nil nil])]
(if (nil? value)
(encoding/encode 0)
(byte-array
(concat (encoding/encode value)
(encoding/encode count))))))
(defn decode-fn [db]
(fn [byte-buffer]
(let [value (encoding/decode byte-buffer)]
(if (zero? value)
nil
[value (encoding/decode byte-buffer)]))))
(comment
(time
(def tightly-packed-trie
(tpt/tightly-packed-trie
trie
encode-fn
(decode-fn @trie-database))))
(time
(def tightly-packed-backwards-trie
(tpt/tightly-packed-trie
backwards-trie
encode-fn
(decode-fn @trie-database))))
)
(defn key-get-in-tpt [tpt db ks]
(let [id (map #(get-in db [(list %) :id]) ks)
v (get tpt id)]
{id v}))
(defn id-get-in-tpt [tpt db ids]
(let [ks (apply concat (map #(get db %) ids))
v (get tpt ids)
id (get-in db [ks :id])]
{ks (assoc v :value (get db id))}))
(defn clone-consonants [phones]
(map
#(if (phonetics/vowel (string/replace % #"\d" ""))
%
"?")
phones))
(defn word->phones [word]
(or (dict/word->cmu-phones word)
(util/get-phones-with-stress word)))
(defn perfect-rhymes [rhyme-trie phones]
(let [rhyme-suffix (first
(util/take-through
#(= (last %) \1)
(reverse phones)))]
(trie/lookup rhyme-trie rhyme-suffix)))
(defn vowel-rhymes [rhyme-trie phones]
(let [rhyme-suffix (->> (reverse phones)
(clone-consonants)
(util/take-through #(= (last %) \1))
(first))]
(trie/lookup rhyme-trie rhyme-suffix)))
(defn n+1grams [trie k]
(->> (trie/lookup trie k)
(trie/children)
(map #(get % []))))
(defn word->n+1grams [trie database word]
(->> word
database
(#(trie/lookup trie [%]))
trie/children
(map #(get % []))
(map (fn [[id fr]] [(database id) fr]))
(sort-by (comp - #(nth % 1)))
(remove #({"<s>" "</s>"} (nth % 0)))))
(comment
(let [trie (@context :trie)
db (@context :database)]
(word->n+1grams trie db "technology"))
)
(defn phrase->phones [phrase]
(let [words (string/split phrase #"[ -]")]
(->> words
(map word->phones)
(map syllabify/syllabify))))
(defn syllabify-with-stress [word]
(let [phones (word->phones word)
phones-without-stress (map #(string/replace % #"\d" "") phones)
syllables (first (owoga.syllabify/syllabify phones-without-stress))]
(loop [phones phones
syllables syllables
result [[]]]
(cond
(empty? syllables)
(map seq (pop result))
(empty? (first syllables))
(recur
phones
(rest syllables)
(conj result []))
:else
(recur
(rest phones)
(cons (rest (first syllables))
(rest syllables))
(conj (pop result)
(conj (peek result) (first phones))))))))
(defn syllabify-phrase-with-stress [phrase]
(reduce
into
[]
(map
(comp owoga.syllabify/syllabify
first
owoga.phonetics/get-phones)
(string/split phrase #"[ -]"))))
(comment
(syllabify-phrase-with-stress "bother me")
(word->phones "bother me")
(map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones) ["bother" "me"])
[(syllabify-phrase-with-stress "on poverty")
(syllabify-phrase-with-stress "can bother me")]
)
(defn phrase->flex-rhyme-phones
"Takes a space-seperated string of words
and returns the concatenation of the words
vowel phones.
Returns them in reversed order so they
are ready to be used in a lookup of a rhyme trie.
"
[phrase]
(->> phrase
(#(string/split % #" "))
(map (comp owoga.syllabify/syllabify first owoga.phonetics/get-phones))
(map (partial reduce into []))
(map #(filter (partial re-find #"\d") %))
(flatten)
(map #(string/replace % #"\d" ""))
(reverse)))
(comment
(phrase->flex-rhyme-phones "bother hello")
;; => ("OW" "AH" "ER" "AA")
)
(defonce context (atom {}))
(defn initialize []
(swap!
context
assoc
:database
(with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
(into {} (map read-string (line-seq rdr)))))
(swap!
context
assoc
:trie
(tpt/load-tightly-packed-trie-from-file
"resources/dark-corpus-backwards-tpt.bin"
(decode-fn (@context :database))))
(swap!
context
assoc
:perfect-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(@context :database)))
(swap!
context
assoc
:rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(@context :database)))
(swap!
context
assoc
:flex-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector (reverse (phrase->flex-rhyme-phones %)) %)))
(completing
(fn [trie [k v]]
(update trie k (fnil conj [v]) v)))
(trie/make-trie)
(@context :database)))
nil)
;; From a tightly-packed-trie and a database, build a trie
;; of phones of n-grams
(comment
(do
(time
(swap!
context
assoc
:flex-rhyme-trie
(transduce
(comp
(map (fn [[k v]]
[(string/join " " (map (@context :database) k))
[k v]]))
(map (fn [[phrase [k v]]]
[(phrase->flex-rhyme-phones phrase)
[k v]])))
(completing
(fn [trie [k v]]
(update trie k (fnil conj [v]) v)))
(trie/make-trie)
(->> (trie/children-at-depth (@context :trie) 0 1)))))
nil)
(take 5 (@context :flex-rhyme-trie))
)
(comment
(->> (get (@context :flex-rhyme-trie) ["EH" "OW" "IY" "EH"])
(take 20)
(map first)
(map (partial map (@context :database))))
(trie/children (trie/lookup (@context :trie) [13393]))
((@context :database) "desk") ;; => 13393
((@context :database) "wobbly") ;; => 152750
(get (@context :trie) [13393 152750]))
(defn rhyme-choices
[{:keys [rhyme-trie database] :as context} phrase]
(if (string? phrase)
(let [phones (phrase->phones phrase)]
(get rhyme-trie phones))
(get rhyme-trie phrase)))
(comment
(get (:rhyme-trie @context) (phrase->phones "fall"))
(->> (rhyme-choices
@context
"fall")
#_(map (comp (:database @context) first first)))
(take 20 (:flex-rhyme-trie @context))
(take 20 (:rhyme-trie @context))
)
(defn exclude-non-rhymes-from-choices
"Removes any choice that includes the last
word of the rhyming phrase as the last word of the choice.
Also removes beginning and end of sentence markers (1 and 38 in the database)."
[{:keys [database]} phrase choices]
(if (string? phrase)
(let [word-id (database (last (string/split phrase #" ")))]
(remove
(fn [child]
(or (= ((comp first second) child) word-id)
(#{1 38} ((comp first first) child))))
choices))
(remove
(fn [child] (#{1 38} ((comp first first) child)))
choices)))
(defn exclude-non-english-phrases-from-choices
[{:keys [database]} choices]
(filter
(fn [choice]
(->> (first choice)
(map database)
(every? dict/cmu-with-stress-map)))
choices))
(defn weighted-selection-from-choices
[choices]
(math/weighted-selection
(comp second second)
choices))
(defn choice->n-gram
[{:keys [database]} choice]
(map database (first choice)))
(defn generate-rhyming-n-gram
[phrase]
(->> (rhyme-choices @context phrase)
(exclude-non-rhymes-from-choices @context phrase)
(weighted-selection-from-choices)
(choice->n-gram @context)))
(defn get-flex-rhyme
"Gets from a rhyme-trie a rhyming n-gram based on the
weighted selection from their frequencies."
[{:keys [flex-rhyme-trie database] :as context} phrase]
(if (string? phrase)
(let [phones (phrase->flex-rhyme-phones phrase)
;; Exclude the last word. Don't rhyme kodak with kodak.
word-id (database (first (string/split phrase #" ")))
choices (remove
(fn [child]
(= (first child) word-id))
(get flex-rhyme-trie phones))
choice (math/weighted-selection
(comp second second)
choices)]
(map database (first choice)))
(let [phones phrase
choices (get flex-rhyme-trie phones)
choice (math/weighted-selection
(comp second second)
choices)]
(map database (first choice)))))
(comment
(get-flex-rhyme @context "bother me")
(phrase->flex-rhyme-phones "bother me")
(get-flex-rhyme @context ["IY" "ER" "AA"])
)
(defn get-next-markov
[{:keys [trie database] :as context} seed]
(let [seed (take-last 3 seed)
node (trie/lookup trie seed)
children (and node
(->> node
trie/children
(map (fn [^com.owoga.trie.ITrie child]
[(.key child)
(get child [])]))
(remove (comp nil? second))
(remove
(fn [[k v]]
(#{1 38} k)))))]
(cond
(nil? node) (recur context (rest seed))
(seq children)
(if (< (rand) (/ (apply max (map (comp second second) children))
(apply + (map (comp second second) children))))
(recur context (rest seed))
(first (math/weighted-selection (comp second second) children)))
(> (count seed) 0)
(recur context (rest seed))
:else (throw (Exception. "Error")))))
(defn get-next-markov-from-phrase-backwards
[{:keys [database trie] :as context} phrase n]
(let [word-ids (->> phrase
(#(string/split % #" "))
(take n)
(reverse)
(map database))]
(database (get-next-markov context word-ids))))
(comment
(initialize)
(get (@context :database) "</s>")
(keys @context)
((@context :database) (get-next-markov @context [1]))
(get-next-markov @context [222])
(get-next-markov-from-phrase-backwards @context "will strike you down" 3)
(get (@context :database) 7982)
)
(defn ids->words
[{:keys [database] :as context} ids]
(map database ids))
(defn words->syllables
[words]
(->> words
(string/join " ")
(reverse (phrase->flex-rhyme-phones))))
(defn generate-sentence-with-n-words
[{:keys [database] :as context} seed n]
(loop [seed seed]
(if (>= (dec n) (count seed))
(recur (conj seed (get-next-markov context seed)))
(map database seed))))
(defn take-words-amounting-to-at-least-n-syllables
"This function is nice to grab the tail end of a sentence for making a good rhyme.
If the sentence ends with a single-syllable word, like 'me', but a more
interesting n-gram like 'bother me', then you might want to explore the rhymes
available for the last N syllables. Sure, a word like 'poverty' would show up if you
got all rhymes for 'me'. But you'd have to filter through a lot more less-than-great
rhymes before you see it."
[phrase n]
(letfn [(phones [word]
[word (first (owoga.phonetics/get-phones word))])
(syllables [[word phones]]
[word (owoga.syllabify/syllabify phones)])]
(->> phrase
(#(string/split % #" "))
(map phones)
(map syllables)
(reduce
(fn [result [word syllables]]
(if (<= n (count (mapcat second result)))
(reduced result)
(conj result [word syllables])))
[])
(map first)
(string/join " "))))
(comment
(take-words-amounting-to-at-least-n-syllables
"police can bother me" 3);; => "police can"
(take-words-amounting-to-at-least-n-syllables
"police can bother me" 4);; => "police can bother"
)
(defn take-n-syllables
"Returns the vowel sounds that make up the last n syllables.
Doesn't return stress."
[phrase n]
(if (string? phrase)
(->> phrase
(phrase->flex-rhyme-phones)
(take n)
(reverse))
(take-last n phrase)))
(comment
(take-n-syllables "bother me" 2);; => ("ER" "IY")
)
(defn valid-english-sentence?
[phrase]
(let [words (string/split #" " phrase)]
(and (nlp/valid-sentence? phrase)
(every? dict/cmu-with-stress-map words))))
(defn sha256 [text]
(let [digest (java.security.MessageDigest/getInstance "SHA-256")]
(->> (.digest digest (.getBytes text "UTF-8"))
(#(BigInteger. 1 %))
(#(.toString % 16)))))
(defn syllable-count-phrase
[phrase]
(->> phrase
(#(string/split % #" "))
(map owoga.phonetics/get-phones)
(map first)
(mapcat owoga.syllabify/syllabify)
count))
(defn rhyming-n-gram-choices
[context target-rhyme]
(loop [target-rhyme target-rhyme]
(let [context @context
choices (->> target-rhyme
(rhyme-choices context)
(exclude-non-rhymes-from-choices context target-rhyme)
(exclude-non-english-phrases-from-choices context))]
(if (empty? choices)
(recur (if (string? target-rhyme)
(butlast (phrase->flex-rhyme-phones target-rhyme))
(butlast target-rhyme)))
choices))))
(comment
(->> (rhyming-n-gram-choices context "fall")
(map (comp (@context :database) first first)))
(rhyme-choices @context "tall")
)
(defn generate-n-syllable-sentence-rhyming-with
[context target-phrase n-gram-rank target-rhyme-syllable-count target-sentence-syllable-count]
(if (string? target-phrase)
(let [target-phrase-words (string/split target-phrase #" ")
reversed-target-phrase (string/join " " (reverse target-phrase-words))
target-rhyme
(->> (take-words-amounting-to-at-least-n-syllables
reversed-target-phrase
target-rhyme-syllable-count)
(#(string/split % #" "))
reverse
(string/join " "))
rhyming-n-gram (->> (rhyming-n-gram-choices context target-rhyme)
(weighted-selection-from-choices)
(choice->n-gram context)
(string/join " "))]
(loop [phrase rhyming-n-gram]
(if (<= target-sentence-syllable-count (syllable-count-phrase phrase))
phrase
(recur
(str (get-next-markov-from-phrase-backwards context phrase n-gram-rank)
" "
phrase)))))
(let [target-rhyme
(->> (take-n-syllables target-phrase target-rhyme-syllable-count))
rhyming-n-gram (->> (rhyming-n-gram-choices context target-rhyme)
(weighted-selection-from-choices)
(choice->n-gram context)
(string/join " "))]
(loop [phrase rhyming-n-gram]
(if (<= target-sentence-syllable-count (syllable-count-phrase phrase))
phrase
(recur
(str (get-next-markov-from-phrase-backwards context phrase n-gram-rank)
" "
phrase)))))))
(comment
(generate-n-syllable-sentence-rhyming-with
context
"war on poverty"
3
3
8)
)
(defn generate-haiku
[seed]
(let [haiku (cons
seed
(map
#(generate-n-syllable-sentence-rhyming-with
@context
(take 3 (phrase->flex-rhyme-phones seed))
3 3 %)
[5 3]))]
(lazy-seq
(cons
haiku
(generate-haiku
(last haiku))))))
(comment
(defn valid-haiku [haiku]
(and
(or (every? nlp/valid-sentence? haiku)
(->> haiku
(mapcat #(string/split % #" "))
(every? dict/cmu-with-stress-map)))
(->> haiku
(map #(string/split % #" "))
(map last)
(apply distinct?))))
(println (first (generate-haiku "fall")))
(->> (generate-haiku "</s>")
(filter valid-haiku)
(map (partial string/join "\n"))
(map #(vector % (sha256 %)))
(map (fn [[haiku sha]]
(println haiku)
(println sha)
(println)))
(take 1))
)
(comment
(println
(string/join
"\n"
(map
#(generate-n-syllable-sentence-rhyming-with
@context
(take 3 (phrase->flex-rhyme-phones "player unknown battlegrounds"))
3 4 %)
[9 6 6 9 6 6 9 6 6])))
)
"
another day a battleground
contorts the fragile sound
that no one gives a damn about
what is the chaos all about
have we really been trampled down
cause they'll pay be blasted now
weeping to absent cow
die die lightning all around
calling for santa's now
hours of lifes battleground
just how much more could a man about
trample on and as without
i'm just like so so fragile how
killing and i'll be close damage wow
witness sky is blackened now
"
(defn amulate?
[text]
(let [digest (sha256 text)]
(re-matches #"8{4}" digest)))
(defn continuously-amulate
[seed]
(let [next-sentence (generate-haiku seed)
next-seed (->> next-sentence
(#(string/split % #" "))
(reverse)
(map
(fn [word]
[word (phrase->flex-rhyme-phones word)]))
((fn [word-phones]
(loop [word-phones word-phones
seed []]
(if (< 2 (count (mapcat second seed)))
(string/join
" "
(reverse (map first seed)))
(recur (rest word-phones)
(conj seed (first word-phones))))))))]
(lazy-seq
(cons next-sentence (continuously-amulate next-seed)))))
(comment
(generate-haiku "technology")
(take 5 (continuously-amulate "technology"))
(->> (amul8 "technology" 1)
(map second)
(partition 2 1)
(map
(fn [pair]
(string/join "\n" pair)))
(map #(vector % (sha256 %)))
(map
(fn [[text sha]]
[text sha (re-matches #"8{4}" sha)])))
(dict/cmu-with-stress-map )
(repeatedly
3
#(amulate (reverse ["pleasure" "of" "the" "arcane" "technology"])))
(phrase->flex-rhyme-phones "bother hello")
(phrase->flex-rhyme-phones "snow-covered on")
(get-flex-rhyme @context (reverse ["AA" "ER" "AH" "OW"]))
((@context :database) "<s>")
(get-next-markov @context [1 503])
(take 20
(repeatedly #(reverse (get-flex-rhyme @context
(reverse (phrase->flex-rhyme-phones "technology"))
"technology"))))
(amulate)
(get (@context :database) "</s>")
(get (@context :database) "technology")
(phrase->flex-rhyme-phones "able") ;; => ("EY" "AH")
(phrase->flex-rhyme-phones "away") ;; => ("AH" "EY")
(take 20 (@context :flex-rhyme-trie))
(get-flex-rhyme @context '("AA" "IY" "AE"))
(map #(get (@context :database) %) [1 503])
(time (count (tpt/children-at-depth (@context :trie) 0 2)))
(->> (trie/children-at-depth (@context :flex-rhyme-trie') 0 5)
(take 500))
(trie/children (trie/lookup (@context :flex-rhyme-trie')
(reverse (rest (phrase->flex-rhyme-phones "i love you")))))
(trie/lookup (@context :flex-rhyme-trie') '("IY" "AH" "AA"))
(map (@context :database) '())
(take 5 (@context :flex-rhyme-trie'))
(map #(get (@context :database) %) [21 8953])
(map #(get (@context :database) %) [410 48670])
(get (@context :trie) [1 2 2])
(trie/children (trie/lookup (@context :trie) [1 2]))
(first (@context :trie))
;; 448351
;; 4388527
(time (initialize))
)
(defn flex-rhymes->phrases [flex-rhymes database]
(->> flex-rhymes
(map second)
(map
(fn [rhymes]
(reduce
(fn [acc [k [v fr]]]
(update acc k (fnil #(+ % fr) 0)))
{}
rhymes)))
(map (partial sort-by (comp - second)))
(map
(fn [rhymes]
(map
(fn [[k fr]]
[(map database k) fr])
rhymes)))))
(comment
(->> (trie/lookup
(@context :flex-rhyme-trie3')
(reverse (phrase->flex-rhyme-phones "taylor my dear")))
(#(flex-rhymes->phrases % (@context :database)))
(apply concat)
(sort-by (comp - second))
(remove
(fn [[k fr]]
(or (= 1 (count k))
(= "</s>" (first k))
(= "<s>" (second k))))))
(filter
dict/english?
(flatten
(map #(get % [])
(trie/children
(trie/lookup
(@context :flex-rhyme-trie)
'("IY" "AH" "AA"))))))
(->> (take 5 (drop 500 (@context :flex-rhyme-trie')))
(#(flex-rhymes->phrases % (@context :database))))
(let [key (reverse (phrase->flex-rhyme-phones "technology"))]
[key
(reverse (phrase->flex-rhyme-phones "sociology"))
(get (@context :flex-rhyme-trie) key)
(get (@context :flex-rhyme-trie) (rest key))])
)
(defn find-rhymes
"Takes a rhyme-trie (perfect or vowel only, for example)
and a word. Returns list of rhyming words."
[trie word]
(->> (perfect-rhymes trie (or (dict/cmu-with-stress-map word)
(util/get-phones-with-stress word)))
(map (comp first second))
(remove nil?)
(map (@context :database))
(map #(get (@context :trie) [%]))
(sort-by #(nth % 1))
(reverse)
(map
(fn [[word-id freq]]
[((@context :database) word-id)
freq]))
(remove #(= word (first %)))))
(defn choose-next-word
"Given an n-gram of [[word1 freq1] [word2 freq2]] chooses
the next word based on markov data in trie.
Could be improved by taking into account grammar and/or bidirectional context.
The n-gram parameter is a list of trie entries
For trie entries that are word/frequency pairs, it might look something like this.
`[[sunshine 38] [</s> 509]]`
But note that nothing in this function uses the frequency count from the passed in n-gram.
It's just easier for the calling functions to pass them in like that."
[{:keys [database trie] :as context} n-gram]
(let [n-gram-ids (->> n-gram (map first) (map database))
node (trie/lookup trie n-gram-ids)]
(cond
(= 0 (count n-gram-ids))
(let [children (->> (trie/children trie)
(map #(get % [])))
choice (math/weighted-selection second children)]
[(database (first choice)) (second choice)])
node
(let [children (->> (trie/children node)
(map #(get % []))
(remove (fn [[id f]] (= id (first n-gram-ids)))))]
(if (seq children)
(let [children-freqs (into (sorted-map) (frequencies (map second children)))
n-minus-1-gram-odds (/ (second (first children-freqs))
(+ (second (get node []))
(second (first children-freqs))))
;; Good-turing smoothing, take unseen ngram?
take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
(< (rand) n-minus-1-gram-odds))]
(if take-n-minus-1-gram?
(choose-next-word context (butlast n-gram))
(let [choice (math/weighted-selection second children)]
[(database (first choice)) (second choice)])))
(choose-next-word context (butlast n-gram))))
:else
(choose-next-word context (butlast n-gram)))))
(defn remove-sentence-markers [phrase]
(remove (fn [[word _]] (#{"<s>" "</s>"} word)) phrase))
(defn valid-sentence? [phrase]
(->> phrase
(map first)
(string/join " ")
(#(string/replace % #"(<s>|</s>)" ""))
(nlp/valid-sentence?)))
(defn valid-sentences? [phrase]
(let [sentences (->> (util/take-through
#(= (first %) "</s>")
phrase)
(map remove-sentence-markers))]
sentences))
(defn generate-phrase [{:keys [database trie] :as context} phrase]
(loop [phrase' (loop [phrase phrase]
(if (< 5 (count phrase))
phrase
(recur (cons (choose-next-word context (take 3 phrase))
phrase))))]
(if (valid-sentence? phrase')
phrase'
(recur (loop [phrase phrase]
(if (< 5 (count phrase))
phrase
(recur (cons (choose-next-word context (take 3 phrase))
phrase))))))))
(defn generate-sentence-backwards
"Given a phrase of [w1 w2 w3] generates a sentence
using a backwards markov."
([{:keys [database trie] :as context} phrase]
(let [phrase (map (fn [w]
(let [id (database w)]
[w (second (get trie [id]))]))
phrase)]
(loop [phrase' (loop [phrase phrase]
(if (= "<s>" (first (first phrase)))
phrase
(recur (cons (choose-next-word context (take 3 phrase))
phrase))))]
(if (valid-sentence? phrase')
phrase'
(recur (loop [phrase phrase]
(if (= "<s>" (first (first phrase)))
phrase
(recur (cons (choose-next-word context (take 3 phrase))
phrase)))))))))
)
(defn generate-rhyme
([context]
(generate-rhyme context ["</s>"]))
([{:keys [perfect-rhyme-trie] :as context} phrase]
(let [phrase1 (generate-sentence-backwards context phrase)
rhyme (second (find-rhymes perfect-rhyme-trie (first (first (take-last 2 phrase1)))))
phrase2 (generate-sentence-backwards context [(first rhyme) "</s>"])]
[phrase1 phrase2])))
(comment
(initialize)
(generate-rhyme @context)
(find-rhymes (@context :perfect-rhyme-trie) "technology")
(let [{:keys [database trie rhyme-trie]} @context
phrase ["</s>"]
ids (map database phrase)]
(get trie ids))
(choose-next-word @context (take 3 [["</s>" 509]]))
(generate-sentence-backwards @context ["kill" "</s>"])
(valid-sentences? (generate-phrase @context '(["bitter" 41])))
(choose-next-word @context (take 3 [["theology" 41]]))
(choose-next-word @context [["and" 5] ["theology" 41]])
(find-rhymes (@context :perfect-rhyme-trie) "theology")
(trie/lookup (@context :trie) '(57 2477))
(take 5 (@context :trie))
(->> (find-rhymes (@context :perfect-rhyme-trie) "technology")
(map (fn [[word frq]]
(let [n+1grams (word->n+1grams
(@context :trie)
(@context :database)
word)]
(map vector n+1grams (repeat [word frq])))))
(reduce into []))
(def loaded-backwards-trie
(tpt/load-tightly-packed-trie-from-file
"resources/dark-corpus-backwards-tpt.bin"
(decode-fn @trie-database)))
(def loaded-backwards-database
(with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
(into {} (map read-string (line-seq rdr)))))
(def rhyme-database (atom {}))
(def db
(nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
(def perfect-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
@loaded-backwards-database))
(def vowel-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (word->phones %))))
(map reverse)
(map (fn [[phones v]]
[(map #(if (owoga.phonetics/vowel
(string/replace % #"\d" ""))
%
"?")
phones)
v])))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(take 1000 db)))
(take 20 vowel-rhyme-trie)
#_(with-open [wtr (clojure.java.io/writer "database.bin")]
(let [lines (->> (seq @trie-database)
(map pr-str)
(map #(str % "\n")))]
(doseq [line lines]
(.write wtr line))))
(profile
{}
(def example-story
(loop [generated-text [(get @trie-database "<s>")]
i 0]
(if (> i 20)
generated-text
(let [children (loop [i 4]
(let [node (p :lookup
(trie/lookup
loaded-tightly-packed-trie
(vec (take-last i generated-text))))
children (p :seq-children (and node (trie/children node)))]
(cond
(nil? node) (recur (dec i))
(< i 0) (throw (Exception. "Error"))
(seq children) children
:else (recur (dec i)))))]
(recur
(conj
generated-text
(->> children
(map #(get % []))
(remove nil?)
(#(p :weighted-selection (math/weighted-selection
(fn [[_ c]] c)
%)))
first))
(inc i)))))))
(->> example-story
(map (fn [v] (get-in @trie-database [v])))
(string/join " ")
(#(string/replace % #" ([\.,\?])" "$1"))
((fn [txt]
(string/replace txt #"(^|\. |\? )([a-z])" (fn [[a b c]]
(str b (.toUpperCase c)))))))
(key-get-in-tpt
tightly-packed-trie
trie-database
'("<s>" "<s>" "the"))
;; => {(2 2 3) {:value 3263, :count 462}}
(id-get-in-tpt
tightly-packed-trie
trie-database
'(2 2 3))
;; => {("<s>" "<s>" "the") {:value ("<s>" "<s>" "the"), :count 462}}
)
(comment
(->> (perfect-rhymes perfect-rhyme-trie
(or (dict/cmu-with-stress-map "technology")
(util/get-phones-with-stress "technology")))
(map (comp first second))
(remove nil?)
#_#_#_#_(map @loaded-backwards-database)
(map #(vector [%] (n+1grams
loaded-backwards-trie
[%])))
(map (fn [[w1 w2s]]
(mapv #(into w1 [(nth % 0)]) w2s)))
(take 10))
(->> (perfect-rhymes perfect-rhyme-trie
(or (dict/cmu-with-stress-map "technology")
(util/get-phones-with-stress "technology")))
(map (comp first second))
(remove nil?)
(map @loaded-backwards-database)
(map #(vector [%] (n+1grams
loaded-backwards-trie
[%])))
(map (fn [[w1 w2s]]
(mapv #(into w1 [(nth % 0)]) w2s)))
(reduce into [])
(map (fn [k]
(let [children (->> (n+1grams loaded-backwards-trie k)
(mapv first))]
(mapv #(into k [%]) children))))
(reduce into [])
#_#_#_#_(map #(map @loaded-backwards-database %))
(filter (partial every? dict/english?))
(take 100)
(map reverse))
(util/get-phones-with-stress "you") ;; => ("B" "AA1" "DH" "ER" "M")
(def phones (or (dict/cmu-with-stress-map "sandman")
(util/get-phones-with-stress "sandman")))
(take 20 vowel-rhyme-trie)
(->> (vowel-rhymes vowel-rhyme-trie phones)
(map (comp first second))
(remove nil?)
(take 20))
;; Bigrams of rhyme
(->> (perfect-rhymes perfect-rhyme-trie
(or (dict/cmu-with-stress-map "technology")
(util/get-phones-with-stress "technology")))
(map (comp first second))
(remove nil?)
(map @loaded-backwards-database)
(map #(vector [%] (n+1grams
loaded-backwards-trie
[%])))
(map (fn [[w1 w2s]]
(mapv #(into w1 [(nth % 0)]) w2s)))
(reduce into [])
(map (fn [k]
(let [children (->> (n+1grams loaded-backwards-trie k)
(mapv first))]
(mapv #(into k [%]) children))))
(reduce into [])
(map #(map @loaded-backwards-database %))
(filter (partial every? dict/english?))
(take 100)
(map reverse))
)
(defn perfect-rhymes [rhyme-trie phones]
(let [rhyme-suffix (first
(util/take-through
#(= (last %) \1)
(reverse phones)))]
(trie/lookup rhyme-trie rhyme-suffix)))
(defn vowel-rhymes [rhyme-trie phones]
(let [rhyme-suffix (->> (reverse phones)
(clone-consonants)
(util/take-through #(= (last %) \1))
(first))]
(trie/lookup rhyme-trie rhyme-suffix)))
(defn rhymes-rank-1
"Phones match from primary stress to the end."
[trie phones]
(let [rhyme-suffix (first
(util/take-through
#(= (last %) \1)
phones))]
(trie/lookup trie rhyme-suffix)))
(defn rhymes-rank-2
"Phones match from secondary stress to the end."
[trie phones]
(let [rhyme-suffix (first
(util/take-through
#(= (last %) \2)
phones))]
(trie/lookup trie rhyme-suffix)))
(comment
(keys @context)
;; => (:flex-rhyme-trie :database :trie :perfect-rhyme-trie :rhyme-trie)
)