You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
486 lines
16 KiB
Clojure
486 lines
16 KiB
Clojure
(ns com.darklimericks.linguistics.core
|
|
(:require [com.owoga.prhyme.data.dictionary :as dict]
|
|
[com.owoga.prhyme.core :as prhyme]
|
|
[com.owoga.prhyme.util :as util]
|
|
[com.darklimericks.server.models :as models]
|
|
[com.owoga.corpus.markov :as markov]
|
|
[clojure.string :as string]
|
|
[com.owoga.phonetics :as phonetics]
|
|
[com.owoga.phonetics.syllabify :as syllabify]
|
|
[com.owoga.phonetics.stress-manip :as stress-manip]
|
|
[clojure.math.combinatorics :as combinatorics]
|
|
[com.owoga.prhyme.nlp.core :as nlp]
|
|
[com.owoga.prhyme.data-transform :as data-transform]
|
|
[com.owoga.trie :as trie]))
|
|
|
|
(defn gen-artist []
|
|
(->> [(rand-nth (seq dict/adjectives))
|
|
(rand-nth (seq dict/nouns))]
|
|
(map string/capitalize)
|
|
(string/join " ")))
|
|
|
|
(defn gen-album []
|
|
(->> [(rand-nth (seq dict/adverbs))
|
|
(rand-nth (seq dict/verbs))]
|
|
(map string/capitalize)
|
|
(string/join " ")))
|
|
|
|
(defn perfect-rhyme
|
|
[phones]
|
|
(->> phones
|
|
reverse
|
|
(util/take-through stress-manip/primary-stress?)
|
|
first
|
|
reverse
|
|
(#(cons (first %)
|
|
(stress-manip/remove-any-stress-signifiers (rest %))))))
|
|
|
|
(comment
|
|
(perfect-rhyme (first (phonetics/get-phones "technology")))
|
|
;; => ("AA1" "L" "AH" "JH" "IY")
|
|
)
|
|
|
|
(defn perfect-rhyme-sans-consonants
|
|
[phones]
|
|
(->> phones
|
|
perfect-rhyme
|
|
(remove phonetics/consonant)))
|
|
|
|
(comment
|
|
(perfect-rhyme-sans-consonants (first (phonetics/get-phones "technology")))
|
|
;; => ("AA1" "AH" "IY")
|
|
)
|
|
|
|
(defn perfect-rhyme?
|
|
[phones1 phones2]
|
|
(apply = (map perfect-rhyme [phones1 phones2])))
|
|
|
|
(defn perfect-rhyme-sans-consonants?
|
|
[phones1 phones2]
|
|
(apply = (map perfect-rhyme-sans-consonants [phones1 phones2])))
|
|
|
|
(comment
|
|
(apply perfect-rhyme? (map (comp first phonetics/get-phones) ["technology" "ecology"]));; => true
|
|
(apply perfect-rhyme? (map (comp first phonetics/get-phones) ["technology" "economy"]));; => false
|
|
(apply perfect-rhyme-sans-consonants? (map (comp first phonetics/get-phones) ["technology" "economy"]));; => true
|
|
(apply perfect-rhyme-sans-consonants? (map (comp first phonetics/get-phones) ["technology" "trilogy"]));; => false
|
|
(apply perfect-rhyme? (map (comp first phonetics/get-phones) ["bother me" "poverty"]))
|
|
(apply perfect-rhyme? (map (comp first phonetics/phrase-phones) ["bother me" "poverty"]))
|
|
(phonetics/phrase-phones "bother me");; => [["B" "AA1" "DH" "ER0" "M" "IY1"]]
|
|
(phonetics/phrase-phones "poverty");; => [["P" "AA1" "V" "ER0" "T" "IY0"]]
|
|
)
|
|
|
|
(defn number-of-matching-vowels-with-stress
|
|
[phones1 phones2]
|
|
(let [[vowels1 vowels2] (map (partial filter phonetics/vowel?) [phones1 phones2])]
|
|
(->> [vowels1 vowels2]
|
|
(map reverse)
|
|
(apply map vector)
|
|
(filter (partial apply =))
|
|
(filter (comp (partial re-find #"1") first))
|
|
count)))
|
|
|
|
(comment
|
|
(apply
|
|
number-of-matching-vowels-with-stress
|
|
(map first (map phonetics/get-phones ["technology" "ecology"])))
|
|
|
|
(apply
|
|
number-of-matching-vowels-with-stress
|
|
(map first (map phonetics/get-phones ["biology" "ecology"])))
|
|
|
|
)
|
|
|
|
(defn number-of-matching-vowels-any-stress
|
|
[phones1 phones2]
|
|
(let [[vowels1 vowels2] (map (partial filter phonetics/vowel?) [phones1 phones2])]
|
|
(->> [vowels1 vowels2]
|
|
(map (partial map phonetics/remove-stress))
|
|
(map reverse)
|
|
(apply map vector)
|
|
(filter (partial apply =))
|
|
count)))
|
|
|
|
(comment
|
|
(apply
|
|
number-of-matching-vowels-any-stress
|
|
(map first (map phonetics/get-phones ["economy" "ecology"])))
|
|
|
|
(apply
|
|
number-of-matching-vowels-any-stress
|
|
(map first (map phonetics/get-phones ["biology" "ecology"])))
|
|
)
|
|
|
|
(defn same-number-of-syllables?
|
|
[phones1 phones2]
|
|
(apply = (map (comp count syllabify/syllabify) [phones1 phones2])))
|
|
|
|
(comment
|
|
(apply
|
|
same-number-of-syllables?
|
|
(map first (map phonetics/get-phones ["economy" "ecology"])))
|
|
|
|
(apply
|
|
same-number-of-syllables?
|
|
(map first (map phonetics/get-phones ["numerology" "ecology"])))
|
|
)
|
|
|
|
(defn quality-of-rhyme-phones
|
|
"Points for:
|
|
- Perfect rhyme
|
|
- Perfect rhyme sans consonants
|
|
- Number of matching stressed vowels
|
|
- Number of matching any-stressed vowels
|
|
- Same number of syllables
|
|
"
|
|
[phones1 phones2]
|
|
(let [perfect? (if (perfect-rhyme? phones1 phones2) 1 0)
|
|
perfect-sans-consonants? (if (perfect-rhyme-sans-consonants? phones1 phones2) 1 0)
|
|
num-matching-stressed (number-of-matching-vowels-with-stress phones1 phones2)
|
|
num-matching-any-stress (number-of-matching-vowels-any-stress phones1 phones2)
|
|
same-number-of-syllables (if (same-number-of-syllables? phones1 phones2) 1 0)]
|
|
(+ perfect?
|
|
perfect-sans-consonants?
|
|
num-matching-stressed
|
|
num-matching-any-stress
|
|
same-number-of-syllables)))
|
|
|
|
(comment
|
|
(->> [["economy" "ecology"]
|
|
["biology" "ecology"]
|
|
["bother me" "poverty"]
|
|
["property" "properly"]
|
|
["bother me" "invincibility"]
|
|
["invincibility" "bother me"]]
|
|
(map (partial map (comp first phonetics/phrase-phones)))
|
|
(map (partial apply quality-of-rhyme-phones)))
|
|
|
|
(phonetics/phrase-phones "bother me")
|
|
(phonetics/phrase-phones "invincibility")
|
|
|
|
(let [phones1 ["B" "AA1" "DH" "ER0" "M" "IY1"]
|
|
phones2 ["IH2" "N" "V" "IH2" "N" "S" "AH0" "B" "IH1" "L" "IH0" "T" "IY0"]]
|
|
(perfect-rhyme-sans-consonants? phones1 phones2))
|
|
|
|
)
|
|
|
|
(defn rhymes
|
|
"All rhymes. Slightly flexible. Ordered by number of rhyming syllables.
|
|
Most generic and likely desired rhyming algorithm."
|
|
[target]
|
|
(->> target
|
|
(phonetics/phrase-phones)
|
|
(mapcat (partial
|
|
markov/rhymes
|
|
models/rhyme-trie-unstressed-trailing-consonants))
|
|
(mapcat second)))
|
|
|
|
(comment
|
|
(rhymes "bother me")
|
|
|
|
)
|
|
|
|
(defn rhymes-with-phones
|
|
"All rhymes. Slightly flexible. Ordered by number of rhyming syllables.
|
|
Most generic and likely desired rhyming algorithm."
|
|
[target]
|
|
(let [pronunciations (phonetics/phrase-phones target)]
|
|
(->> pronunciations
|
|
(mapcat
|
|
(partial markov/rhymes models/rhyme-trie-unstressed-trailing-consonants))
|
|
(mapcat second)
|
|
(mapcat
|
|
(fn [word]
|
|
(map #(vector word %) (phonetics/phrase-phones word)))))))
|
|
|
|
(comment
|
|
(rhymes-with-phones
|
|
"technology")
|
|
|
|
)
|
|
(defn rhymes-with-frequencies
|
|
[target trie database]
|
|
(let [rhymes- (rhymes target)
|
|
freqs (map
|
|
(comp
|
|
(fnil int 0)
|
|
second
|
|
(partial get models/markov-trie)
|
|
(partial conj [1 1 1])
|
|
database)
|
|
rhymes-)]
|
|
(distinct (sort-by (comp - second) (map vector rhymes- freqs)))))
|
|
|
|
(defn assoc-phrases-with-phones
|
|
[phrases]
|
|
(mapcat
|
|
(fn [phrase]
|
|
(map #(vector phrase %) (phonetics/get-phones phrase)))
|
|
phrases))
|
|
|
|
(comment
|
|
(assoc-phrases-with-phones ["foo" "bar"]);; => (["foo" ["F" "UW1"]] ["bar" ["B" "AA1" "R"]])
|
|
)
|
|
|
|
(defn append-freqs
|
|
[database trie prefix phrase-phones]
|
|
(map
|
|
(fn [phrase-phone]
|
|
(let [id (database (first phrase-phone))]
|
|
(conj phrase-phone (second (get trie (conj prefix id) ['_ 0])))))
|
|
phrase-phones))
|
|
|
|
(comment
|
|
(->> ["top" "bar"]
|
|
(assoc-phrases-with-phones)
|
|
(append-freqs models/database models/markov-trie [1 1 1]))
|
|
;; => (["top" ["T" "AA1" "P"] 888]
|
|
;; ["top" ["T" "AO1" "P"] 888]
|
|
;; ["bar" ["B" "AA1" "R"] 220])
|
|
)
|
|
|
|
(defn append-rhyme-quality
|
|
[target-phrase phrase-phones]
|
|
(mapcat
|
|
(fn [phrase-phone]
|
|
(map
|
|
(fn [phones]
|
|
(into phrase-phone [target-phrase phones (quality-of-rhyme-phones
|
|
(second phrase-phone)
|
|
phones)]))
|
|
(phonetics/phrase-phones target-phrase)))
|
|
phrase-phones))
|
|
|
|
(comment
|
|
(->> ["hog" "bog"]
|
|
(assoc-phrases-with-phones)
|
|
(append-freqs models/database models/markov-trie [1 1 1])
|
|
(append-rhyme-quality "log"))
|
|
;; => (["hog" ["HH" "AA1" "G"] 18 "log" ["L" "AO1" "G"] 0]
|
|
;; ["bog" ["B" "AA1" "G"] 42 "log" ["L" "AO1" "G"] 0]
|
|
;; ["bog" ["B" "AO1" "G"] 42 "log" ["L" "AO1" "G"] 4])
|
|
)
|
|
|
|
|
|
(defn distinct-by
|
|
"Returns a stateful transducer that removes elements by calling f on each step as a uniqueness key.
|
|
Returns a lazy sequence when provided with a collection."
|
|
([f]
|
|
(fn [rf]
|
|
(let [seen (volatile! #{})]
|
|
(fn
|
|
([] (rf))
|
|
([result] (rf result))
|
|
([result input]
|
|
(let [v (f input)]
|
|
(if (contains? @seen v)
|
|
result
|
|
(do (vswap! seen conj v)
|
|
(rf result input)))))))))
|
|
([f xs]
|
|
(sequence (distinct-by f) xs)))
|
|
|
|
(defn rhymes-with-frequencies-and-rhyme-quality
|
|
[target trie database]
|
|
(let [rhymes- (rhymes target)
|
|
rhymes-with-freqs-and-qualities
|
|
(->> rhymes-
|
|
(assoc-phrases-with-phones)
|
|
(append-freqs models/database models/markov-trie [1 1 1])
|
|
(append-rhyme-quality target))]
|
|
(into
|
|
[]
|
|
(distinct-by first)
|
|
(sort-by
|
|
(fn [[w1 p1 f w2 p2 q]]
|
|
[(- q) (- f)])
|
|
rhymes-with-freqs-and-qualities))))
|
|
|
|
(comment
|
|
(time
|
|
(rhymes-with-frequencies-and-rhyme-quality
|
|
"bother me"
|
|
models/markov-trie
|
|
models/database))
|
|
|
|
(apply quality-of-rhyme-phones
|
|
'(["B" "AA1" "DH" "ER0" "M" "IY1"]
|
|
["IH2" "N" "V" "IH2" "N" "S" "AH0" "B" "IH1" "L" "IH0" "T" "IY0"]))
|
|
|
|
(apply quality-of-rhyme-phones
|
|
'(["B" "AA1" "DH" "ER0" "M" "IY1"]
|
|
["IH2" "N" "V" "IH2" "Z" "AH0" "B" "IH1" "L" "AH0" "T" "IY0"]))
|
|
|
|
)
|
|
|
|
(comment
|
|
(markov/rhymes models/rhyme-trie-unstressed-trailing-consonants (phonetics/get-phones "food"))
|
|
(rhymes "food")
|
|
|
|
(get models/markov-trie [(models/database "food")])
|
|
(rhymes-with-frequencies "technology" models/markov-trie models/database)
|
|
|
|
)
|
|
|
|
(defn rhymes-by-quality
|
|
"Returns rhyming word, pronunciation, rhyme quality, and n-gram frequency."
|
|
[seed-phrase]
|
|
(->> seed-phrase
|
|
(prhyme/phrase->all-phones)
|
|
(map first)
|
|
(map
|
|
(fn [phones]
|
|
[phones (markov/rhymes
|
|
models/rhyme-trie-unstressed-trailing-consonants
|
|
phones)]))
|
|
(map (fn append-quality-of-rhyme [[phones1 words]]
|
|
[phones1 (->> (mapcat
|
|
prhyme/phrase->all-phones
|
|
(reduce into #{} (map second words)))
|
|
(map (fn [[phones2 word]]
|
|
[phones2
|
|
word
|
|
(prhyme/quality-of-rhyme-phones
|
|
phones1
|
|
phones2)])))]))
|
|
(map (fn sort-by-quality-of-rhyme [[phones1 words]]
|
|
[phones1 (sort-by (fn [[_ _ quality]]
|
|
(- quality))
|
|
words)]))
|
|
(reduce
|
|
(fn convert-to-hashmap [acc [pronunciation rhyming-words]]
|
|
(reduce (fn [acc [phones word rhyme-quality]]
|
|
(assoc-in acc [pronunciation phones] {:word word
|
|
:rhyme-quality rhyme-quality
|
|
:pronunciation phones}))
|
|
acc
|
|
rhyming-words))
|
|
{})))
|
|
|
|
(comment
|
|
(rhymes-by-quality "boss hog")
|
|
|
|
)
|
|
|
|
(defn add-frequency-to-rhymes
|
|
[rhymes trie database]
|
|
(reduce
|
|
(fn [acc [pronunciation rhyming-words]]
|
|
(reduce
|
|
(fn [acc [phones word]]
|
|
(assoc-in acc [pronunciation phones :freq] (second (get trie [(database (:word word))]))))
|
|
acc
|
|
rhyming-words))
|
|
rhymes
|
|
rhymes))
|
|
|
|
(defn rhymes-with-quality-and-frequency
|
|
[phrase]
|
|
(-> (rhymes-by-quality phrase)
|
|
(add-frequency-to-rhymes models/markov-trie models/database)))
|
|
|
|
(comment
|
|
(-> (rhymes-by-quality "boss hog")
|
|
(add-frequency-to-rhymes models/markov-trie models/database))
|
|
|
|
{("B" "AA1" "S" "HH" "AA1" "G")
|
|
{("D" "EH1" "M" "AH0" "G" "AA2" "G")
|
|
{:word "demagogue", :rhyme-quality 1, :freq 20},
|
|
("B" "AE1" "K" "L" "AA2" "G") {:word "backlog", :rhyme-quality 2, :freq 1},
|
|
("HH" "EH1" "JH" "HH" "AA2" "G")
|
|
{:word "hedgehog", :rhyme-quality 2, :freq 2},
|
|
,,,
|
|
("AA1" "G") {:word "og", :rhyme-quality 4, :freq 134},
|
|
("P" "R" "OW1" "L" "AA0" "G")
|
|
{:word "prologue", :rhyme-quality 2, :freq 25}},
|
|
("B" "AO1" "S" "HH" "AA1" "G")
|
|
{("D" "EH1" "M" "AH0" "G" "AA2" "G")
|
|
{:word "demagogue", :rhyme-quality 1, :freq 20},
|
|
("B" "AE1" "K" "L" "AA2" "G") {:word "backlog", :rhyme-quality 2, :freq 1},
|
|
("HH" "EH1" "JH" "HH" "AA2" "G")
|
|
{:word "hedgehog", :rhyme-quality 2, :freq 2},
|
|
,,,
|
|
("P" "R" "OW1" "L" "AA0" "G") {:word "prologue", :rhyme-quality 2, :freq 25}}}
|
|
)
|
|
|
|
(defn open-nlp-perplexity
|
|
"Returns the perplexity of the parse tree using OpenNLP.
|
|
This is an alternative to the perplexity of the Markov model.
|
|
Normalized per word because long sentences are naturally more perplex."
|
|
[phrase]
|
|
(->> phrase
|
|
nlp/tokenize
|
|
(string/join " ")
|
|
(nlp/most-likely-parse)
|
|
((fn [[line perplexity]]
|
|
[line (/ perplexity (count (string/split line #" ")))]))
|
|
second))
|
|
|
|
(defn lyric-suggestions [seed-phrase trie database]
|
|
(let [realize-seed (fn [seed]
|
|
(data-transform/untokenize
|
|
(-> (map database (reverse seed))
|
|
butlast
|
|
rest)))]
|
|
(loop [seed (vec (reverse (map #(get database % 0) (string/split seed-phrase #" "))))]
|
|
(cond
|
|
(< 20 (count seed)) (realize-seed seed)
|
|
(= (database prhyme/BOS) (peek seed)) (realize-seed seed)
|
|
:else (recur (conj seed (markov/get-next-markov
|
|
trie
|
|
seed
|
|
(partial remove (fn [child]
|
|
(= (.key child) (database prhyme/EOS)))))))))))
|
|
|
|
(defn phrase->quality-of-rhyme
|
|
"Gets the quality of rhyme of the thie highest quality pronunciation of all
|
|
combinations of phrases."
|
|
[phrase1 phrase2]
|
|
(let [phones1 (prhyme/phrase->all-phones phrase1)
|
|
phones2 (prhyme/phrase->all-phones phrase2)
|
|
all-possible-rhyme-combinations (combinatorics/cartesian-product
|
|
phones1
|
|
phones2)]
|
|
(->> all-possible-rhyme-combinations
|
|
(map (partial map first))
|
|
(map (juxt identity
|
|
(partial apply quality-of-rhyme-phones)))
|
|
(sort-by (comp - second))
|
|
first)))
|
|
|
|
(defn wgu-lyric-suggestions
|
|
[phrase]
|
|
(let [rhymes (rhymes-by-quality phrase)
|
|
seeds (map vector rhymes (repeat "</s>"))
|
|
lyrics (map #(lyric-suggestions
|
|
(string/join " " %)
|
|
models/markov-trie
|
|
models/database)
|
|
seeds)]
|
|
(->> lyrics
|
|
(map (juxt identity open-nlp-perplexity))
|
|
(sort-by (comp - second)))))
|
|
|
|
(comment
|
|
(wgu-lyric-suggestions "technology")
|
|
|
|
(phrase->quality-of-rhyme "boss hog" "brain fog")
|
|
|
|
(let [rhymes (rhymes-by-quality "bother me")
|
|
seeds (map vector rhymes (repeat "</s>"))
|
|
lyrics (map #(lyric-suggestions
|
|
(string/join " " %)
|
|
models/markov-trie
|
|
models/database)
|
|
seeds)]
|
|
(->> lyrics
|
|
(map (juxt identity open-nlp-perplexity))
|
|
(sort-by (comp - second))))
|
|
|
|
(->> #(lyric-suggestions "bother me </s>" models/markov-trie models/database)
|
|
repeatedly
|
|
(take 5)
|
|
(map (juxt identity open-nlp-perplexity (partial phrase->quality-of-rhyme "bother me"))))
|
|
|
|
)
|