From f30686e95e844d24f8251ee54522194bf75e02ad Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Fri, 23 Oct 2020 13:40:32 -0700 Subject: [PATCH] Cleanup and organize namespaces --- src/com/owoga/corpus/darklyrics.clj | 1 - src/com/owoga/prhyme/core.clj | 26 +++++ src/com/owoga/prhyme/gen.clj | 25 ++++ .../prhyme/generation/weighted_selection.clj | 110 ++++++++++++++++++ src/com/owoga/prhyme/lymeric.clj | 37 +++--- src/com/owoga/prhyme/util/math.clj | 101 ++++++++++++++++ src/com/owoga/prhyme/util/nlp.clj | 8 +- 7 files changed, 284 insertions(+), 24 deletions(-) create mode 100644 src/com/owoga/prhyme/generation/weighted_selection.clj create mode 100644 src/com/owoga/prhyme/util/math.clj diff --git a/src/com/owoga/corpus/darklyrics.clj b/src/com/owoga/corpus/darklyrics.clj index ccd2229..6c2eb3d 100644 --- a/src/com/owoga/corpus/darklyrics.clj +++ b/src/com/owoga/corpus/darklyrics.clj @@ -168,7 +168,6 @@ artist-album-texts))) (comment - (def darkov-2 (util/read-markov "dark-corpus-2.edn")) (get darkov-2 '(nil nil)) (take 3 (scrape base-url)) diff --git a/src/com/owoga/prhyme/core.clj b/src/com/owoga/prhyme/core.clj index 5bf5f9d..5a7c4d6 100644 --- a/src/com/owoga/prhyme/core.clj +++ b/src/com/owoga/prhyme/core.clj @@ -159,6 +159,32 @@ :else (= (last (:rimes a)) (last (:rimes b))))) +(defn rhymes? + "What does it mean for something to rhyme?" + [a b] + (cond + (and (= 1 (count (last (:rimes a)))) + (= 1 (count (last (:rimes b)))) + (or (= (last (:rimes a)) '("ER")) + (= (last (:rimes a)) '("AA")) + (= (last (:rimes a)) '("AE")) + (= (last (:rimes a)) '("AO")) + (= (last (:rimes a)) '("AW")) + (= (last (:rimes a)) '("EH")) + (= (last (:rimes a)) '("IH")) + (= (last (:rimes a)) '("UH")) + (= (last (:rimes a)) '("AH")))) + (= (list (first (take-last 2 (:nuclei a))) + (last (:onsets a))) + (list (first (take-last 2 (:nuclei b))) + (last (:onsets b)))) + + (and (= 1 (count (last (:rimes a)))) + (= 1 (count (last (:rimes b))))) + (= (last (:onsets a)) (last (:onsets b))) + + :else (= (last (:rimes a)) (last (:rimes b))))) + (defn onset+nucleus [syllables] (->> syllables (map #(first (u/take-through u/vowel %))))) diff --git a/src/com/owoga/prhyme/gen.clj b/src/com/owoga/prhyme/gen.clj index bd330b1..2470f62 100644 --- a/src/com/owoga/prhyme/gen.clj +++ b/src/com/owoga/prhyme/gen.clj @@ -1,5 +1,6 @@ (ns com.owoga.prhyme.gen (:require [clojure.string :as string] + [com.owoga.prhyme.util.math :as math] [com.owoga.prhyme.util :as util] [com.owoga.prhyme.util.weighted-rand :as weighted-rand] [com.owoga.prhyme.util.nlp :as nlp] @@ -225,6 +226,30 @@ (filter nlp/valid-sentence?) first)) +(defn remove-selection-from-target [target selection] + (->> target + (#(assoc % :syllables (drop-last + (:syllable-count + selection) + (:syllables + target)))) + (#(assoc % :rimes (prhyme/rimes (:syllables %)))) + (#(assoc % :onsets (prhyme/onset+nucleus (:syllables %)))) + (#(assoc % :nuclei (prhyme/nucleus (:syllables %)))))) + +(defn selection-seq + ([words adjust target] + (selection-seq words adjust target '())) + ([words adjust target result] + (let [[weighted-words _ _] (adjust [words target result]) + selection (math/weighted-selection :weight weighted-words) + new-target (remove-selection-from-target target selection) + new-result (cons selection result)] + (cons selection + (lazy-seq (selection-seq words adjust new-target new-result)))))) + +(defn generate-prhyme [words adjust target stop?] + (loop [result '()])) (defn prhyme "2020-10-21 iteration" diff --git a/src/com/owoga/prhyme/generation/weighted_selection.clj b/src/com/owoga/prhyme/generation/weighted_selection.clj new file mode 100644 index 0000000..02c32d4 --- /dev/null +++ b/src/com/owoga/prhyme/generation/weighted_selection.clj @@ -0,0 +1,110 @@ +(ns com.owoga.prhyme.generation.weighted-selection + "Utilities for generation of rhymes by assigning weights to a collection of + words and randomly choosing words based on their weights. + + For example, we might want the first word of the rhyme of a target phrase to + be selected from words that are highly weighted by their rhymeness. It's not + as important for subsequent words to rhyme, so we might want subsequent words + to be selected weighted by whether or not they are synonyms to some other + word, by how many syllables they have, by whether they are associated with a + markov value, etc..." + (:require [com.owoga.prhyme.core :as prhyme])) + +;;;; Signature of "weight adjustment" functions +;;; +;;; A weight adjustment function gets called every time a decision needs to be +;;; made for which word to choose, so I think it's important to not be too slow. +;;; +;;; The function will receive: +;;; - a collection of the words from which to weight. +;;; - the target phrase we're rhyming for +;;; - the current result that we have so far +;;; +;;; By default, the weights of the passed in words will probably be 1. If you +;;; want future weights to be adjusted from past weights, that's up to you. +;;; +;;; The target phrase will change as words are chosen for the result. +;;; A good and strategy will be to chop off syllables from the target phrase +;;; for each syllable of a matching result. + +(defn adjust-for-markov + "Works with a markov data structure that was generated taking into account + sentence boundaries (represented as nils). + + A key in the markov structure of '(nil) would have a value that represents all + words that have occurred in position 1 of the raw data. + + A key of '(nil \"foo\") would have a value that represents all words + that occurred in position 2 following \"foo\" + + Automatically detects the order (window size) of the markov model. Does this + by counting the length of the first key. + " + [markov percent] + (let [markov-n (count (first (first markov)))] + (fn [[words target result]] + (let [key (let [k (map :norm-word (take markov-n result))] + (reverse + (if (> markov-n (count k)) + (concat k (repeat (- markov-n (count k)) nil)) + k))) + markov-options (markov key) + markov-option-avg (/ (apply + (vals markov-options)) + (max 1 (count markov-options)))] + (if (nil? markov-options) + [words target result] + (let [[markovs non-markovs] + ((juxt filter remove) + #(markov-options (:norm-word %)) + words) + weight-non-markovs (apply + (map :weight non-markovs)) + target-weight-markovs (- (/ weight-non-markovs (- 1 percent)) + weight-non-markovs) + count-markovs (count markovs) + adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))] + [(concat + (map + (fn [m] + (let [option (markov-options (:norm-word m))] + (as-> m m + (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m))) + (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs))))) + markovs) + non-markovs) + target + result])))))) + +(defn adjust-for-rhymes + "Weights words by whether or not they rhyme. + Once result contains something, becomes inactive. If you want to try to rhyme + every selection, you'll need a different function. This one will only rhyme + the tail of a target." + [percent] + (fn [[words target result]] + (if (empty? result) + (let [words-with-rime-count + (map + (fn [word] + (assoc word :num-matching (if (prhyme/rhymes? target word) 1 0))) + words) + + [rhyming non-rhyming] + ((juxt filter remove) + #(< 0 (:num-matching %)) + words-with-rime-count) + + weight-non-rhyming (apply + (map :weight non-rhyming)) + target-weight-rhyming (* 100 percent weight-non-rhyming) + count-rhyming (count rhyming) + adjustment-rhyming (if (= 0 count-rhyming) 1 (/ target-weight-rhyming count-rhyming))] + [(concat + (map + (fn [word] + (as-> word word + (assoc word :weight (* adjustment-rhyming (:weight word))) + (assoc word :adjustment-for-rimes adjustment-rhyming))) + rhyming) + non-rhyming) + target + result]) + [words target result]))) diff --git a/src/com/owoga/prhyme/lymeric.clj b/src/com/owoga/prhyme/lymeric.clj index 5bd59e1..bb2e238 100644 --- a/src/com/owoga/prhyme/lymeric.clj +++ b/src/com/owoga/prhyme/lymeric.clj @@ -1,7 +1,7 @@ (ns com.owoga.prhyme.lymeric (:require [com.owoga.prhyme.gen :as gen] [com.owoga.prhyme.util :as util] - [com.owoga.prhyme.util.weighted-rand :as weighted-rand] + [com.owoga.prhyme.generation.weighted-selection :as weighted-selection] [clojure.string :as string] [com.owoga.prhyme.frp :as frp] [com.owoga.prhyme.util.nlp :as nlp] @@ -40,7 +40,6 @@ (remove #(banned-words (:norm-word %)) base-words) (frp/phrase->word frp/words (get rhymes pattern))))] - (println banned-words) (recur (rest scheme) (assoc rhymes pattern rhyme) (conj result rhyme))))))) @@ -48,25 +47,22 @@ (comment (rhyme-from-scheme nil '((A 8) (A 8) (B 5) (B 5) (A 8))) - - (first (filter #(= "abba" (:norm-word %)) frp/words)) - - (gen/gen-sentence-with-syllable-count darklyrics/darkov-2 8 (map #(assoc % :weight 1) frp/popular)) - (def adj - (comp (gen/adjust-for-markov-with-boundaries darklyrics/darkov-2 0.9) - (gen/adjust-for-tail-rimes util/words-map 0.99))) - (repeatedly - 10 - (fn [] - (gen/gen-rhyme-with-syllable-count - adj - 8 - frp/popular - (frp/phrase->word frp/words "famed watched waterloo")))) - ) (comment + (rhyme-from-scheme nil '((A 7) (A 7) (B 5) (B 5) (A 7))) + + (->> (gen/selection-seq + (map #(assoc % :weight 1) frp/words) + (weighted-selection/adjust-for-rhymes 0.99) + (frp/phrase->word frp/words "hi there my boy")) + (take 3)) + + ["bishop larch smitten us dwell" + "solely first week in hell" + "and take that for three" + "come wrapped in glory" + "you ever leave it so well"] ["romancing realized too late" "my crown revive my withered state" "reign is obsolete" @@ -86,8 +82,7 @@ "it wither away with this knife" "hate is my virtue" "my feelings are well overdue" - "war we await the afterlife"] - ) + "war we await the afterlife"]) (->> (repeatedly (fn [] (gen/gen-target-by-syllable-count darklyrics/darkov-2 8 (map #(assoc % :weight 1) frp/popular)))) @@ -102,6 +97,4 @@ (let [adj (comp (gen/adjust-for-markov darklyrics/darkov-2) (gen/adjust-for-tail-rimes util/words-map))])) -(apply (fnil + 0) '()) - (map :syllable-count '()) diff --git a/src/com/owoga/prhyme/util/math.clj b/src/com/owoga/prhyme/util/math.clj new file mode 100644 index 0000000..735edba --- /dev/null +++ b/src/com/owoga/prhyme/util/math.clj @@ -0,0 +1,101 @@ +;; Fast weighted random selection thanks to the Vose algorithm. +;; https://gist.github.com/ghadishayban/a26cc402958ef3c7ce61 + +(ns com.owoga.prhyme.util.math + (:import clojure.lang.PersistentQueue)) + + +;; Vose's alias method +;; http://www.keithschwarz.com/darts-dice-coins/ +(defprotocol Rand + (nextr [_ rng])) + +(deftype Vose [n ^ints alias ^doubles prob] + Rand + ;; returns the index of the chosen weight + (nextr [_ rng] ;; not using the rng for now + (let [i (rand-int n) + p (aget prob i)] + (if (or (= p 1.0) + (< (rand) p)) + i + (aget alias i))))) + +(defn ^:private make-vose [dist] + (let [N (count dist) + alias (int-array N) + prob (double-array N)] + (if (zero? N) + (->Vose N alias prob) + (let [^doubles ps (->> dist + (map (partial * N)) + (into-array Double/TYPE)) + + [small large] (loop [i 0 + [small large] [PersistentQueue/EMPTY + PersistentQueue/EMPTY] + ps (seq ps)] + (if (seq ps) + (let [p (first ps)] + (if (< p 1) + (recur (inc i) + [(conj small i) large] + (rest ps)) + (recur (inc i) + [small (conj large i)] + (rest ps)))) + [small large])) + + [small large] (loop [small small + large large] + (if (and (seq small) (seq large)) + (let [l (first small) + g (first large) + small (pop small) + large (pop large)] + (aset-double prob l (aget ps l)) + (aset-int alias l g) + (let [pg (- (+ (aget ps g) (aget ps l)) + 1.0)] + (aset-double ps g pg) + (if (< pg 1) + (recur (conj small g) large) + (recur small (conj large g))))) + [small large]))] + (doseq [g (concat large small)] + (aset-double prob g 1)) + (->Vose N alias prob))))) + +(defn from-weights [ws] + (let [N (count ws) + tot (reduce + 0.0 ws) + dist (if (zero? tot) + (repeat N (/ 1 tot)) + (map #(/ % tot) ws))] + (make-vose (vec dist)))) + +(comment + (let [ws [1 2 1 3 3] + rng (from-weights ws) + chosen (repeatedly 1000000 #(nextr rng nil)) + accuracy (mapv (comp float + #(/ % 100000) + (frequencies chosen)) + (range (count ws)))] + accuracy)) + +(defn weighted-selection + "If given a coll, assumes the coll is weights and returns the selected index by + weighted random selection. + + If given a key function and a collection, uses the key function to get a + collection of weights and returns the value at the randomly selected index." + ([coll] + (let [rng (from-weights coll) + index (nextr rng nil)] + index)) + ([key-fn coll] + (let [rng (from-weights (map key-fn coll)) + index (nextr rng nil) + selection (nth coll index)] + selection))) diff --git a/src/com/owoga/prhyme/util/nlp.clj b/src/com/owoga/prhyme/util/nlp.clj index fe2ccd5..f163e9b 100644 --- a/src/com/owoga/prhyme/util/nlp.clj +++ b/src/com/owoga/prhyme/util/nlp.clj @@ -7,7 +7,13 @@ (def get-sentences (nlp/make-sentence-detector "models/en-sent.bin")) (def parse (tb/make-treebank-parser "en-parser-chunking.bin")) -(defn valid-sentence? [phrase] +(defn valid-sentence? + "Tokenizes and parses the phrase using OpenNLP models from + http://opennlp.sourceforge.net/models-1.5/ + + If the parse tree has an 'S as the top-level tag, then + we consider it a valid English sentence." + [phrase] (->> phrase tokenize (string/join " ")