diff --git a/src/com/owoga/corpus/darklyrics.clj b/src/com/owoga/corpus/darklyrics.clj index 9163a01..ffb6814 100644 --- a/src/com/owoga/corpus/darklyrics.clj +++ b/src/com/owoga/corpus/darklyrics.clj @@ -139,7 +139,8 @@ {} ((util/window (inc n)) tokens)))) -(def darkov-2 (util/read-markov "dark-corpus-2.edn")) +(defn read-darkov-2 [] + (util/read-markov (io/resource "dark-corpus-2.edn"))) (defn norm-filepath [text] (-> text @@ -167,7 +168,8 @@ artist-album-texts))) (comment - (def darkov-2 (util/read-markov "dark-corpus-2.edn")) + (def darkov-2 (util/read-markov (io/resource "dark-corpus-2.edn"))) + (take 10 darkov-2) (get darkov-2 '(nil nil)) (take 3 (scrape base-url)) (-main) @@ -200,11 +202,4 @@ (make-markov (slurp "darklyrics.txt") 1)))) (run! write-scrape (take 4 (scrape base-url))) - (take 100 darkov) - (util/write-markov "darklyrics.edn" darkov) - (spit "test.txt" (pr-str {:foo "1"})) - (def lyrics (scrape base-url)) - (with-open [writer (io/writer "darklyrics.txt")] - (run! - #(.write writer %) - (take 200 (filter english? lyrics))))) + (def lyrics (scrape base-url))) diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index 26c85fa..d11d1ea 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -15,25 +15,6 @@ {} ((util/window (inc n)) tokens))) -(defn -main [directory] - (let [directory (io/file "dark-corpus") - files (file-seq directory) - lines (->> files - (remove #(.isDirectory %)) - (map #(slurp %)) - (map #(string/split % #"\n+")) - #_(map #(util/extend-coll % nil 1))) - markovs (->> lines - (map #(make-markov % 1)))] - (take 1 markovs))) - -(merge-with - (fn [a b] - (+ (if (nil? a) 0 a) - b)) - {:foo 1} - {:bar 2 :foo 3}) - (defn merge-markov [& maps] (apply merge-with @@ -56,8 +37,8 @@ "them" 50 "baz" 99}})) -(defn gen-markov [] - (->> (file-seq (io/file "dark-corpus")) +(defn gen-markov [directory] + (->> (file-seq (io/file directory)) (remove #(.isDirectory %)) (map #(slurp %)) (map clean-text) @@ -69,27 +50,4 @@ (map #(util/extend-coll % nil 2)) (map #(make-markov % 2)) (apply merge-markov) - (util/write-markov "dark-corpus-2.edn"))) - -(comment - (gen-markov) - (->> (file-seq (io/file "dark-corpus")) - (remove #(.isDirectory %)) - (map #(slurp %)) - (map clean-text) - (filter util/english?) - (map #(string/split % #"\n+")) - (flatten) - (map #(string/split % #"\s+")) - (map reverse) - (map #(util/extend-coll % nil 2)) - (map #(make-markov % 2)) - (apply merge-markov) - (util/write-markov "dark-corpus-2.edn")) - - (def darkov-2 (util/read-markov "dark-corpus-2.edn")) - (def darkov-1 (util/read-markov "dark-corpus-1.edn")) - (get darkov-2 '(nil nil)) - (darkov-1 '("london")) - - (-main "dark-lyrics")) + (util/write-markov "resources/dark-corpus-2.edn"))) diff --git a/src/com/owoga/prhyme/core.clj b/src/com/owoga/prhyme/core.clj index 5a7c4d6..9fba6bf 100644 --- a/src/com/owoga/prhyme/core.clj +++ b/src/com/owoga/prhyme/core.clj @@ -25,6 +25,75 @@ (def nouns (set/intersection popular (set (line-seq (io/reader (io/resource "nouns.txt")))))) +(defrecord Word [word syllables syllable-count rimes onsets nuclei]) + +;;; Typical rhyme model (explanation of following 3 functions) +;; +;; In the typical theory of syllable structure, the general structure of a +;; syllable (σ) consists of three segments. These segments are grouped into two +;; components: +;; +;; Onset (ω) +;; a consonant or consonant cluster, obligatory in some languages, +;; optional or even restricted in others +;; +;; Rime (ρ) +;; right branch, contrasts with onset, splits into nucleus and coda +;; +;; Nucleus (ν) +;; a vowel or syllabic consonant, obligatory in most languages +;; Coda (κ) +;; consonant, optional in some languages, highly restricted or prohibited in others + +(defn rimes [syllables] + (->> syllables + (map reverse) + (map #(first (u/take-through u/vowel %))) + (map reverse))) + +(defn onset+nucleus [syllables] + (->> syllables + (map #(first (u/take-through u/vowel %))))) + +(defn nucleus [syllables] + (map #(list (last (first (u/take-through u/vowel %)))) syllables)) + +(defn make-word [word] + (let [syllables (s/syllabify (rest word)) + rimes (rimes syllables) + onsets (onset+nucleus syllables) + nuclei (nucleus syllables)] + (->> (->Word + (first word) + syllables + (count syllables) + rimes + onsets + nuclei) + (#(assoc % :norm-word (string/lower-case + (string/replace + (:word %) + #"\(\d+\)" + ""))))))) + +(defn make-word-1 [word phonemes] + (let [syllables (s/syllabify phonemes) + rimes (rimes syllables) + onsets (onset+nucleus syllables) + nuclei (nucleus syllables)] + (->> (->Word + (string/lower-case word) + syllables + (count syllables) + rimes + onsets + nuclei) + ;; CMU dict has multiple pronounciations for some words. + ;; foobar(1), foobar(2), etc... + ;; it's useful to have the normalized word for situations + ;; when you don't care how it's pronounced. + (#(assoc % :normalized-word (string/replace (:word %) #"\(\d+\)" "")))))) + (defn words-by-rime* [words] (let [words-with-rime (->> words (map rest) @@ -129,35 +198,6 @@ (defn filter-to-syllable-count [n words] (filter (fn [word] (= n (count (s/syllabify (rest word))))) words)) -(defn rimes [syllables] - (->> syllables - (map reverse) - (map #(first (u/take-through u/vowel %))) - (map reverse))) - -(defn rimes? [a b] - (cond - (and (= 1 (count (last (:rimes a)))) - (= 1 (count (last (:rimes b)))) - (or (= (last (:rimes a)) '("ER")) - (= (last (:rimes a)) '("AA")) - (= (last (:rimes a)) '("AE")) - (= (last (:rimes a)) '("AO")) - (= (last (:rimes a)) '("AW")) - (= (last (:rimes a)) '("EH")) - (= (last (:rimes a)) '("IH")) - (= (last (:rimes a)) '("UH")) - (= (last (:rimes a)) '("AH")))) - (= (list (first (take-last 2 (:nuclei a))) - (last (:onsets a))) - (list (first (take-last 2 (:nuclei b))) - (last (:onsets b)))) - - (and (= 1 (count (last (:rimes a)))) - (= 1 (count (last (:rimes b))))) - (= (take-last 2 (:nuclei a)) (take-last 2 (:nuclei b))) - - :else (= (last (:rimes a)) (last (:rimes b))))) (defn rhymes? "What does it mean for something to rhyme?" @@ -185,13 +225,6 @@ :else (= (last (:rimes a)) (last (:rimes b))))) -(defn onset+nucleus [syllables] - (->> syllables - (map #(first (u/take-through u/vowel %))))) - -(defn nucleus [syllables] - (map #(list (last (first (u/take-through u/vowel %)))) syllables)) - (defn rhyming-word "Simple lookup in data. Data is a tree of syllables to words. diff --git a/src/com/owoga/prhyme/frp.clj b/src/com/owoga/prhyme/frp.clj index e9b9a3e..9221181 100644 --- a/src/com/owoga/prhyme/frp.clj +++ b/src/com/owoga/prhyme/frp.clj @@ -1,6 +1,5 @@ (ns com.owoga.prhyme.frp (:require [clojure.java.io :as io] - [clojure.pprint :as pprint] [clojure.string :as string] [clojure.set :as set] [com.owoga.prhyme.core :as p] diff --git a/src/com/owoga/prhyme/generation/weighted_selection.clj b/src/com/owoga/prhyme/generation/weighted_selection.clj index 02c32d4..67149f1 100644 --- a/src/com/owoga/prhyme/generation/weighted_selection.clj +++ b/src/com/owoga/prhyme/generation/weighted_selection.clj @@ -81,30 +81,35 @@ the tail of a target." [percent] (fn [[words target result]] - (if (empty? result) - (let [words-with-rime-count - (map - (fn [word] - (assoc word :num-matching (if (prhyme/rhymes? target word) 1 0))) - words) - - [rhyming non-rhyming] - ((juxt filter remove) - #(< 0 (:num-matching %)) - words-with-rime-count) - - weight-non-rhyming (apply + (map :weight non-rhyming)) - target-weight-rhyming (* 100 percent weight-non-rhyming) - count-rhyming (count rhyming) - adjustment-rhyming (if (= 0 count-rhyming) 1 (/ target-weight-rhyming count-rhyming))] - [(concat + (let [words-with-rime-count (map (fn [word] - (as-> word word - (assoc word :weight (* adjustment-rhyming (:weight word))) - (assoc word :adjustment-for-rimes adjustment-rhyming))) - rhyming) - non-rhyming) - target - result]) + (assoc word :num-matching (if (prhyme/rhymes? target word) 1 0))) + words) + + [rhyming non-rhyming] + ((juxt filter remove) + #(< 0 (:num-matching %)) + words-with-rime-count) + + weight-non-rhyming (apply + (map :weight non-rhyming)) + target-weight-rhyming (* 100 percent weight-non-rhyming) + count-rhyming (count rhyming) + adjustment-rhyming (if (= 0 count-rhyming) 1 (/ target-weight-rhyming count-rhyming))] + [(concat + (map + (fn [word] + (as-> word word + (assoc word :weight (* adjustment-rhyming (:weight word))) + (assoc word :adjustment-for-rimes adjustment-rhyming))) + rhyming) + non-rhyming) + target + result]))) + +(defn adjust-for-tail-rhyme + [percent] + (fn [[words target result]] + (if (empty? result) + ((adjust-for-rhymes percent) [words target result]) [words target result]))) diff --git a/src/com/owoga/prhyme/lymeric.clj b/src/com/owoga/prhyme/lymeric.clj index 8717781..bab3f99 100644 --- a/src/com/owoga/prhyme/lymeric.clj +++ b/src/com/owoga/prhyme/lymeric.clj @@ -3,12 +3,13 @@ [com.owoga.prhyme.generation.weighted-selection :as weighted-selection] [clojure.string :as string] [com.owoga.prhyme.frp :as frp] - [com.owoga.corpus.darklyrics :as darklyrics])) + [taoensso.nippy :as nippy] + [clojure.java.io :as io])) (defn rhyme-from-scheme "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]" - [rhymer scheme] + [rhymer markov scheme] (let [base-words (map #(assoc % :weight 1) frp/popular)] (loop [scheme scheme rhymes {} @@ -25,7 +26,7 @@ (remove nil? [(weighted-selection/adjust-for-markov - darklyrics/darkov-2 + markov 0.99) (when (rhymes pattern) (weighted-selection/adjust-for-tail-rhyme 0.99))])) @@ -45,16 +46,12 @@ (conj result rhyme))))))) (comment - (rhyme-from-scheme nil '((A 8) (A 8) (B 5) (B 5) (A 8))) + (time (def darkov-2 (nippy/thaw-from-file (io/resource "darkov-2.bin")))) + (rhyme-from-scheme nil darkov-2 '((A 8) (A 8) (B 5) (B 5) (A 8))) + ) (comment - (->> (repeatedly - (fn [] - (rhyme-from-scheme nil '((A 7) (A 7) (B 5) (B 5) (A 7))))) - (take 2)) - - (apply map vector (list '(1 2 3) '(4 5 6))) (->> (gen/selection-seq (map #(assoc % :weight 1) frp/words) (weighted-selection/adjust-for-rhymes 0.99) @@ -86,3 +83,5 @@ "hate is my virtue" "my feelings are well overdue" "war we await the afterlife"]) + + diff --git a/src/com/owoga/prhyme/syllabify.clj b/src/com/owoga/prhyme/syllabify.clj index 6f378f6..e286ff4 100644 --- a/src/com/owoga/prhyme/syllabify.clj +++ b/src/com/owoga/prhyme/syllabify.clj @@ -58,8 +58,7 @@ (empty? onset) (recur (rest phones) [(first phones)]) (not (>sonorous (first phones) (last onset))) [onset phones] :else (recur (rest phones) (conj onset (first phones))))))) -(comment - (slurp-onset (reverse ["B" "W"]))) + (defn fix-lax "https://www.reddit.com/r/phonetics/comments/i7hp5f/what_is_the_alaska_rule_in_reference_to/ @@ -103,22 +102,35 @@ (comment (syllabify ["AH" "L" "AE" "S" "K" "AH"]) + ;; => (("AH") ("L" "AE" "S") ("K" "AH")) (syllabify ["H" "ER" "AH" "L" "D"]) + ;; => (("H" "ER") ("AH" "L" "D")) (syllabify ["H" "EH" "R" "AH" "L" "D"]) + ;; => (("H" "EH") ("R" "AH" "L" "D")) (syllabify ["B" "OY" "N" "K"]) - (syllabify ["H" "ER" "AH" "L" "D"]) + ;; => (("B" "OY" "N" "K")) (syllabify ["G" "L" "IH" "M" "P" "S" "T"]) + ;; => (("G" "L" "IH" "M" "P" "S" "T")) (syllabify ["B" "IY" "G" "L" "IH" "M" "P" "S" "T"]) + ;; => (("B" "IY") ("G" "L" "IH" "M" "P" "S" "T")) (syllabify ["G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"]) + ;; => (("G" "L" "IH" "M" "P" "S") ("T" "R" "EH" "D")) (syllabify ["UH" "P" "R" "AY" "S" "IY" "NG"]) + ;; => (("UH") ("P" "R" "AY") ("S" "IY" "NG")) (syllabify ["UH" "L" "AE" "S" "K" "UH"]) + ;; => (("UH") ("L" "AE" "S") ("K" "UH")) (syllabify ["R" "OY" "AH" "L"]) + ;; => (("R" "OY") ("AH" "L")) (syllabify ["R" "AY" "AH" "L"]) + ;; => (("R" "AY") ("AH" "L")) + ;; TODO: Fix below wi-thcheeze (syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"]) + ;; => (("R" "OY") ("AH" "L") ("W" "IH") ("TH" "CH" "IY" "Z")) + ;; + ;; ["GLIMPSED" "G" "L" "IH" "M" "P" "S" "T"] + ;; ["BEGLIMPSED" "B" "IY" "G" "L" "IH" "M" "P" "S" "T"] + ;; ["BEGLIMPSED" "B" "EH" "G" "L" "IH" "M" "P" "S" "T"] + ;; ["GLIMSTEST" "G" "L" "IH" "M" "S" "T" "EH" "S" "T"] + ;; ["GLIMPSTRED" "G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"] + ;; ["GLIMSTRED" "G" "L" "IH" "M" "S" "T" "R" "EH" "D"] ) - ;; ["GLIMPSED" "G" "L" "IH" "M" "P" "S" "T"] - ;; ["BEGLIMPSED" "B" "IY" "G" "L" "IH" "M" "P" "S" "T"] - ;; ["BEGLIMPSED" "B" "EH" "G" "L" "IH" "M" "P" "S" "T"] - ;; ["GLIMSTEST" "G" "L" "IH" "M" "S" "T" "EH" "S" "T"] - ;; ["GLIMPSTRED" "G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"] - ;; ["GLIMSTRED" "G" "L" "IH" "M" "S" "T" "R" "EH" "D"]