Cleanup

5 years ago · 94f32d0a2f
parent c2d3882854
commit 94f32d0a2f
7 changed files with 136 additions and 135 deletions
--- a/src/com/owoga/corpus/darklyrics.clj
+++ b/src/com/owoga/corpus/darklyrics.clj
@ -139,7 +139,8 @@
     {}
     ((util/window (inc n)) tokens))))

-(def darkov-2 (util/read-markov "dark-corpus-2.edn"))
+(defn read-darkov-2 []
+  (util/read-markov (io/resource "dark-corpus-2.edn")))

 (defn norm-filepath [text]
  (-> text
@ -167,7 +168,8 @@
     artist-album-texts)))

 (comment
-  (def darkov-2 (util/read-markov "dark-corpus-2.edn"))
+  (def darkov-2 (util/read-markov (io/resource "dark-corpus-2.edn")))
+  (take 10 darkov-2)
  (get darkov-2 '(nil nil))
  (take 3 (scrape base-url))
  (-main)
@ -200,11 +202,4 @@
          (make-markov (slurp "darklyrics.txt") 1))))
  (run! write-scrape (take 4 (scrape base-url)))

-  (take 100 darkov)
-  (util/write-markov "darklyrics.edn" darkov)
-  (spit "test.txt" (pr-str {:foo "1"}))
-  (def lyrics (scrape base-url))
-  (with-open [writer (io/writer "darklyrics.txt")]
-    (run!
-     #(.write writer %)
-     (take 200 (filter english? lyrics)))))
+  (def lyrics (scrape base-url)))
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -15,25 +15,6 @@
   {}
   ((util/window (inc n)) tokens)))

-(defn -main [directory]
-  (let [directory (io/file "dark-corpus")
-        files (file-seq directory)
-        lines (->> files
-                   (remove #(.isDirectory %))
-                   (map #(slurp %))
-                   (map #(string/split % #"\n+"))
-                   #_(map #(util/extend-coll % nil 1)))
-        markovs (->> lines
-                     (map #(make-markov % 1)))]
-    (take 1 markovs)))
-
-(merge-with
- (fn [a b]
-   (+ (if (nil? a) 0 a)
-      b))
- {:foo 1}
- {:bar 2 :foo 3})
-
 (defn merge-markov [& maps]
  (apply
   merge-with
@ -56,8 +37,8 @@
               "them" 50
               "baz" 99}}))

-(defn gen-markov []
-  (->> (file-seq (io/file "dark-corpus"))
+(defn gen-markov [directory]
+  (->> (file-seq (io/file directory))
       (remove #(.isDirectory %))
       (map #(slurp %))
       (map clean-text)
@ -69,27 +50,4 @@
       (map #(util/extend-coll % nil 2))
       (map #(make-markov % 2))
       (apply merge-markov)
-       (util/write-markov "dark-corpus-2.edn")))
-
-(comment
-  (gen-markov)
-  (->> (file-seq (io/file "dark-corpus"))
-       (remove #(.isDirectory %))
-       (map #(slurp %))
-       (map clean-text)
-       (filter util/english?)
-       (map #(string/split % #"\n+"))
-       (flatten)
-       (map #(string/split % #"\s+"))
-       (map reverse)
-       (map #(util/extend-coll % nil 2))
-       (map #(make-markov % 2))
-       (apply merge-markov)
-       (util/write-markov "dark-corpus-2.edn"))
-
-  (def darkov-2 (util/read-markov "dark-corpus-2.edn"))
-  (def darkov-1 (util/read-markov "dark-corpus-1.edn"))
-  (get darkov-2 '(nil nil))
-  (darkov-1 '("london"))
-
-  (-main "dark-lyrics"))
+       (util/write-markov "resources/dark-corpus-2.edn")))
--- a/src/com/owoga/prhyme/core.clj
+++ b/src/com/owoga/prhyme/core.clj
@ -25,6 +25,75 @@
 (def nouns
  (set/intersection popular (set (line-seq (io/reader (io/resource "nouns.txt"))))))

+(defrecord Word [word syllables syllable-count rimes onsets nuclei])
+
+;;; Typical rhyme model (explanation of following 3 functions)
+;;
+;; In the typical theory of syllable structure, the general structure of a
+;; syllable (σ) consists of three segments. These segments are grouped into two
+;; components:
+;;
+;; Onset (ω)
+;;     a consonant or consonant cluster, obligatory in some languages,
+;;     optional or even restricted in others
+;;    
+;; Rime (ρ)
+;;     right branch, contrasts with onset, splits into nucleus and coda
+;;
+;;     Nucleus (ν)
+;;         a vowel or syllabic consonant, obligatory in most languages
+;;     Coda (κ)
+;;         consonant, optional in some languages, highly restricted or prohibited in others
+
+(defn rimes [syllables]
+  (->> syllables
+       (map reverse)
+       (map #(first (u/take-through u/vowel %)))
+       (map reverse)))
+
+(defn onset+nucleus [syllables]
+  (->> syllables
+       (map #(first (u/take-through u/vowel %)))))
+
+(defn nucleus [syllables]
+  (map #(list (last (first (u/take-through u/vowel %)))) syllables))
+
+(defn make-word [word]
+  (let [syllables (s/syllabify (rest word))
+        rimes     (rimes syllables)
+        onsets    (onset+nucleus syllables)
+        nuclei    (nucleus syllables)]
+    (->> (->Word
+          (first word)
+          syllables
+          (count syllables)
+          rimes
+          onsets
+          nuclei)
+         (#(assoc % :norm-word (string/lower-case
+                                (string/replace
+                                 (:word %)
+                                 #"\(\d+\)"
+                                 "")))))))
+
+(defn make-word-1 [word phonemes]
+  (let [syllables (s/syllabify phonemes)
+        rimes     (rimes syllables)
+        onsets    (onset+nucleus syllables)
+        nuclei    (nucleus syllables)]
+    (->> (->Word
+          (string/lower-case word)
+          syllables
+          (count syllables)
+          rimes
+          onsets
+          nuclei)
+         ;; CMU dict has multiple pronounciations for some words.
+         ;; foobar(1), foobar(2), etc...
+         ;; it's useful to have the normalized word for situations
+         ;; when you don't care how it's pronounced.
+         (#(assoc % :normalized-word (string/replace (:word %) #"\(\d+\)" ""))))))
+
 (defn words-by-rime* [words]
  (let [words-with-rime (->> words
                             (map rest)
@ -129,35 +198,6 @@
 (defn filter-to-syllable-count [n words]
  (filter (fn [word] (= n (count (s/syllabify (rest word))))) words))

-(defn rimes [syllables]
-  (->> syllables
-       (map reverse)
-       (map #(first (u/take-through u/vowel %)))
-       (map reverse)))
-
-(defn rimes? [a b]
-  (cond
-    (and (= 1 (count (last (:rimes a))))
-         (= 1 (count (last (:rimes b))))
-         (or (= (last (:rimes a)) '("ER"))
-             (= (last (:rimes a)) '("AA"))
-             (= (last (:rimes a)) '("AE"))
-             (= (last (:rimes a)) '("AO"))
-             (= (last (:rimes a)) '("AW"))
-             (= (last (:rimes a)) '("EH"))
-             (= (last (:rimes a)) '("IH"))
-             (= (last (:rimes a)) '("UH"))
-             (= (last (:rimes a)) '("AH"))))
-    (= (list (first (take-last 2 (:nuclei a)))
-             (last (:onsets a)))
-       (list (first (take-last 2 (:nuclei b)))
-             (last (:onsets b))))
-
-    (and (= 1 (count (last (:rimes a))))
-         (= 1 (count (last (:rimes b)))))
-    (= (take-last 2 (:nuclei a)) (take-last 2 (:nuclei b)))
-
-    :else (= (last (:rimes a)) (last (:rimes b)))))

 (defn rhymes?
  "What does it mean for something to rhyme?"
@ -185,13 +225,6 @@

    :else (= (last (:rimes a)) (last (:rimes b)))))

-(defn onset+nucleus [syllables]
-  (->> syllables
-       (map #(first (u/take-through u/vowel %)))))
-
-(defn nucleus [syllables]
-  (map #(list (last (first (u/take-through u/vowel %)))) syllables))
-
 (defn rhyming-word
  "Simple lookup in data.
  Data is a tree of syllables to words.
--- a/src/com/owoga/prhyme/frp.clj
+++ b/src/com/owoga/prhyme/frp.clj
@ -1,6 +1,5 @@
 (ns com.owoga.prhyme.frp
  (:require [clojure.java.io :as io]
-            [clojure.pprint :as pprint]
            [clojure.string :as string]
            [clojure.set :as set]
            [com.owoga.prhyme.core :as p]
--- a/src/com/owoga/prhyme/generation/weighted_selection.clj
+++ b/src/com/owoga/prhyme/generation/weighted_selection.clj
@ -81,30 +81,35 @@
  the tail of a target."
  [percent]
  (fn [[words target result]]
-    (if (empty? result)
-      (let [words-with-rime-count
-            (map
-             (fn [word]
-               (assoc word :num-matching (if (prhyme/rhymes? target word) 1 0)))
-             words)
-
-            [rhyming non-rhyming]
-            ((juxt filter remove)
-             #(< 0 (:num-matching %))
-             words-with-rime-count)
-
-            weight-non-rhyming (apply + (map :weight non-rhyming))
-            target-weight-rhyming (* 100 percent weight-non-rhyming)
-            count-rhyming (count rhyming)
-            adjustment-rhyming (if (= 0 count-rhyming) 1 (/ target-weight-rhyming count-rhyming))]
-        [(concat
+    (let [words-with-rime-count
          (map
           (fn [word]
-             (as-> word word
-               (assoc word :weight (* adjustment-rhyming (:weight word)))
-               (assoc word :adjustment-for-rimes adjustment-rhyming)))
-           rhyming)
-          non-rhyming)
-         target
-         result])
+             (assoc word :num-matching (if (prhyme/rhymes? target word) 1 0)))
+           words)
+
+          [rhyming non-rhyming]
+          ((juxt filter remove)
+           #(< 0 (:num-matching %))
+           words-with-rime-count)
+
+          weight-non-rhyming (apply + (map :weight non-rhyming))
+          target-weight-rhyming (* 100 percent weight-non-rhyming)
+          count-rhyming (count rhyming)
+          adjustment-rhyming (if (= 0 count-rhyming) 1 (/ target-weight-rhyming count-rhyming))]
+      [(concat
+        (map
+         (fn [word]
+           (as-> word word
+             (assoc word :weight (* adjustment-rhyming (:weight word)))
+             (assoc word :adjustment-for-rimes adjustment-rhyming)))
+         rhyming)
+        non-rhyming)
+       target
+       result])))
+
+(defn adjust-for-tail-rhyme
+  [percent]
+  (fn [[words target result]]
+    (if (empty? result)
+      ((adjust-for-rhymes percent) [words target result])
      [words target result])))
--- a/src/com/owoga/prhyme/lymeric.clj
+++ b/src/com/owoga/prhyme/lymeric.clj
@ -3,12 +3,13 @@
            [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
            [clojure.string :as string]
            [com.owoga.prhyme.frp :as frp]
-            [com.owoga.corpus.darklyrics :as darklyrics]))
+            [taoensso.nippy :as nippy]
+            [clojure.java.io :as io]))


 (defn rhyme-from-scheme
  "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
-  [rhymer scheme]
+  [rhymer markov scheme]
  (let [base-words (map #(assoc % :weight 1) frp/popular)]
    (loop [scheme scheme
           rhymes {}
@ -25,7 +26,7 @@
                   (remove
                    nil?
                    [(weighted-selection/adjust-for-markov
-                      darklyrics/darkov-2
+                      markov
                      0.99)
                     (when (rhymes pattern)
                       (weighted-selection/adjust-for-tail-rhyme 0.99))]))
@ -45,16 +46,12 @@
                 (conj result rhyme)))))))

 (comment
-  (rhyme-from-scheme nil '((A 8) (A 8) (B 5) (B 5) (A 8)))
+  (time (def darkov-2 (nippy/thaw-from-file (io/resource "darkov-2.bin"))))
+  (rhyme-from-scheme nil darkov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))
+
  )

 (comment
-  (->> (repeatedly
-        (fn []
-          (rhyme-from-scheme nil '((A 7) (A 7) (B 5) (B 5) (A 7)))))
-       (take 2))
-
-  (apply map vector (list '(1 2 3) '(4 5 6)))
  (->> (gen/selection-seq
        (map #(assoc % :weight 1) frp/words)
        (weighted-selection/adjust-for-rhymes 0.99)
@ -86,3 +83,5 @@
   "hate is my virtue"
   "my feelings are well overdue"
   "war we await the afterlife"])
+
+
--- a/src/com/owoga/prhyme/syllabify.clj
+++ b/src/com/owoga/prhyme/syllabify.clj
@ -58,8 +58,7 @@
       (empty? onset) (recur (rest phones) [(first phones)])
       (not (>sonorous (first phones) (last onset))) [onset phones]
       :else (recur (rest phones) (conj onset (first phones)))))))
-(comment
-  (slurp-onset (reverse ["B" "W"])))
+
 (defn fix-lax
  "https://www.reddit.com/r/phonetics/comments/i7hp5f/what_is_the_alaska_rule_in_reference_to/

@ -103,22 +102,35 @@

 (comment
  (syllabify ["AH" "L" "AE" "S" "K" "AH"])
+  ;; => (("AH") ("L" "AE" "S") ("K" "AH"))
  (syllabify ["H" "ER" "AH" "L" "D"])
+  ;; => (("H" "ER") ("AH" "L" "D"))
  (syllabify ["H" "EH" "R" "AH" "L" "D"])
+  ;; => (("H" "EH") ("R" "AH" "L" "D"))
  (syllabify ["B" "OY" "N" "K"])
-  (syllabify ["H" "ER" "AH" "L" "D"])
+  ;; => (("B" "OY" "N" "K"))
  (syllabify ["G" "L" "IH" "M" "P" "S" "T"])
+  ;; => (("G" "L" "IH" "M" "P" "S" "T"))
  (syllabify ["B" "IY" "G" "L" "IH" "M" "P" "S" "T"])
+  ;; => (("B" "IY") ("G" "L" "IH" "M" "P" "S" "T"))
  (syllabify ["G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"])
+  ;; => (("G" "L" "IH" "M" "P" "S") ("T" "R" "EH" "D"))
  (syllabify ["UH" "P" "R" "AY" "S" "IY" "NG"])
+  ;; => (("UH") ("P" "R" "AY") ("S" "IY" "NG"))
  (syllabify ["UH" "L" "AE" "S" "K" "UH"])
+  ;; => (("UH") ("L" "AE" "S") ("K" "UH"))
  (syllabify ["R" "OY" "AH" "L"])
+  ;; => (("R" "OY") ("AH" "L"))
  (syllabify ["R" "AY" "AH" "L"])
+  ;; => (("R" "AY") ("AH" "L"))
+  ;; TODO: Fix below wi-thcheeze
  (syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"])
+  ;; => (("R" "OY") ("AH" "L") ("W" "IH") ("TH" "CH" "IY" "Z"))
+  ;;
+  ;; ["GLIMPSED" "G" "L" "IH" "M" "P" "S" "T"]
+  ;; ["BEGLIMPSED" "B" "IY" "G" "L" "IH" "M" "P" "S" "T"]
+  ;; ["BEGLIMPSED" "B" "EH" "G" "L" "IH" "M" "P" "S" "T"]
+  ;; ["GLIMSTEST" "G" "L" "IH" "M" "S" "T" "EH" "S" "T"]
+  ;; ["GLIMPSTRED" "G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"]
+  ;; ["GLIMSTRED" "G" "L" "IH" "M" "S" "T" "R" "EH" "D"]
  )
- ;; ["GLIMPSED" "G" "L" "IH" "M" "P" "S" "T"]
- ;; ["BEGLIMPSED" "B" "IY" "G" "L" "IH" "M" "P" "S" "T"]
- ;; ["BEGLIMPSED" "B" "EH" "G" "L" "IH" "M" "P" "S" "T"]
- ;; ["GLIMSTEST" "G" "L" "IH" "M" "S" "T" "EH" "S" "T"]
- ;; ["GLIMPSTRED" "G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"]
- ;; ["GLIMSTRED" "G" "L" "IH" "M" "S" "T" "R" "EH" "D"]