From 7a4f22970e800ba0f3d24b8263a7447af0284711 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Fri, 23 Oct 2020 08:16:29 -0700 Subject: [PATCH] Better markov/corpus handling --- src/com/owoga/corpus/darklyrics.clj | 74 ++++++++++++------- src/com/owoga/corpus/markov.clj | 80 +++++++++++++++++++++ src/com/owoga/prhyme/gen.clj | 108 ++++++++++++++++++++++++---- src/com/owoga/prhyme/util.clj | 22 ++++++ 4 files changed, 247 insertions(+), 37 deletions(-) create mode 100644 src/com/owoga/corpus/markov.clj diff --git a/src/com/owoga/corpus/darklyrics.clj b/src/com/owoga/corpus/darklyrics.clj index ec932d6..991de9d 100644 --- a/src/com/owoga/corpus/darklyrics.clj +++ b/src/com/owoga/corpus/darklyrics.clj @@ -10,8 +10,16 @@ (def base-url "http://www.darklyrics.com/a.html") (def data-dir "dark-corpus") +(defn fix-url [url] + (string/replace url #".*(http://.*(?!http://).*$)" "$1")) + (defn fetch-url- [url] - (html/html-resource (java.net.URL. url))) + (let [url (fix-url url)] + (try + (html/html-resource (java.net.URL. url)) + (catch Exception e + (prn "Exception during fetch " e) + {})))) (def fetch-url (memoize fetch-url-)) @@ -70,33 +78,44 @@ (let [words (string/split text #"\s+") english-words (->> words (filter #(util/words-map (string/lower-case %))))] - (< 0.5 (/ (count english-words) (count words))))) + (< 0.7 (/ (count english-words) (count words))))) (defn scrape ([base-url] - (scrape (parse-letters-urls (fetch-url base-url)) '() '())) + (scrape (drop 3 (parse-letters-urls (fetch-url base-url))) '() '())) ([letters-urls artists-urls [artist-name albums-urls]] (cond (not-empty albums-urls) - (let [album-html (fetch-url (first albums-urls)) - album-name (->> (html/select album-html [:div.albumlyrics :h2]) - (map html/text) - first - (#(string/replace % #"(album: |\s+)" " ")) - (string/trim))] - (cons [artist-name album-name (parse-album-songs album-html)] - (lazy-seq (scrape letters-urls artists-urls [artist-name (rest albums-urls)])))) + (try + (let [album-html (fetch-url (first albums-urls)) + album-name (->> (html/select album-html [:div.albumlyrics :h2]) + (map html/text) + first + (#(string/replace % #"(album: |\s+)" " ")) + (string/trim))] + (cons [artist-name album-name (parse-album-songs album-html)] + (lazy-seq (scrape letters-urls artists-urls [artist-name (rest albums-urls)])))) + (catch Exception e + (prn "album exception" e) + (scrape letters-urls artists-urls [artist-name (rest albums-urls)]))) (not-empty artists-urls) - (let [artist-html (fetch-url (first artists-urls)) - artist-name (->> (html/select artist-html [:h1]) - (map html/text) - first - (#(string/replace % #" LYRICS" "")))] - (scrape - letters-urls - (rest artists-urls) - [artist-name (parse-artists-albums artist-html)])) + (try + (let [artist-html (fetch-url (first artists-urls)) + artist-name (->> (html/select artist-html [:h1]) + (map html/text) + first + (#(string/replace % #" LYRICS" "")))] + (scrape + letters-urls + (rest artists-urls) + [artist-name (parse-artists-albums artist-html)])) + (catch Exception e + (prn "artist exception" e) + (scrape + letters-urls + (rest artists-urls) + ["unknown" '()]))) (not-empty letters-urls) (scrape @@ -140,18 +159,23 @@ (spit file lyrics))) songs)) -(defn do-scrape [] +(defn -main [] (let [artist-album-texts (scrape base-url)] (run! (fn [x] - (Thread/sleep (rand 3000)) - (println (str "Writing songs for " (second x))) - (write-scrape x)) + (try + (println (str "Writing songs for " (second x))) + (write-scrape x) + (catch Exception e + (prn "Exception: " e)))) artist-album-texts))) (comment + + (def darkov-2 (util/read-markov "dark-corpus-2.edn")) + (get darkov-2 '(nil nil)) (take 3 (scrape base-url)) - (do-scrape) + (-main) (def letters-urls (parse-letters-urls (fetch-url base-url))) (def artists-urls (parse-artists-urls (fetch-url (first letters-urls)))) (def artist-html (fetch-url (first artists-urls))) diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj new file mode 100644 index 0000000..f8b57b4 --- /dev/null +++ b/src/com/owoga/corpus/markov.clj @@ -0,0 +1,80 @@ +(ns com.owoga.corpus.markov + (:require [com.owoga.prhyme.util :as util] + [clojure.string :as string] + [clojure.java.io :as io])) + +(defn clean-text [text] + (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) + +(defn make-markov [tokens n] + (reduce + (fn [a w] + (let [k (butlast w) + v (last w)] + (update-in a [k v] (fnil inc 0)))) + {} + ((util/window (inc n)) tokens))) + +(defn -main [directory] + (let [directory (io/file "dark-corpus") + files (file-seq directory) + lines (->> files + (remove #(.isDirectory %)) + (map #(slurp %)) + (map #(string/split % #"\n+")) + #_(map #(util/extend-coll % nil 1))) + markovs (->> lines + (map #(make-markov % 1)))] + (take 1 markovs))) + +(merge-with + (fn [a b] + (+ (if (nil? a) 0 a) + b)) + {:foo 1} + {:bar 2 :foo 3}) + +(defn merge-markov [& maps] + (apply + merge-with + (fn [a-possibilities b-possibilities] + (apply + merge-with + (fn [a b] + ((fnil + 0) a b)) + a-possibilities + b-possibilities)) + maps)) + +(comment + (merge-markov + {'("away") {"her" 1 + "foo" 7}} + {'("away") {"her" 2 + "them" 1 + "bar" 8}} + {'("away") {"her" 10 + "them" 50 + "baz" 99}})) + +(comment + (->> (file-seq (io/file "dark-corpus")) + (remove #(.isDirectory %)) + (map #(slurp %)) + (map clean-text) + (filter util/english?) + (map #(string/split % #"\n+")) + (flatten) + (map #(string/split % #"\s+")) + (map reverse) + (map #(util/extend-coll % nil 2)) + (map #(make-markov % 2)) + (apply merge-markov) + (util/write-markov "dark-corpus-2.edn")) + + (def darkov-2 (util/read-markov "dark-corpus-2.edn")) + (def darkov-1 (util/read-markov "dark-corpus-1.edn")) + (get darkov-2 '(nil nil)) + (darkov-1 '("london")) + + (-main "dark-lyrics")) diff --git a/src/com/owoga/prhyme/gen.clj b/src/com/owoga/prhyme/gen.clj index 7731ad0..268889b 100644 --- a/src/com/owoga/prhyme/gen.clj +++ b/src/com/owoga/prhyme/gen.clj @@ -86,6 +86,66 @@ result]))) [words target result])))) +(defn adjust-for-markov-with-boundaries + [markov percent] + (let [markov-n (count (first (first markov)))] + (fn [[words target result]] + (let [key (let [k (map :norm-word (take markov-n result))] + (reverse + (if (> markov-n (count k)) + (concat k (repeat (- markov-n (count k)) nil)) + k))) + markov-options (markov key) + markov-option-avg (/ (apply + (vals markov-options)) + (max 1 (count markov-options)))] + (if (nil? markov-options) + [words target result] + (let [[markovs non-markovs] + ((juxt filter remove) + #(markov-options (:norm-word %)) + words) + weight-non-markovs (apply + (map :weight non-markovs)) + target-weight-markovs (- (/ weight-non-markovs (- 1 percent)) + weight-non-markovs) + count-markovs (count markovs) + adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))] + [(concat + (map + (fn [m] + (let [option (markov-options (:norm-word m))] + (as-> m m + (assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m))) + (assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs))))) + markovs) + non-markovs) + target + result])))))) + +(comment + (let [markov-1-example + {'("dream") {"a" 1} + '("a") {"me" 1}} + markov-2-example + {'(nil nil) {"dream" 1} + '(nil "dream") {"a" 1} + '("dream" "a") {"me" 1} + '("a" "me") {"give" 1} + '("give" nil) {nil 1}} + result-a '() + result-b '({:norm-word "dream", + :weight 9.000000000000002, + :adjustment-for-markov 9.000000000000002}) + words [{:norm-word "dream" :weight 1} + {:norm-word "foo" :weight 1} + {:norm-word "a" :weight 1} + {:norm-word "me" :weight 1} + {:norm-word "give" :weight 1}] + adj (adjust-for-markov-with-boundaries markov-2-example 0.9)] + (adj [words 'target result-b])) + + ((adjust-for-markov-with-boundaries {'("foo" "bar") {}} 0.5) + ['() '() '("hi" "bye" "there")])) + (defn adjust-for-rimes [dictionary percent] (fn [[words target result]] @@ -223,10 +283,7 @@ target (phrase->word words phrase)] (prhymer words adjust target (syllable-stop target)))) -(def adj (comp (adjust-for-markov dr/darkov 0.25) - (adjust-for-markov dr/darkov-2 0.9) - (adjust-for-tail-rimes words-map 0.9))) -(defn generate-prhymes [poem] +#_(defn generate-prhymes [poem] (let [r (partial generate-rhyme-for-phrase frp/popular adj)] (fn [] (->> poem @@ -243,15 +300,42 @@ (map (fn [line] (map #(:norm-word %) line))) (map #(string/join " " %)))))) +(defn generate-prhymes-darkov [words adj phrase] + (let [target (phrase->word words phrase) + r (generate-rhyme-for-phrase words adj target)] + (first + (filter + #(and + (or (< 0.9 (rand)) + (nlp/valid-sentence? (string/join " " (map :norm-word %)))) + (= (:syllable-count target) + (apply + (map :syllable-count %)))) + r)) + (map (fn [line] (map #(:norm-word %) line))) + (map #(string/join " " %)))) + (comment - (apply map vector - (->> ["mister sandman give me a dream" - "make him the cutest that i've ever seen" - "give him two lips like roses in clover" - "then tell him that his lonesome nights are over"] - (generate-prhymes) - (repeatedly) - (take 5))) + (let [adj (comp (adjust-for-markov-with-boundaries dr/darkov-2 0.9) + (adjust-for-tail-rimes words-map 0.9))] + (->> (generate-rhyme-for-phrase frp/popular adj "make him the cutest that i've ever seen") + (take 20) + (map #(map :norm-word %)) + (map #(string/join " " %)))) + + (let [adj (comp (adjust-for-markov-with-boundaries dr/darkov-2 0.9) + (adjust-for-tail-rimes words-map 0.9))] + (->> (generate-rhyme-for-phrase frp/popular adj "mister sandman give me a dream") + (take 20) + (map #(map :norm-word %)) + (map #(string/join " " %)))) + + (let [adj (adjust-for-markov-with-boundaries dr/darkov-2 0.9)] + (apply map vector + (->> ["mister sandman give me a dream" + "make him the cutest that i've ever seen" + "give him two lips like roses in clover" + "then tell him that his lonesome nights are over"] + (map #(generate-prhymes-darkov util/popular adj %))))) (apply map vector (->> ["taylor is my star" "she brightens my day"] diff --git a/src/com/owoga/prhyme/util.clj b/src/com/owoga/prhyme/util.clj index 95c2246..0abda38 100644 --- a/src/com/owoga/prhyme/util.clj +++ b/src/com/owoga/prhyme/util.clj @@ -73,6 +73,19 @@ (def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"}) (defn window [n] + (fn [coll] + (cond + (empty? coll) [] + (< (count coll) n) [] + :else (cons (take n coll) + (lazy-seq ((window n) (rest coll))))))) + +(defn extend-coll [coll val n] + (concat (repeat n val) + coll + (repeat n val))) + +(defn window-with-nil [n] (fn [coll] (cond (empty? coll) [] @@ -83,6 +96,15 @@ (defn clean-text [text] (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) +(defn english? [text] + (let [words (string/split text #"\s+") + english-words + (->> words (filter #(words-map (string/lower-case %))))] + (< 0.7 (/ (count english-words) (max 1 (count words)))))) + +(defn padr [val n coll] + (concat coll (repeat n val))) + (defn make-markov [text] (let [tokens (reverse (string/split (clean-text text) #"\s+"))] (reduce