Better markov/corpus handling

main
Eric Ihli 4 years ago
parent 3809ab1f09
commit 7a4f22970e

@ -10,8 +10,16 @@
(def base-url "http://www.darklyrics.com/a.html") (def base-url "http://www.darklyrics.com/a.html")
(def data-dir "dark-corpus") (def data-dir "dark-corpus")
(defn fix-url [url]
(string/replace url #".*(http://.*(?!http://).*$)" "$1"))
(defn fetch-url- [url] (defn fetch-url- [url]
(html/html-resource (java.net.URL. url))) (let [url (fix-url url)]
(try
(html/html-resource (java.net.URL. url))
(catch Exception e
(prn "Exception during fetch " e)
{}))))
(def fetch-url (memoize fetch-url-)) (def fetch-url (memoize fetch-url-))
@ -70,33 +78,44 @@
(let [words (string/split text #"\s+") (let [words (string/split text #"\s+")
english-words english-words
(->> words (filter #(util/words-map (string/lower-case %))))] (->> words (filter #(util/words-map (string/lower-case %))))]
(< 0.5 (/ (count english-words) (count words))))) (< 0.7 (/ (count english-words) (count words)))))
(defn scrape (defn scrape
([base-url] ([base-url]
(scrape (parse-letters-urls (fetch-url base-url)) '() '())) (scrape (drop 3 (parse-letters-urls (fetch-url base-url))) '() '()))
([letters-urls artists-urls [artist-name albums-urls]] ([letters-urls artists-urls [artist-name albums-urls]]
(cond (cond
(not-empty albums-urls) (not-empty albums-urls)
(let [album-html (fetch-url (first albums-urls)) (try
album-name (->> (html/select album-html [:div.albumlyrics :h2]) (let [album-html (fetch-url (first albums-urls))
(map html/text) album-name (->> (html/select album-html [:div.albumlyrics :h2])
first (map html/text)
(#(string/replace % #"(album: |\s+)" " ")) first
(string/trim))] (#(string/replace % #"(album: |\s+)" " "))
(cons [artist-name album-name (parse-album-songs album-html)] (string/trim))]
(lazy-seq (scrape letters-urls artists-urls [artist-name (rest albums-urls)])))) (cons [artist-name album-name (parse-album-songs album-html)]
(lazy-seq (scrape letters-urls artists-urls [artist-name (rest albums-urls)]))))
(catch Exception e
(prn "album exception" e)
(scrape letters-urls artists-urls [artist-name (rest albums-urls)])))
(not-empty artists-urls) (not-empty artists-urls)
(let [artist-html (fetch-url (first artists-urls)) (try
artist-name (->> (html/select artist-html [:h1]) (let [artist-html (fetch-url (first artists-urls))
(map html/text) artist-name (->> (html/select artist-html [:h1])
first (map html/text)
(#(string/replace % #" LYRICS" "")))] first
(scrape (#(string/replace % #" LYRICS" "")))]
letters-urls (scrape
(rest artists-urls) letters-urls
[artist-name (parse-artists-albums artist-html)])) (rest artists-urls)
[artist-name (parse-artists-albums artist-html)]))
(catch Exception e
(prn "artist exception" e)
(scrape
letters-urls
(rest artists-urls)
["unknown" '()])))
(not-empty letters-urls) (not-empty letters-urls)
(scrape (scrape
@ -140,18 +159,23 @@
(spit file lyrics))) (spit file lyrics)))
songs)) songs))
(defn do-scrape [] (defn -main []
(let [artist-album-texts (scrape base-url)] (let [artist-album-texts (scrape base-url)]
(run! (run!
(fn [x] (fn [x]
(Thread/sleep (rand 3000)) (try
(println (str "Writing songs for " (second x))) (println (str "Writing songs for " (second x)))
(write-scrape x)) (write-scrape x)
(catch Exception e
(prn "Exception: " e))))
artist-album-texts))) artist-album-texts)))
(comment (comment
(def darkov-2 (util/read-markov "dark-corpus-2.edn"))
(get darkov-2 '(nil nil))
(take 3 (scrape base-url)) (take 3 (scrape base-url))
(do-scrape) (-main)
(def letters-urls (parse-letters-urls (fetch-url base-url))) (def letters-urls (parse-letters-urls (fetch-url base-url)))
(def artists-urls (parse-artists-urls (fetch-url (first letters-urls)))) (def artists-urls (parse-artists-urls (fetch-url (first letters-urls))))
(def artist-html (fetch-url (first artists-urls))) (def artist-html (fetch-url (first artists-urls)))

@ -0,0 +1,80 @@
(ns com.owoga.corpus.markov
(:require [com.owoga.prhyme.util :as util]
[clojure.string :as string]
[clojure.java.io :as io]))
(defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
(defn make-markov [tokens n]
(reduce
(fn [a w]
(let [k (butlast w)
v (last w)]
(update-in a [k v] (fnil inc 0))))
{}
((util/window (inc n)) tokens)))
(defn -main [directory]
(let [directory (io/file "dark-corpus")
files (file-seq directory)
lines (->> files
(remove #(.isDirectory %))
(map #(slurp %))
(map #(string/split % #"\n+"))
#_(map #(util/extend-coll % nil 1)))
markovs (->> lines
(map #(make-markov % 1)))]
(take 1 markovs)))
(merge-with
(fn [a b]
(+ (if (nil? a) 0 a)
b))
{:foo 1}
{:bar 2 :foo 3})
(defn merge-markov [& maps]
(apply
merge-with
(fn [a-possibilities b-possibilities]
(apply
merge-with
(fn [a b]
((fnil + 0) a b))
a-possibilities
b-possibilities))
maps))
(comment
(merge-markov
{'("away") {"her" 1
"foo" 7}}
{'("away") {"her" 2
"them" 1
"bar" 8}}
{'("away") {"her" 10
"them" 50
"baz" 99}}))
(comment
(->> (file-seq (io/file "dark-corpus"))
(remove #(.isDirectory %))
(map #(slurp %))
(map clean-text)
(filter util/english?)
(map #(string/split % #"\n+"))
(flatten)
(map #(string/split % #"\s+"))
(map reverse)
(map #(util/extend-coll % nil 2))
(map #(make-markov % 2))
(apply merge-markov)
(util/write-markov "dark-corpus-2.edn"))
(def darkov-2 (util/read-markov "dark-corpus-2.edn"))
(def darkov-1 (util/read-markov "dark-corpus-1.edn"))
(get darkov-2 '(nil nil))
(darkov-1 '("london"))
(-main "dark-lyrics"))

@ -86,6 +86,66 @@
result]))) result])))
[words target result])))) [words target result]))))
(defn adjust-for-markov-with-boundaries
[markov percent]
(let [markov-n (count (first (first markov)))]
(fn [[words target result]]
(let [key (let [k (map :norm-word (take markov-n result))]
(reverse
(if (> markov-n (count k))
(concat k (repeat (- markov-n (count k)) nil))
k)))
markov-options (markov key)
markov-option-avg (/ (apply + (vals markov-options))
(max 1 (count markov-options)))]
(if (nil? markov-options)
[words target result]
(let [[markovs non-markovs]
((juxt filter remove)
#(markov-options (:norm-word %))
words)
weight-non-markovs (apply + (map :weight non-markovs))
target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
weight-non-markovs)
count-markovs (count markovs)
adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))]
[(concat
(map
(fn [m]
(let [option (markov-options (:norm-word m))]
(as-> m m
(assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
(assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
markovs)
non-markovs)
target
result]))))))
(comment
(let [markov-1-example
{'("dream") {"a" 1}
'("a") {"me" 1}}
markov-2-example
{'(nil nil) {"dream" 1}
'(nil "dream") {"a" 1}
'("dream" "a") {"me" 1}
'("a" "me") {"give" 1}
'("give" nil) {nil 1}}
result-a '()
result-b '({:norm-word "dream",
:weight 9.000000000000002,
:adjustment-for-markov 9.000000000000002})
words [{:norm-word "dream" :weight 1}
{:norm-word "foo" :weight 1}
{:norm-word "a" :weight 1}
{:norm-word "me" :weight 1}
{:norm-word "give" :weight 1}]
adj (adjust-for-markov-with-boundaries markov-2-example 0.9)]
(adj [words 'target result-b]))
((adjust-for-markov-with-boundaries {'("foo" "bar") {}} 0.5)
['() '() '("hi" "bye" "there")]))
(defn adjust-for-rimes (defn adjust-for-rimes
[dictionary percent] [dictionary percent]
(fn [[words target result]] (fn [[words target result]]
@ -223,10 +283,7 @@
target (phrase->word words phrase)] target (phrase->word words phrase)]
(prhymer words adjust target (syllable-stop target)))) (prhymer words adjust target (syllable-stop target))))
(def adj (comp (adjust-for-markov dr/darkov 0.25) #_(defn generate-prhymes [poem]
(adjust-for-markov dr/darkov-2 0.9)
(adjust-for-tail-rimes words-map 0.9)))
(defn generate-prhymes [poem]
(let [r (partial generate-rhyme-for-phrase frp/popular adj)] (let [r (partial generate-rhyme-for-phrase frp/popular adj)]
(fn [] (fn []
(->> poem (->> poem
@ -243,15 +300,42 @@
(map (fn [line] (map #(:norm-word %) line))) (map (fn [line] (map #(:norm-word %) line)))
(map #(string/join " " %)))))) (map #(string/join " " %))))))
(defn generate-prhymes-darkov [words adj phrase]
(let [target (phrase->word words phrase)
r (generate-rhyme-for-phrase words adj target)]
(first
(filter
#(and
(or (< 0.9 (rand))
(nlp/valid-sentence? (string/join " " (map :norm-word %))))
(= (:syllable-count target)
(apply + (map :syllable-count %))))
r))
(map (fn [line] (map #(:norm-word %) line)))
(map #(string/join " " %))))
(comment (comment
(apply map vector (let [adj (comp (adjust-for-markov-with-boundaries dr/darkov-2 0.9)
(->> ["mister sandman give me a dream" (adjust-for-tail-rimes words-map 0.9))]
"make him the cutest that i've ever seen" (->> (generate-rhyme-for-phrase frp/popular adj "make him the cutest that i've ever seen")
"give him two lips like roses in clover" (take 20)
"then tell him that his lonesome nights are over"] (map #(map :norm-word %))
(generate-prhymes) (map #(string/join " " %))))
(repeatedly)
(take 5))) (let [adj (comp (adjust-for-markov-with-boundaries dr/darkov-2 0.9)
(adjust-for-tail-rimes words-map 0.9))]
(->> (generate-rhyme-for-phrase frp/popular adj "mister sandman give me a dream")
(take 20)
(map #(map :norm-word %))
(map #(string/join " " %))))
(let [adj (adjust-for-markov-with-boundaries dr/darkov-2 0.9)]
(apply map vector
(->> ["mister sandman give me a dream"
"make him the cutest that i've ever seen"
"give him two lips like roses in clover"
"then tell him that his lonesome nights are over"]
(map #(generate-prhymes-darkov util/popular adj %)))))
(apply map vector (->> ["taylor is my star" (apply map vector (->> ["taylor is my star"
"she brightens my day"] "she brightens my day"]

@ -73,6 +73,19 @@
(def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"}) (def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"})
(defn window [n] (defn window [n]
(fn [coll]
(cond
(empty? coll) []
(< (count coll) n) []
:else (cons (take n coll)
(lazy-seq ((window n) (rest coll)))))))
(defn extend-coll [coll val n]
(concat (repeat n val)
coll
(repeat n val)))
(defn window-with-nil [n]
(fn [coll] (fn [coll]
(cond (cond
(empty? coll) [] (empty? coll) []
@ -83,6 +96,15 @@
(defn clean-text [text] (defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
(defn english? [text]
(let [words (string/split text #"\s+")
english-words
(->> words (filter #(words-map (string/lower-case %))))]
(< 0.7 (/ (count english-words) (max 1 (count words))))))
(defn padr [val n coll]
(concat coll (repeat n val)))
(defn make-markov [text] (defn make-markov [text]
(let [tokens (reverse (string/split (clean-text text) #"\s+"))] (let [tokens (reverse (string/split (clean-text text) #"\s+"))]
(reduce (reduce

Loading…
Cancel
Save