Better markov/corpus handling

main
Eric Ihli 4 years ago
parent 3809ab1f09
commit 7a4f22970e

@ -10,8 +10,16 @@
(def base-url "http://www.darklyrics.com/a.html") (def base-url "http://www.darklyrics.com/a.html")
(def data-dir "dark-corpus") (def data-dir "dark-corpus")
(defn fix-url [url]
(string/replace url #".*(http://.*(?!http://).*$)" "$1"))
(defn fetch-url- [url] (defn fetch-url- [url]
(html/html-resource (java.net.URL. url))) (let [url (fix-url url)]
(try
(html/html-resource (java.net.URL. url))
(catch Exception e
(prn "Exception during fetch " e)
{}))))
(def fetch-url (memoize fetch-url-)) (def fetch-url (memoize fetch-url-))
@ -70,14 +78,15 @@
(let [words (string/split text #"\s+") (let [words (string/split text #"\s+")
english-words english-words
(->> words (filter #(util/words-map (string/lower-case %))))] (->> words (filter #(util/words-map (string/lower-case %))))]
(< 0.5 (/ (count english-words) (count words))))) (< 0.7 (/ (count english-words) (count words)))))
(defn scrape (defn scrape
([base-url] ([base-url]
(scrape (parse-letters-urls (fetch-url base-url)) '() '())) (scrape (drop 3 (parse-letters-urls (fetch-url base-url))) '() '()))
([letters-urls artists-urls [artist-name albums-urls]] ([letters-urls artists-urls [artist-name albums-urls]]
(cond (cond
(not-empty albums-urls) (not-empty albums-urls)
(try
(let [album-html (fetch-url (first albums-urls)) (let [album-html (fetch-url (first albums-urls))
album-name (->> (html/select album-html [:div.albumlyrics :h2]) album-name (->> (html/select album-html [:div.albumlyrics :h2])
(map html/text) (map html/text)
@ -86,8 +95,12 @@
(string/trim))] (string/trim))]
(cons [artist-name album-name (parse-album-songs album-html)] (cons [artist-name album-name (parse-album-songs album-html)]
(lazy-seq (scrape letters-urls artists-urls [artist-name (rest albums-urls)])))) (lazy-seq (scrape letters-urls artists-urls [artist-name (rest albums-urls)]))))
(catch Exception e
(prn "album exception" e)
(scrape letters-urls artists-urls [artist-name (rest albums-urls)])))
(not-empty artists-urls) (not-empty artists-urls)
(try
(let [artist-html (fetch-url (first artists-urls)) (let [artist-html (fetch-url (first artists-urls))
artist-name (->> (html/select artist-html [:h1]) artist-name (->> (html/select artist-html [:h1])
(map html/text) (map html/text)
@ -97,6 +110,12 @@
letters-urls letters-urls
(rest artists-urls) (rest artists-urls)
[artist-name (parse-artists-albums artist-html)])) [artist-name (parse-artists-albums artist-html)]))
(catch Exception e
(prn "artist exception" e)
(scrape
letters-urls
(rest artists-urls)
["unknown" '()])))
(not-empty letters-urls) (not-empty letters-urls)
(scrape (scrape
@ -140,18 +159,23 @@
(spit file lyrics))) (spit file lyrics)))
songs)) songs))
(defn do-scrape [] (defn -main []
(let [artist-album-texts (scrape base-url)] (let [artist-album-texts (scrape base-url)]
(run! (run!
(fn [x] (fn [x]
(Thread/sleep (rand 3000)) (try
(println (str "Writing songs for " (second x))) (println (str "Writing songs for " (second x)))
(write-scrape x)) (write-scrape x)
(catch Exception e
(prn "Exception: " e))))
artist-album-texts))) artist-album-texts)))
(comment (comment
(def darkov-2 (util/read-markov "dark-corpus-2.edn"))
(get darkov-2 '(nil nil))
(take 3 (scrape base-url)) (take 3 (scrape base-url))
(do-scrape) (-main)
(def letters-urls (parse-letters-urls (fetch-url base-url))) (def letters-urls (parse-letters-urls (fetch-url base-url)))
(def artists-urls (parse-artists-urls (fetch-url (first letters-urls)))) (def artists-urls (parse-artists-urls (fetch-url (first letters-urls))))
(def artist-html (fetch-url (first artists-urls))) (def artist-html (fetch-url (first artists-urls)))

@ -0,0 +1,80 @@
(ns com.owoga.corpus.markov
(:require [com.owoga.prhyme.util :as util]
[clojure.string :as string]
[clojure.java.io :as io]))
(defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
(defn make-markov [tokens n]
(reduce
(fn [a w]
(let [k (butlast w)
v (last w)]
(update-in a [k v] (fnil inc 0))))
{}
((util/window (inc n)) tokens)))
(defn -main [directory]
(let [directory (io/file "dark-corpus")
files (file-seq directory)
lines (->> files
(remove #(.isDirectory %))
(map #(slurp %))
(map #(string/split % #"\n+"))
#_(map #(util/extend-coll % nil 1)))
markovs (->> lines
(map #(make-markov % 1)))]
(take 1 markovs)))
(merge-with
(fn [a b]
(+ (if (nil? a) 0 a)
b))
{:foo 1}
{:bar 2 :foo 3})
(defn merge-markov [& maps]
(apply
merge-with
(fn [a-possibilities b-possibilities]
(apply
merge-with
(fn [a b]
((fnil + 0) a b))
a-possibilities
b-possibilities))
maps))
(comment
(merge-markov
{'("away") {"her" 1
"foo" 7}}
{'("away") {"her" 2
"them" 1
"bar" 8}}
{'("away") {"her" 10
"them" 50
"baz" 99}}))
(comment
(->> (file-seq (io/file "dark-corpus"))
(remove #(.isDirectory %))
(map #(slurp %))
(map clean-text)
(filter util/english?)
(map #(string/split % #"\n+"))
(flatten)
(map #(string/split % #"\s+"))
(map reverse)
(map #(util/extend-coll % nil 2))
(map #(make-markov % 2))
(apply merge-markov)
(util/write-markov "dark-corpus-2.edn"))
(def darkov-2 (util/read-markov "dark-corpus-2.edn"))
(def darkov-1 (util/read-markov "dark-corpus-1.edn"))
(get darkov-2 '(nil nil))
(darkov-1 '("london"))
(-main "dark-lyrics"))

@ -86,6 +86,66 @@
result]))) result])))
[words target result])))) [words target result]))))
(defn adjust-for-markov-with-boundaries
[markov percent]
(let [markov-n (count (first (first markov)))]
(fn [[words target result]]
(let [key (let [k (map :norm-word (take markov-n result))]
(reverse
(if (> markov-n (count k))
(concat k (repeat (- markov-n (count k)) nil))
k)))
markov-options (markov key)
markov-option-avg (/ (apply + (vals markov-options))
(max 1 (count markov-options)))]
(if (nil? markov-options)
[words target result]
(let [[markovs non-markovs]
((juxt filter remove)
#(markov-options (:norm-word %))
words)
weight-non-markovs (apply + (map :weight non-markovs))
target-weight-markovs (- (/ weight-non-markovs (- 1 percent))
weight-non-markovs)
count-markovs (count markovs)
adjustment-markovs (if (= 0 count-markovs) 1 (/ target-weight-markovs count-markovs))]
[(concat
(map
(fn [m]
(let [option (markov-options (:norm-word m))]
(as-> m m
(assoc m :weight (* (/ option markov-option-avg) adjustment-markovs (:weight m)))
(assoc m :adjustment-for-markov (* (/ option markov-option-avg) adjustment-markovs)))))
markovs)
non-markovs)
target
result]))))))
(comment
(let [markov-1-example
{'("dream") {"a" 1}
'("a") {"me" 1}}
markov-2-example
{'(nil nil) {"dream" 1}
'(nil "dream") {"a" 1}
'("dream" "a") {"me" 1}
'("a" "me") {"give" 1}
'("give" nil) {nil 1}}
result-a '()
result-b '({:norm-word "dream",
:weight 9.000000000000002,
:adjustment-for-markov 9.000000000000002})
words [{:norm-word "dream" :weight 1}
{:norm-word "foo" :weight 1}
{:norm-word "a" :weight 1}
{:norm-word "me" :weight 1}
{:norm-word "give" :weight 1}]
adj (adjust-for-markov-with-boundaries markov-2-example 0.9)]
(adj [words 'target result-b]))
((adjust-for-markov-with-boundaries {'("foo" "bar") {}} 0.5)
['() '() '("hi" "bye" "there")]))
(defn adjust-for-rimes (defn adjust-for-rimes
[dictionary percent] [dictionary percent]
(fn [[words target result]] (fn [[words target result]]
@ -223,10 +283,7 @@
target (phrase->word words phrase)] target (phrase->word words phrase)]
(prhymer words adjust target (syllable-stop target)))) (prhymer words adjust target (syllable-stop target))))
(def adj (comp (adjust-for-markov dr/darkov 0.25) #_(defn generate-prhymes [poem]
(adjust-for-markov dr/darkov-2 0.9)
(adjust-for-tail-rimes words-map 0.9)))
(defn generate-prhymes [poem]
(let [r (partial generate-rhyme-for-phrase frp/popular adj)] (let [r (partial generate-rhyme-for-phrase frp/popular adj)]
(fn [] (fn []
(->> poem (->> poem
@ -243,15 +300,42 @@
(map (fn [line] (map #(:norm-word %) line))) (map (fn [line] (map #(:norm-word %) line)))
(map #(string/join " " %)))))) (map #(string/join " " %))))))
(defn generate-prhymes-darkov [words adj phrase]
(let [target (phrase->word words phrase)
r (generate-rhyme-for-phrase words adj target)]
(first
(filter
#(and
(or (< 0.9 (rand))
(nlp/valid-sentence? (string/join " " (map :norm-word %))))
(= (:syllable-count target)
(apply + (map :syllable-count %))))
r))
(map (fn [line] (map #(:norm-word %) line)))
(map #(string/join " " %))))
(comment (comment
(let [adj (comp (adjust-for-markov-with-boundaries dr/darkov-2 0.9)
(adjust-for-tail-rimes words-map 0.9))]
(->> (generate-rhyme-for-phrase frp/popular adj "make him the cutest that i've ever seen")
(take 20)
(map #(map :norm-word %))
(map #(string/join " " %))))
(let [adj (comp (adjust-for-markov-with-boundaries dr/darkov-2 0.9)
(adjust-for-tail-rimes words-map 0.9))]
(->> (generate-rhyme-for-phrase frp/popular adj "mister sandman give me a dream")
(take 20)
(map #(map :norm-word %))
(map #(string/join " " %))))
(let [adj (adjust-for-markov-with-boundaries dr/darkov-2 0.9)]
(apply map vector (apply map vector
(->> ["mister sandman give me a dream" (->> ["mister sandman give me a dream"
"make him the cutest that i've ever seen" "make him the cutest that i've ever seen"
"give him two lips like roses in clover" "give him two lips like roses in clover"
"then tell him that his lonesome nights are over"] "then tell him that his lonesome nights are over"]
(generate-prhymes) (map #(generate-prhymes-darkov util/popular adj %)))))
(repeatedly)
(take 5)))
(apply map vector (->> ["taylor is my star" (apply map vector (->> ["taylor is my star"
"she brightens my day"] "she brightens my day"]

@ -73,6 +73,19 @@
(def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"}) (def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"})
(defn window [n] (defn window [n]
(fn [coll]
(cond
(empty? coll) []
(< (count coll) n) []
:else (cons (take n coll)
(lazy-seq ((window n) (rest coll)))))))
(defn extend-coll [coll val n]
(concat (repeat n val)
coll
(repeat n val)))
(defn window-with-nil [n]
(fn [coll] (fn [coll]
(cond (cond
(empty? coll) [] (empty? coll) []
@ -83,6 +96,15 @@
(defn clean-text [text] (defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
(defn english? [text]
(let [words (string/split text #"\s+")
english-words
(->> words (filter #(words-map (string/lower-case %))))]
(< 0.7 (/ (count english-words) (max 1 (count words))))))
(defn padr [val n coll]
(concat coll (repeat n val)))
(defn make-markov [text] (defn make-markov [text]
(let [tokens (reverse (string/split (clean-text text) #"\s+"))] (let [tokens (reverse (string/split (clean-text text) #"\s+"))]
(reduce (reduce

Loading…
Cancel
Save