More scraping utilities and better rhyming

main
Eric Ihli 4 years ago
parent 218d665c65
commit 3809ab1f09

@ -8,10 +8,13 @@
(def root-url "http://www.darklyrics.com") (def root-url "http://www.darklyrics.com")
(def base-url "http://www.darklyrics.com/a.html") (def base-url "http://www.darklyrics.com/a.html")
(def data-dir "dark-corpus")
(defn fetch-url [url] (defn fetch-url- [url]
(html/html-resource (java.net.URL. url))) (html/html-resource (java.net.URL. url)))
(def fetch-url (memoize fetch-url-))
(defn parse-letters-urls [index] (defn parse-letters-urls [index]
(-> index (-> index
(html/select [:div.listrow]) (html/select [:div.listrow])
@ -48,6 +51,21 @@
((partial apply str)) ((partial apply str))
(string/replace #"\s+" " "))) (string/replace #"\s+" " ")))
(defn parse-album-songs [album-html]
(->> album-html
(#(html/select % [:div.lyrics]))
first
:content
(util/take-between #(= :h3 (:tag %)))
(map
(fn [[title & lyrics]]
[(html/text (first (html/select title [:a])))
(->> lyrics
(filter string?)
(map string/trim)
(string/join "\n")
(string/trim))]))))
(defn english? [text] (defn english? [text]
(let [words (string/split text #"\s+") (let [words (string/split text #"\s+")
english-words english-words
@ -57,17 +75,34 @@
(defn scrape (defn scrape
([base-url] ([base-url]
(scrape (parse-letters-urls (fetch-url base-url)) '() '())) (scrape (parse-letters-urls (fetch-url base-url)) '() '()))
([letters-urls artists-urls albums-urls] ([letters-urls artists-urls [artist-name albums-urls]]
(cond (cond
(not-empty albums-urls) (not-empty albums-urls)
(cons (parse-album-lyrics (fetch-url (first albums-urls))) (let [album-html (fetch-url (first albums-urls))
(lazy-seq (scrape letters-urls artists-urls (rest albums-urls)))) album-name (->> (html/select album-html [:div.albumlyrics :h2])
(map html/text)
first
(#(string/replace % #"(album: |\s+)" " "))
(string/trim))]
(cons [artist-name album-name (parse-album-songs album-html)]
(lazy-seq (scrape letters-urls artists-urls [artist-name (rest albums-urls)]))))
(not-empty artists-urls) (not-empty artists-urls)
(scrape letters-urls (rest artists-urls) (parse-artists-albums (fetch-url (first artists-urls)))) (let [artist-html (fetch-url (first artists-urls))
artist-name (->> (html/select artist-html [:h1])
(map html/text)
first
(#(string/replace % #" LYRICS" "")))]
(scrape
letters-urls
(rest artists-urls)
[artist-name (parse-artists-albums artist-html)]))
(not-empty letters-urls) (not-empty letters-urls)
(scrape (rest letters-urls) (parse-artists-urls (fetch-url (first letters-urls))) albums-urls) (scrape
(rest letters-urls)
(parse-artists-urls (fetch-url (first letters-urls)))
[nil nil])
:else :else
nil))) nil)))
@ -91,12 +126,61 @@
(map (fn [[k v]] (vector (list k) v)) (map (fn [[k v]] (vector (list k) v))
(make-markov (slurp "darklyrics.txt") 2)))) (make-markov (slurp "darklyrics.txt") 2))))
(defn norm-filepath [text]
(-> text
string/lower-case
(string/replace #"\s+" "-")
(string/replace #"[\(\)\"',\.]" "")))
(defn write-scrape [[artist album songs]]
(run!
(fn [[song lyrics]]
(let [file (io/file (string/join "/" (map norm-filepath [data-dir artist album song])))]
(io/make-parents file)
(spit file lyrics)))
songs))
(defn do-scrape []
(let [artist-album-texts (scrape base-url)]
(run!
(fn [x]
(Thread/sleep (rand 3000))
(println (str "Writing songs for " (second x)))
(write-scrape x))
artist-album-texts)))
(comment (comment
(take 3 (scrape base-url))
(do-scrape)
(def letters-urls (parse-letters-urls (fetch-url base-url)))
(def artists-urls (parse-artists-urls (fetch-url (first letters-urls))))
(def artist-html (fetch-url (first artists-urls)))
(def album-urls (parse-artists-albums artist-html))
(def album-html (fetch-url (first album-urls)))
(->> album-html
(#(html/select % [:div.lyrics]))
first
:content
(util/take-between #(= :h3 (:tag %)))
(map
(fn [[title & lyrics]]
[(html/text (first (html/select title [:a])))
(->> lyrics
(filter string?)
(map string/trim)
(string/join "\n")
(string/trim))])))
(->> (html/select artist-html [:h1])
(map html/text)
(first ))
(def darkov (def darkov
(into (into
{} {}
(map (fn [[k v]] (vector (list k) v)) (map (fn [[k v]] (vector (list k) v))
(make-markov (slurp "darklyrics.txt"))))) (make-markov (slurp "darklyrics.txt") 1))))
(run! write-scrape (take 4 (scrape base-url)))
(take 100 darkov) (take 100 darkov)
(util/write-markov "darklyrics.edn" darkov) (util/write-markov "darklyrics.edn" darkov)
(spit "test.txt" (pr-str {:foo "1"})) (spit "test.txt" (pr-str {:foo "1"}))

@ -136,8 +136,18 @@
(map reverse))) (map reverse)))
(defn rimes? [a b] (defn rimes? [a b]
(if (= 1 (count (last (:rimes a)))) (if (and (= 1 (count (last (:rimes a))))
(= (take-last 2 (:rimes a)) (take-last 2 (:rimes b))) (= 1 (count (last (:rimes b))))
(or (= (last (:rimes a)) '("ER"))
(= (last (:rimes a)) '("AA"))
(= (last (:rimes a)) '("AE"))
(= (last (:rimes a)) '("AO"))
(= (last (:rimes a)) '("AW"))
(= (last (:rimes a)) '("EH"))
(= (last (:rimes a)) '("IH"))
(= (last (:rimes a)) '("UH"))
(= (last (:rimes a)) '("AH"))))
(= (last (:onsets a)) (last (:onsets b)))
(= (last (:rimes a)) (last (:rimes b))))) (= (last (:rimes a)) (last (:rimes b)))))
(defn onset+nucleus [syllables] (defn onset+nucleus [syllables]

@ -253,13 +253,21 @@
(repeatedly) (repeatedly)
(take 5))) (take 5)))
(apply map vector (->> ["taylor is my star"
"she brightens my day"]
(generate-prhymes)
(repeatedly)
(take 10)))
(frp/phrase->word frp/popular "homer")
(frp/phrase->word frp/popular "")
(apply map vector (->> ["mister sandman" (apply map vector (->> ["mister sandman"
"give me a dream" "give me a dream"
"make him the cutest" "make him the cutest"
"that i've ever seen"] "that i've ever seen"]
(generate-prhymes) (generate-prhymes)
(repeatedly) (repeatedly)
(take 2))) (take 10)))
(def adj (comp (adjust-for-markov dr/darkov 0.9) (def adj (comp (adjust-for-markov dr/darkov 0.9)
(adjust-for-tail-rimes words-map 0.9))) (adjust-for-tail-rimes words-map 0.9)))

@ -115,6 +115,17 @@
(recur (rest coll) (recur (rest coll)
(cons (first coll) acc))))) (cons (first coll) acc)))))
(defn take-between
"Seq [) of pred."
[pred coll]
(->> coll
(drop-while #(not (pred %)))
((fn [coll]
(if (empty? coll)
nil
(cons (cons (first coll) (take-while #(not (pred %)) (rest coll)))
(lazy-seq (take-between pred (rest coll)))))))))
(defn max-consecutive [pred coll] (defn max-consecutive [pred coll]
(loop [coll coll (loop [coll coll
cur-count 0 cur-count 0

Loading…
Cancel
Save