Add scraper for lyrics
parent
6d4eba92aa
commit
1228739fd2
File diff suppressed because one or more lines are too long
@ -0,0 +1,171 @@
|
||||
(ns com.owoga.corpus.darklyrics
|
||||
(:require [net.cgrand.enlive-html :as html]
|
||||
[clojure.string :as string]
|
||||
[clojure.java.io :as io]))
|
||||
|
||||
(def root-url "http://www.darklyrics.com")
|
||||
(def base-url "http://www.darklyrics.com/a.html")
|
||||
|
||||
(defn fetch-url [url]
|
||||
(html/html-resource (java.net.URL. url)))
|
||||
|
||||
(defn pages-urls [index]
|
||||
(-> index
|
||||
(html/select [:div.listrow])
|
||||
(first)
|
||||
(html/select [:a])
|
||||
((partial map #(get-in % [:attrs :href])))
|
||||
((partial map #(apply str root-url %)))))
|
||||
|
||||
(defn parse-letters-urls [index]
|
||||
(-> index
|
||||
(html/select [:div.listrow])
|
||||
(first)
|
||||
(html/select [:a])
|
||||
((partial map #(get-in % [:attrs :href])))
|
||||
((partial map #(apply str root-url %)))))
|
||||
|
||||
(defn artists-urls [page]
|
||||
(-> page
|
||||
(html/select [:div.artists :a])
|
||||
((partial map #(get-in % [:attrs :href])))
|
||||
((partial map #(apply str root-url "/" %)))))
|
||||
|
||||
(defn parse-artists-urls [page]
|
||||
(-> page
|
||||
(html/select [:div.artists :a])
|
||||
((partial map #(get-in % [:attrs :href])))
|
||||
((partial map #(apply str root-url "/" %)))))
|
||||
|
||||
(defn artists-names [page]
|
||||
(-> page
|
||||
(html/select [:div.artists :a])
|
||||
((partial map #(get-in % [:content])))))
|
||||
|
||||
(defn artists-albums [page]
|
||||
(-> page
|
||||
(html/select [:div.album])
|
||||
((partial
|
||||
map
|
||||
(fn [album]
|
||||
(cons
|
||||
(first (map html/text (html/select album [:h2 :strong])))
|
||||
(list
|
||||
(map
|
||||
#(str root-url (string/replace (get-in % [:attrs :href]) #"\.\." ""))
|
||||
(html/select album [:a])))))))))
|
||||
|
||||
(defn parse-artists-albums [page]
|
||||
(-> page
|
||||
(html/select [:div.album])
|
||||
((partial
|
||||
map
|
||||
(fn [album]
|
||||
(first
|
||||
(map
|
||||
#(str root-url (string/replace (get-in % [:attrs :href]) #"\.\." ""))
|
||||
(html/select album [:a]))))))))
|
||||
|
||||
(defn album-lyrics [page]
|
||||
(-> page
|
||||
(html/select [:div.lyrics])
|
||||
first
|
||||
:content
|
||||
((partial partition-by #(and (map? %) (= :h3 (:tag %)))))
|
||||
flatten
|
||||
((partial filter string?))
|
||||
((partial apply str))
|
||||
(string/replace #"\s+" " ")))
|
||||
|
||||
(defn parse-album-lyrics [page]
|
||||
(-> page
|
||||
(html/select [:div.lyrics])
|
||||
first
|
||||
:content
|
||||
((partial partition-by #(and (map? %) (= :h3 (:tag %)))))
|
||||
flatten
|
||||
((partial filter string?))
|
||||
((partial apply str))
|
||||
(string/replace #"\s+" " ")))
|
||||
|
||||
(defn lazy-artists
|
||||
([urls]
|
||||
(lazy-artists urls '()))
|
||||
([urls artists]
|
||||
(cond
|
||||
(empty? urls)
|
||||
nil
|
||||
|
||||
(empty? artists)
|
||||
(lazy-artists (rest urls)
|
||||
(artists-urls (fetch-url (first urls))))
|
||||
|
||||
:else
|
||||
(cons (fetch-url (first artists))
|
||||
(lazy-seq (lazy-artists urls (rest artists)))))))
|
||||
|
||||
(defn lazy-lyrics
|
||||
([page]
|
||||
(let [album-urls (->> (artists-albums page)
|
||||
(map #(vector (first %) (first (second %)))))]
|
||||
(lazy-lyrics page album-urls)))
|
||||
([page albums]
|
||||
(cond
|
||||
(empty? albums) nil
|
||||
:else
|
||||
(cons (album-lyrics (fetch-url (second (first albums))))
|
||||
(lazy-seq (lazy-lyrics page (rest albums)))))))
|
||||
|
||||
(defn lazy-scrape
|
||||
([base-url]
|
||||
(let [response (fetch-url base-url)
|
||||
alphabetical (pages-urls response)
|
||||
artists (lazy-artists alphabetical)]))
|
||||
([response artists albums]
|
||||
(cond
|
||||
(empty? artists) nil
|
||||
)))
|
||||
|
||||
(defn scrape
|
||||
([base-url]
|
||||
(scrape (parse-letters-urls (fetch-url base-url)) '() '()))
|
||||
([letters-urls artists-urls albums-urls]
|
||||
(cond
|
||||
(not-empty albums-urls)
|
||||
(cons (parse-album-lyrics (fetch-url (first albums-urls)))
|
||||
(lazy-seq (scrape letters-urls artists-urls (rest albums-urls))))
|
||||
|
||||
(not-empty artists-urls)
|
||||
(scrape letters-urls (rest artists-urls) (parse-artists-albums (fetch-url (first artists-urls))))
|
||||
|
||||
(not-empty letters-urls)
|
||||
(scrape (rest letters-urls) (parse-artists-urls (fetch-url (first letters-urls))) albums-urls)
|
||||
|
||||
:else
|
||||
nil)))
|
||||
|
||||
(comment
|
||||
(parse-letters-urls (fetch-url base-url))
|
||||
(def lyrics (scrape base-url))
|
||||
(with-open [writer (io/writer "darklyrics.txt")]
|
||||
(run!
|
||||
#(.write writer %)
|
||||
(take 20 lyrics)))
|
||||
(def response (fetch-url base-url))
|
||||
(def a (fetch-url (first (pages-urls response))))
|
||||
(artists-urls (fetch-url (second (pages-urls response))))
|
||||
(def la (lazy-artists (pages-urls response)))
|
||||
(first la)
|
||||
(def first-artists-page (first la))
|
||||
(def first-artists-album-url (first (second (first (artists-albums first-artists-page)))))
|
||||
(album-lyrics (fetch-url first-artists-album-url))
|
||||
(first (lazy-albums (first la)))
|
||||
(def artist-1 (first (artists-urls a)))
|
||||
(def artist-1-page (fetch-url artist-1))
|
||||
(-> artist-1-page
|
||||
(html/select [:div.album]))
|
||||
(def artists-albums-1 (artists-albums artist-1-page))
|
||||
(def artist-album (first (second (first artists-albums-1))))
|
||||
artist-album
|
||||
(def album (fetch-url artist-album))
|
||||
)
|
Loading…
Reference in New Issue