diff --git a/src/com/owoga/corpus/darklyrics.clj b/src/com/owoga/corpus/darklyrics.clj index a17cad9..ec932d6 100644 --- a/src/com/owoga/corpus/darklyrics.clj +++ b/src/com/owoga/corpus/darklyrics.clj @@ -8,10 +8,13 @@ (def root-url "http://www.darklyrics.com") (def base-url "http://www.darklyrics.com/a.html") +(def data-dir "dark-corpus") -(defn fetch-url [url] +(defn fetch-url- [url] (html/html-resource (java.net.URL. url))) +(def fetch-url (memoize fetch-url-)) + (defn parse-letters-urls [index] (-> index (html/select [:div.listrow]) @@ -48,6 +51,21 @@ ((partial apply str)) (string/replace #"\s+" " "))) +(defn parse-album-songs [album-html] + (->> album-html + (#(html/select % [:div.lyrics])) + first + :content + (util/take-between #(= :h3 (:tag %))) + (map + (fn [[title & lyrics]] + [(html/text (first (html/select title [:a]))) + (->> lyrics + (filter string?) + (map string/trim) + (string/join "\n") + (string/trim))])))) + (defn english? [text] (let [words (string/split text #"\s+") english-words @@ -57,17 +75,34 @@ (defn scrape ([base-url] (scrape (parse-letters-urls (fetch-url base-url)) '() '())) - ([letters-urls artists-urls albums-urls] + ([letters-urls artists-urls [artist-name albums-urls]] (cond (not-empty albums-urls) - (cons (parse-album-lyrics (fetch-url (first albums-urls))) - (lazy-seq (scrape letters-urls artists-urls (rest albums-urls)))) + (let [album-html (fetch-url (first albums-urls)) + album-name (->> (html/select album-html [:div.albumlyrics :h2]) + (map html/text) + first + (#(string/replace % #"(album: |\s+)" " ")) + (string/trim))] + (cons [artist-name album-name (parse-album-songs album-html)] + (lazy-seq (scrape letters-urls artists-urls [artist-name (rest albums-urls)])))) (not-empty artists-urls) - (scrape letters-urls (rest artists-urls) (parse-artists-albums (fetch-url (first artists-urls)))) + (let [artist-html (fetch-url (first artists-urls)) + artist-name (->> (html/select artist-html [:h1]) + (map html/text) + first + (#(string/replace % #" LYRICS" "")))] + (scrape + letters-urls + (rest artists-urls) + [artist-name (parse-artists-albums artist-html)])) (not-empty letters-urls) - (scrape (rest letters-urls) (parse-artists-urls (fetch-url (first letters-urls))) albums-urls) + (scrape + (rest letters-urls) + (parse-artists-urls (fetch-url (first letters-urls))) + [nil nil]) :else nil))) @@ -91,12 +126,61 @@ (map (fn [[k v]] (vector (list k) v)) (make-markov (slurp "darklyrics.txt") 2)))) +(defn norm-filepath [text] + (-> text + string/lower-case + (string/replace #"\s+" "-") + (string/replace #"[\(\)\"',\.]" ""))) + +(defn write-scrape [[artist album songs]] + (run! + (fn [[song lyrics]] + (let [file (io/file (string/join "/" (map norm-filepath [data-dir artist album song])))] + (io/make-parents file) + (spit file lyrics))) + songs)) + +(defn do-scrape [] + (let [artist-album-texts (scrape base-url)] + (run! + (fn [x] + (Thread/sleep (rand 3000)) + (println (str "Writing songs for " (second x))) + (write-scrape x)) + artist-album-texts))) + (comment + (take 3 (scrape base-url)) + (do-scrape) + (def letters-urls (parse-letters-urls (fetch-url base-url))) + (def artists-urls (parse-artists-urls (fetch-url (first letters-urls)))) + (def artist-html (fetch-url (first artists-urls))) + (def album-urls (parse-artists-albums artist-html)) + (def album-html (fetch-url (first album-urls))) + (->> album-html + (#(html/select % [:div.lyrics])) + first + :content + (util/take-between #(= :h3 (:tag %))) + (map + (fn [[title & lyrics]] + [(html/text (first (html/select title [:a]))) + (->> lyrics + (filter string?) + (map string/trim) + (string/join "\n") + (string/trim))]))) + + (->> (html/select artist-html [:h1]) + (map html/text) + (first )) (def darkov (into {} (map (fn [[k v]] (vector (list k) v)) - (make-markov (slurp "darklyrics.txt"))))) + (make-markov (slurp "darklyrics.txt") 1)))) + (run! write-scrape (take 4 (scrape base-url))) + (take 100 darkov) (util/write-markov "darklyrics.edn" darkov) (spit "test.txt" (pr-str {:foo "1"})) diff --git a/src/com/owoga/prhyme/core.clj b/src/com/owoga/prhyme/core.clj index c19cc51..b01e022 100644 --- a/src/com/owoga/prhyme/core.clj +++ b/src/com/owoga/prhyme/core.clj @@ -136,8 +136,18 @@ (map reverse))) (defn rimes? [a b] - (if (= 1 (count (last (:rimes a)))) - (= (take-last 2 (:rimes a)) (take-last 2 (:rimes b))) + (if (and (= 1 (count (last (:rimes a)))) + (= 1 (count (last (:rimes b)))) + (or (= (last (:rimes a)) '("ER")) + (= (last (:rimes a)) '("AA")) + (= (last (:rimes a)) '("AE")) + (= (last (:rimes a)) '("AO")) + (= (last (:rimes a)) '("AW")) + (= (last (:rimes a)) '("EH")) + (= (last (:rimes a)) '("IH")) + (= (last (:rimes a)) '("UH")) + (= (last (:rimes a)) '("AH")))) + (= (last (:onsets a)) (last (:onsets b))) (= (last (:rimes a)) (last (:rimes b))))) (defn onset+nucleus [syllables] diff --git a/src/com/owoga/prhyme/gen.clj b/src/com/owoga/prhyme/gen.clj index f0aab0c..7731ad0 100644 --- a/src/com/owoga/prhyme/gen.clj +++ b/src/com/owoga/prhyme/gen.clj @@ -253,13 +253,21 @@ (repeatedly) (take 5))) + (apply map vector (->> ["taylor is my star" + "she brightens my day"] + (generate-prhymes) + (repeatedly) + (take 10))) + + (frp/phrase->word frp/popular "homer") + (frp/phrase->word frp/popular "") (apply map vector (->> ["mister sandman" "give me a dream" "make him the cutest" "that i've ever seen"] (generate-prhymes) (repeatedly) - (take 2))) + (take 10))) (def adj (comp (adjust-for-markov dr/darkov 0.9) (adjust-for-tail-rimes words-map 0.9))) diff --git a/src/com/owoga/prhyme/util.clj b/src/com/owoga/prhyme/util.clj index 17a4738..95c2246 100644 --- a/src/com/owoga/prhyme/util.clj +++ b/src/com/owoga/prhyme/util.clj @@ -115,6 +115,17 @@ (recur (rest coll) (cons (first coll) acc))))) +(defn take-between + "Seq [) of pred." + [pred coll] + (->> coll + (drop-while #(not (pred %))) + ((fn [coll] + (if (empty? coll) + nil + (cons (cons (first coll) (take-while #(not (pred %)) (rest coll))) + (lazy-seq (take-between pred (rest coll))))))))) + (defn max-consecutive [pred coll] (loop [coll coll cur-count 0