Move url helpers to utils namespace

main
Eric Ihli 3 years ago
parent 185bcfb6df
commit d0a8202b63

@ -2,29 +2,13 @@
(:require [net.cgrand.enlive-html :as html] (:require [net.cgrand.enlive-html :as html]
[com.owoga.prhyme.util :as util] [com.owoga.prhyme.util :as util]
[clojure.string :as string] [clojure.string :as string]
[clojure.java.io :as io])) [clojure.java.io :as io]
[com.owoga.corpus.util :refer [fetch-url]]))
(def root-url "http://www.darklyrics.com") (def root-url "http://www.darklyrics.com")
(def base-url "http://www.darklyrics.com/a.html") (def base-url "http://www.darklyrics.com/a.html")
(def data-dir "dark-corpus") (def data-dir "dark-corpus")
(defn fix-url
"Some hrefs are relative and some are absolute."
[url]
(string/replace url #".*(http://.*(?!http://).*$)" "$1"))
(defn fetch-url-
"Memoized for faster iterations in development."
[url]
(let [url (fix-url url)]
(try
(html/html-resource (java.net.URL. url))
(catch Exception e
(prn "Exception during fetch " e)
{}))))
(def fetch-url (memoize fetch-url-))
(defn parse-letters-urls [index] (defn parse-letters-urls [index]
(-> index (-> index
(html/select [:div.listrow]) (html/select [:div.listrow])

@ -1,10 +1,29 @@
(ns com.owoga.corpus.util (ns com.owoga.corpus.util
(:require [taoensso.tufte :as tufte :refer (defnp p profiled profile)] (:require [taoensso.tufte :as tufte :refer (defnp p profiled profile)]
[clojure.string :as string])) [clojure.string :as string]
[net.cgrand.enlive-html :as html]))
(set! *warn-on-reflection* true) (set! *warn-on-reflection* true)
(tufte/add-basic-println-handler! {}) (tufte/add-basic-println-handler! {})
(defn fix-url
"This is specific to some non-conformity on DarkLyrics.com.
Some hrefs are relative and some are absolute."
[url]
(string/replace url #".*(http://.*(?!http://).*$)" "$1"))
(defn fetch-url-
"Memoized for faster iterations in development."
[url]
(let [url (fix-url url)]
(try
(html/html-resource (java.net.URL. url))
(catch Exception e
(prn "Exception during fetch " e)
{}))))
(def fetch-url (memoize fetch-url-))
(defn clean-text (defn clean-text
"Removes all non-alphabetical characters and lowercases everything. "Removes all non-alphabetical characters and lowercases everything.
Very spartan way of cleaning." Very spartan way of cleaning."

Loading…
Cancel
Save