From d0a8202b6355ff6cb446d08fa459828965324452 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Thu, 16 Dec 2021 07:33:17 -0600 Subject: [PATCH] Move url helpers to utils namespace --- src/com/owoga/corpus/darklyrics.clj | 20 ++------------------ src/com/owoga/corpus/util.clj | 21 ++++++++++++++++++++- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/com/owoga/corpus/darklyrics.clj b/src/com/owoga/corpus/darklyrics.clj index f8b4a08..80e4d2c 100644 --- a/src/com/owoga/corpus/darklyrics.clj +++ b/src/com/owoga/corpus/darklyrics.clj @@ -2,29 +2,13 @@ (:require [net.cgrand.enlive-html :as html] [com.owoga.prhyme.util :as util] [clojure.string :as string] - [clojure.java.io :as io])) + [clojure.java.io :as io] + [com.owoga.corpus.util :refer [fetch-url]])) (def root-url "http://www.darklyrics.com") (def base-url "http://www.darklyrics.com/a.html") (def data-dir "dark-corpus") -(defn fix-url - "Some hrefs are relative and some are absolute." - [url] - (string/replace url #".*(http://.*(?!http://).*$)" "$1")) - -(defn fetch-url- - "Memoized for faster iterations in development." - [url] - (let [url (fix-url url)] - (try - (html/html-resource (java.net.URL. url)) - (catch Exception e - (prn "Exception during fetch " e) - {})))) - -(def fetch-url (memoize fetch-url-)) - (defn parse-letters-urls [index] (-> index (html/select [:div.listrow]) diff --git a/src/com/owoga/corpus/util.clj b/src/com/owoga/corpus/util.clj index 66beef2..d8c15ba 100644 --- a/src/com/owoga/corpus/util.clj +++ b/src/com/owoga/corpus/util.clj @@ -1,10 +1,29 @@ (ns com.owoga.corpus.util (:require [taoensso.tufte :as tufte :refer (defnp p profiled profile)] - [clojure.string :as string])) + [clojure.string :as string] + [net.cgrand.enlive-html :as html])) (set! *warn-on-reflection* true) (tufte/add-basic-println-handler! {}) +(defn fix-url + "This is specific to some non-conformity on DarkLyrics.com. + Some hrefs are relative and some are absolute." + [url] + (string/replace url #".*(http://.*(?!http://).*$)" "$1")) + +(defn fetch-url- + "Memoized for faster iterations in development." + [url] + (let [url (fix-url url)] + (try + (html/html-resource (java.net.URL. url)) + (catch Exception e + (prn "Exception during fetch " e) + {})))) + +(def fetch-url (memoize fetch-url-)) + (defn clean-text "Removes all non-alphabetical characters and lowercases everything. Very spartan way of cleaning."