diff --git a/README_WGU.org b/README_WGU.org deleted file mode 100644 index 644d226..0000000 --- a/README_WGU.org +++ /dev/null @@ -1,86 +0,0 @@ -#+TITLE: Capstone Documentation - -* C - -Design and develop a fully functional data product that addresses your identified business problem or organizational need. Include each of the following attributes as they are the minimum required elements for the product: - -** one descriptive method and one non-descriptive (predictive or prescriptive) method - -*** Descriptive Method - -- 10 most common sentence structures -- Average lyric length -- 5-number analysis of lyric length - -** collected or available datasets - -** decision-support functionality - -** ability to support featurizing, parsing, cleaning, and wrangling datasets - -** methods and algorithms supporting data exploration and preparation - -** data visualization functionalities for data exploration and inspection - -** implementation of interactive queries - -** implementation of machine-learning methods and algorithms - -** functionalities to evaluate the accuracy of the data product - -** industry-appropriate security features - -** tools to monitor and maintain the product - -** a user-friendly, functional dashboard that includes at least three visualization types - - -* Documentation - -D. Create each of the following forms of documentation for the product you have developed: - -** Business Vision - -Provide rhyming lyric suggestions optionally constrained by syllable count. - -** Data Sets - -See ~resources/darklyrics-markov.tpt~ - -** Data Analysis - -See ~src/com/owoga/darklyrics/core.clj~ - -See https://github.com/eihli/prhyme - -** Assessment - -See visualization of rhyme suggestion in action. - -See perplexity? - -** Visualizations - -See visualization of smoothing technique. - -See wordcloud - -** Accuracy - -• assessment of the product’s accuracy - -** Testing - -• the results from the data product testing, revisions, and optimization based on the provided plans, including screenshots - -** Source - -• source code and executable file(s) - -** Quick Start - -• a quick start guide summarizing the steps necessary to install and use the product - -* Notes - -http-kit doesn't support https so no need to bother with keystore stuff like you would with jetty. Just proxy from haproxy. diff --git a/web/README_WGU.org b/web/README_WGU.org new file mode 100644 index 0000000..58388b5 --- /dev/null +++ b/web/README_WGU.org @@ -0,0 +1,369 @@ +#+TITLE: Capstone Documentation + +:PROPERTIES: +:END: + +* C + +Design and develop a fully functional data product that addresses your identified business problem or organizational need. Include each of the following attributes as they are the minimum required elements for the product: + +** one descriptive method and one non-descriptive (predictive or prescriptive) method + +*** Descriptive Method + +**** Most common sentence structures + +Here is the code to generate a report on the most common sentence structures given a directory of lyrics files. + +#+begin_src clojure :results value +(require '[com.owoga.corpus.markov :as markov] + '[com.owoga.prhyme.nlp.core :as nlp] + '[clojure.string :as string] + '[clojure.java.io :as io]) + +(let [lines (transduce + (comp + (map slurp) + (map #(string/split % #"\n")) + (map (partial remove empty?)) + (map nlp/structure-freqs)) + merge + {} + (eduction (markov/xf-file-seq 0 10) (file-seq (io/file "/home/eihli/src/prhyme/dark-corpus"))))] + (take 5 (sort-by (comp - second) lines))) +#+end_src + +#+RESULTS: +| (TOP (NP (NNP) (.))) | 6 | +| (TOP (S (NP (PRP)) (VP (VBP) (ADJP (JJ))) (.))) | 6 | +| (INC (NP (JJ) (NN)) nil (IN) (NP (DT)) (NP (PRP)) (VBP)) | 4 | +| (TOP (NP (NP (JJ) (NN)) nil (NP (NN) (CC) (NN)))) | 4 | +| (TOP (S (NP (JJ) (NN)) nil (VP (VBG) (ADJP (JJ))))) | 4 | + +*** Prescriptive Method + +**** Most likely next words + +#+begin_src clojure +(require '[com.darklimericks.server.models :as models] + '[com.owoga.trie :as trie]) + +(let [seed ["bother" "me"] + seed-ids (map models/database seed) + lookup (reverse seed-ids) + results (trie/children (trie/lookup models/markov-trie lookup))] + (->> results + (map #(get % [])) + (sort-by (comp - second)) + (map #(update % 0 models/database)) + (take 10))) +#+end_src + +#+RESULTS: +| don't | 36 | +| doesn't | 21 | +| to | 14 | +| won't | 9 | +| really | 5 | +| not | 4 | +| you | 4 | +| it | 3 | +| even | 3 | +| shouldn't | 3 | + +** collected or available datasets + +The dataset currently in use is in ~/dark-corpus~. Further dataset will need to be provided by the end-user. + +** Decision support functionality + +*** Choosing words for a lyric based on markov likelihood + +*** Choosing words to complete a lyric based on rhyme quality + +#+begin_src clojure :results value table :colnames yes +(require '[com.darklimericks.linguistics.core :as linguistics]) + +(let [results + (linguistics/rhymes-with-frequencies-and-rhyme-quality + "bother me" + models/markov-trie + models/database)] + (->> results + (map + (fn [[rhyming-word + rhyming-word-phones + frequency-count-of-rhyming-word + target-word + target-word-phones + rhyme-quality]] + [rhyming-word frequency-count-of-rhyming-word rhyme-quality])) + (take 10) + (vec) + (into [["rhyme" "frequency count" "rhyme quality"]]))) +#+end_src + +#+RESULTS: +| rhyme | frequency count | rhyme quality | +| honoree | 2 | 7 | +| referee | 3 | 6 | +| repartee | 2 | 6 | +| nominee | 2 | 6 | +| undersea | 1 | 6 | +| oversea | 1 | 6 | +| rosemarie | 0 | 6 | +| disagree | 180 | 5 | +| poverty | 175 | 5 | +| mockery | 122 | 5 | + +** Ability to support featurizing, parsing, cleaning, and wrangling datasets + +The data processing code is in ~prhyme~ + +Each line gets tokenized using a regular expression to split the string into tokens. + +#+begin_src clojure +(def re-word + "Regex for tokenizing a string into words + (including contractions and hyphenations), + commas, periods, and newlines." + #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\?|\n)") +#+end_src + +Along with tokenization, the lines get stripped of whitespace and converted to lowercase. This conversion is done so that +words can be compared: "Foo" is the same as "foo". + +#+begin_src clojure +(def xf-tokenize + (comp + (map string/trim) + (map (partial re-seq re-word)) + (map (partial map second)) + (map (partial mapv string/lower-case)))) +#+end_src + + +** methods and algorithms supporting data exploration and preparation + +The primary data structure and algorithms supporting exploration of the data are a Markov Trie + +The Trie data structure suppors a ~lookup~ function that returns the child trie at a certain lookup key and a ~children~ function that returns all of the immediate children of a particular Trie. + +#+begin_src clojure +(defprotocol ITrie + (children [self] "Immediate children of a node.") + (lookup [self ^clojure.lang.PersistentList ks] "Return node at key.")) + +(deftype Trie [key value ^clojure.lang.PersistentTreeMap children-] + ITrie + (children [trie] + (map + (fn [[k ^Trie child]] + (Trie. k + (.value child) + (.children- child))) + children-)) + + (lookup [trie k] + (loop [k k + trie trie] + (cond + ;; Allows `update` to work the same as with maps... can use `fnil`. + ;; (nil? trie') (throw (Exception. (format "Key not found: %s" k))) + (nil? trie) nil + (empty? k) + (Trie. (.key trie) + (.value trie) + (.children- trie)) + :else (recur + (rest k) + (get (.children- trie) (first k)))))) +#+end_src + +** data visualization functionalities for data exploration and inspection + +** implementation of interactive queries + +Interactive query capability at [[https://darklimericks.com/wgu]]. + +** implementation of machine-learning methods and algorithms + +Functions for training both forwards and backwards + +#+begin_src clojure +(defn file-seq->markov-trie + "For forwards markov." + [database files n m] + (transduce + (comp + (map slurp) + (map #(string/split % #"[\n+\?\.]")) + (map (partial transduce data-transform/xf-tokenize conj)) + (map (partial transduce data-transform/xf-filter-english conj)) + (map (partial remove empty?)) + (map (partial into [] (data-transform/xf-pad-tokens (dec m) "" 1 ""))) + (map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m)))) + (mapcat (partial mapv (data-transform/make-database-processor database)))) + (completing + (fn [trie lookup] + (update trie lookup (fnil #(update % 1 inc) [lookup 0])))) + (trie/make-trie) + files)) + +(comment + (let [files (->> "dark-corpus" + io/file + file-seq + (eduction (xf-file-seq 501 2))) + database (atom {:next-id 1}) + trie (file-seq->markov-trie database files 1 3)] + [(take 20 trie) + (map (comp (partial map @database) first) (take 20 (drop 105 trie))) + (take 10 @database)]) + ;; [([(1 1 2) [[1 1 2] 1]] + ;; [(1 1 3) [[1 1 3] 1]] + ;; [(1 1 7) [[1 1 7] 2]] + ;; [(1 1 9) [[1 1 9] 3]] + ;; [(1 1 16) [[1 1 16] 4]]) + ;; (("" "call" "me") + ;; ("" "call") + ;; ("" "right" "") + ;; ("" "right") + ;; ("" "that's" "proportional") + ;; ("" "that's") + ;; ("" "don't" "") + ;; ("" "don't") + ;; ("" "yourself" "in") + ;; ("" "yourself") + ;; ("" "transformation" "") + ;; ("" "transformation") + ;; ("") + ;; ("them" "from" "their") + ;; ("them" "from") + ;; ("them") + ;; ("from" "their" "pain") + ;; ("from" "their") + ;; ("from" "your" "side") + ;; ("from" "your")) + ;; (["come" 92] + ;; ["summer" 17] + ;; ["more" 101] + ;; [121 "that's"] + ;; [65 "by"] + ;; ["dust" 133] + ;; [70 "said"] + ;; ["misery" 128] + ;; [62 "get"] + ;; [74 "gone"])] + ) +#+end_src + +#+begin_src clojure +(defn train-backwards + "For building lines backwards so they can be seeded with a target rhyme." + [files n m trie-filepath database-filepath tightly-packed-trie-filepath] + (let [database (atom {:next-id 1}) + trie (file-seq->backwards-markov-trie database files n m)] + (nippy/freeze-to-file trie-filepath (seq trie)) + (println "Froze" trie-filepath) + (nippy/freeze-to-file database-filepath @database) + (println "Froze" database-filepath) + (save-tightly-packed-trie trie database tightly-packed-trie-filepath) + (let [loaded-trie (->> trie-filepath + nippy/thaw-from-file + (into (trie/make-trie))) + loaded-db (->> database-filepath + nippy/thaw-from-file) + loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file + tightly-packed-trie-filepath + (decode-fn loaded-db))] + (println "Loaded trie:" (take 5 loaded-trie)) + (println "Loaded database:" (take 5 loaded-db)) + (println "Loaded tightly-packed-trie:" (take 5 loaded-tightly-packed-trie)) + (println "Successfully loaded trie and database.")))) + +(comment + (time + (let [files (->> "dark-corpus" + io/file + file-seq + (eduction (xf-file-seq 0 250000))) + [trie database] (train-backwards + files + 1 + 5 + "/home/eihli/.models/markov-trie-4-gram-backwards.bin" + "/home/eihli/.models/markov-database-4-gram-backwards.bin" + "/home/eihli/.models/markov-tightly-packed-trie-4-gram-backwards.bin")])) + + (time + (def markov-trie (into (trie/make-trie) (nippy/thaw-from-file "/home/eihli/.models/markov-trie-4-gram-backwards.bin")))) + (time + (def database (nippy/thaw-from-file "/home/eihli/.models/markov-database-4-gram-backwards.bin"))) + (time + (def markov-tight-trie + (tpt/load-tightly-packed-trie-from-file + "/home/eihli/.models/markov-tightly-packed-trie-4-gram-backwards.bin" + (decode-fn database)))) + (take 20 markov-tight-trie) + ) +#+end_src + +** functionalities to evaluate the accuracy of the data product + +** industry-appropriate security features + +** tools to monitor and maintain the product + +** a user-friendly, functional dashboard that includes at least three visualization types + + +* Documentation + +D. Create each of the following forms of documentation for the product you have developed: + +** Business Vision + +Provide rhyming lyric suggestions optionally constrained by syllable count. + +** Data Sets + +See ~resources/darklyrics-markov.tpt~ + +** Data Analysis + +See ~src/com/owoga/darklyrics/core.clj~ + +See https://github.com/eihli/prhyme + +** Assessment + +See visualization of rhyme suggestion in action. + +See perplexity? + +** Visualizations + +See visualization of smoothing technique. + +See wordcloud + +** Accuracy + +• assessment of the product’s accuracy + +** Testing + +• the results from the data product testing, revisions, and optimization based on the provided plans, including screenshots + +** Source + +• source code and executable file(s) + +** Quick Start + +• a quick start guide summarizing the steps necessary to install and use the product + +* Notes + +http-kit doesn't support https so no need to bother with keystore stuff like you would with jetty. Just proxy from haproxy. diff --git a/web/src/com/darklimericks/server/views.clj b/web/src/com/darklimericks/server/views.clj index 554d56d..421b892 100644 --- a/web/src/com/darklimericks/server/views.clj +++ b/web/src/com/darklimericks/server/views.clj @@ -93,7 +93,7 @@ css :css :or {title "DarkLimericks" css ["/assets/tachyons.css"] - js ["/assets/wgu-main.js"]} + js ["/assets/wgu/main.js"]} :as opts} :opts} & body] (println (keys request)) @@ -293,7 +293,9 @@ "rhyme-target") (form/submit-button {:class "ml2"} - "Show rhyme suggestions"))]) + "Show rhyme suggestions")) + [:div + [:canvas#myChart {:width 400 :height 400}]]]) (defn show-rhyme-suggestion [request suggestions] diff --git a/web/wgu-app/package-lock.json b/web/wgu-app/package-lock.json index 9694aa4..191b5cd 100644 --- a/web/wgu-app/package-lock.json +++ b/web/wgu-app/package-lock.json @@ -178,6 +178,11 @@ "integrity": "sha1-hZgoeOIbmOHGZCXgPQF0eI9Wnug=", "dev": true }, + "chart.js": { + "version": "3.4.0", + "resolved": "https://registry.npmjs.org/chart.js/-/chart.js-3.4.0.tgz", + "integrity": "sha512-mJsRm2apQm5mwz2OgYqGNG4erZh/qljcRZkWSa0kLkFr3UC3e1wKRMgnIh6WdhUrNu0w/JT9PkjLyylqEqHXEQ==" + }, "cipher-base": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/cipher-base/-/cipher-base-1.0.4.tgz", diff --git a/web/wgu-app/package.json b/web/wgu-app/package.json index 942e6dd..73ec362 100644 --- a/web/wgu-app/package.json +++ b/web/wgu-app/package.json @@ -5,5 +5,7 @@ "devDependencies": { "shadow-cljs": "2.14.5" }, - "dependencies": {} + "dependencies": { + "chart.js": "^3.4.0" + } } diff --git a/web/wgu-app/shadow-cljs.edn b/web/wgu-app/shadow-cljs.edn index 9541996..15a8914 100644 --- a/web/wgu-app/shadow-cljs.edn +++ b/web/wgu-app/shadow-cljs.edn @@ -10,4 +10,6 @@ :builds {:frontend {:target :browser + :output-dir "/home/eihli/src/darklimericks/web/resources/public/wgu/" + :assets-path "/assets/" :modules {:main {:init-fn wgu.app/init}}}}} diff --git a/web/wgu-app/src/main/wgu/app.cljs b/web/wgu-app/src/main/wgu/app.cljs index 187b6ee..19c79b1 100644 --- a/web/wgu-app/src/main/wgu/app.cljs +++ b/web/wgu-app/src/main/wgu/app.cljs @@ -1,3 +1,35 @@ -(ns wgu.app) +(ns wgu.app + (:require ["chart.js/auto" :as chart])) -(defn init [] (println "Hello world")) + +(defn init-chart [] + (let [ctx (. js/document getElementById "myChart") + data {:type "bar", + :data {:labels ["Red", "Blue", "Yellow", "Green", "Purple", "Orange"], + :datasets [{ + :label "# of Votes", + :data [12, 19, 3, 5, 2, 3], + :backgroundColor [ + "rgba(255, 99, 132, 0.2)", + "rgba(54, 162, 235, 0.2)", + "rgba(255, 206, 86, 0.2)", + "rgba(75, 192, 192, 0.2)", + "rgba(153, 102, 255, 0.2)", + "rgba(255, 159, 64, 0.2)" + ], + :borderColor [ + "rgba(255, 99, 132, 1)", + "rgba(54, 162, 235, 1)", + "rgba(255, 206, 86, 1)", + "rgba(75, 192, 192, 1)", + "rgba(153, 102, 255, 1)", + "rgba(255, 159, 64, 1)" + ], + :borderWidth 1}]}, + :options {:scales {:y {:beginAtZero true}}}} + chart (new chart/Chart ctx, (clj->js data))] + (.log js/console chart))) + +(defn init [] + (println "Hello world") + (.addEventListener js/window "DOMContentLoaded" init-chart))