(ns examples.core (:require [clojure.string :as string] [clojure.set] [clojure.java.io :as io] [taoensso.nippy :as nippy] [taoensso.timbre :as timbre] [com.owoga.prhyme.frp :as frp] [com.owoga.prhyme.util :as util] [com.owoga.prhyme.core :as prhyme] [com.owoga.prhyme.data.bigrams :as bigrams] [com.owoga.prhyme.gen :as gen] [com.owoga.prhyme.nlp.core :as nlp] [com.owoga.prhyme.nlg.core :as nlg] [com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.data.thesaurus :as thesaurus] [com.owoga.prhyme.data.darklyrics :as darklyrics] [com.owoga.prhyme.util.weighted-rand :as weighted-rand] [com.owoga.prhyme.generation.weighted-selection :as weighted] [clojure.set :as set] [clojure.zip :as zip] [clojure.walk :as walk])) (defn weight-fn [word target result] (let [rimes (frp/consecutive-matching word target :rimes) nuclei (frp/consecutive-matching word target :nuclei) onsets (frp/consecutive-matching word target :onsets) total (apply + (map count [rimes nuclei onsets]))] total)) (defn pred-fn [word target result] (< 0 (weight-fn word target result))) (defn weight-popular [word target result] (if (dict/popular (:normalized-word word)) 10 1)) (defn pred-popular [word target result] (< 1 (weight-popular word target result))) (def words-by-rime (prhyme/words-by-rime* (filter (fn [[word & _]] (get dict/popular (string/lower-case word))) dict/cmu-dict))) (defn rime-1 [target] (let [rime (last (:rimes target))] (fn [x] (= rime (last (:rimes x)))))) (defn rime-2 [target] (let [rime (last (butlast (:rimes target)))] (fn [x] (= rime (last (butlast (:rimes x))))))) (defn rime-member? [coll] (let [coll (into #{} coll)] (fn [x] (coll (:normalized-word x))))) (defn rime-compare [& comparators] (let [juxtcomp (apply juxt comparators)] (fn [a b] (let [a (juxtcomp a) b (juxtcomp b)] (compare a b))))) (def c (fn [a b] ((rime-compare (rime-1 {:rimes '(1 2)}) (rime-2 {:rimes '(1 2)}) (rime-member? ["foo" "bar"])) b a))) (comment (let [coll [{:rimes '(3 2) :normalized-word "foo"} {:rimes '(1 2) :normalized-word "foo"} {:rimes '(4 5) :normalized-word "foo"} {:rimes '(1 2) :normalized-word "buzz"}]] (sort c coll)) (let [coll '("woman" "union" "passion" "infatuation" "emotion" "disposition" "communion" "attraction" "affection" "adoration" "admiration") coll (map #(prhyme/phrase->word dict/popular %) coll) target (prhyme/phrase->word dict/popular "devotion") synonyms (thesaurus/synonyms "love" "heart") comparisons (fn [target] (fn [a b] ((rime-compare (rime-1 target) (rime-2 target) (rime-member? synonyms)) b a)))] (sort (comparisons target) coll)) ) (defn rhymestorm [& words] (let [synonyms (->> (apply thesaurus/synonyms words) (filter #(get dict/popular %)) (into #{})) comparisons (fn [target] (fn [a b] ((rime-compare (rime-1 target) (rime-2 target) (rime-member? synonyms)) b a)))] (->> synonyms (map (fn [synonym] (let [word (prhyme/phrase->word dict/prhyme-dict synonym) rhymes (get words-by-rime (last (:rimes word)))] (when rhymes (let [rhyming-words (map string/lower-case (prhyme/flatten-node rhymes)) rhyming-synonyms (remove #{(:normalized-word word)} (filter synonyms rhyming-words))] [(:normalized-word word) rhyming-synonyms]))))) (remove (fn [[_ rhymes]] (empty? rhymes))) (map (fn [[target rhymes]] [target (->> rhymes (map prhyme/phrase->word dict/popular) (sort (comparisons (prhyme/phrase->word dict/popular target))) (map :normalized-word))])) (into {})))) (comment (rhymestorm "love") (take 3 (drop 500 dict/prhyme-dict)) (take 3 dict/cmu-dict) (take 3 dict/popular) (let [node (get-in words-by-rime ['("AH" "V")])] (->> (prhyme/flatten-node node))) (let [love-synonyms (thesaurus/thesaurus "love") heart-synonyms (thesaurus/thesaurus "heart")] (->> (clojure.set/intersection (into #{} love-synonyms) (into #{} heart-synonyms)) (map string/lower-case) (filter #(dict/popular %)))) (let [synonyms (thesaurus/synonyms "love" "heart")] synonyms) (def love-rhymes (let [synonyms (->> (thesaurus/synonyms "love" "heart") (filter #(get dict/popular %)) (into #{}))] (->> (map (fn [synonym] (let [word (prhyme/phrase->word dict/prhyme-dict synonym) rhymes (get words-by-rime (last (:rimes word)))] (when rhymes (let [rhyming-words (map string/lower-case (prhyme/flatten-node rhymes)) rhyming-synonyms (filter synonyms rhyming-words)] [(:normalized-word word) rhyming-synonyms])))) synonyms) (into {})))) (count love-rhymes) (get-in words-by-rime ['("AH" "V")]) (weight-fn (first (filter #(= (:normalized-word %) "gotshal's") dict/prhyme-dict)) (prhyme/phrase->word dict/prhyme-dict "bye bye") nil) (take 10 darklyrics/darklyrics-markov-2) (get darklyrics/darklyrics-markov-2 '("memory" "my")) (repeatedly 5 (fn [] (let [rhymes (gen/selection-seq dict/prhyme-dict (comp (weighted/adjust-for-tail-rhyme 0.90) #_(weighted/adjust-for-rhymes 0.50) #_(weighted/adjust-for-fn :adj-rimes 0.80 pred-fn weight-fn) (weighted/adjust-for-fn :adj-popular 0.95 pred-popular weight-popular) (weighted/adjust-for-markov darklyrics/darklyrics-markov-2 0.99)) (prhyme/phrase->word dict/prhyme-dict "happy birthday taylor my love"))] (->> rhymes (take 5) (map :normalized-word))))) ) (defn remove-sentences-with-words-not-in-dictionary [dictionary] (let [dictionary (into #{} dictionary)] (fn [sentences] (->> sentences (map #(string/split % #" ")) (remove #(some (complement dictionary) %)) (remove #(some string/blank? %)) (map #(string/join " " %)))))) (defn pathed-part-of-speech-word-frequencies "Seq of pathed part-of-speech to word frequencies of each document. {(TOP NP NN) {'test' 2 'sample' 4 ,,,} (TOP VP VBZ) {'is' 5 'runs' 2 ,,,} ,,,} To reduce, deep merge with +." [documents] (->> documents (map slurp) (map util/clean-text) (filter dict/english?) (map #(string/split % #"\n+")) (map (remove-sentences-with-words-not-in-dictionary dict/popular)) (remove empty?) (remove #(some empty? %)) (map nlp/treebank-zipper) (map nlp/leaf-pos-path-word-freqs))) (defn grammar-tree-frequencies "Seq of grammar tree frequencies of each document. {(TOP (NP (NN)) (VP (VBZ))) 23 (TOP (NP (DT) (NN)) (VP (VBZ))) 18 ,,,} To reduce, merge with +." [documents] (->> documents (map slurp) (map util/clean-text) (filter dict/english?) (map #(string/split % #"\n+")) (map (remove-sentences-with-words-not-in-dictionary dict/popular)) (remove empty?) (remove #(some empty? %)) (map nlp/parse-to-simple-tree) (map nlp/parse-tree-sans-leaf-words) (map (fn [lines] (map #(hash-map % 1) lines))) (map (partial apply merge-with +)))) (defn weighted-selection-from-map [m] (first (weighted-rand/weighted-selection second (seq m)))) (defn chunked-writing-pos-path-freqs [documents chunk-size] (let [chunks (range 0 (count documents) chunk-size)] (run! (fn [chunk] (let [structure (->> documents (drop chunk) (take chunk-size) pathed-part-of-speech-word-frequencies (reduce (fn [a v] (nlp/deep-merge-with + a v)) {})) filepath (format "resources/pos-freqs/%s.nip" chunk)] (timbre/info (format "Writing to %s." filepath)) (nippy/freeze-to-file filepath structure))) chunks))) (defn chunked-writing-structure-freqs [documents chunk-size] (let [chunks (range 0 (count documents) chunk-size)] (run! (fn [chunk] (let [structure (->> documents (drop chunk) (take chunk-size) structures (reduce (fn [a v] (nlp/deep-merge-with + a v)) {})) filepath (format "resources/structure-freqs/%s.nip" chunk)] (timbre/info (format "Writing to %s." filepath)) (nippy/freeze-to-file filepath structure))) chunks))) (defn pos-paths->pos-freqs "Convert pos paths, like {(TOP S NP NN) {'test' 5 'car' 3 ,,,}} into a top-level pos freq map like {NN {'test' 25 'car' 8 ,,,}}.is" [pos-paths] (->> pos-paths (map (fn [[k v]] (hash-map (last k) v))) (reduce (fn [a v] (nlp/deep-merge-with + a v)) {}))) (comment (take 5 darklyrics/darklyrics-markov-2) (darklyrics/darklyrics-markov-2 '("time" "is")) (def darkov-2 darklyrics/darklyrics-markov-2) ;; => ([("profanity" "unholy") {"its" 2}] ;; [("ants" "triumph") {nil 1}] ;; [("hiding" "our") {"of" 1, "expose" 3, "above" 1}] ;; [("won't" "intervention") {"divine" 1, "an" 1}] ;; [("pines" "weeping") {"the" 1}]) ;; Merge pos paths (def pos-freqs-data (let [documents (->> "resources/pos-freqs" io/file file-seq (remove #(.isDirectory %)))] (reduce (fn [accum document] (let [data (nippy/thaw-from-file document)] (nlp/deep-merge-with + accum data))) {} documents))) (nippy/freeze-to-file "resources/corpus/darklyrics/pos-word-freqs.nippy" pos-freqs-data) (count pos-freqs-data) (take 20 pos-freqs-data) (time (def pos-freqs-data-3 (reduce (fn [acc [k v]] (let [new-map (hash-map (take-last 3 k) v)] (nlp/deep-merge-with + acc new-map))) {} pos-freqs-data))) (count pos-freqs-data-3) (take 2 (reverse (sort-by #(count (second %)) pos-freqs-data-3))) (time (def pos-freqs-data-2 (reduce (fn [acc [k v]] (let [new-map (hash-map (take-last 2 k) v)] (nlp/deep-merge-with + acc new-map))) {} pos-freqs-data-3))) (def structure-freq-data (let [documents (->> "resources/structure-freqs" io/file file-seq (remove #(.isDirectory %)))] (reduce (fn [accum document] (let [data (nippy/thaw-from-file document)] (nlp/deep-merge-with + accum data))) {} documents))) (def popular-structure-freq-data (into {} (take 500 (reverse (sort-by #(second %) structure-freq-data))))) (take 100 popular-structure-freq-data) (nippy/freeze-to-file "resources/corpus/darklyrics/grammar-tree-freqs.nippy" structure-freq-data) (def t1 (nippy/thaw-from-file "resources/structure-freqs/0.nip")) structures (take 100 (reverse (sort-by second structures))) (do (let [documents (->> "dark-corpus" io/file file-seq (remove #(.isDirectory %)) (drop 5000)) chunk-size 5000] (chunked-writing-pos-path-freqs documents chunk-size)) (let [documents (->> "dark-corpus" io/file file-seq (remove #(.isDirectory %)) (drop 50000)) chunk-size 5000] (chunked-writing-structure-freqs documents chunk-size))) (def t1 (nippy/thaw-from-file "resources/pos-freqs/0.nip")) (take 10 t1) (let [path-freqs (pos-paths->pos-freqs t1)] (take 10 path-freqs)) (take 5 t1) (take 10 (reverse (sort-by #(count (second %)) t1))) (def t3 (nippy/thaw-from-file "resources/pos-freqs/400.nip")) (def t2 (nippy/thaw-from-file "resources/pos-freqs/800.nip")) (count (merge-with + t1 t2 t3)) ;; => 2353 (count t3) ;; => 1013 (count t1) ;; => 871 (count t2) ;; => 676 (def corpus (->> "dark-corpus" io/file file-seq (remove #(.isDirectory %))) (time (def example-pos-freqs (->> corpus (take 100) pos-path-freqs (reduce (fn [a v] (nlp/deep-merge-with + a v)) {})))) (time (def example-structures (->> corpus (take 100) structures (reduce (fn [a v] (merge-with + a v)) {})))) (let [structure (weighted-selection-from-map example-structures)] (repeatedly 10 (fn [] (->> (nlp/generate-from-structure-and-pos-freqs structure example-pos-freqs) nlp/leaf-nodes (string/join " "))))) ;; => ("then get your life" ;; "sometimes lie my hand" ;; "still become your chapter" ;; "alright fade our surfing" ;; "far care my band" ;; "all fake my fallow" ;; "here gimme our head" ;; "long back my guide" ;; "never stop their seed" ;; "never consume our tomorrow") ;; => ("now scarred towards the future" ;; "never gone among the side" ;; "ill removed with the end" ;; "well filled in the life" ;; "again torn towards the world" ;; "desperately matched in the love" ;; "nowadays matched in the ark" ;; "awhile needed through all night" ;; "so torn in the darkness" ;; "first erased on the land") ;; => ("pictures of the destiny" ;; "tears on the pain" ;; "lights in the disaster" ;; "corpses on the fire" ;; "castles on the universe" ;; "efforts for the king" ;; "visions of the night" ;; "retreats into the darker" ;; "tales into the attack" ;; "pictures into the play") (get-in {:a 1} '()) (let [zipper (zip/seq-zip '(TOP (S (NP) (VB))))] (-> zipper zip/down zip/right zip/node)))