diff --git a/dev/examples/core.clj b/dev/examples/core.clj index f388b1e..3438f80 100644 --- a/dev/examples/core.clj +++ b/dev/examples/core.clj @@ -207,8 +207,13 @@ (remove #(some string/blank? %)) (map #(string/join " " %)))))) -(defn pos-path-freqs - "Seq of pos-path frequencies of each document. +(defn pathed-part-of-speech-word-frequencies + "Seq of pathed part-of-speech to word frequencies of each document. + + {(TOP NP NN) {'test' 2 'sample' 4 ,,,} + (TOP VP VBZ) {'is' 5 'runs' 2 ,,,} + ,,,} + To reduce, deep merge with +." [documents] (->> documents @@ -222,8 +227,13 @@ (map nlp/treebank-zipper) (map nlp/leaf-pos-path-word-freqs))) -(defn structures - "Seq of structure frequencies of each document. +(defn grammar-tree-frequencies + "Seq of grammar tree frequencies of each document. + + {(TOP (NP (NN)) (VP (VBZ))) 23 + (TOP (NP (DT) (NN)) (VP (VBZ))) 18 + ,,,} + To reduce, merge with +." [documents] (->> documents @@ -252,7 +262,7 @@ (let [structure (->> documents (drop chunk) (take chunk-size) - pos-path-freqs + pathed-part-of-speech-word-frequencies (reduce (fn [a v] (nlp/deep-merge-with + a v)) @@ -303,18 +313,82 @@ ;; [("won't" "intervention") {"divine" 1, "an" 1}] ;; [("pines" "weeping") {"the" 1}]) - (def structures (nippy/thaw-from-file "resources/structure-freqs/0.nip")) + + ;; Merge pos paths + (def pos-freqs-data + (let [documents (->> "resources/pos-freqs" + io/file + file-seq + (remove #(.isDirectory %)))] + (reduce + (fn [accum document] + (let [data (nippy/thaw-from-file document)] + (nlp/deep-merge-with + accum data))) + {} + documents))) + + (nippy/freeze-to-file "resources/corpus/darklyrics/pos-word-freqs.nippy" pos-freqs-data) + (count pos-freqs-data) + (take 20 pos-freqs-data) + + (time + (def pos-freqs-data-3 + (reduce + (fn [acc [k v]] + (let [new-map (hash-map (take-last 3 k) v)] + (nlp/deep-merge-with + acc new-map))) + {} + pos-freqs-data))) + (count pos-freqs-data-3) + (take 2 (reverse (sort-by #(count (second %)) pos-freqs-data-3))) + + (time + (def + pos-freqs-data-2 + (reduce + (fn [acc [k v]] + (let [new-map (hash-map (take-last 2 k) v)] + (nlp/deep-merge-with + acc new-map))) + {} + pos-freqs-data-3))) + + (def structure-freq-data + (let [documents (->> "resources/structure-freqs" + io/file + file-seq + (remove #(.isDirectory %)))] + (reduce + (fn [accum document] + (let [data (nippy/thaw-from-file document)] + (nlp/deep-merge-with + accum data))) + {} + documents))) + (def popular-structure-freq-data (into {} (take 500 (reverse (sort-by #(second %) structure-freq-data))))) + (take 100 popular-structure-freq-data) + (nippy/freeze-to-file "resources/corpus/darklyrics/grammar-tree-freqs.nippy" structure-freq-data) + + (def t1 (nippy/thaw-from-file "resources/structure-freqs/0.nip")) + structures (take 100 (reverse (sort-by second structures))) - (let [documents (->> "dark-corpus" - io/file - file-seq - (remove #(.isDirectory %)) - (drop 5000) - (take 10000)) - chunk-size 5000] - (chunked-writing-pos-path-freqs - documents - chunk-size)) + (do + (let [documents (->> "dark-corpus" + io/file + file-seq + (remove #(.isDirectory %)) + (drop 5000)) + chunk-size 5000] + (chunked-writing-pos-path-freqs + documents + chunk-size)) + (let [documents (->> "dark-corpus" + io/file + file-seq + (remove #(.isDirectory %)) + (drop 50000)) + chunk-size 5000] + (chunked-writing-structure-freqs + documents + chunk-size))) (def t1 (nippy/thaw-from-file "resources/pos-freqs/0.nip")) (take 10 t1) @@ -336,7 +410,7 @@ (->> "dark-corpus" io/file file-seq - (remove #(.isDirectory %)))) + (remove #(.isDirectory %))) (time (def example-pos-freqs @@ -406,5 +480,4 @@ (-> zipper zip/down zip/right - zip/node)) - ) + zip/node))) diff --git a/src/com/owoga/corpus/darklyrics.clj b/src/com/owoga/corpus/darklyrics.clj index ffb6814..7c33b2f 100644 --- a/src/com/owoga/corpus/darklyrics.clj +++ b/src/com/owoga/corpus/darklyrics.clj @@ -1,5 +1,3 @@ -;; TODO: Filter out non-English lyrics. - (ns com.owoga.corpus.darklyrics (:require [net.cgrand.enlive-html :as html] [com.owoga.prhyme.util :as util] @@ -10,10 +8,14 @@ (def base-url "http://www.darklyrics.com/a.html") (def data-dir "dark-corpus") -(defn fix-url [url] +(defn fix-url + "Some hrefs are relative and some are absolute." + [url] (string/replace url #".*(http://.*(?!http://).*$)" "$1")) -(defn fetch-url- [url] +(defn fetch-url- + "Memoized for faster iterations in development." + [url] (let [url (fix-url url)] (try (html/html-resource (java.net.URL. url)) @@ -139,9 +141,6 @@ {} ((util/window (inc n)) tokens)))) -(defn read-darkov-2 [] - (util/read-markov (io/resource "dark-corpus-2.edn"))) - (defn norm-filepath [text] (-> text string/lower-case @@ -168,11 +167,6 @@ artist-album-texts))) (comment - (def darkov-2 (util/read-markov (io/resource "dark-corpus-2.edn"))) - (take 10 darkov-2) - (get darkov-2 '(nil nil)) - (take 3 (scrape base-url)) - (-main) (def letters-urls (parse-letters-urls (fetch-url base-url))) (def artists-urls (parse-artists-urls (fetch-url (first letters-urls)))) (def artist-html (fetch-url (first artists-urls))) @@ -190,16 +184,4 @@ (filter string?) (map string/trim) (string/join "\n") - (string/trim))]))) - - (->> (html/select artist-html [:h1]) - (map html/text) - (first )) - (def darkov - (into - {} - (map (fn [[k v]] (vector (list k) v)) - (make-markov (slurp "darklyrics.txt") 1)))) - (run! write-scrape (take 4 (scrape base-url))) - - (def lyrics (scrape base-url))) + (string/trim))])))) diff --git a/src/com/owoga/prhyme/data/darklyrics.clj b/src/com/owoga/prhyme/data/darklyrics.clj index 0ead430..f2623a9 100644 --- a/src/com/owoga/prhyme/data/darklyrics.clj +++ b/src/com/owoga/prhyme/data/darklyrics.clj @@ -1,7 +1,12 @@ (ns com.owoga.prhyme.data.darklyrics (:require [clojure.java.io :as io] - [taoensso.nippy :as nippy]) - (:import [java.io DataInputStream ByteArrayOutputStream])) + [clojure.edn :as edn] + [taoensso.nippy :as nippy] + [next.jdbc :as jdbc] + [next.jdbc.sql :as sql] + [com.owoga.prhyme.data.dictionary :as dict]) + (:import (java.io ByteArrayOutputStream ByteArrayInputStream + DataOutputStream DataInputStream))) (defn thaw-from-file "Convenience util: like `thaw`, but reads from `(clojure.java.io/file )`. @@ -17,23 +22,52 @@ (io/copy xin xout) (nippy/thaw (.toByteArray xout) thaw-opts)))) -;; (thaw-from-file (io/resource "test.bin")) -;; (bytes (byte-array (take 20 (range)))) +(def darklyrics-markov-2 + (thaw-from-file (io/resource "dark-corpus-2.bin"))) -;; (byte-array (map (comp byte int) "ascii")) -;; ;; => [97, 115, 99, 105, 105] -;; (bytes (byte-array (map (comp byte int) "ascii"))) -;; ;; => [97, 115, 99, 105, 105] +(comment + (def words (map #(vector (hash %) %) + (map :normalized-word dict/prhyme-dict))) + (count words) + (count dict/prhyme-dict) + (count (into #{} (map first words))) + (take 5 words) -;; (let [xin (io/input-stream (io/resource "test.bin")) -;; xout (ByteArrayOutputStream.)] -;; (io/copy xin xout) -;; (nippy/thaw (.toByteArray xout))) + (def ds "jdbc:sqlite:resources/darklyrics.db") -;; (.fullyRead (io/input-stream (io/resource "test.bin"))) + (def hashes + (into + {} + (map + (fn [[k v]] + [(hash k) k]) + darklyrics-markov-2))) -;; (.getSize (io/input-stream (io/resource "dark-corpus-2.bin"))) -;; (def data (into {} (map vec (partition 2 (range 20))))) -;; (nippy/freeze-to-file "resources/test.bin" data) -(def darklyrics-markov-2 - (thaw-from-file (io/resource "dark-corpus-2.bin"))) + (nippy/freeze-to-file + "resources/dark-corpus-hashes.nip" + hashes) + + (run! + (fn [c] + (sql/insert-multi! + ds + :markov + [:hash :words] + c)) + (partition (int 1e5) hashes)) + + (run! + (fn [c] + (sql/insert-multi! + ds + :dict + [:hash :word] + c)) + (partition (int 1e5) words)) + + (println (+ 2 2)) + + (keyword "won't") + (get darklyrics-markov-2 '("hiding" "our")) + (count darklyrics-markov-2) + ) diff --git a/src/com/owoga/prhyme/data/dictionary.clj b/src/com/owoga/prhyme/data/dictionary.clj index 0e3ff48..4728255 100644 --- a/src/com/owoga/prhyme/data/dictionary.clj +++ b/src/com/owoga/prhyme/data/dictionary.clj @@ -5,6 +5,13 @@ [com.owoga.prhyme.util :as util] [com.owoga.prhyme.core :as prhyme])) +(def cmu-with-stress + (->> (io/resource "cmudict-0.7b") + io/reader + line-seq + (drop-while #(= \; (first %))) + (map #(string/split % #"\s+")))) + (def cmu-dict (->> (io/reader (io/resource "cmudict_SPHINX_40")) (line-seq) @@ -62,3 +69,15 @@ (->> words (filter #(word-set (string/lower-case %))))] (< 0.7 (/ (count english-words) (max 1 (count words)))))) + +(comment + (let [phoneme-lookup (into + {} + (map + (fn [[word & phonemes]] + [(string/lower-case word) + phonemes]) + cmu-with-stress))] + (phoneme-lookup "zhirinovsky")) + + ) diff --git a/src/com/owoga/prhyme/nlg/core.clj b/src/com/owoga/prhyme/nlg/core.clj index c57c045..fcf5bb5 100644 --- a/src/com/owoga/prhyme/nlg/core.clj +++ b/src/com/owoga/prhyme/nlg/core.clj @@ -221,13 +221,50 @@ ) +(defmethod create-element '(TOP (PP (IN) (NP (DT) (NN)))) + [tree] + (let [zipper (zip/seq-zip tree) + noun (->> tree + leaf-filter + (filter (fn [z] + (let [[pos word] (zip/node z)] + (= pos 'NN)))) + (map zip/node) + first) + preposition (->> tree + leaf-filter + (filter (fn [z] + (let [[pos word] (zip/node z)] + (= pos 'IN)))) + (map zip/node) + first) + determiner (->> tree + leaf-filter + (filter (fn [z] + (let [[pos word] (zip/node z)] + (= pos 'DT)))) + (map zip/node) + first) + noun-phrase (.createNounPhrase nlg-factory (second noun)) + prepositional-phrase (.createPrepositionPhrase nlg-factory) + clause (.createClause nlg-factory)] + (.setDeterminer noun-phrase (second determiner)) + (.addComplement prepositional-phrase noun-phrase) + (.setPreposition prepositional-phrase (second preposition)) + prepositional-phrase)) + +(comment + (.realise + realiser + (create-element '(TOP (PP (IN "in") (NP (DT "the") (NN "park")))))) + ) + (defmethod create-element '(PRP$) [[[_ child]]] (.createNounPhrase nlg-factory child)) (defmethod create-element 'NN [clause [pos child]] - (let [noun-phrase (.createNounPhrase nlg-factory child)] - (.setNoun))) + (let [noun-phrase (.createNounPhrase nlg-factory child)])) (comment (let [clause (.createClause nlg-factory) @@ -270,7 +307,6 @@ (.setNoun clause nn2) clause)) -(realise (create-element '((DT "a") (NN "sample") (NN "test")))) (defmethod create-element '(PRP$ NN) [[prp$ nn]] @@ -288,18 +324,11 @@ [[[_ child]]] (create-element child)) -(realise (create-element '((NP ((PRP$ "Eric") (NN "test")))))) - -(realise (create-element '((DT "This")))) -(create-element '((NN "test"))) -(realise (create-element '((PRP$ "Eric") (NN "test")))) -(realise (create-element '((NNP "tests")))) (defmethod create-element '(VB) [[[_ child]]] (.createVerbPhrase nlg-factory child)) -(realise (create-element '((VB "run")))) (defmethod create-element '(VBZ) [[[_ child]]] @@ -315,7 +344,15 @@ clause)) (comment + (realise (create-element '((NP ((PRP$ "Eric") (NN "test")))))) (realise (create-element '((VBZ "is") (NP ((NN "test")))))) + + (realise (create-element '((VB "run")))) + (realise (create-element '((DT "a") (NN "sample") (NN "test")))) + (create-element '((NN "test"))) + (realise (create-element '((PRP$ "Eric") (NN "test")))) + (realise (create-element '((NNP "tests")))) + ) (comment diff --git a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj index b677da4..aa9580e 100644 --- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj +++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj @@ -225,9 +225,71 @@ :else (recur (next parse-zipper))))) +(defn generate-with-markov-with-custom-progression-n-2-pos-freqs + "Sams as above, but with next/prev and stop fns" + [next + prev + next-stop? + prev-stop? + pos-path->word-freqs + parse-zipper + markov] + (loop [parse-zipper parse-zipper] + (cond + (nil? (next parse-zipper)) (zip/root parse-zipper) + + (next-stop? parse-zipper) (zip/root parse-zipper) + + (zip/branch? parse-zipper) + (recur (next parse-zipper)) + + (string? (zip/node parse-zipper)) + (recur (next parse-zipper)) + + (and (symbol? (zip/node parse-zipper)) + (pos-path->word-freqs (take-last 2 (seq (map first (zip/path parse-zipper)))))) + (let [target-path (take-last 2 (seq (map first (zip/path parse-zipper)))) + target-pos (zip/node parse-zipper) + pos-path-word (pos-path->word-freqs target-path) + pos-map pos-path-word + markov-options (markov (reverse + (next-two-words (nlp/iter-zip + parse-zipper + prev + prev-stop?)))) + selection-possibilities (merge-with + (fn [a b] + (let [max-pos (apply max (vals pos-map))] + (+ a b max-pos))) + pos-map + markov-options)] + (timbre/info "Markov options are" + (apply list (next-two-words (nlp/iter-zip + parse-zipper + prev + prev-stop?))) + (apply list (take 10 markov-options))) + (timbre/info "Choosing POS for" target-path) + (let [selection (weighted-rand/weighted-selection-from-map + selection-possibilities)] + (timbre/info + "Most likely selection possibilities" + (apply list (take 5 (reverse (sort-by second selection-possibilities))))) + (timbre/info "Chose " selection) + (recur + (-> parse-zipper + zip/up + (#(zip/replace % (list (zip/node (zip/down %)) selection))) + zip/down + next + next)))) + + :else + (recur (next parse-zipper))))) + (comment (let [structure '(TOP (S (NP (DT) (JJ) (NN)) - (VP (VBZ)) + (VP (RB) (VBZ)) (NP (DT) (JJ) (NN)))) structure (-> structure zip/seq-zip @@ -238,17 +300,14 @@ (repeatedly 10 (fn [] - (->> (generate-with-markov-with-custom-progression + (->> (generate-with-markov-with-custom-progression-n-2-pos-freqs zip/prev zip/next nil? zip/end? - examples/t1 - pos-freqs + examples/pos-freqs-data-2 structure - examples/darkov-2) - nlp/leaf-nodes - (string/join " "))))) + examples/darkov-2))))) (timbre/set-level! :info) (timbre/set-level! :error) @@ -276,24 +335,21 @@ (take 5 examples/t2) - (let [structure '(TOP (S (NP (DT) (JJ) (NN)) - (VP (VBZ)) - (NP (DT) (JJ) (NN)))) + (let [structure (weighted-rand/weighted-selection-from-map + examples/popular-structure-freq-data) structure (-> structure zip/seq-zip nlp/iter-zip last) - pos-freqs (examples/pos-paths->pos-freqs - examples/t1)] + pos-freqs examples/pos-freqs-data-2] (repeatedly 10 (fn [] - (->> (generate-with-markov-with-custom-progression + (->> (generate-with-markov-with-custom-progression-n-2-pos-freqs zip/prev zip/next nil? zip/end? - examples/t1 pos-freqs structure examples/darkov-2) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index 54f33d0..e7261fa 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -335,6 +335,11 @@ (map #(hash-map (butlast %) {(last %) 1})) (apply deep-merge-with +))) +(defn pathed-part-of-speech-word-frequencies + "I like this name better." + [zipper] + (leaf-pos-path-word-freqs zipper)) + (comment (let [zipper (treebank-zipper ["Eric's test is difficult." "Eric's test is thorough." @@ -379,6 +384,32 @@ ;; (TOP (S (NP (NNP)) (VP (VBZ) (VP (VBG))) (.)))) ) +(defn grammar-tree-frequencies + "Seq of grammar tree frequencies of each document. + + {(TOP (NP (NN)) (VP (VBZ))) 23 + (TOP (NP (DT) (NN)) (VP (VBZ))) 18 + ,,,} + + To reduce, merge with +." + [document] + (->> document + parse-to-simple-tree + parse-tree-sans-leaf-words + (map #(hash-map % 1)) + (apply merge-with +))) + +(comment + (let [document ["this is a test" + "this is a sample" + "that was a test" + "you are a test"]] + (grammar-tree-frequencies + document)) + ;; => {(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))) 1, + ;; (TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))))) 2, + ;; (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))))) 1} + ) ;;;; This is not sufficient ;; You'll end up with a mapping that says a verb phrase can be a @@ -826,13 +857,6 @@ (pos-tagger (tokenize phrase))) ) -(comment - (let [text "My name is Eric." - zipper (treebank-zipper text)] - (map identity (seq-zip zipper))) - - ) - (defn zipper->freqs [zipper] (let [coll ()])) @@ -987,7 +1011,8 @@ ) (defn reverse-children-to-parents - "We want to work from the reverse, or backwards, representation of pos-constituents-frequencies. + "We want to work from the reverse, or backwards, + representation of pos-constituents-frequencies. Pos-constituent-frequencies is good for generating text top-down. But we want to generate text bottom-up for rhyming." [pos-freqs] @@ -1018,7 +1043,7 @@ "Who is your mother and what does she do?"] (pos-constituent-frequencies))] (->> pos-freqs - #_(ways-to-make-a 'S) + (ways-to-make-a 'S) (#(get % 'NN)))) (let [pos-freqs (->> ["My name is Eric." diff --git a/src/com/owoga/prhyme/util.clj b/src/com/owoga/prhyme/util.clj index 737c8aa..6f7526f 100644 --- a/src/com/owoga/prhyme/util.clj +++ b/src/com/owoga/prhyme/util.clj @@ -30,6 +30,22 @@ (map convert-to-sphinx) (map string/upper-case))) +(defn get-phones-with-stress + ".getPhones only " + [word] + (->> (map str (.getPhones cmu-lexicon word nil)) + (map convert-to-sphinx) + (map string/upper-case))) + +(comment + + (require '[com.owoga.prhyme.data.dictionary :as dict]) + (rest (first (filter #(= (first %) "ZHIRINOVSKY") dict/cmu-with-stress))) + ;; => ("ZH" "IH2" "R" "AH0" "N" "AA1" "V" "S" "K" "IY2") + (get-phones-with-stress "zhirinovsky") + ;; => ("ZH" "IH1" "R" "AH" "N" "AA1" "F" "S" "K" "IY") + ) + (defn window [n] (fn [coll] (cond