From d5bcd096069a8cab116559d2cd1226ef1f5350bf Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Tue, 22 Jun 2021 20:43:49 -0500 Subject: [PATCH] Remove references to dark-corpus --- .gitattributes | 3 - dev/examples/core.clj | 25 +- resources/dark-corpus-1.edn | 3 - resources/dark-corpus-2.bin | 3 - src/com/owoga/corpus/markov.clj | 12 +- src/com/owoga/prhyme/data/darklyrics.clj | 15 +- src/com/owoga/prhyme/limerick.clj | 286 ----------------------- 7 files changed, 15 insertions(+), 332 deletions(-) delete mode 100644 resources/dark-corpus-1.edn delete mode 100644 resources/dark-corpus-2.bin delete mode 100644 src/com/owoga/prhyme/limerick.clj diff --git a/.gitattributes b/.gitattributes index 66a3a6c..de3d06e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,10 +3,7 @@ test.bin filter=lfs diff=lfs merge=lfs -text dark-fress.bin filter=lfs diff=lfs merge=lfs -text resources/models/en-sent.bin filter=lfs diff=lfs merge=lfs -text resources/models/en-token.bin filter=lfs diff=lfs merge=lfs -text -resources/dark-corpus-2.bin filter=lfs diff=lfs merge=lfs -text resources/models/en-parser-chunking.bin filter=lfs diff=lfs merge=lfs -text -resources/dark-corpus-2.edn filter=lfs diff=lfs merge=lfs -text -resources/dark-corpus-1.edn filter=lfs diff=lfs merge=lfs -text resources/models filter=lfs diff=lfs merge=lfs -text resources/pos-freqs filter=lfs diff=lfs merge=lfs -text resources/structure-freqs filter=lfs diff=lfs merge=lfs -text diff --git a/dev/examples/core.clj b/dev/examples/core.clj index 6e60b3c..42857aa 100644 --- a/dev/examples/core.clj +++ b/dev/examples/core.clj @@ -169,22 +169,9 @@ (prhyme/phrase->word dict/prhyme-dict "bye bye") nil) - (take 10 darklyrics/darklyrics-markov-2) - (get darklyrics/darklyrics-markov-2 '("memory" "my")) - (repeatedly - 5 - (fn [] - (let [rhymes (gen/selection-seq - dict/prhyme-dict - (comp (weighted/adjust-for-tail-rhyme 0.90) - #_(weighted/adjust-for-rhymes 0.50) - #_(weighted/adjust-for-fn :adj-rimes 0.80 pred-fn weight-fn) - (weighted/adjust-for-fn :adj-popular 0.95 pred-popular weight-popular) - (weighted/adjust-for-markov darklyrics/darklyrics-markov-2 0.99)) - (prhyme/phrase->word dict/prhyme-dict "happy birthday taylor my love"))] - (->> rhymes - (take 5) - (map :normalized-word))))) + + + ) (defn remove-sentences-with-words-not-in-dictionary [dictionary] @@ -293,9 +280,9 @@ {}))) (comment - (take 5 darklyrics/darklyrics-markov-2) - (darklyrics/darklyrics-markov-2 '("time" "is")) - (def darkov-2 darklyrics/darklyrics-markov-2) + + + ;; => ([("profanity" "unholy") {"its" 2}] ;; [("ants" "triumph") {nil 1}] ;; [("hiding" "our") {"of" 1, "expose" 3, "above" 1}] diff --git a/resources/dark-corpus-1.edn b/resources/dark-corpus-1.edn deleted file mode 100644 index 4e896c8..0000000 --- a/resources/dark-corpus-1.edn +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1e27da97d1d0a876026d0c220854dd1f23c12c282bb0ce5b9fc40b12ff9abf6 -size 4281246 diff --git a/resources/dark-corpus-2.bin b/resources/dark-corpus-2.bin deleted file mode 100644 index d95e18c..0000000 --- a/resources/dark-corpus-2.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:533c32eceec147964e505724eeef32968206a1d8df87ba51a6a2432409aca0b6 -size 87008392 diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index ca2abab..ee9003a 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -200,7 +200,7 @@ (let [files (->> "dark-corpus" io/file file-seq - (eduction (xf-file-seq 0 5000))) + (eduction (xf-file-seq 0 10000))) [trie database] (train-backwards files 1 4 "/tmp/trie.bin" "/tmp/database.bin" "/tmp/tpt.bin")])) (def markov-trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/trie.bin"))) @@ -709,10 +709,10 @@ (:current-best @context) false))))))) -(def best-of-10 (valid-or-best-sentence? 10)) +(def best-of-20 (valid-or-best-sentence? 20)) (comment - (take-until (best-of-10) (constantly "my name sky does eat")) + (take-until (best-of-20) (constantly "my name sky does eat")) ) (comment @@ -792,7 +792,7 @@ ; Here, we need to make a choice about which pronunciation ; we want to use to build line-phones. Choose randomly. (take-until - (best-of-10) + (best-of-20) #(tightly-generate-n-syllable-sentence database markov-trie @@ -800,7 +800,7 @@ syllable-count (make-markov-filter (map database [prhyme/BOS prhyme/EOS])))) (take-until - (best-of-10) + (best-of-20) #(tightly-generate-n-syllable-sentence-rhyming-with database markov-trie @@ -828,7 +828,7 @@ 10) (repeatedly - 5 + 2 #(->> (rhyme-from-scheme '[[A 9] [A 9] [B 5] [B 5] [A 9]] database diff --git a/src/com/owoga/prhyme/data/darklyrics.clj b/src/com/owoga/prhyme/data/darklyrics.clj index f2623a9..5534462 100644 --- a/src/com/owoga/prhyme/data/darklyrics.clj +++ b/src/com/owoga/prhyme/data/darklyrics.clj @@ -22,8 +22,7 @@ (io/copy xin xout) (nippy/thaw (.toByteArray xout) thaw-opts)))) -(def darklyrics-markov-2 - (thaw-from-file (io/resource "dark-corpus-2.bin"))) + (comment (def words (map #(vector (hash %) %) @@ -35,14 +34,6 @@ (def ds "jdbc:sqlite:resources/darklyrics.db") - (def hashes - (into - {} - (map - (fn [[k v]] - [(hash k) k]) - darklyrics-markov-2))) - (nippy/freeze-to-file "resources/dark-corpus-hashes.nip" hashes) @@ -68,6 +59,6 @@ (println (+ 2 2)) (keyword "won't") - (get darklyrics-markov-2 '("hiding" "our")) - (count darklyrics-markov-2) + + ) diff --git a/src/com/owoga/prhyme/limerick.clj b/src/com/owoga/prhyme/limerick.clj deleted file mode 100644 index 71ae818..0000000 --- a/src/com/owoga/prhyme/limerick.clj +++ /dev/null @@ -1,286 +0,0 @@ -(ns com.owoga.prhyme.limerick - (:require [com.owoga.prhyme.generation.weighted-selection :as weighted-selection] - [com.owoga.prhyme.util.math :as math] - [com.owoga.prhyme.nlp.core :as nlp] - [clojure.string :as string] - [com.owoga.phonetics :as phonetics] - [com.owoga.phonetics.syllabify :as syllabify] - [com.owoga.prhyme.core :as prhyme] - [com.owoga.prhyme.util :as util] - [com.owoga.prhyme.data.dictionary :as dict] - [com.owoga.trie :as trie] - [com.owoga.tightly-packed-trie :as tpt] - [com.owoga.tightly-packed-trie.encoding :as encoding] - [clojure.java.io :as io])) - -(defn rhyme-from-scheme - "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]" - [words markov scheme] - (loop [scheme scheme - rhymes {} - result []] - (cond - (empty? scheme) result - :else - (let [[pattern syllable-count] (first scheme) - banned-words (into #{} (->> result - (map #(string/split % #" ")) - (map #(last %)))) - adj (util/comp-rnil - (weighted-selection/adjust-for-markov - markov - 0.99) - (when (rhymes pattern) - (weighted-selection/adjust-for-tail-rhyme 0.99))) - rhyme (if (nil? (get rhymes pattern)) - (gen/gen-sentence-with-syllable-count - adj - syllable-count - words) - (gen/gen-rhyme-with-syllable-count - adj - syllable-count - (remove #(banned-words (:normalized-word %)) words) - (prhyme/phrase->word words (get rhymes pattern))))] - (recur (rest scheme) - (assoc rhymes pattern rhyme) - (conj result rhyme)))))) - -(comment - (require '[com.owoga.prhyme.data.dictionary :as dict] - '[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]] - '[clojure.java.io :as io]) - - (rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8))) - - ) - -(comment - ["bishop larch smitten us dwell" - "solely first week in hell" - "and take that for three" - "come wrapped in glory" - "you ever leave it so well"] - ["romancing realized too late" - "my crown revive my withered state" - "reign is obsolete" - "i sit in the street" - "but nobody cares of my fate"] - ["flesh is hacked to get me sedate" - "demonstration obsessed with hate" - "justice will be written in stone" - "and you will be shown" - "bedrooms of icons suffocate"] - ["you will bow to their hungry gods" - "come will come whatever the odds" - "now we see the light" - "you can't put it right" - "recklessly chopping firing squads"] - ["untimely they fool their poor life" - "it wither away with this knife" - "hate is my virtue" - "my feelings are well overdue" - "war we await the afterlife"]) - - -;;;; Generating limericks with a markov model - -(defn phrase->flex-rhyme-phones - "Takes a space-seperated string of words - and returns the concatenation of the words - vowel phones. - - Returns them in reversed order so they - are ready to be used in a lookup of a rhyme trie. - " - [phrase] - (->> phrase - (#(string/split % #" ")) - (map (comp syllabify/syllabify first phonetics/get-phones)) - (map (partial reduce into [])) - (map #(filter (partial re-find #"\d") %)) - (flatten) - (map #(string/replace % #"\d" "")) - (reverse))) - -(defn word->phones [word] - (or (dict/word->cmu-phones word) - (util/get-phones-with-stress word))) - -(defonce context (atom {})) - -(defn decode-fn [db] - (fn [byte-buffer] - (let [value (encoding/decode byte-buffer)] - (if (zero? value) - nil - [value (encoding/decode byte-buffer)])))) - -(defn initialize [] - (swap! - context - assoc - :database - (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")] - (into {} (map read-string (line-seq rdr))))) - - (swap! - context - assoc - :trie - (tpt/load-tightly-packed-trie-from-file - "resources/dark-corpus-backwards-tpt.bin" - (decode-fn (@context :database)))) - - (swap! - context - assoc - :perfect-rhyme-trie - (transduce - (comp - (map first) - (filter string?) - (map #(vector % (reverse (word->phones %)))) - (map reverse)) - (completing - (fn [trie [k v]] - (update trie k (fnil #(update % 1 inc) [v 0])))) - (trie/make-trie) - (@context :database))) - - (swap! - context - assoc - :rhyme-trie - (transduce - (comp - (map first) - (filter string?) - (map #(vector % (reverse (word->phones %)))) - (map reverse)) - (completing - (fn [trie [k v]] - (update trie k (fnil #(update % 1 inc) [v 0])))) - (trie/make-trie) - (@context :database))) - - (swap! - context - assoc - :flex-rhyme-trie - (transduce - (comp - (map (fn [[k v]] - [(string/join " " (map (@context :database) k)) - [k v]])) - (map (fn [[phrase [k v]]] - [(phrase->flex-rhyme-phones phrase) - [k v]]))) - (completing - (fn [trie [k v]] - (update trie k (fnil conj [v]) v))) - (trie/make-trie) - (->> (trie/children-at-depth (@context :trie) 0 1)))) - nil) - - -(comment - (time (initialize)) - - (println 2) - - (take 5 (:flex-rhyme-trie @context)) - - ) - -(defn choose-next-word - "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses - the next word based on markov data in trie. - - Could be improved by taking into account grammar and/or bidirectional context. - - The n-gram parameter is a list of trie entries - For trie entries that are word/frequency pairs, it might look something like this. - `[[sunshine 38] [ 509]]` - - But note that nothing in this function uses the frequency count from the passed in n-gram. - It's just easier for the calling functions to pass them in like that." - [{:keys [database trie] :as context} n-gram] - (let [n-gram-ids (->> n-gram (map first) (map database)) - node (trie/lookup trie n-gram-ids)] - (cond - (= 0 (count n-gram-ids)) - (let [children (->> (trie/children trie) - (map #(get % []))) - choice (math/weighted-selection second children)] - [(database (first choice)) (second choice)]) - node - (let [children (->> (trie/children node) - (map #(get % [])) - (remove (fn [[id f]] (= id (first n-gram-ids)))))] - (if (seq children) - (let [children-freqs (into (sorted-map) (frequencies (map second children))) - n-minus-1-gram-odds (/ (second (first children-freqs)) - (+ (second (get node [])) - (second (first children-freqs)))) - ;; Good-turing smoothing, take unseen ngram? - take-n-minus-1-gram? (and (< 1 (count n-gram-ids)) - (< (rand) n-minus-1-gram-odds))] - (if take-n-minus-1-gram? - (choose-next-word context (butlast n-gram)) - (let [choice (math/weighted-selection second children)] - [(database (first choice)) (second choice)]))) - (choose-next-word context (butlast n-gram)))) - :else - (choose-next-word context (butlast n-gram))))) - -(defn valid-sentence? [phrase] - (->> phrase - (map first) - (string/join " ") - (#(string/replace % #"(|)" "")) - (nlp/valid-sentence?))) - -(defn generate-sentence-backwards - "Given a phrase of [w1 w2 w3] generates a sentence - using a backwards markov." - ([{:keys [database trie] :as context} phrase] - (let [phrase (map (fn [w] - (let [id (database w)] - [w (second (get trie [id]))])) - phrase)] - (loop [phrase' (loop [phrase phrase] - (if (= "" (first (first phrase))) - phrase - (recur (cons (choose-next-word context (take 3 phrase)) - phrase))))] - (if (valid-sentence? phrase') - phrase' - (recur (loop [phrase phrase] - (if (= "" (first (first phrase))) - phrase - (recur (cons (choose-next-word context (take 3 phrase)) - phrase))))))))) - ) - -(comment - (take 5 (:database @context)) - - (map (:database @context) ["me" "bother"]) - (map (:database @context) ["bother me"]) - (first - (filter - valid-sentence? - (repeatedly - (fn [] - (generate-sentence-backwards - @context - ["bother" "me" ""]))))) - - (keys @context) - (time (initialize)) - ) - -(defn rhyme-from-scheme-2 - "Generate rhyme without the use of `weighted-selection/adjust-for-markov`." - [])