diff --git a/.gitattributes b/.gitattributes
index 66a3a6c..de3d06e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -3,10 +3,7 @@ test.bin filter=lfs diff=lfs merge=lfs -text
dark-fress.bin filter=lfs diff=lfs merge=lfs -text
resources/models/en-sent.bin filter=lfs diff=lfs merge=lfs -text
resources/models/en-token.bin filter=lfs diff=lfs merge=lfs -text
-resources/dark-corpus-2.bin filter=lfs diff=lfs merge=lfs -text
resources/models/en-parser-chunking.bin filter=lfs diff=lfs merge=lfs -text
-resources/dark-corpus-2.edn filter=lfs diff=lfs merge=lfs -text
-resources/dark-corpus-1.edn filter=lfs diff=lfs merge=lfs -text
resources/models filter=lfs diff=lfs merge=lfs -text
resources/pos-freqs filter=lfs diff=lfs merge=lfs -text
resources/structure-freqs filter=lfs diff=lfs merge=lfs -text
diff --git a/dev/examples/core.clj b/dev/examples/core.clj
index 6e60b3c..42857aa 100644
--- a/dev/examples/core.clj
+++ b/dev/examples/core.clj
@@ -169,22 +169,9 @@
(prhyme/phrase->word dict/prhyme-dict "bye bye")
nil)
- (take 10 darklyrics/darklyrics-markov-2)
- (get darklyrics/darklyrics-markov-2 '("memory" "my"))
- (repeatedly
- 5
- (fn []
- (let [rhymes (gen/selection-seq
- dict/prhyme-dict
- (comp (weighted/adjust-for-tail-rhyme 0.90)
- #_(weighted/adjust-for-rhymes 0.50)
- #_(weighted/adjust-for-fn :adj-rimes 0.80 pred-fn weight-fn)
- (weighted/adjust-for-fn :adj-popular 0.95 pred-popular weight-popular)
- (weighted/adjust-for-markov darklyrics/darklyrics-markov-2 0.99))
- (prhyme/phrase->word dict/prhyme-dict "happy birthday taylor my love"))]
- (->> rhymes
- (take 5)
- (map :normalized-word)))))
+
+
+
)
(defn remove-sentences-with-words-not-in-dictionary [dictionary]
@@ -293,9 +280,9 @@
{})))
(comment
- (take 5 darklyrics/darklyrics-markov-2)
- (darklyrics/darklyrics-markov-2 '("time" "is"))
- (def darkov-2 darklyrics/darklyrics-markov-2)
+
+
+
;; => ([("profanity" "unholy") {"its" 2}]
;; [("ants" "triumph") {nil 1}]
;; [("hiding" "our") {"of" 1, "expose" 3, "above" 1}]
diff --git a/resources/dark-corpus-1.edn b/resources/dark-corpus-1.edn
deleted file mode 100644
index 4e896c8..0000000
--- a/resources/dark-corpus-1.edn
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e1e27da97d1d0a876026d0c220854dd1f23c12c282bb0ce5b9fc40b12ff9abf6
-size 4281246
diff --git a/resources/dark-corpus-2.bin b/resources/dark-corpus-2.bin
deleted file mode 100644
index d95e18c..0000000
--- a/resources/dark-corpus-2.bin
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:533c32eceec147964e505724eeef32968206a1d8df87ba51a6a2432409aca0b6
-size 87008392
diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj
index ca2abab..ee9003a 100644
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@@ -200,7 +200,7 @@
(let [files (->> "dark-corpus"
io/file
file-seq
- (eduction (xf-file-seq 0 5000)))
+ (eduction (xf-file-seq 0 10000)))
[trie database] (train-backwards files 1 4 "/tmp/trie.bin" "/tmp/database.bin" "/tmp/tpt.bin")]))
(def markov-trie (into (trie/make-trie) (nippy/thaw-from-file "/tmp/trie.bin")))
@@ -709,10 +709,10 @@
(:current-best @context)
false)))))))
-(def best-of-10 (valid-or-best-sentence? 10))
+(def best-of-20 (valid-or-best-sentence? 20))
(comment
- (take-until (best-of-10) (constantly "my name sky does eat"))
+ (take-until (best-of-20) (constantly "my name sky does eat"))
)
(comment
@@ -792,7 +792,7 @@
; Here, we need to make a choice about which pronunciation
; we want to use to build line-phones. Choose randomly.
(take-until
- (best-of-10)
+ (best-of-20)
#(tightly-generate-n-syllable-sentence
database
markov-trie
@@ -800,7 +800,7 @@
syllable-count
(make-markov-filter (map database [prhyme/BOS prhyme/EOS]))))
(take-until
- (best-of-10)
+ (best-of-20)
#(tightly-generate-n-syllable-sentence-rhyming-with
database
markov-trie
@@ -828,7 +828,7 @@
10)
(repeatedly
- 5
+ 2
#(->> (rhyme-from-scheme
'[[A 9] [A 9] [B 5] [B 5] [A 9]]
database
diff --git a/src/com/owoga/prhyme/data/darklyrics.clj b/src/com/owoga/prhyme/data/darklyrics.clj
index f2623a9..5534462 100644
--- a/src/com/owoga/prhyme/data/darklyrics.clj
+++ b/src/com/owoga/prhyme/data/darklyrics.clj
@@ -22,8 +22,7 @@
(io/copy xin xout)
(nippy/thaw (.toByteArray xout) thaw-opts))))
-(def darklyrics-markov-2
- (thaw-from-file (io/resource "dark-corpus-2.bin")))
+
(comment
(def words (map #(vector (hash %) %)
@@ -35,14 +34,6 @@
(def ds "jdbc:sqlite:resources/darklyrics.db")
- (def hashes
- (into
- {}
- (map
- (fn [[k v]]
- [(hash k) k])
- darklyrics-markov-2)))
-
(nippy/freeze-to-file
"resources/dark-corpus-hashes.nip"
hashes)
@@ -68,6 +59,6 @@
(println (+ 2 2))
(keyword "won't")
- (get darklyrics-markov-2 '("hiding" "our"))
- (count darklyrics-markov-2)
+
+
)
diff --git a/src/com/owoga/prhyme/limerick.clj b/src/com/owoga/prhyme/limerick.clj
deleted file mode 100644
index 71ae818..0000000
--- a/src/com/owoga/prhyme/limerick.clj
+++ /dev/null
@@ -1,286 +0,0 @@
-(ns com.owoga.prhyme.limerick
- (:require [com.owoga.prhyme.generation.weighted-selection :as weighted-selection]
- [com.owoga.prhyme.util.math :as math]
- [com.owoga.prhyme.nlp.core :as nlp]
- [clojure.string :as string]
- [com.owoga.phonetics :as phonetics]
- [com.owoga.phonetics.syllabify :as syllabify]
- [com.owoga.prhyme.core :as prhyme]
- [com.owoga.prhyme.util :as util]
- [com.owoga.prhyme.data.dictionary :as dict]
- [com.owoga.trie :as trie]
- [com.owoga.tightly-packed-trie :as tpt]
- [com.owoga.tightly-packed-trie.encoding :as encoding]
- [clojure.java.io :as io]))
-
-(defn rhyme-from-scheme
- "scheme of format [[A 9] [A 9] [B 5] [B 5] [A 9]]"
- [words markov scheme]
- (loop [scheme scheme
- rhymes {}
- result []]
- (cond
- (empty? scheme) result
- :else
- (let [[pattern syllable-count] (first scheme)
- banned-words (into #{} (->> result
- (map #(string/split % #" "))
- (map #(last %))))
- adj (util/comp-rnil
- (weighted-selection/adjust-for-markov
- markov
- 0.99)
- (when (rhymes pattern)
- (weighted-selection/adjust-for-tail-rhyme 0.99)))
- rhyme (if (nil? (get rhymes pattern))
- (gen/gen-sentence-with-syllable-count
- adj
- syllable-count
- words)
- (gen/gen-rhyme-with-syllable-count
- adj
- syllable-count
- (remove #(banned-words (:normalized-word %)) words)
- (prhyme/phrase->word words (get rhymes pattern))))]
- (recur (rest scheme)
- (assoc rhymes pattern rhyme)
- (conj result rhyme))))))
-
-(comment
- (require '[com.owoga.prhyme.data.dictionary :as dict]
- '[com.owoga.prhyme.data.darklyrics :refer [darklyrics-markov-2]]
- '[clojure.java.io :as io])
-
- (rhyme-from-scheme dict/prhyme-dict darklyrics-markov-2 '((A 8) (A 8) (B 5) (B 5) (A 8)))
-
- )
-
-(comment
- ["bishop larch smitten us dwell"
- "solely first week in hell"
- "and take that for three"
- "come wrapped in glory"
- "you ever leave it so well"]
- ["romancing realized too late"
- "my crown revive my withered state"
- "reign is obsolete"
- "i sit in the street"
- "but nobody cares of my fate"]
- ["flesh is hacked to get me sedate"
- "demonstration obsessed with hate"
- "justice will be written in stone"
- "and you will be shown"
- "bedrooms of icons suffocate"]
- ["you will bow to their hungry gods"
- "come will come whatever the odds"
- "now we see the light"
- "you can't put it right"
- "recklessly chopping firing squads"]
- ["untimely they fool their poor life"
- "it wither away with this knife"
- "hate is my virtue"
- "my feelings are well overdue"
- "war we await the afterlife"])
-
-
-;;;; Generating limericks with a markov model
-
-(defn phrase->flex-rhyme-phones
- "Takes a space-seperated string of words
- and returns the concatenation of the words
- vowel phones.
-
- Returns them in reversed order so they
- are ready to be used in a lookup of a rhyme trie.
- "
- [phrase]
- (->> phrase
- (#(string/split % #" "))
- (map (comp syllabify/syllabify first phonetics/get-phones))
- (map (partial reduce into []))
- (map #(filter (partial re-find #"\d") %))
- (flatten)
- (map #(string/replace % #"\d" ""))
- (reverse)))
-
-(defn word->phones [word]
- (or (dict/word->cmu-phones word)
- (util/get-phones-with-stress word)))
-
-(defonce context (atom {}))
-
-(defn decode-fn [db]
- (fn [byte-buffer]
- (let [value (encoding/decode byte-buffer)]
- (if (zero? value)
- nil
- [value (encoding/decode byte-buffer)]))))
-
-(defn initialize []
- (swap!
- context
- assoc
- :database
- (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
- (into {} (map read-string (line-seq rdr)))))
-
- (swap!
- context
- assoc
- :trie
- (tpt/load-tightly-packed-trie-from-file
- "resources/dark-corpus-backwards-tpt.bin"
- (decode-fn (@context :database))))
-
- (swap!
- context
- assoc
- :perfect-rhyme-trie
- (transduce
- (comp
- (map first)
- (filter string?)
- (map #(vector % (reverse (word->phones %))))
- (map reverse))
- (completing
- (fn [trie [k v]]
- (update trie k (fnil #(update % 1 inc) [v 0]))))
- (trie/make-trie)
- (@context :database)))
-
- (swap!
- context
- assoc
- :rhyme-trie
- (transduce
- (comp
- (map first)
- (filter string?)
- (map #(vector % (reverse (word->phones %))))
- (map reverse))
- (completing
- (fn [trie [k v]]
- (update trie k (fnil #(update % 1 inc) [v 0]))))
- (trie/make-trie)
- (@context :database)))
-
- (swap!
- context
- assoc
- :flex-rhyme-trie
- (transduce
- (comp
- (map (fn [[k v]]
- [(string/join " " (map (@context :database) k))
- [k v]]))
- (map (fn [[phrase [k v]]]
- [(phrase->flex-rhyme-phones phrase)
- [k v]])))
- (completing
- (fn [trie [k v]]
- (update trie k (fnil conj [v]) v)))
- (trie/make-trie)
- (->> (trie/children-at-depth (@context :trie) 0 1))))
- nil)
-
-
-(comment
- (time (initialize))
-
- (println 2)
-
- (take 5 (:flex-rhyme-trie @context))
-
- )
-
-(defn choose-next-word
- "Given an n-gram of [[word1 freq1] [word2 freq2]] chooses
- the next word based on markov data in trie.
-
- Could be improved by taking into account grammar and/or bidirectional context.
-
- The n-gram parameter is a list of trie entries
- For trie entries that are word/frequency pairs, it might look something like this.
- `[[sunshine 38] [ 509]]`
-
- But note that nothing in this function uses the frequency count from the passed in n-gram.
- It's just easier for the calling functions to pass them in like that."
- [{:keys [database trie] :as context} n-gram]
- (let [n-gram-ids (->> n-gram (map first) (map database))
- node (trie/lookup trie n-gram-ids)]
- (cond
- (= 0 (count n-gram-ids))
- (let [children (->> (trie/children trie)
- (map #(get % [])))
- choice (math/weighted-selection second children)]
- [(database (first choice)) (second choice)])
- node
- (let [children (->> (trie/children node)
- (map #(get % []))
- (remove (fn [[id f]] (= id (first n-gram-ids)))))]
- (if (seq children)
- (let [children-freqs (into (sorted-map) (frequencies (map second children)))
- n-minus-1-gram-odds (/ (second (first children-freqs))
- (+ (second (get node []))
- (second (first children-freqs))))
- ;; Good-turing smoothing, take unseen ngram?
- take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
- (< (rand) n-minus-1-gram-odds))]
- (if take-n-minus-1-gram?
- (choose-next-word context (butlast n-gram))
- (let [choice (math/weighted-selection second children)]
- [(database (first choice)) (second choice)])))
- (choose-next-word context (butlast n-gram))))
- :else
- (choose-next-word context (butlast n-gram)))))
-
-(defn valid-sentence? [phrase]
- (->> phrase
- (map first)
- (string/join " ")
- (#(string/replace % #"(|)" ""))
- (nlp/valid-sentence?)))
-
-(defn generate-sentence-backwards
- "Given a phrase of [w1 w2 w3] generates a sentence
- using a backwards markov."
- ([{:keys [database trie] :as context} phrase]
- (let [phrase (map (fn [w]
- (let [id (database w)]
- [w (second (get trie [id]))]))
- phrase)]
- (loop [phrase' (loop [phrase phrase]
- (if (= "" (first (first phrase)))
- phrase
- (recur (cons (choose-next-word context (take 3 phrase))
- phrase))))]
- (if (valid-sentence? phrase')
- phrase'
- (recur (loop [phrase phrase]
- (if (= "" (first (first phrase)))
- phrase
- (recur (cons (choose-next-word context (take 3 phrase))
- phrase)))))))))
- )
-
-(comment
- (take 5 (:database @context))
-
- (map (:database @context) ["me" "bother"])
- (map (:database @context) ["bother me"])
- (first
- (filter
- valid-sentence?
- (repeatedly
- (fn []
- (generate-sentence-backwards
- @context
- ["bother" "me" ""])))))
-
- (keys @context)
- (time (initialize))
- )
-
-(defn rhyme-from-scheme-2
- "Generate rhyme without the use of `weighted-selection/adjust-for-markov`."
- [])