diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj
index 6f9161c..bd382a6 100644
--- a/dev/examples/tpt.clj
+++ b/dev/examples/tpt.clj
@@ -9,7 +9,8 @@
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data.dictionary :as dict]
[clojure.zip :as zip]
- [cljol.dig9 :as d]))
+ [cljol.dig9 :as d]
+ [com.owoga.prhyme.data.phonetics :as phonetics]))
(tufte/add-basic-println-handler! {})
@@ -324,6 +325,12 @@
{ks (assoc v :value (get db id))}))
+(defn clone-consonants [phones]
+ (map
+ #(if (phonetics/vowel (string/replace % #"\d" ""))
+ %
+ "?")
+ phones))
(defn word->phones [word]
(or (dict/word->cmu-phones word)
@@ -336,70 +343,231 @@
(reverse phones)))]
(trie/lookup rhyme-trie rhyme-suffix)))
+(defn vowel-rhymes [rhyme-trie phones]
+ (let [rhyme-suffix (->> (reverse phones)
+ (clone-consonants)
+ (util/take-through #(= (last %) \1))
+ (first))]
+ (trie/lookup rhyme-trie rhyme-suffix)))
+
(defn n+1grams [trie k]
(->> (trie/lookup trie k)
(trie/children)
(map #(get % []))))
+(defn word->n+1grams [trie database word]
+ (->> word
+ database
+ (#(trie/lookup trie [%]))
+ trie/children
+ (map #(get % []))
+ (map (fn [[id fr]] [(database id) fr]))
+ (sort-by (comp - #(nth % 1)))
+ (remove #({"" ""} (nth % 0)))))
+
(comment
- ;; Bigrams of rhyme
- (->> (perfect-rhymes rhyme-trie (or (dict/cmu-with-stress-map "pleasing")
- (util/get-phones-with-stress "pleasing")))
+ (let [trie (@context :trie)
+ db (@context :database)]
+ (word->n+1grams trie db "technology"))
+
+ )
+
+(defonce context (atom {}))
+
+(defn initialize []
+ (swap!
+ context
+ assoc
+ :database
+ (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
+ (into {} (map read-string (line-seq rdr)))))
+ (swap!
+ context
+ assoc
+ :trie
+ (tpt/load-tightly-packed-trie-from-file
+ "resources/dark-corpus-backwards-tpt.bin"
+ (decode-fn (@context :database))))
+ (swap!
+ context
+ assoc
+ :perfect-rhyme-trie
+ (transduce
+ (comp
+ (map first)
+ (filter string?)
+ (map #(vector % (reverse (word->phones %))))
+ (map reverse))
+ (completing
+ (fn [trie [k v]]
+ (update trie k (fnil #(update % 1 inc) [v 0]))))
+ (trie/make-trie)
+ (@context :database)))
+ (swap!
+ context
+ assoc
+ :vowel-rhyme-trie
+ (transduce
+ (comp
+ (map first)
+ (filter string?)
+ (map #(vector % (reverse (word->phones %))))
+ (map reverse)
+ (map (fn [[phones v]]
+ [(map #(if (phonetics/vowel
+ (string/replace % #"\d" ""))
+ %
+ "?")
+ phones)
+ v])))
+ (completing
+ (fn [trie [k v]]
+ (update trie k (fnil #(update % 1 inc) [v 0]))))
+ (trie/make-trie)
+ (@context :database)))
+ nil)
+
+(defn find-rhymes [trie word]
+ (->> (perfect-rhymes trie (or (dict/cmu-with-stress-map word)
+ (util/get-phones-with-stress word)))
(map (comp first second))
(remove nil?)
- (map @trie-database)
- (map #(vector [%] (n+1grams
- tightly-packed-backwards-trie
- [%])))
- (map (fn [[w1 w2s]]
- (mapv #(into w1 [(nth % 0)]) w2s)))
- (reduce into [])
- (map (fn [k]
- (let [children (->> (n+1grams tightly-packed-backwards-trie k)
- (mapv first))]
- (mapv #(into k [%]) children))))
- (reduce into [])
- (map #(map @trie-database %)))
+ (map (@context :database))
+ (map #(get (@context :trie) [%]))
+ (sort-by #(nth % 1))
+ (reverse)
+ (map
+ (fn [[word-id freq]]
+ [((@context :database) word-id)
+ freq]))
+ (remove #(= word (first %)))))
+
+(defn choose-next-word [{:keys [database trie] :as context} n-gram]
+ (let [n-gram-ids (->> n-gram (map first) (map database))
+ node (trie/lookup trie n-gram-ids)]
+ (cond
+ (= 0 (count n-gram-ids))
+ (let [children (map #(get % []) (trie/children trie))
+ choice (math/weighted-selection second children)]
+ [(database (first choice)) (second choice)])
+ node
+ (let [children (->> node (trie/children) (map #(get % [])))]
+ (if (seq children)
+ (let [children-freqs (into (sorted-map) (frequencies (map second children)))
+ n-minus-1-gram-odds (/ (second (first children-freqs))
+ (+ (second (get node []))
+ (second (first children-freqs))))
+ take-n-minus-1-gram? (and (< 1 (count n-gram-ids))
+ (< (rand) n-minus-1-gram-odds))]
+ (if take-n-minus-1-gram?
+ (choose-next-word context (butlast n-gram))
+ (let [choice (math/weighted-selection second children)]
+ [(database (first choice)) (second choice)])))
+ (choose-next-word context (butlast n-gram))))
+ :else
+ (choose-next-word context (butlast n-gram)))))
+
+(defn generate-phrase [{:keys [database trie] :as context} phrase]
+ (loop [phrase phrase]
+ (if (< 10 (count phrase))
+ phrase
+ (recur (cons (choose-next-word context (take 3 phrase))
+ phrase)))))
+
+(defn generate-rhyme [{:keys [perfect-rhyme-trie] :as context} phrase]
+ (let [phrase1 (generate-phrase context phrase)
+ rhyme (first (find-rhymes perfect-rhyme-trie (first (last phrase1))))
+ phrase2 (generate-phrase context (list (last phrase1)))]
+ [phrase1 phrase2]))
- )
(comment
+ (initialize)
+
+ (generate-phrase @context '(["bitter" 41]))
+ (generate-rhyme @context '(["bitter" 41]))
+
+ (choose-next-word @context (take 3 [["theology" 41]]))
+
+ (choose-next-word @context [["and" 5] ["theology" 41]])
+
+ (find-rhymes (@context :perfect-rhyme-trie) "theology")
+
+ (trie/chil(trie/lookup (@context :trie) '(57 2477)))
+ (take 5 (@context :trie))
+
+
+ (->> (find-rhymes (@context :perfect-rhyme-trie) "technology")
+ (map (fn [[word frq]]
+ (let [n+1grams (word->n+1grams
+ (@context :trie)
+ (@context :database)
+ word)]
+ (map vector n+1grams (repeat [word frq])))))
+ (reduce into []))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
(do
- (time
- (def backwards-trie
- (transduce (comp (xf-file-seq 0 250000)
- (map slurp)
- (map (partial n-to-m-backwards-grams 1 4))
- (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
- stateful-transducer)
- conj
- (file-seq (io/file "dark-corpus")))))
- (time
- (def tightly-packed-backwards-trie
- (tpt/tightly-packed-trie
- backwards-trie
- encode-fn
- (decode-fn @trie-database))))
- (tpt/save-tightly-packed-trie-to-file
- "resources/dark-corpus-backwards-tpt.bin"
- tightly-packed-backwards-trie)
- (with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")]
- (let [lines (->> (seq @trie-database)
- (map pr-str)
- (map #(str % "\n")))]
- (doseq [line lines]
- (.write wtr line))))
+ #_(time
+ (def backwards-trie
+ (transduce (comp (xf-file-seq 0 250000)
+ (map slurp)
+ (map (partial n-to-m-backwards-grams 1 4))
+ (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
+ stateful-transducer)
+ conj
+ (file-seq (io/file "dark-corpus")))))
+
+ #_(time
+ (def tightly-packed-backwards-trie
+ (tpt/tightly-packed-trie
+ backwards-trie
+ encode-fn
+ (decode-fn @trie-database))))
+
+ #_(tpt/save-tightly-packed-trie-to-file
+ "resources/dark-corpus-backwards-tpt.bin"
+ tightly-packed-backwards-trie)
+ #_(with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")]
+ (let [lines (->> (seq @trie-database)
+ (map pr-str)
+ (map #(str % "\n")))]
+ (doseq [line lines]
+ (.write wtr line))))
+
(def loaded-backwards-trie
(tpt/load-tightly-packed-trie-from-file
"resources/dark-corpus-backwards-tpt.bin"
(decode-fn @trie-database)))
- (def loaded-database
- (atom (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
- (into {} (map read-string (line-seq rdr))))))
- (->> (take 20 loaded-backwards-trie)
- (map first)
- (map (partial map @loaded-database)))
+
+ (def loaded-backwards-database
+ (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
+ (into {} (map read-string (line-seq rdr)))))
+
(def rhyme-database (atom {}))
- (def rhyme-trie
+
+ (def perfect-rhyme-trie
(transduce
(comp
(map first)
@@ -410,67 +578,35 @@
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
- @loaded-database))
-
- (trie/lookup rhyme-trie '("IY0" "JH"))
+ @loaded-backwards-database))
+ (def vowel-rhyme-trie
+ (transduce
+ (comp
+ (map first)
+ (filter string?)
+ (map #(vector % (reverse (word->phones %))))
+ (map reverse)
+ (map (fn [[phones v]]
+ [(map #(if (phonetics/vowel
+ (string/replace % #"\d" ""))
+ %
+ "?")
+ phones)
+ v])))
+ (completing
+ (fn [trie [k v]]
+ (update trie k (fnil #(update % 1 inc) [v 0]))))
+ (trie/make-trie)
+ @loaded-backwards-database))
)
- (tpt/save-tightly-packed-trie-to-file "dark-corpus-tpt.bin" tightly-packed-trie)
-
- (def loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file
- "dark-corpus-tpt.bin"
- (decode-fn @trie-database)))
-
- [(first tightly-packed-trie)
- (first loaded-tightly-packed-trie)]
-
- (take-last 10 (.array (.byte-buffer loaded-tightly-packed-trie)))
- ;; => (-127 -124 -42 -23 28 -127 -124 -41 -90 9)
- ;; => (0 0 0 0 0 37 0 6 -124 -56 -128 -121 1 -17 -128 -118 -117 -128 -115 2)
-
- (take-last 10 (.array (.byte-buffer tightly-packed-trie)))
- ;; => (-127 -124 -42 -23 28 -127 -124 -41 -90 9)
- ;; => (0 0 0 0 0 37 0 6 -124 -56 -128 -121 1 -17 -128 -118 -117 -128 -115 2)
- (.byte-buffer loaded-tightly-packed-trie)
- ;; => #object[java.nio.HeapByteBuffer 0x21b8291a "java.nio.HeapByteBuffer[pos=8 lim=2548630 cap=2548630]"]
-
- (.byte-buffer tightly-packed-trie)
- ;; => #object[java.nio.HeapByteBuffer 0x7dc15357 "java.nio.HeapByteBuffer[pos=8 lim=2548630 cap=2548630]"]
-
- [(.key loaded-tightly-packed-trie)
- (.address loaded-tightly-packed-trie)
- (.limit loaded-tightly-packed-trie)]
- ;; => [0 2424838 2548630]
-
- [(.key tightly-packed-trie)
- (.address tightly-packed-trie)
- (.limit tightly-packed-trie)]
- ;; => [0 2424838 2548630]
-
- (->> (trie/lookup tightly-packed-trie [1])
- (trie/children)
- (map #(get % []))
- (remove nil?)
- (math/weighted-selection #(nth % 1))
- first
- (@trie-database))
-
- (with-open [wtr (clojure.java.io/writer "database.bin")]
- (let [lines (->> (seq @trie-database)
- (map pr-str)
- (map #(str % "\n")))]
- (doseq [line lines]
- (.write wtr line))))
-
- (def trie-database
- (atom (with-open [rdr (clojure.java.io/reader "database.bin")]
- (into {} (map read-string (line-seq rdr))))))
-
- (->> loaded-tightly-packed-trie
- (take 20)
- (map first)
- (map (partial map @trie-database)))
+ #_(with-open [wtr (clojure.java.io/writer "database.bin")]
+ (let [lines (->> (seq @trie-database)
+ (map pr-str)
+ (map #(str % "\n")))]
+ (doseq [line lines]
+ (.write wtr line))))
(profile
{}
@@ -522,3 +658,74 @@
;; => {("" "" "the") {:value ("" "" "the"), :count 462}}
+(comment
+ (->> (perfect-rhymes perfect-rhyme-trie
+ (or (dict/cmu-with-stress-map "technology")
+ (util/get-phones-with-stress "technology")))
+ (map (comp first second))
+ (remove nil?)
+ #_#_#_#_(map @loaded-backwards-database)
+ (map #(vector [%] (n+1grams
+ loaded-backwards-trie
+ [%])))
+ (map (fn [[w1 w2s]]
+ (mapv #(into w1 [(nth % 0)]) w2s)))
+ (take 10))
+
+ (->> (perfect-rhymes perfect-rhyme-trie
+ (or (dict/cmu-with-stress-map "technology")
+ (util/get-phones-with-stress "technology")))
+ (map (comp first second))
+ (remove nil?)
+ (map @loaded-backwards-database)
+ (map #(vector [%] (n+1grams
+ loaded-backwards-trie
+ [%])))
+ (map (fn [[w1 w2s]]
+ (mapv #(into w1 [(nth % 0)]) w2s)))
+ (reduce into [])
+ (map (fn [k]
+ (let [children (->> (n+1grams loaded-backwards-trie k)
+ (mapv first))]
+ (mapv #(into k [%]) children))))
+ (reduce into [])
+ #_#_#_#_(map #(map @loaded-backwards-database %))
+ (filter (partial every? dict/english?))
+ (take 100)
+ (map reverse))
+
+
+ (util/get-phones-with-stress "you") ;; => ("B" "AA1" "DH" "ER" "M")
+ (def phones (or (dict/cmu-with-stress-map "sandman")
+ (util/get-phones-with-stress "sandman")))
+
+ (take 20 vowel-rhyme-trie)
+ (->> (vowel-rhymes vowel-rhyme-trie phones)
+ (map (comp first second))
+ (remove nil?)
+ (take 20))
+
+ ;; Bigrams of rhyme
+ (->> (perfect-rhymes perfect-rhyme-trie
+ (or (dict/cmu-with-stress-map "technology")
+ (util/get-phones-with-stress "technology")))
+ (map (comp first second))
+ (remove nil?)
+ (map @loaded-backwards-database)
+ (map #(vector [%] (n+1grams
+ loaded-backwards-trie
+ [%])))
+ (map (fn [[w1 w2s]]
+ (mapv #(into w1 [(nth % 0)]) w2s)))
+ (reduce into [])
+ (map (fn [k]
+ (let [children (->> (n+1grams loaded-backwards-trie k)
+ (mapv first))]
+ (mapv #(into k [%]) children))))
+ (reduce into [])
+ (map #(map @loaded-backwards-database %))
+ (filter (partial every? dict/english?))
+ (take 100)
+ (map reverse))
+
+ )
diff --git a/src/com/owoga/prhyme/data/tpt.clj b/src/com/owoga/prhyme/data/tpt.clj
index 8ddca01..98d78e3 100644
--- a/src/com/owoga/prhyme/data/tpt.clj
+++ b/src/com/owoga/prhyme/data/tpt.clj
@@ -1,7 +1,13 @@
(ns com.owoga.prhyme.data.tpt
(:require [com.owoga.tightly-packed-trie :as tpt]
[com.owoga.trie :as trie]
+ [com.owoga.prhyme.util :as util]
[clojure.string :as string]
[clojure.java.io :as io])
(:import (java.nio ByteBuffer)
(java.lang.reflect Array)))
+
+(comment
+ (util/get-phones-with-stress "It's phil's coffee")
+
+ )