diff --git a/.gitattributes b/.gitattributes
index 7d75ef1..0b1d71a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -13,3 +13,5 @@ resources/structure-freqs filter=lfs diff=lfs merge=lfs -text
research filter=lfs diff=lfs merge=lfs -text
resources/dark-corpus-tpt.bin filter=lfs diff=lfs merge=lfs -text
resources/database.bin filter=lfs diff=lfs merge=lfs -text
+resources/backwards-database.bin filter=lfs diff=lfs merge=lfs -text
+resources/dark-corpus-backwards-tpt.bin filter=lfs diff=lfs merge=lfs -text
diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj
index f711e3f..6f9161c 100644
--- a/dev/examples/tpt.clj
+++ b/dev/examples/tpt.clj
@@ -60,6 +60,23 @@
(mapv #(mapv vec %))
(reduce #(into %1 %2) [])))
+(defn text->backwards-ngrams
+ "Takes text from a file, including newlines.
+ Pads lines with and for start/end of line.
+ Pads beginning with n - 1 s"
+ [text n]
+ (->> text
+ util/clean-text
+ (#(string/split % #"\n+"))
+ (remove empty?)
+ (mapv tokenize-line)
+ (mapv #(pad-tokens % n))
+ reverse
+ (mapv reverse)
+ (mapv #(partition n 1 %))
+ (mapv #(mapv vec %))
+ (reduce #(into %1 %2) [])))
+
(defn n-to-m-grams
"Exclusive of m, similar to range."
[n m text]
@@ -72,6 +89,18 @@
(recur (inc i)
(into r (text->ngrams text i))))))
+(defn n-to-m-backwards-grams
+ "Exclusive of m, similar to range."
+ [n m text]
+ (loop [i n
+ r []]
+ (cond
+ (= i m)
+ r
+ :else
+ (recur (inc i)
+ (into r (text->backwards-ngrams text i))))))
+
(declare ->TrieKey)
(deftype TrieKey [key]
@@ -188,6 +217,46 @@
(sort-by :count)
reverse))
+(defn rhyme-trie-transducer [xf]
+ (let [trie (volatile! (trie/make-trie))
+ database (atom {})
+ next-id (volatile! 1)]
+ (fn
+ ([] (xf))
+ ([result]
+ (reset! trie-database @database)
+ (xf result))
+ ([result input]
+ (let [ngrams-ids
+ (mapv
+ (fn [ngrams]
+ (mapv
+ (fn [ngram]
+ (let [gram-ids (mapv
+ (fn [gram]
+ (let [gram-id (get @database gram @next-id)]
+ (when (.equals gram-id @next-id)
+ (swap! database
+ #(-> %
+ (assoc gram gram-id)
+ (assoc gram-id gram)))
+ (vswap! next-id inc))
+ gram-id))
+ ngram)
+ ngram-id (get database gram-ids @next-id)]
+ gram-ids))
+ ngrams))
+ input)]
+ (vswap!
+ trie
+ (fn [trie ngrams-ids]
+ (reduce
+ (fn [trie [ngram-ids _]]
+ (update trie ngram-ids (fnil #(update % 1 inc) [(peek ngram-ids) 0])))
+ trie
+ ngrams-ids))
+ ngrams-ids))))))
+
(comment
(time
(def trie
@@ -199,7 +268,16 @@
conj
(file-seq (io/file "dark-corpus")))))
- (take 20 trie)
+ (time
+ (def backwards-trie
+ (transduce (comp (xf-file-seq 0 1000)
+ (map slurp)
+ (map (partial n-to-m-backwards-grams 1 4))
+ (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
+ stateful-transducer)
+ conj
+ (file-seq (io/file "dark-corpus")))))
+
)
(defn encode-fn [v]
@@ -225,6 +303,13 @@
encode-fn
(decode-fn @trie-database))))
+ (time
+ (def tightly-packed-backwards-trie
+ (tpt/tightly-packed-trie
+ backwards-trie
+ encode-fn
+ (decode-fn @trie-database))))
+
)
(defn key-get-in-tpt [tpt db ks]
@@ -240,7 +325,97 @@
+(defn word->phones [word]
+ (or (dict/word->cmu-phones word)
+ (util/get-phones-with-stress word)))
+
+(defn perfect-rhymes [rhyme-trie phones]
+ (let [rhyme-suffix (first
+ (util/take-through
+ #(= (last %) \1)
+ (reverse phones)))]
+ (trie/lookup rhyme-trie rhyme-suffix)))
+
+(defn n+1grams [trie k]
+ (->> (trie/lookup trie k)
+ (trie/children)
+ (map #(get % []))))
+
+(comment
+ ;; Bigrams of rhyme
+ (->> (perfect-rhymes rhyme-trie (or (dict/cmu-with-stress-map "pleasing")
+ (util/get-phones-with-stress "pleasing")))
+ (map (comp first second))
+ (remove nil?)
+ (map @trie-database)
+ (map #(vector [%] (n+1grams
+ tightly-packed-backwards-trie
+ [%])))
+ (map (fn [[w1 w2s]]
+ (mapv #(into w1 [(nth % 0)]) w2s)))
+ (reduce into [])
+ (map (fn [k]
+ (let [children (->> (n+1grams tightly-packed-backwards-trie k)
+ (mapv first))]
+ (mapv #(into k [%]) children))))
+ (reduce into [])
+ (map #(map @trie-database %)))
+
+ )
(comment
+ (do
+ (time
+ (def backwards-trie
+ (transduce (comp (xf-file-seq 0 250000)
+ (map slurp)
+ (map (partial n-to-m-backwards-grams 1 4))
+ (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams)))
+ stateful-transducer)
+ conj
+ (file-seq (io/file "dark-corpus")))))
+ (time
+ (def tightly-packed-backwards-trie
+ (tpt/tightly-packed-trie
+ backwards-trie
+ encode-fn
+ (decode-fn @trie-database))))
+ (tpt/save-tightly-packed-trie-to-file
+ "resources/dark-corpus-backwards-tpt.bin"
+ tightly-packed-backwards-trie)
+ (with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")]
+ (let [lines (->> (seq @trie-database)
+ (map pr-str)
+ (map #(str % "\n")))]
+ (doseq [line lines]
+ (.write wtr line))))
+ (def loaded-backwards-trie
+ (tpt/load-tightly-packed-trie-from-file
+ "resources/dark-corpus-backwards-tpt.bin"
+ (decode-fn @trie-database)))
+ (def loaded-database
+ (atom (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")]
+ (into {} (map read-string (line-seq rdr))))))
+ (->> (take 20 loaded-backwards-trie)
+ (map first)
+ (map (partial map @loaded-database)))
+ (def rhyme-database (atom {}))
+ (def rhyme-trie
+ (transduce
+ (comp
+ (map first)
+ (filter string?)
+ (map #(vector % (reverse (word->phones %))))
+ (map reverse))
+ (completing
+ (fn [trie [k v]]
+ (update trie k (fnil #(update % 1 inc) [v 0]))))
+ (trie/make-trie)
+ @loaded-database))
+
+ (trie/lookup rhyme-trie '("IY0" "JH"))
+
+ )
+
(tpt/save-tightly-packed-trie-to-file "dark-corpus-tpt.bin" tightly-packed-trie)
(def loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file
@@ -292,6 +467,11 @@
(atom (with-open [rdr (clojure.java.io/reader "database.bin")]
(into {} (map read-string (line-seq rdr))))))
+ (->> loaded-tightly-packed-trie
+ (take 20)
+ (map first)
+ (map (partial map @trie-database)))
+
(profile
{}
(def example-story
diff --git a/resources/backwards-database.bin b/resources/backwards-database.bin
new file mode 100644
index 0000000..c2c1c3d
--- /dev/null
+++ b/resources/backwards-database.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b5fb5f328f0c7cbac924c54943aa3dc05b42fe0388b2a1dbf752f4fb89bc743
+size 17713048
diff --git a/resources/dark-corpus-backwards-tpt.bin b/resources/dark-corpus-backwards-tpt.bin
new file mode 100644
index 0000000..7aad424
--- /dev/null
+++ b/resources/dark-corpus-backwards-tpt.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:13c4b4d690a955a052e06b7abbd55776f82b8f57ad0c0b309d59d629372dbab7
+size 109432963
diff --git a/src/com/owoga/prhyme/corpus/db.clj b/src/com/owoga/prhyme/corpus/db.clj
deleted file mode 100644
index 26ff7a9..0000000
--- a/src/com/owoga/prhyme/corpus/db.clj
+++ /dev/null
@@ -1,6 +0,0 @@
-(ns com.owoga.prhyme.corpus.db
- (:require [integrant.core :as ig]))
-
-(defn tokens->db
- [tokens]
- )
diff --git a/src/com/owoga/prhyme/data/dictionary.clj b/src/com/owoga/prhyme/data/dictionary.clj
index 4728255..c8b714b 100644
--- a/src/com/owoga/prhyme/data/dictionary.clj
+++ b/src/com/owoga/prhyme/data/dictionary.clj
@@ -12,6 +12,18 @@
(drop-while #(= \; (first %)))
(map #(string/split % #"\s+"))))
+(def cmu-with-stress-map
+ (->> cmu-with-stress
+ (map (partial split-at 1))
+ (map #(vector
+ (string/lower-case
+ (first (first %)))
+ (second %)))
+ (into {})))
+
+(defn word->cmu-phones [word]
+ (get cmu-with-stress-map word))
+
(def cmu-dict
(->> (io/reader (io/resource "cmudict_SPHINX_40"))
(line-seq)
@@ -27,6 +39,15 @@
:else (recur (rest words)
(update accum key (fnil conj []) (rest word)))))))
+(comment
+ (take 5 (seq spelling->phonemes))
+ ;; => (["daphnis" [("D" "AE" "F" "N" "AH" "S")]]
+ ;; ["dammam" [("D" "AE" "M" "AH" "M")]]
+ ;; ["kirschenmann" [("K" "ER" "SH" "AH" "N" "M" "AH" "N")]]
+ ;; ["baumgart" [("B" "AW" "M" "G" "AA" "R" "T") ("B" "AA" "M" "G" "AA" "R" "T")]]
+ ;; ["probasco" [("P" "R" "OW" "B" "AA" "S" "K" "OW")]])
+ )
+
(def phrase->Word
(into
{}
@@ -36,6 +57,15 @@
phonemes])
cmu-dict)))
+(comment
+ (take 5 (seq phrase->Word))
+ ;; => (["daphnis" ("D" "AE" "F" "N" "AH" "S")]
+ ;; ["dammam" ("D" "AE" "M" "AH" "M")]
+ ;; ["kirschenmann" ("K" "ER" "SH" "AH" "N" "M" "AH" "N")]
+ ;; ["baumgart" ("B" "AW" "M" "G" "AA" "R" "T")]
+ ;; ["probasco" ("P" "R" "OW" "B" "AA" "S" "K" "OW")])
+ )
+
(def prhyme-dict
(into [] (map prhyme/cmu->prhyme cmu-dict)))
diff --git a/src/com/owoga/prhyme/data/phonetics.clj b/src/com/owoga/prhyme/data/phonetics.clj
index a014bc1..c01773e 100644
--- a/src/com/owoga/prhyme/data/phonetics.clj
+++ b/src/com/owoga/prhyme/data/phonetics.clj
@@ -9,6 +9,11 @@
(map #(string/split % #"\t"))
(into {})))
+(comment
+ (take 5 (seq phonemap))
+ ;; => (["T" "stop"] ["CH" "affricate"] ["K" "stop"] ["HH" "aspirate"] ["UH" "vowel"])
+ )
+
(def long-vowel #{"EY" "IY" "AY" "OW" "UW"})
(def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"})
diff --git a/src/com/owoga/prhyme/data/tpt.clj b/src/com/owoga/prhyme/data/tpt.clj
index cc19591..8ddca01 100644
--- a/src/com/owoga/prhyme/data/tpt.clj
+++ b/src/com/owoga/prhyme/data/tpt.clj
@@ -1,295 +1,7 @@
(ns com.owoga.prhyme.data.tpt
- (:require [clojure.string :as string]
+ (:require [com.owoga.tightly-packed-trie :as tpt]
+ [com.owoga.trie :as trie]
+ [clojure.string :as string]
[clojure.java.io :as io])
(:import (java.nio ByteBuffer)
(java.lang.reflect Array)))
-
-(defn ubyte [b]
- (bit-and 0xff b))
-
-(defn sbyte [b]
- (bit-or -0x100 b))
-
-(defn nth-bit [n b]
- (let [mask (bit-shift-left 1 n)]
- (bit-and
- 1
- (bit-shift-right
- (bit-and mask b)
- n))))
-(reduce (fn [a _] (bit-or 1 (bit-shift-left a 1))) 0 (range 2))
-
-(defn ones-mask [n]
- (reduce (fn [a _] (bit-or 1 (bit-shift-left a 1))) 0 (range n)))
-
-(defn bit-slice
- "Start is least-significant bit.
- (bit-slice 2 6 10101010)
- -> ,1010,
- "
- [start end b]
- (let [mask (bit-shift-left (ones-mask (- end start)) start)]
- (bit-shift-right (bit-and b mask) start)))
-
-(defn as-binary-string [b]
- (string/replace
- (format "%8s" (Integer/toBinaryString b))
- #" "
- "0"))
-
-(defn bits [binary-string]
- (Integer/parseInt binary-string 2))
-
-(defn bit-on [i b]
- (bit-or b (int (Math/pow 2 i))))
-
-;; I've got a dictionary of ~120,000 words.
-;; I need to reference them in a compact way.
-;; It will be a little bit of a waste, but we can use a 32 bit index.
-
-(/ (Math/log 1.2e5) (Math/log 2))
-
-
-
-(defn vb-encode [n]
- (loop [n n
- b '()]
- (cond
- (< n 128) (let [ba (byte-array (cons n b))
- i (dec (count ba))]
- (aset-byte ba i (sbyte (+ 128 (aget ba i))))
- ba)
- :else (recur (quot n 128) (cons (mod n 128) b)))))
-
-(defn vb-decode-1 [ba]
- (loop [n 0
- i 0]
- (let [b (aget ba i)]
- (cond
- (> (bit-and b 0x80) 0)
- [(bit-or (bit-and 0x7f b)
- (bit-shift-left n 7))
- (inc i)]
- :else
- (recur (bit-or (bit-and 0x7f b)
- (bit-shift-left n 7))
- (inc i))))))
-
-(defn vb-decode
- ([ba]
- (vb-decode ba 0))
- ([ba i]
- (if (>= i (count ba))
- (cons (first (vb-decode-1 ba))
- nil)
- (let [[value byte-count] (vb-decode-1 ba)]
- (lazy-seq
- (cons
- value
- (vb-decode (byte-array (drop byte-count ba))
- (+ i byte-count))))))))
-
-(comment
- (let [n1 0
- n2 1
- n3 127
- n4 128
- n5 257
- n6 9876543210
- baos (java.io.ByteArrayOutputStream.)]
- (->> [n1 n2 n3 n4 n5 n6]
- (map vb-encode)
- (run! #(.writeBytes baos %)))
- (let [ba (.toByteArray baos)]
- (vb-decode ba)))
- ;; => ([0 1] [1 1] [127 1] [128 2] [257 2] [9876543210 5])
- )
-
-(defn combine-significant-bits [num-significant-bits & bytes]
- (reduce
- (fn [a b]
- (bit-or b (bit-shift-left a num-significant-bits)))
- bytes))
-
-(comment
- (let [b1 (bits "0110110")
- b2 (bits "1001001")
- ;; remove 2 flag bits
- slice (partial bit-slice 0 6)
- b1' (slice b1)
- b2' (slice b2)]
- (map
- as-binary-string
- [b1
- b2
- b1'
- b2'
- (combine-significant-bits 6 b1' b2' )]))
- ;; => ("00110110" "01001001" "00110110" "00001001" "110110001001")
- )
-
-(defn byte-buffer-variable-length-decode
- [bb]
- (let [combine (fn [n b]
- (bit-or (bit-and 0x7f b)
- (bit-shift-left n 7)))]
- (loop [n 0 i 0]
- (let [b (.get bb)]
- (if (zero? (bit-and b 0x80))
- (recur
- (combine n b)
- (inc i))
- (combine n b))))))
-
-(comment
- (let [bb (java.nio.ByteBuffer/wrap (vb-encode 9876543210))]
- (println (.limit bb))
- (byte-buffer-variable-length-decode bb))
-
- )
-
-(defn encode-with-flag-bits
- "Flag is a binary string.
- Returns byte array."
- [flag n]
- (let [flag-len (count flag)
- data-len (- 8 flag-len)
- max-data-val (int (Math/pow 2 data-len))
- flag-val (bit-shift-left (bits flag) data-len)]
- (loop [n n r '()]
- (if (< n max-data-val)
- (byte-array (cons (bit-or flag-val n) r))
- (recur
- (quot n max-data-val)
- (cons
- (bit-or flag-val (mod n max-data-val))
- r))))))
-
-(comment
- (let [ns [0 1 127 128 257 9876543210]
- encoded (map
- (partial encode-with-flag-bits "101")
- ns)
- as-binary (->> encoded
- (map (partial map as-binary-string))
- (map (partial map (partial take-last 8)))
- (map (partial map (partial apply str))))]
- as-binary)
- ;; => (("10100000")
- ;; ("10100001")
- ;; ("10100011" "10111111")
- ;; ("10100100" "10100000")
- ;; ("10101000" "10100001")
- ;; ("10101001" "10100110" "10101011" "10100000" "10100101" "10110111" "10101010"))
- )
-
-(defn decode-with-flag-bits
- "Flag is only used for length. It could be refactored to take an int.
- Or it could be refactored to take a function that does work based on the flags.
- For now though, flag bits must be checked external to this. This is just parity
- decoding of the encoding above.
- Returns [decoded-number number-of-bytes-read]."
- [flag ba]
- (let [flag-len (count flag)
- data-len (- 8 flag-len)
- mask-val (int (dec (Math/pow 2 data-len)))]
- (loop [ba ba r 0 i 0]
- (if (empty? ba)
- [r i]
- (let [data-val (bit-and mask-val (first ba))]
- (recur
- (rest ba)
- (bit-or
- (bit-shift-left r data-len)
- data-val)
- (inc i)))))))
-
-(comment
- (let [ns [0 1 127 128 257 9876543210]
- encoded (map
- (partial encode-with-flag-bits "101")
- ns)
- decoded (->> encoded
- (map (partial decode-with-flag-bits "101")))]
- decoded)
- ;; => ([0 1] [1 1] [127 2] [128 2] [257 2] [9876543210 7])
- (let [ns [0 1 127 128 257 9876543210]
- encoded (map
- (partial encode-with-flag-bits "001")
- ns)
- decoded (->> encoded
- (map (partial decode-with-flag-bits "101")))]
- decoded)
- ;; => ([0 1] [1 1] [127 2] [128 2] [257 2] [9876543210 7])
- )
-
-(defn encode-byte-with-n-flag-bits [n b]
- (let [max-size-with-flag (int (Math/pow 2 (- 8 n)))]
- (loop [b b r '()]
- (if (< b max-size-with-flag)
- (byte-array (cons b r))
- (recur (quot b max-size-with-flag)
- (cons (mod b max-size-with-flag) r))))))
-
-(defn decode-byte-with-n-flag-bits [n ba]
- (let [max-size-with-flag (int (Math/pow 2 (- 8 n)))]
- (loop [ba ba r 0]
- (if (nil? ba)
- r
- (recur (rest ba)
- (+ r ))))))
-
-(def dictionary ["hi" "my" "name" "is" "what"])
-
-(defn slurp-bytes [x]
- (with-open [out (java.io.ByteArrayOutputStream.)]
- (io/copy (io/input-stream x) out)
- (.toByteArray out)))
-
-(def markov
- {"hello" {:freq 5
- :children {"world" {:freq 2}
- "eric" {:freq 1}}}
- "goodbye" {:freq 2
- :children {"cruel" {:freq 1
- :children {"world" {:freq 1}}}}}
- "world" {:freq 3}
- "eric" {:freq 1}
- "cruel" {:freq 1
- :children {"world" {:freq 1}}}})
-
-(def markov
- {"world"
- {:freq 3
- :children {"hello" {:freq 2}
- "goodbye" {:freq 2}
- "cruel" {:freq 1
- :children {"goodbye" {:freq 1}}}}}
- "hello"
- {:freq 2}
- "goodbye"
- {:freq 3}
- "cruel"
- {:freq 1
- :children {"goodbye" {:freq 1}}}})
-
-(comment
- (with-open [o (io/output-stream "test.bin")]
- (.write o (byte-array 8) 0 8))
-
- (let [ba (slurp-bytes "test.bin")
- len (count ba)]
- )
-
- (with-open [o (io/output-stream "test.bin")]
- (.write o (count dictionary))
- (run!
- (fn [[i w]]
- (let [b (vb-encode i)]
- (.write o b 0 (count b)))
- (let [b (.getBytes w)]
- (.write o b 0 (count b))))
- (map vector (range) dictionary))
- )
-
- )
diff --git a/src/com/owoga/prhyme/util.clj b/src/com/owoga/prhyme/util.clj
index 6f7526f..31160b4 100644
--- a/src/com/owoga/prhyme/util.clj
+++ b/src/com/owoga/prhyme/util.clj
@@ -24,14 +24,18 @@
(comment
(map str (.getPhones cmu-lexicon "two" nil)))
-(defn get-phones [word]
+(defn get-phones
+ "String must be lowercase."
+ [word]
(->> (map str (.getPhones cmu-lexicon word nil))
(map remove-stress)
(map convert-to-sphinx)
(map string/upper-case)))
(defn get-phones-with-stress
- ".getPhones only "
+ "String must be lowercase.
+ .getPhones only.
+ Might be different from stress in cmu-dict"
[word]
(->> (map str (.getPhones cmu-lexicon word nil))
(map convert-to-sphinx)