diff --git a/.gitattributes b/.gitattributes index 7d75ef1..0b1d71a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -13,3 +13,5 @@ resources/structure-freqs filter=lfs diff=lfs merge=lfs -text research filter=lfs diff=lfs merge=lfs -text resources/dark-corpus-tpt.bin filter=lfs diff=lfs merge=lfs -text resources/database.bin filter=lfs diff=lfs merge=lfs -text +resources/backwards-database.bin filter=lfs diff=lfs merge=lfs -text +resources/dark-corpus-backwards-tpt.bin filter=lfs diff=lfs merge=lfs -text diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj index f711e3f..6f9161c 100644 --- a/dev/examples/tpt.clj +++ b/dev/examples/tpt.clj @@ -60,6 +60,23 @@ (mapv #(mapv vec %)) (reduce #(into %1 %2) []))) +(defn text->backwards-ngrams + "Takes text from a file, including newlines. + Pads lines with and for start/end of line. + Pads beginning with n - 1 s" + [text n] + (->> text + util/clean-text + (#(string/split % #"\n+")) + (remove empty?) + (mapv tokenize-line) + (mapv #(pad-tokens % n)) + reverse + (mapv reverse) + (mapv #(partition n 1 %)) + (mapv #(mapv vec %)) + (reduce #(into %1 %2) []))) + (defn n-to-m-grams "Exclusive of m, similar to range." [n m text] @@ -72,6 +89,18 @@ (recur (inc i) (into r (text->ngrams text i)))))) +(defn n-to-m-backwards-grams + "Exclusive of m, similar to range." + [n m text] + (loop [i n + r []] + (cond + (= i m) + r + :else + (recur (inc i) + (into r (text->backwards-ngrams text i)))))) + (declare ->TrieKey) (deftype TrieKey [key] @@ -188,6 +217,46 @@ (sort-by :count) reverse)) +(defn rhyme-trie-transducer [xf] + (let [trie (volatile! (trie/make-trie)) + database (atom {}) + next-id (volatile! 1)] + (fn + ([] (xf)) + ([result] + (reset! trie-database @database) + (xf result)) + ([result input] + (let [ngrams-ids + (mapv + (fn [ngrams] + (mapv + (fn [ngram] + (let [gram-ids (mapv + (fn [gram] + (let [gram-id (get @database gram @next-id)] + (when (.equals gram-id @next-id) + (swap! database + #(-> % + (assoc gram gram-id) + (assoc gram-id gram))) + (vswap! next-id inc)) + gram-id)) + ngram) + ngram-id (get database gram-ids @next-id)] + gram-ids)) + ngrams)) + input)] + (vswap! + trie + (fn [trie ngrams-ids] + (reduce + (fn [trie [ngram-ids _]] + (update trie ngram-ids (fnil #(update % 1 inc) [(peek ngram-ids) 0]))) + trie + ngrams-ids)) + ngrams-ids)))))) + (comment (time (def trie @@ -199,7 +268,16 @@ conj (file-seq (io/file "dark-corpus"))))) - (take 20 trie) + (time + (def backwards-trie + (transduce (comp (xf-file-seq 0 1000) + (map slurp) + (map (partial n-to-m-backwards-grams 1 4)) + (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams))) + stateful-transducer) + conj + (file-seq (io/file "dark-corpus"))))) + ) (defn encode-fn [v] @@ -225,6 +303,13 @@ encode-fn (decode-fn @trie-database)))) + (time + (def tightly-packed-backwards-trie + (tpt/tightly-packed-trie + backwards-trie + encode-fn + (decode-fn @trie-database)))) + ) (defn key-get-in-tpt [tpt db ks] @@ -240,7 +325,97 @@ +(defn word->phones [word] + (or (dict/word->cmu-phones word) + (util/get-phones-with-stress word))) + +(defn perfect-rhymes [rhyme-trie phones] + (let [rhyme-suffix (first + (util/take-through + #(= (last %) \1) + (reverse phones)))] + (trie/lookup rhyme-trie rhyme-suffix))) + +(defn n+1grams [trie k] + (->> (trie/lookup trie k) + (trie/children) + (map #(get % [])))) + +(comment + ;; Bigrams of rhyme + (->> (perfect-rhymes rhyme-trie (or (dict/cmu-with-stress-map "pleasing") + (util/get-phones-with-stress "pleasing"))) + (map (comp first second)) + (remove nil?) + (map @trie-database) + (map #(vector [%] (n+1grams + tightly-packed-backwards-trie + [%]))) + (map (fn [[w1 w2s]] + (mapv #(into w1 [(nth % 0)]) w2s))) + (reduce into []) + (map (fn [k] + (let [children (->> (n+1grams tightly-packed-backwards-trie k) + (mapv first))] + (mapv #(into k [%]) children)))) + (reduce into []) + (map #(map @trie-database %))) + + ) (comment + (do + (time + (def backwards-trie + (transduce (comp (xf-file-seq 0 250000) + (map slurp) + (map (partial n-to-m-backwards-grams 1 4)) + (map (fn [ngrams] (map #(prep-ngram-for-trie %) ngrams))) + stateful-transducer) + conj + (file-seq (io/file "dark-corpus"))))) + (time + (def tightly-packed-backwards-trie + (tpt/tightly-packed-trie + backwards-trie + encode-fn + (decode-fn @trie-database)))) + (tpt/save-tightly-packed-trie-to-file + "resources/dark-corpus-backwards-tpt.bin" + tightly-packed-backwards-trie) + (with-open [wtr (clojure.java.io/writer "resources/backwards-database.bin")] + (let [lines (->> (seq @trie-database) + (map pr-str) + (map #(str % "\n")))] + (doseq [line lines] + (.write wtr line)))) + (def loaded-backwards-trie + (tpt/load-tightly-packed-trie-from-file + "resources/dark-corpus-backwards-tpt.bin" + (decode-fn @trie-database))) + (def loaded-database + (atom (with-open [rdr (clojure.java.io/reader "resources/backwards-database.bin")] + (into {} (map read-string (line-seq rdr)))))) + (->> (take 20 loaded-backwards-trie) + (map first) + (map (partial map @loaded-database))) + (def rhyme-database (atom {})) + (def rhyme-trie + (transduce + (comp + (map first) + (filter string?) + (map #(vector % (reverse (word->phones %)))) + (map reverse)) + (completing + (fn [trie [k v]] + (update trie k (fnil #(update % 1 inc) [v 0])))) + (trie/make-trie) + @loaded-database)) + + (trie/lookup rhyme-trie '("IY0" "JH")) + + ) + (tpt/save-tightly-packed-trie-to-file "dark-corpus-tpt.bin" tightly-packed-trie) (def loaded-tightly-packed-trie (tpt/load-tightly-packed-trie-from-file @@ -292,6 +467,11 @@ (atom (with-open [rdr (clojure.java.io/reader "database.bin")] (into {} (map read-string (line-seq rdr)))))) + (->> loaded-tightly-packed-trie + (take 20) + (map first) + (map (partial map @trie-database))) + (profile {} (def example-story diff --git a/resources/backwards-database.bin b/resources/backwards-database.bin new file mode 100644 index 0000000..c2c1c3d --- /dev/null +++ b/resources/backwards-database.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b5fb5f328f0c7cbac924c54943aa3dc05b42fe0388b2a1dbf752f4fb89bc743 +size 17713048 diff --git a/resources/dark-corpus-backwards-tpt.bin b/resources/dark-corpus-backwards-tpt.bin new file mode 100644 index 0000000..7aad424 --- /dev/null +++ b/resources/dark-corpus-backwards-tpt.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13c4b4d690a955a052e06b7abbd55776f82b8f57ad0c0b309d59d629372dbab7 +size 109432963 diff --git a/src/com/owoga/prhyme/corpus/db.clj b/src/com/owoga/prhyme/corpus/db.clj deleted file mode 100644 index 26ff7a9..0000000 --- a/src/com/owoga/prhyme/corpus/db.clj +++ /dev/null @@ -1,6 +0,0 @@ -(ns com.owoga.prhyme.corpus.db - (:require [integrant.core :as ig])) - -(defn tokens->db - [tokens] - ) diff --git a/src/com/owoga/prhyme/data/dictionary.clj b/src/com/owoga/prhyme/data/dictionary.clj index 4728255..c8b714b 100644 --- a/src/com/owoga/prhyme/data/dictionary.clj +++ b/src/com/owoga/prhyme/data/dictionary.clj @@ -12,6 +12,18 @@ (drop-while #(= \; (first %))) (map #(string/split % #"\s+")))) +(def cmu-with-stress-map + (->> cmu-with-stress + (map (partial split-at 1)) + (map #(vector + (string/lower-case + (first (first %))) + (second %))) + (into {}))) + +(defn word->cmu-phones [word] + (get cmu-with-stress-map word)) + (def cmu-dict (->> (io/reader (io/resource "cmudict_SPHINX_40")) (line-seq) @@ -27,6 +39,15 @@ :else (recur (rest words) (update accum key (fnil conj []) (rest word))))))) +(comment + (take 5 (seq spelling->phonemes)) + ;; => (["daphnis" [("D" "AE" "F" "N" "AH" "S")]] + ;; ["dammam" [("D" "AE" "M" "AH" "M")]] + ;; ["kirschenmann" [("K" "ER" "SH" "AH" "N" "M" "AH" "N")]] + ;; ["baumgart" [("B" "AW" "M" "G" "AA" "R" "T") ("B" "AA" "M" "G" "AA" "R" "T")]] + ;; ["probasco" [("P" "R" "OW" "B" "AA" "S" "K" "OW")]]) + ) + (def phrase->Word (into {} @@ -36,6 +57,15 @@ phonemes]) cmu-dict))) +(comment + (take 5 (seq phrase->Word)) + ;; => (["daphnis" ("D" "AE" "F" "N" "AH" "S")] + ;; ["dammam" ("D" "AE" "M" "AH" "M")] + ;; ["kirschenmann" ("K" "ER" "SH" "AH" "N" "M" "AH" "N")] + ;; ["baumgart" ("B" "AW" "M" "G" "AA" "R" "T")] + ;; ["probasco" ("P" "R" "OW" "B" "AA" "S" "K" "OW")]) + ) + (def prhyme-dict (into [] (map prhyme/cmu->prhyme cmu-dict))) diff --git a/src/com/owoga/prhyme/data/phonetics.clj b/src/com/owoga/prhyme/data/phonetics.clj index a014bc1..c01773e 100644 --- a/src/com/owoga/prhyme/data/phonetics.clj +++ b/src/com/owoga/prhyme/data/phonetics.clj @@ -9,6 +9,11 @@ (map #(string/split % #"\t")) (into {}))) +(comment + (take 5 (seq phonemap)) + ;; => (["T" "stop"] ["CH" "affricate"] ["K" "stop"] ["HH" "aspirate"] ["UH" "vowel"]) + ) + (def long-vowel #{"EY" "IY" "AY" "OW" "UW"}) (def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"}) diff --git a/src/com/owoga/prhyme/data/tpt.clj b/src/com/owoga/prhyme/data/tpt.clj index cc19591..8ddca01 100644 --- a/src/com/owoga/prhyme/data/tpt.clj +++ b/src/com/owoga/prhyme/data/tpt.clj @@ -1,295 +1,7 @@ (ns com.owoga.prhyme.data.tpt - (:require [clojure.string :as string] + (:require [com.owoga.tightly-packed-trie :as tpt] + [com.owoga.trie :as trie] + [clojure.string :as string] [clojure.java.io :as io]) (:import (java.nio ByteBuffer) (java.lang.reflect Array))) - -(defn ubyte [b] - (bit-and 0xff b)) - -(defn sbyte [b] - (bit-or -0x100 b)) - -(defn nth-bit [n b] - (let [mask (bit-shift-left 1 n)] - (bit-and - 1 - (bit-shift-right - (bit-and mask b) - n)))) -(reduce (fn [a _] (bit-or 1 (bit-shift-left a 1))) 0 (range 2)) - -(defn ones-mask [n] - (reduce (fn [a _] (bit-or 1 (bit-shift-left a 1))) 0 (range n))) - -(defn bit-slice - "Start is least-significant bit. - (bit-slice 2 6 10101010) - -> ,1010, - " - [start end b] - (let [mask (bit-shift-left (ones-mask (- end start)) start)] - (bit-shift-right (bit-and b mask) start))) - -(defn as-binary-string [b] - (string/replace - (format "%8s" (Integer/toBinaryString b)) - #" " - "0")) - -(defn bits [binary-string] - (Integer/parseInt binary-string 2)) - -(defn bit-on [i b] - (bit-or b (int (Math/pow 2 i)))) - -;; I've got a dictionary of ~120,000 words. -;; I need to reference them in a compact way. -;; It will be a little bit of a waste, but we can use a 32 bit index. - -(/ (Math/log 1.2e5) (Math/log 2)) - - - -(defn vb-encode [n] - (loop [n n - b '()] - (cond - (< n 128) (let [ba (byte-array (cons n b)) - i (dec (count ba))] - (aset-byte ba i (sbyte (+ 128 (aget ba i)))) - ba) - :else (recur (quot n 128) (cons (mod n 128) b))))) - -(defn vb-decode-1 [ba] - (loop [n 0 - i 0] - (let [b (aget ba i)] - (cond - (> (bit-and b 0x80) 0) - [(bit-or (bit-and 0x7f b) - (bit-shift-left n 7)) - (inc i)] - :else - (recur (bit-or (bit-and 0x7f b) - (bit-shift-left n 7)) - (inc i)))))) - -(defn vb-decode - ([ba] - (vb-decode ba 0)) - ([ba i] - (if (>= i (count ba)) - (cons (first (vb-decode-1 ba)) - nil) - (let [[value byte-count] (vb-decode-1 ba)] - (lazy-seq - (cons - value - (vb-decode (byte-array (drop byte-count ba)) - (+ i byte-count)))))))) - -(comment - (let [n1 0 - n2 1 - n3 127 - n4 128 - n5 257 - n6 9876543210 - baos (java.io.ByteArrayOutputStream.)] - (->> [n1 n2 n3 n4 n5 n6] - (map vb-encode) - (run! #(.writeBytes baos %))) - (let [ba (.toByteArray baos)] - (vb-decode ba))) - ;; => ([0 1] [1 1] [127 1] [128 2] [257 2] [9876543210 5]) - ) - -(defn combine-significant-bits [num-significant-bits & bytes] - (reduce - (fn [a b] - (bit-or b (bit-shift-left a num-significant-bits))) - bytes)) - -(comment - (let [b1 (bits "0110110") - b2 (bits "1001001") - ;; remove 2 flag bits - slice (partial bit-slice 0 6) - b1' (slice b1) - b2' (slice b2)] - (map - as-binary-string - [b1 - b2 - b1' - b2' - (combine-significant-bits 6 b1' b2' )])) - ;; => ("00110110" "01001001" "00110110" "00001001" "110110001001") - ) - -(defn byte-buffer-variable-length-decode - [bb] - (let [combine (fn [n b] - (bit-or (bit-and 0x7f b) - (bit-shift-left n 7)))] - (loop [n 0 i 0] - (let [b (.get bb)] - (if (zero? (bit-and b 0x80)) - (recur - (combine n b) - (inc i)) - (combine n b)))))) - -(comment - (let [bb (java.nio.ByteBuffer/wrap (vb-encode 9876543210))] - (println (.limit bb)) - (byte-buffer-variable-length-decode bb)) - - ) - -(defn encode-with-flag-bits - "Flag is a binary string. - Returns byte array." - [flag n] - (let [flag-len (count flag) - data-len (- 8 flag-len) - max-data-val (int (Math/pow 2 data-len)) - flag-val (bit-shift-left (bits flag) data-len)] - (loop [n n r '()] - (if (< n max-data-val) - (byte-array (cons (bit-or flag-val n) r)) - (recur - (quot n max-data-val) - (cons - (bit-or flag-val (mod n max-data-val)) - r)))))) - -(comment - (let [ns [0 1 127 128 257 9876543210] - encoded (map - (partial encode-with-flag-bits "101") - ns) - as-binary (->> encoded - (map (partial map as-binary-string)) - (map (partial map (partial take-last 8))) - (map (partial map (partial apply str))))] - as-binary) - ;; => (("10100000") - ;; ("10100001") - ;; ("10100011" "10111111") - ;; ("10100100" "10100000") - ;; ("10101000" "10100001") - ;; ("10101001" "10100110" "10101011" "10100000" "10100101" "10110111" "10101010")) - ) - -(defn decode-with-flag-bits - "Flag is only used for length. It could be refactored to take an int. - Or it could be refactored to take a function that does work based on the flags. - For now though, flag bits must be checked external to this. This is just parity - decoding of the encoding above. - Returns [decoded-number number-of-bytes-read]." - [flag ba] - (let [flag-len (count flag) - data-len (- 8 flag-len) - mask-val (int (dec (Math/pow 2 data-len)))] - (loop [ba ba r 0 i 0] - (if (empty? ba) - [r i] - (let [data-val (bit-and mask-val (first ba))] - (recur - (rest ba) - (bit-or - (bit-shift-left r data-len) - data-val) - (inc i))))))) - -(comment - (let [ns [0 1 127 128 257 9876543210] - encoded (map - (partial encode-with-flag-bits "101") - ns) - decoded (->> encoded - (map (partial decode-with-flag-bits "101")))] - decoded) - ;; => ([0 1] [1 1] [127 2] [128 2] [257 2] [9876543210 7]) - (let [ns [0 1 127 128 257 9876543210] - encoded (map - (partial encode-with-flag-bits "001") - ns) - decoded (->> encoded - (map (partial decode-with-flag-bits "101")))] - decoded) - ;; => ([0 1] [1 1] [127 2] [128 2] [257 2] [9876543210 7]) - ) - -(defn encode-byte-with-n-flag-bits [n b] - (let [max-size-with-flag (int (Math/pow 2 (- 8 n)))] - (loop [b b r '()] - (if (< b max-size-with-flag) - (byte-array (cons b r)) - (recur (quot b max-size-with-flag) - (cons (mod b max-size-with-flag) r)))))) - -(defn decode-byte-with-n-flag-bits [n ba] - (let [max-size-with-flag (int (Math/pow 2 (- 8 n)))] - (loop [ba ba r 0] - (if (nil? ba) - r - (recur (rest ba) - (+ r )))))) - -(def dictionary ["hi" "my" "name" "is" "what"]) - -(defn slurp-bytes [x] - (with-open [out (java.io.ByteArrayOutputStream.)] - (io/copy (io/input-stream x) out) - (.toByteArray out))) - -(def markov - {"hello" {:freq 5 - :children {"world" {:freq 2} - "eric" {:freq 1}}} - "goodbye" {:freq 2 - :children {"cruel" {:freq 1 - :children {"world" {:freq 1}}}}} - "world" {:freq 3} - "eric" {:freq 1} - "cruel" {:freq 1 - :children {"world" {:freq 1}}}}) - -(def markov - {"world" - {:freq 3 - :children {"hello" {:freq 2} - "goodbye" {:freq 2} - "cruel" {:freq 1 - :children {"goodbye" {:freq 1}}}}} - "hello" - {:freq 2} - "goodbye" - {:freq 3} - "cruel" - {:freq 1 - :children {"goodbye" {:freq 1}}}}) - -(comment - (with-open [o (io/output-stream "test.bin")] - (.write o (byte-array 8) 0 8)) - - (let [ba (slurp-bytes "test.bin") - len (count ba)] - ) - - (with-open [o (io/output-stream "test.bin")] - (.write o (count dictionary)) - (run! - (fn [[i w]] - (let [b (vb-encode i)] - (.write o b 0 (count b))) - (let [b (.getBytes w)] - (.write o b 0 (count b)))) - (map vector (range) dictionary)) - ) - - ) diff --git a/src/com/owoga/prhyme/util.clj b/src/com/owoga/prhyme/util.clj index 6f7526f..31160b4 100644 --- a/src/com/owoga/prhyme/util.clj +++ b/src/com/owoga/prhyme/util.clj @@ -24,14 +24,18 @@ (comment (map str (.getPhones cmu-lexicon "two" nil))) -(defn get-phones [word] +(defn get-phones + "String must be lowercase." + [word] (->> (map str (.getPhones cmu-lexicon word nil)) (map remove-stress) (map convert-to-sphinx) (map string/upper-case))) (defn get-phones-with-stress - ".getPhones only " + "String must be lowercase. + .getPhones only. + Might be different from stress in cmu-dict" [word] (->> (map str (.getPhones cmu-lexicon word nil)) (map convert-to-sphinx)