diff --git a/dev/examples/tpt.clj b/dev/examples/tpt.clj index 30a5dff..367fa31 100644 --- a/dev/examples/tpt.clj +++ b/dev/examples/tpt.clj @@ -218,46 +218,6 @@ (sort-by :count) reverse)) -(defn rhyme-trie-transducer [xf] - (let [trie (volatile! (trie/make-trie)) - database (atom {}) - next-id (volatile! 1)] - (fn - ([] (xf)) - ([result] - (reset! trie-database @database) - (xf result)) - ([result input] - (let [ngrams-ids - (mapv - (fn [ngrams] - (mapv - (fn [ngram] - (let [gram-ids (mapv - (fn [gram] - (let [gram-id (get @database gram @next-id)] - (when (.equals gram-id @next-id) - (swap! database - #(-> % - (assoc gram gram-id) - (assoc gram-id gram))) - (vswap! next-id inc)) - gram-id)) - ngram) - ngram-id (get database gram-ids @next-id)] - gram-ids)) - ngrams)) - input)] - (vswap! - trie - (fn [trie ngrams-ids] - (reduce - (fn [trie [ngram-ids _]] - (update trie ngram-ids (fnil #(update % 1 inc) [(peek ngram-ids) 0]))) - trie - ngrams-ids)) - ngrams-ids)))))) - (comment (transduce (comp (xf-file-seq 0 10) (map slurp) diff --git a/src/com/owoga/corpus/markov.clj b/src/com/owoga/corpus/markov.clj index e378009..6333a21 100644 --- a/src/com/owoga/corpus/markov.clj +++ b/src/com/owoga/corpus/markov.clj @@ -1,9 +1,13 @@ (ns com.owoga.corpus.markov - (:require [com.owoga.prhyme.util :as util] + (:require [com.owoga.prhyme.core :as prhyme] + [com.owoga.prhyme.util :as util] [com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.nlp.core :as nlp] + [com.owoga.trie :as trie] + [com.owoga.tightly-packed-trie :as tpt] [clojure.string :as string] - [clojure.java.io :as io])) + [clojure.java.io :as io] + [com.owoga.phonetics :as phonetics])) (defn clean-text [text] (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) @@ -37,4 +41,212 @@ "bar" 8}} {'("away") {"her" 10 "them" 50 - "baz" 99}})) + "baz" 99}}) + + ) + + +(defn slurp-file-to-read-string + "Returns the value of read-string of the contents of the file. + Useful for reading into memory a saved database of n-grams to identifiers + and identifiers to n-grams." + [filepath] + (read-string (slurp filepath))) + +(defn spit-edn-to-file + [filepath data] + (spit filepath (pr-str data))) + +(comment + (do + (spit-edn-to-file + "/tmp/spit-edn-test.txt" + {:a {:b :c}}) + (slurp-file-to-read-string "/tmp/spit-edn-test.txt"));; => {:a {:b :c}} + ) + +(defn xf-file-seq [start end] + (comp (remove #(.isDirectory %)) + (drop start) + (take end))) + +(defn stateful-transducer + "Stateful transform that crates a trie. + " + [xf] + (let [trie (volatile! (trie/make-trie)) + database (atom {}) + next-id (volatile! 1)] + (fn + ([] (xf)) + ([result] + (xf result)) + ([result map-entries-in] + (let [map-entries-out + (mapv + (fn [[lookup v]] + (mapv + (fn [key] + (let [key-id (get @database key @next-id)] + (when (.equals key-id @next-id) + (swap! database + #(-> % + (assoc key key-id) + (assoc key-id key))) + (vswap! next-id inc)) + + [(mapv @database lookup) v])) + lookup)) + map-entries-in)] + (vswap! + trie + (fn [trie map-entries-out] + (reduce + (fn [trie [lookup _]] + (update trie lookup (fnil #(update % 1 inc) [(peek lookup) 0]))) + trie + map-entries-out)) + map-entries-out)))))) + +(defn pad-tokens + "Pads the beginning with n - 1 tokens and + the end with 1 token." + [tokens n] + (vec (concat (vec (repeat (max 1 (dec n)) "")) tokens [""]))) + +(def re-word + "Regex for tokenizing a string into words + (including contractions and hyphenations), + commas, periods, and newlines." + #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)") + +(defn tokenize-line + [line] + (->> line + (string/trim) + (re-seq re-word) + (mapv second) + (mapv string/lower-case))) + +(defn text->ngrams + "Takes text from a file, including newlines. + Pads lines with and for start/end of line. + Pads beginning with n - 1 s" + [text n] + (->> text + util/clean-text + (#(string/split % #"\n+")) + (remove empty?) + (mapv tokenize-line) + (mapv #(pad-tokens % n)) + (mapv #(partition n 1 %)) + (mapv #(mapv vec %)) + (reduce #(into %1 %2) []))) + +(defn n-to-m-grams + "Exclusive of m, similar to range." + [n m text] + (loop [i n + r []] + (cond + (= i m) + r + :else + (recur (inc i) + (into r (text->ngrams text i)))))) + +(comment + (n-to-m-grams 1 3 "The quick brown fox jumps over the lazy dog.") + ;; => [[""] + ;; ["the"] + ;; ["quick"] + ;; ,,, + ;; ["the" "lazy"] + ;; ["lazy" "dog"] + ;; ["dog" ""]] + + ) +(defn prep-ngram-for-trie + "The tpt/trie expects values conjed into an ngram + to be of format '[[k1 k2 k3] value]." + [ngram] + (clojure.lang.MapEntry. (vec ngram) ngram)) + +(comment + (transduce (comp (xf-file-seq 501 2) + (map slurp) + (map (partial n-to-m-grams 1 4)) + (map (fn [ngrams] (mapv #(prep-ngram-for-trie %) ngrams))) + stateful-transducer) + conj + (file-seq (io/file "dark-corpus"))) + + ) +(defn initialize + "Takes an atom as a context. Swaps in :database, :trie, :rhyme-trie" + [context] + (swap! + context + assoc + :database + (with-open [rdr (io/reader "resources/backwards-database.bin")] + (into {} (map read-string (line-seq rdr))))) + + (swap! + context + assoc + :trie + (tpt/load-tightly-packed-trie-from-file + "resources/dark-corpus-backwards-tpt.bin" + (decode-fn (@context :database)))) + + (swap! + context + assoc + :perfect-rhyme-trie + (transduce + (comp + (map first) + (filter string?) + (map #(vector % (reverse (phonetics/get-phones %)))) + (map reverse)) + (completing + (fn [trie [k v]] + (update trie k (fnil #(update % 1 inc) [v 0])))) + (trie/make-trie) + (@context :database))) + + (swap! + context + assoc + :rhyme-trie + (transduce + (comp + (map first) + (filter string?) + (map (fn [word] + (let [phones-coll (phonetics/get-phones)] + (map + #(vector (reverse (phonetics/get-phones %)) word) + phones-coll))))) + (completing + (fn [trie [k v]] + (update trie k (fnil #(update % 1 inc) [v 0])))) + (trie/make-trie) + (@context :database))) + + (swap! + context + assoc + :flex-rhyme-trie + (transduce + (comp + (map first) + (filter string?) + (map #(vector (reverse (prhyme/phrase->flex-rhyme-phones %)) %))) + (completing + (fn [trie [k v]] + (update trie k (fnil conj [v]) v))) + (trie/make-trie) + (@context :database))) + nil) diff --git a/src/com/owoga/prhyme/core.clj b/src/com/owoga/prhyme/core.clj index 1811738..f468c0f 100644 --- a/src/com/owoga/prhyme/core.clj +++ b/src/com/owoga/prhyme/core.clj @@ -1,9 +1,12 @@ (ns com.owoga.prhyme.core (:require [clojure.zip :as zip] [clojure.string :as string] + [com.owoga.prhyme.data.dictionary :as dict] + [com.owoga.prhyme.util :as util] + [com.owoga.phonetics :as phonetics] + [com.owoga.phonetics.syllabify :as syllabify] [com.owoga.prhyme.util :as u] - [com.owoga.prhyme.syllabify :as s] - [com.owoga.prhyme.data.phonetics :as phonetics])) + [com.owoga.prhyme.syllabify :as s])) ;;; Typical rhyme model (explanation of following 3 functions) ;; @@ -128,6 +131,28 @@ word)))) (merge-phrase-words phrase)))) +(defn phrase->flex-rhyme-phones + "Takes a space-seperated string of words + and returns the concatenation of the words + vowel phones. + + Returns them in reversed order so they + are ready to be used in a lookup of a rhyme trie. + " + [phrase] + (->> phrase + (#(string/split % #" ")) + (map (comp syllabify/syllabify first phonetics/get-phones)) + (map (partial reduce into [])) + (map #(filter (partial re-find #"\d") %)) + (flatten) + (map #(string/replace % #"\d" "")) + (reverse))) + +(comment + (phrase->flex-rhyme-phones "bother me");; => ("IY" "ER" "AA") + ) + (defn words-by-rime* [words] (let [words-with-rime (->> words (map rest)