Cleanup and move markov-code to markov-ns

main
Eric Ihli 3 years ago
parent 62a1ff3a3b
commit 60e1c4fd73

@ -218,46 +218,6 @@
(sort-by :count) (sort-by :count)
reverse)) reverse))
(defn rhyme-trie-transducer [xf]
(let [trie (volatile! (trie/make-trie))
database (atom {})
next-id (volatile! 1)]
(fn
([] (xf))
([result]
(reset! trie-database @database)
(xf result))
([result input]
(let [ngrams-ids
(mapv
(fn [ngrams]
(mapv
(fn [ngram]
(let [gram-ids (mapv
(fn [gram]
(let [gram-id (get @database gram @next-id)]
(when (.equals gram-id @next-id)
(swap! database
#(-> %
(assoc gram gram-id)
(assoc gram-id gram)))
(vswap! next-id inc))
gram-id))
ngram)
ngram-id (get database gram-ids @next-id)]
gram-ids))
ngrams))
input)]
(vswap!
trie
(fn [trie ngrams-ids]
(reduce
(fn [trie [ngram-ids _]]
(update trie ngram-ids (fnil #(update % 1 inc) [(peek ngram-ids) 0])))
trie
ngrams-ids))
ngrams-ids))))))
(comment (comment
(transduce (comp (xf-file-seq 0 10) (transduce (comp (xf-file-seq 0 10)
(map slurp) (map slurp)

@ -1,9 +1,13 @@
(ns com.owoga.corpus.markov (ns com.owoga.corpus.markov
(:require [com.owoga.prhyme.util :as util] (:require [com.owoga.prhyme.core :as prhyme]
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.nlp.core :as nlp] [com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[clojure.string :as string] [clojure.string :as string]
[clojure.java.io :as io])) [clojure.java.io :as io]
[com.owoga.phonetics :as phonetics]))
(defn clean-text [text] (defn clean-text [text]
(string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" ""))) (string/lower-case (string/replace text #"[^a-zA-Z'\-\s]" "")))
@ -37,4 +41,212 @@
"bar" 8}} "bar" 8}}
{'("away") {"her" 10 {'("away") {"her" 10
"them" 50 "them" 50
"baz" 99}})) "baz" 99}})
)
(defn slurp-file-to-read-string
"Returns the value of read-string of the contents of the file.
Useful for reading into memory a saved database of n-grams to identifiers
and identifiers to n-grams."
[filepath]
(read-string (slurp filepath)))
(defn spit-edn-to-file
[filepath data]
(spit filepath (pr-str data)))
(comment
(do
(spit-edn-to-file
"/tmp/spit-edn-test.txt"
{:a {:b :c}})
(slurp-file-to-read-string "/tmp/spit-edn-test.txt"));; => {:a {:b :c}}
)
(defn xf-file-seq [start end]
(comp (remove #(.isDirectory %))
(drop start)
(take end)))
(defn stateful-transducer
"Stateful transform that crates a trie.
"
[xf]
(let [trie (volatile! (trie/make-trie))
database (atom {})
next-id (volatile! 1)]
(fn
([] (xf))
([result]
(xf result))
([result map-entries-in]
(let [map-entries-out
(mapv
(fn [[lookup v]]
(mapv
(fn [key]
(let [key-id (get @database key @next-id)]
(when (.equals key-id @next-id)
(swap! database
#(-> %
(assoc key key-id)
(assoc key-id key)))
(vswap! next-id inc))
[(mapv @database lookup) v]))
lookup))
map-entries-in)]
(vswap!
trie
(fn [trie map-entries-out]
(reduce
(fn [trie [lookup _]]
(update trie lookup (fnil #(update % 1 inc) [(peek lookup) 0])))
trie
map-entries-out))
map-entries-out))))))
(defn pad-tokens
"Pads the beginning with n - 1 <s> tokens and
the end with 1 </s> token."
[tokens n]
(vec (concat (vec (repeat (max 1 (dec n)) "<s>")) tokens ["</s>"])))
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
(defn tokenize-line
[line]
(->> line
(string/trim)
(re-seq re-word)
(mapv second)
(mapv string/lower-case)))
(defn text->ngrams
"Takes text from a file, including newlines.
Pads lines with <s> and </s> for start/end of line.
Pads beginning with n - 1 <s>s"
[text n]
(->> text
util/clean-text
(#(string/split % #"\n+"))
(remove empty?)
(mapv tokenize-line)
(mapv #(pad-tokens % n))
(mapv #(partition n 1 %))
(mapv #(mapv vec %))
(reduce #(into %1 %2) [])))
(defn n-to-m-grams
"Exclusive of m, similar to range."
[n m text]
(loop [i n
r []]
(cond
(= i m)
r
:else
(recur (inc i)
(into r (text->ngrams text i))))))
(comment
(n-to-m-grams 1 3 "The quick brown fox jumps over the lazy dog.")
;; => [["<s>"]
;; ["the"]
;; ["quick"]
;; ,,,
;; ["the" "lazy"]
;; ["lazy" "dog"]
;; ["dog" "</s>"]]
)
(defn prep-ngram-for-trie
"The tpt/trie expects values conjed into an ngram
to be of format '[[k1 k2 k3] value]."
[ngram]
(clojure.lang.MapEntry. (vec ngram) ngram))
(comment
(transduce (comp (xf-file-seq 501 2)
(map slurp)
(map (partial n-to-m-grams 1 4))
(map (fn [ngrams] (mapv #(prep-ngram-for-trie %) ngrams)))
stateful-transducer)
conj
(file-seq (io/file "dark-corpus")))
)
(defn initialize
"Takes an atom as a context. Swaps in :database, :trie, :rhyme-trie"
[context]
(swap!
context
assoc
:database
(with-open [rdr (io/reader "resources/backwards-database.bin")]
(into {} (map read-string (line-seq rdr)))))
(swap!
context
assoc
:trie
(tpt/load-tightly-packed-trie-from-file
"resources/dark-corpus-backwards-tpt.bin"
(decode-fn (@context :database))))
(swap!
context
assoc
:perfect-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector % (reverse (phonetics/get-phones %))))
(map reverse))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(@context :database)))
(swap!
context
assoc
:rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map (fn [word]
(let [phones-coll (phonetics/get-phones)]
(map
#(vector (reverse (phonetics/get-phones %)) word)
phones-coll)))))
(completing
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [v 0]))))
(trie/make-trie)
(@context :database)))
(swap!
context
assoc
:flex-rhyme-trie
(transduce
(comp
(map first)
(filter string?)
(map #(vector (reverse (prhyme/phrase->flex-rhyme-phones %)) %)))
(completing
(fn [trie [k v]]
(update trie k (fnil conj [v]) v)))
(trie/make-trie)
(@context :database)))
nil)

@ -1,9 +1,12 @@
(ns com.owoga.prhyme.core (ns com.owoga.prhyme.core
(:require [clojure.zip :as zip] (:require [clojure.zip :as zip]
[clojure.string :as string] [clojure.string :as string]
[com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.util :as util]
[com.owoga.phonetics :as phonetics]
[com.owoga.phonetics.syllabify :as syllabify]
[com.owoga.prhyme.util :as u] [com.owoga.prhyme.util :as u]
[com.owoga.prhyme.syllabify :as s] [com.owoga.prhyme.syllabify :as s]))
[com.owoga.prhyme.data.phonetics :as phonetics]))
;;; Typical rhyme model (explanation of following 3 functions) ;;; Typical rhyme model (explanation of following 3 functions)
;; ;;
@ -128,6 +131,28 @@
word)))) word))))
(merge-phrase-words phrase)))) (merge-phrase-words phrase))))
(defn phrase->flex-rhyme-phones
"Takes a space-seperated string of words
and returns the concatenation of the words
vowel phones.
Returns them in reversed order so they
are ready to be used in a lookup of a rhyme trie.
"
[phrase]
(->> phrase
(#(string/split % #" "))
(map (comp syllabify/syllabify first phonetics/get-phones))
(map (partial reduce into []))
(map #(filter (partial re-find #"\d") %))
(flatten)
(map #(string/replace % #"\d" ""))
(reverse)))
(comment
(phrase->flex-rhyme-phones "bother me");; => ("IY" "ER" "AA")
)
(defn words-by-rime* [words] (defn words-by-rime* [words]
(let [words-with-rime (->> words (let [words-with-rime (->> words
(map rest) (map rest)

Loading…
Cancel
Save