diff --git a/src/com/owoga/prhyme/data/dictionary.clj b/src/com/owoga/prhyme/data/dictionary.clj index c8b714b..45a8bc6 100644 --- a/src/com/owoga/prhyme/data/dictionary.clj +++ b/src/com/owoga/prhyme/data/dictionary.clj @@ -98,7 +98,7 @@ english-words (->> words (filter #(word-set (string/lower-case %))))] - (< 0.7 (/ (count english-words) (max 1 (count words)))))) + (< 0.8 (/ (count english-words) (max 1 (count words)))))) (comment (let [phoneme-lookup (into diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj index 5412caa..cd08aa9 100644 --- a/src/com/owoga/prhyme/data_transform.clj +++ b/src/com/owoga/prhyme/data_transform.clj @@ -2,7 +2,6 @@ (:require [clojure.string :as string] [clojure.java.io :as io] [com.owoga.prhyme.data.dictionary :as dict] - [com.owoga.prhyme.nlp.core :as nlp] [com.owoga.trie :as trie] [com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie.encoding :as encoding] @@ -97,21 +96,23 @@ k)] [k' 1]))) -(defn xf-part-of-speech-database - [database] - (fn [sentence] - (let [leafs (->> sentence - nlp/treebank-zipper - nlp/leaf-pos-path-word-freqs)] - (run! - (fn [[k v]] - (swap! - database - assoc - k - (merge-with + (@database k) v))) - leafs) - sentence))) +(comment + ;; TODO: Move to nlp.core + (defn xf-part-of-speech-database + [database] + (fn [sentence] + (let [leafs (->> sentence + nlp/treebank-zipper + nlp/leaf-pos-path-word-freqs)] + (run! + (fn [[k v]] + (swap! + database + assoc + k + (merge-with + (@database k) v))) + leafs) + sentence)))) (comment (let [database (atom {})] @@ -206,32 +207,31 @@ (recur (conj result [k v]) (rest k))))) -(defn process-text - "Processes text into key value pairs where +(comment + ;; TODO: Move to nlp.core + (defn process-text + "Processes text into key value pairs where the keys are parts-of-speech paths and the values are the children at that path. Ready to be inserted into a trie." - [text] - (->> text - (split-text-into-sentences) - (map string/trim) - (remove empty?) - (mapv nlp/treebank-zipper) - (remove nil?) - (map nlp/parts-of-speech-trie-entries) - (mapv (fn [file] - (mapv (fn [line] - (mapv vec line)) - file))) - (reduce into []) - (map flatten-trie-entry-to-all-subkeys) - (reduce into []) - (mapv normalize-text) - (mapv (fn [[k v]] - (clojure.lang.MapEntry. (into (vec k) [v]) v))))) + [text] + (->> text + (split-text-into-sentences) + (map string/trim) + (remove empty?) + (mapv nlp/treebank-zipper) + (remove nil?) + (map nlp/parts-of-speech-trie-entries) + (reduce into []) + (map flatten-trie-entry-to-all-subkeys) + (reduce into []) + (mapv normalize-text) + (mapv (fn [[k v]] + (clojure.lang.MapEntry. (into (vec k) [v]) v)))))) (comment + (process-text (first texts)) (flatten-trie-entry-to-all-subkeys '[(TOP S NP) (NP PP)]) ;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]] @@ -304,7 +304,7 @@ trie entries))) (trie/make-trie) - (take 300 texts)))) + (take 3000 texts)))) (nippy/freeze-to-file "/tmp/test-trie.bin" (seq test-trie)) (time @@ -441,32 +441,6 @@ (#(zip/insert-right % (zip/node z2))) (zip/root)))) -(defn generate - [trie database zipper] - (let [k (map first (zip/path zipper))] - (do (Thread/sleep 10) (println k)) - (if (vector? (database (last k))) - (loop [zipper zipper] - (let [children (last (map first (zip/path zipper)))] - (Thread/sleep 50) (println children (zip/root zipper)) - (if (empty? children) - zipper - (recur - (-> zipper - zip/up - (zip/append-child [(first children)]) - (zip/down) - (zip/rightmost) - (zip/down) - (#(generate trie database %)) - (zip/up) - (zip/up) - (zip/down) - (zip/replace (subvec 1 children))))))) - (zip/insert-right - zipper - (choose trie database k))))) - (defn generate [trie database zipper] (cond @@ -526,12 +500,14 @@ (comment (trie/lookup test-trie [1]) - (->> (generate test-trie @test-database (zip/vector-zip [1])) - (zip/vector-zip) - (iterate zip/next) - (take-while (complement zip/end?)) - (map zip/node) - (filter string?)) + (repeatedly + 20 + #(->> (generate test-trie @test-database (zip/vector-zip [1])) + (zip/vector-zip) + (iterate zip/next) + (take-while (complement zip/end?)) + (map zip/node) + (filter string?))) (-> [:a [:b] [:b]] zip/vector-zip @@ -649,43 +625,47 @@ ) -(defn xf-grammar-database - [database] - (fn [sentence] - (let [leafs (->> sentence - nlp/treebank-zipper - nlp/leaf-pos-path-word-freqs)] - (run! - (fn [[k v]] - (swap! - database - assoc - k - (merge-with + (@database k) v))) - leafs) - sentence))) - -(defn file-seq->grammar-tree - [files] - (transduce - (comp - (xf-file-seq 0 1000) - (map slurp) - (map #(string/split % #"[\n+\?\.]")) - (map (partial transduce xf-tokenize conj)) - (map (partial transduce xf-filter-english conj)) - (map (partial remove empty?)) - (remove empty?) - (map (partial transduce xf-untokenize conj)) - (map nlp/grammar-tree-frequencies) - (map (partial into {}))) - (fn - ([acc] - (sort-by (comp - second) acc)) - ([acc m] - (merge-with + acc m))) - {} - files)) +(comment + ;; TODO: Move to nlp.core + (defn xf-grammar-database + [database] + (fn [sentence] + (let [leafs (->> sentence + nlp/treebank-zipper + nlp/leaf-pos-path-word-freqs)] + (run! + (fn [[k v]] + (swap! + database + assoc + k + (merge-with + (@database k) v))) + leafs) + sentence)))) + +(comment + ;; TODO: remove or move to nlp.core + (defn file-seq->grammar-tree + [files] + (transduce + (comp + (xf-file-seq 0 1000) + (map slurp) + (map #(string/split % #"[\n+\?\.]")) + (map (partial transduce xf-tokenize conj)) + (map (partial transduce xf-filter-english conj)) + (map (partial remove empty?)) + (remove empty?) + (map (partial transduce xf-untokenize conj)) + (map nlp/grammar-tree-frequencies) + (map (partial into {}))) + (fn + ([acc] + (sort-by (comp - second) acc)) + ([acc m] + (merge-with + acc m))) + {} + files))) (comment (time @@ -699,27 +679,29 @@ ) -(defn file-seq->part-of-speech-freqs - [files] - (transduce - (comp - (xf-file-seq 0 1000) - (map slurp) - (map #(string/split % #"[\n+\?\.]")) - (map (partial transduce xf-tokenize conj)) - (map (partial transduce xf-filter-english conj)) - (map (partial remove empty?)) - (remove empty?) - (map (partial transduce xf-untokenize conj)) - (map (partial map nlp/treebank-zipper)) - (map (partial map nlp/leaf-pos-path-word-freqs)) - (map (partial reduce (fn [acc m] - (nlp/deep-merge-with + acc m)) {}))) - (completing - (fn [result input] - (nlp/deep-merge-with + result input))) - {} - files)) +(comment + ;; TODO: Remove or move to nlp.core + (defn file-seq->part-of-speech-freqs + [files] + (transduce + (comp + (xf-file-seq 0 1000) + (map slurp) + (map #(string/split % #"[\n+\?\.]")) + (map (partial transduce xf-tokenize conj)) + (map (partial transduce xf-filter-english conj)) + (map (partial remove empty?)) + (remove empty?) + (map (partial transduce xf-untokenize conj)) + (map (partial map nlp/treebank-zipper)) + (map (partial map nlp/leaf-pos-path-word-freqs)) + (map (partial reduce (fn [acc m] + (nlp/deep-merge-with + acc m)) {}))) + (completing + (fn [result input] + (nlp/deep-merge-with + result input))) + {} + files))) (comment (time (->> (file-seq->part-of-speech-freqs @@ -732,24 +714,26 @@ ) -(defn file-seq->parts-of-speech-trie - [files] - (transduce - (comp - (xf-file-seq 0 1000) - (map slurp) - (map #(string/split % #"[\n+\?\.]")) - (map (partial transduce xf-tokenize conj)) - (map (partial transduce xf-filter-english conj)) - (map (partial remove empty?)) - (remove empty?) - (map (partial transduce xf-untokenize conj)) - (map nlp/grammar-tree-frequencies) - (map (partial into {}))) - (fn - ([acc] - (sort-by (comp - second) acc)) - ([acc m] - (merge-with + acc m))) - {} - files)) +(comment + ;; TODO: Remove or move to nlp.core + (defn file-seq->parts-of-speech-trie + [files] + (transduce + (comp + (xf-file-seq 0 1000) + (map slurp) + (map #(string/split % #"[\n+\?\.]")) + (map (partial transduce xf-tokenize conj)) + (map (partial transduce xf-filter-english conj)) + (map (partial remove empty?)) + (remove empty?) + (map (partial transduce xf-untokenize conj)) + (map nlp/grammar-tree-frequencies) + (map (partial into {}))) + (fn + ([acc] + (sort-by (comp - second) acc)) + ([acc m] + (merge-with + acc m))) + {} + files))) diff --git a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj index aa9580e..f00b547 100644 --- a/src/com/owoga/prhyme/nlg/prhyme_nlg.clj +++ b/src/com/owoga/prhyme/nlg/prhyme_nlg.clj @@ -2,9 +2,13 @@ (:require [clojure.zip :as zip] [clojure.string :as string] [taoensso.timbre :as timbre] + [com.owoga.prhyme.util.math :as math] [examples.core :as examples] [taoensso.nippy :as nippy] [com.owoga.prhyme.nlp.core :as nlp] + [clojure.java.io :as io] + [com.owoga.prhyme.data-transform :as df] + [com.owoga.trie :as trie] [com.owoga.prhyme.util.weighted-rand :as weighted-rand] [clojure.set :as set])) @@ -403,3 +407,181 @@ [(TOP (NP (NP (NN)) (PP (IN) (NP (PRP$) (NN))))) 218] [(TOP (NP (JJ) (NNS))) 211] [(TOP (VB)) 204])) + + +(comment + (def test-database (atom {::nlp/next-id 1})) + + (def texts + (eduction + (comp (df/xf-file-seq 0 250000) + (map slurp)) + (file-seq (io/file "dark-corpus")))) + + (time + (def test-trie + (transduce + (comp + (map + (fn [text] + (try + (nlp/text->grammar-trie-map-entry text) + (catch Exception e + (throw e))))) + (map (partial map (nlp/make-database-stateful-xf test-database)))) + (completing + (fn [trie entries] + (reduce + (fn [trie [k v]] + (update trie k (fnil inc 0))) + trie + entries))) + (trie/make-trie) + (take 300 texts)))) + + ) + +(defn children + [trie database k] + (->> (trie/lookup trie k) + (trie/children) + (map #(vector (.key %) (get % []))) + (remove (comp nil? second)) + (sort-by (comp - second)))) + +(defn choose + [trie database k] + (math/weighted-selection + second + (children trie database k))) + +(defn markov-generate-grammar + [trie database zipper] + (cond + (zip/end? zipper) + (zip/root zipper) + + (seqable? (zip/node zipper)) + (recur trie database (zip/next zipper)) + + (symbol? (zip/node zipper)) + (recur trie database (zip/next zipper)) + + (symbol? (database (zip/node zipper))) + (let [sym (database (zip/node zipper)) + sym-path (->> (map first (zip/path zipper)) + butlast + (filter symbol?) + (#(concat % (list sym)))) + path (map database sym-path) + choice (first (choose trie database path))] + (recur + trie + database + (-> zipper + (zip/replace + [sym choice]) + (zip/root) + (zip/vector-zip)))) + + (string? (database (zip/node zipper))) + (let [terminal (database (zip/node zipper)) + path (->> (map first (zip/path zipper)) + butlast + (filter symbol?))] + (recur + trie + database + (-> zipper + zip/remove + zip/root + zip/vector-zip))) + + :else + (recur + trie + database + (-> zipper + (zip/replace + (mapv + database + (database (zip/node zipper)))) + (zip/next) + (zip/root) + (zip/vector-zip))))) + +(comment + (markov-generate-grammar test-trie @test-database (zip/vector-zip [1])) + + ) + +(defn markov-generate-sentence + [trie database zipper] + (cond + (zip/end? zipper) + (zip/root zipper) + + (seqable? (zip/node zipper)) + (recur trie database (zip/next zipper)) + + (symbol? (zip/node zipper)) + (recur trie database (zip/next zipper)) + + (symbol? (database (zip/node zipper))) + (let [sym (database (zip/node zipper)) + sym-path (->> (map first (zip/path zipper)) + butlast + (filter symbol?) + (#(concat % (list sym)))) + path (map database sym-path) + choice (first (choose trie database path))] + (recur + trie + database + (-> zipper + (zip/replace + [sym choice]) + (zip/root) + (zip/vector-zip)))) + + (string? (database (zip/node zipper))) + (let [terminal (database (zip/node zipper)) + path (->> (map first (zip/path zipper)) + butlast + (filter symbol?))] + (recur + trie + database + (-> zipper + (zip/replace + terminal) + (zip/next) + (zip/root) + (zip/vector-zip)))) + + :else + (recur + trie + database + (-> zipper + (zip/replace + (mapv + database + (database (zip/node zipper)))) + (zip/next) + (zip/root) + (zip/vector-zip))))) + +(comment + (generate test-trie @test-database (zip/vector-zip [1])) + + (repeatedly + 20 + #(->> (generate test-trie @test-database (zip/vector-zip [1])) + (zip/vector-zip) + (iterate zip/next) + (take-while (complement zip/end?)) + (map zip/node) + (filter string?))) + + ) diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index a88629b..47d440f 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -2,11 +2,14 @@ (:require [opennlp.nlp :as nlp] [opennlp.treebank :as tb] [clojure.string :as string] + [com.owoga.prhyme.data-transform :as df] + [com.owoga.trie :as trie] [clojure.java.io :as io] [clojure.zip :as zip] [com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2] [com.owoga.prhyme.util.weighted-rand :as weighted-rand] - [clojure.walk :as walk]) + [clojure.walk :as walk] + [com.owoga.prhyme.data.dictionary :as dict]) (:import (opennlp.tools.postag POSModel POSTaggerME) (opennlp.tools.parser Parse ParserModel ParserFactory) @@ -1209,3 +1212,118 @@ ;; [(TOP S NP PP NP NN) ("today")]) ) + + +;;;; Grammar Trie +;; +;; Create a trie from treebank parsed grammar trees. + +(defn -split-text-into-sentences + "Splits text on newlines, periods, exclamation and question marks." + [text] + (->> text + (#(string/replace % #"([\.\?\!\n]+)" "$1\n")) + (string/split-lines))) + +(defn -flatten-trie-entry-to-all-subkeys + "Turns + + [[k1 k2 k3] v] + + into + + [[[k1 k2 k3] v]] + [[k2 k3] v]] + [[k3] v]]] + + This is useful for creating a trie from a grammar tree. It's + nice to know that k3 is a child of both [k1 k2] and [k2] so + if you need to generate a [k2] in isolation, you have + acces to [k1 k2] and [k4 k2] and [kn k2] etc... all under the + top-level key [k2]. + " + [[k v]] + (loop [result [] + k k] + (if (empty? k) + result + (recur (conj result [k v]) + (rest k))))) + +(defn -normalize-text + [[k v]] + (if (string? (first v)) + [k (string/lower-case (first v))] + [k v])) + +(defn english? + [text] + (->> text + (#(string/replace % #"\W" " ")) + (#(string/replace % #" +" " ")) + (#(string/split % #" ")) + (every? #(dict/cmu-with-stress-map (string/lower-case %))))) + +(defn text->grammar-trie-map-entry + "Processes text into key value pairs where + the keys are parts-of-speech paths and the values + are the children at that path. + + Ready to be inserted into a trie." + [text] + (->> text + (-split-text-into-sentences) + (map string/trim) + (remove empty?) + (mapv treebank-zipper) + (remove nil?) + (map parts-of-speech-trie-entries) + (reduce into []) + (map -flatten-trie-entry-to-all-subkeys) + (reduce into []) + (mapv -normalize-text) + (mapv (fn [[k v]] + (clojure.lang.MapEntry. (into (vec k) [v]) v))))) + +(defn -new-key + "Associates key with an auto-incrementing ID + and the ID with the key. + + This 'database' is an atom that maps + keys to integer ids and integer ids to keys. + + This lets us use integers throughout the trie data structure, + which ends up being a lot more efficient and prepares the trie + for being turned into a tightly-packed-trie." + [database k] + (let [next-id (@database ::next-id)] + (swap! + database + #(-> % + (assoc k next-id) + (assoc next-id k) + (update ::next-id inc))) + next-id)) + +(defn make-database-stateful-xf + "This 'database' is an atom that maps + keys to integer ids and integer ids to keys. + + This lets us use integers throughout the trie data structure, + which ends up being a lot more efficient and prepares the trie + for being turned into a tightly-packed-trie. + + Takes an atom and returns a function that takes a Trie key/value. + When the returned function is called, it checks to see + if the key is in the database and if so it returns the associated id. + If not, it increments the id (which is stored in the database + under :next-id) and returns that new id." + [database] + (fn [[k v]] + (let [k' (mapv (fn [kn] + (if-let [id (get @database kn)] + id + (-new-key database kn))) + k)] + [k' 1]))) +