From 78b9977fe0250d2e332246d2da983af1931c1b65 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Wed, 28 Apr 2021 19:38:38 -0500 Subject: [PATCH] Working part-of-speech markov trie --- src/com/owoga/prhyme/data_transform.clj | 158 ++++++++++++++---------- src/com/owoga/prhyme/nlp/core.clj | 130 +++++++++++-------- 2 files changed, 170 insertions(+), 118 deletions(-) diff --git a/src/com/owoga/prhyme/data_transform.clj b/src/com/owoga/prhyme/data_transform.clj index b90f34e..b07bfdb 100644 --- a/src/com/owoga/prhyme/data_transform.clj +++ b/src/com/owoga/prhyme/data_transform.clj @@ -88,11 +88,11 @@ under :next-id) and returns that new id." [database] (fn [[k v]] - (let [k' (map (fn [kn] - (if-let [id (get @database kn)] - id - (new-key database kn))) - k)] + (let [k' (mapv (fn [kn] + (if-let [id (get @database kn)] + id + (new-key database kn))) + k)] [k' 1]))) (defn xf-part-of-speech-database @@ -175,89 +175,119 @@ (tpt/tightly-packed-trie trie encode-fn decode-fn)) (def texts (eduction - (comp (xf-file-seq 0 10) + (comp (xf-file-seq 0 100) (map slurp)) (file-seq (io/file "dark-corpus")))) (defn split-text-into-sentences [text] (->> text - (#(string/replace % #"([\.\?\!])" "$1\n")) + (#(string/replace % #"([\.\?\!\n]+)" "$1\n")) (string/split-lines))) (defn mapmap [fn & body] (apply map (partial map fn) body)) -(defn treebank-zipper->trie-map-entries - [treebank-zipper] - (let [leaf-paths (nlp/leaf-pos-paths treebank-zipper)] - leaf-paths)) - -(comment - (treebank-zipper->trie-map-entries - (zip/seq-zip - '(TOP - ((S - ((NP - ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) - (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) - (. (".")))))))) - - (defn breadth-first-search [z] - (letfn [(zip-children [loc] - (when-let [first-child (zip/down loc)] - (take-while (comp not nil?) - (iterate zip/right first-child))))] - (loop [ret [] - queue (conj clojure.lang.PersistentQueue/EMPTY z)] - (if (seq queue) - (let [[node children] ((juxt zip/node zip-children) (peek queue))] - (recur (conj ret node) (into (pop queue) children))) - ret)))) - - (filter - symbol? - (breadth-first-search - (zip/seq-zip - '(TOP - ((S - ((NP - ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) - (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) - (. ("."))))))))) - - (->> (zip/seq-zip - '(TOP - ((S - ((NP - ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) - (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) - (. ("."))))))) - (iterate zip/next) - (take 10) - last - (zip/path) - (map first) - (filter symbol?)) - - ) +(defn normalize-text + [[k v]] + (if (string? (first v)) + [k (string/lower-case (first v))] + [k v])) (defn process-text + "Processes text into key value pairs where + the keys are parts-of-speech paths and the values + are the children at that path. + + Ready to be inserted into a trie." [text] (->> text (split-text-into-sentences) (map string/trim) + (remove empty?) (map nlp/treebank-zipper) - (map nlp/leaf-pos-paths))) + (remove nil?) + (map nlp/parts-of-speech-trie-entries) + (mapv (fn [file] + (mapv (fn [line] + (mapv vec line)) + file))) + (reduce into []) + (mapv normalize-text) + (mapv (fn [[k v]] + (clojure.lang.MapEntry. (into k [v]) v))))) (comment - (into - #_(trie/make-trie) + (map process-text texts) + + (def test-database (atom {:next-id 1})) + + (transduce + (comp + (map process-text)) + conj [] - (map process-text) texts) + (take 20 @test-database) + ;; => ([[DT JJR] 394] + ;; [558 "progress"] + ;; [453 "peace"] + ;; [584 "rather"] + ;; [487 "avoid"] + ;; ["teaches" 315] + ;; [519 [NP NP]] + ;; [[VB ADJP] 482] + ;; [357 INTJ] + ;; [275 [VBP NP S]] + ;; [NP 10] + ;; [[NN .] 358] + ;; ["skin" 384] + ;; [530 "yourself"] + ;; [[VBD NP] 173] + ;; ["strikes" 101] + ;; [389 "his"] + ;; ["look" 259] + ;; [[RB JJ] 196] + ;; ["products" 179]) + (def test-trie + (transduce + (comp + (map (fn [text] + (try + (process-text text) + (catch Exception e + (println text) + (throw e))))) + (map (partial map (make-database-processor test-database)))) + (completing + (fn [trie entries] + (reduce + (fn [trie [k v]] + (update trie k (fnil #(update % 1 inc) [k 0]))) + trie + entries))) + (trie/make-trie) + texts)) + + (->> test-trie + (take 20) + (map (fn [[k v]] + [(map @test-database k) + (last v)]))) + + (->> (take 100 test-trie)) + + + (@test-database 16) + + (update + (conj (assoc (trie/make-trie) '[top s [np vp .]] '[np]) + '[[top s [s]] [s]]) + + '[[top s]] (fnil #(update % 1 inc) [:freq 0])) + (update {['top] 1} ['top] inc) ) (comment diff --git a/src/com/owoga/prhyme/nlp/core.clj b/src/com/owoga/prhyme/nlp/core.clj index cc30c8a..a40a080 100644 --- a/src/com/owoga/prhyme/nlp/core.clj +++ b/src/com/owoga/prhyme/nlp/core.clj @@ -12,6 +12,7 @@ ParserFactory) (opennlp.tools.cmdline.parser ParserTool))) +(comment tb2/phrases) (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin"))) (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin"))) (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin"))) @@ -231,17 +232,23 @@ Porcelain. If you have the simple tree data structure returned by `parse-to-simple-tree`, then you can just - pass that directly to `zip/seq-zip`." + pass that directly to `zip/seq-zip`. + + Returns nil if something can't be parsed. This will be + the case for empty strings." [text] - (let [tree (->> text - tokenize - (string/join " ") - vector - parse - first - tb/make-tree - unmake-tree)] - (zip/seq-zip tree))) + (try + (let [tree (->> text + tokenize + (string/join " ") + vector + parse + first + tb/make-tree + unmake-tree)] + (doall (zip/seq-zip tree))) + (catch Exception e + nil))) (comment ;; Here is a demo of zipping through a parse tree and changing @@ -1102,10 +1109,14 @@ (defn breadth-first [zipper] (letfn [(zip-children [loc] - (when-let [first-child (zip/down loc)] - (take-while - (comp not nil?) - (iterate zip/right first-child))))] + (try + (when-let [first-child (zip/down loc)] + (take-while + (complement nil?) + (iterate zip/right first-child))) + (catch Exception e + (println (zip/root loc)) + (throw e))))] (loop [result [] queue (conj clojure.lang.PersistentQueue/EMPTY zipper)] (if (seq queue) @@ -1135,27 +1146,44 @@ (map first))]))) (remove (comp nil? second)))) +(defn parts-of-speech-trie-entries + "Given a zipper of a treebank parse tree, returns a sequence of + key-value pairs where the key is a sequence of parts-of-speech + to traverse down the tree and the values are the children + in the parse tree at that path. + + This can be plugged into a Trie with frequency data to + give you the following kind of info: + + {'(TOP) + {'(S) {:freq 534 + '(NP VB) {:freq 233} + '(NP ADJP VB {:freq 210}) + ,,,} + '(SBARQ) {:freq 110} + '(SQ) {:freq 23}}} + " + [zipper] + (try + (->> (breadth-first zipper) + (filter (comp symbol? zip/node)) + (map zip/prev) + (filter zip/branch?) + (mapv (fn [loc] + [(->> (zip/next loc) + (zip/path) + (map first) + (filter symbol?)) + (let [child (zip/next (zip/next loc))] + (if (zip/branch? (zip/next child)) + (map first (zip/node child)) + (zip/node child)))]))) + (catch Exception e + (println (zip/node zipper)) + (throw e)))) + (comment - (->> (zip/vector-zip [1 [2 [3]]]) - (iterate zip/next) - (take 6) - last - zip/path - (map first)) - - (->> (breadth-first - (zip/seq-zip - '(TOP - ((S - ((NP - ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) - (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) - (. (".")))))))) - (map loc-children) - (filter seq?) - ) - - (part-of-speech-children + (parts-of-speech-trie-entries (zip/seq-zip '(TOP ((S @@ -1163,25 +1191,19 @@ ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) (. (".")))))))) - - (->> (zip/seq-zip - '(TOP - ((S - ((NP - ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) - (VP ((VBZ ("is")) (VP ((VBG ("falling")))))) - (. ("."))))))) - (zip/next) - (zip/next) - (zip/next) - (zip/next) - (zip/next) - (zip/node) - #_#_(loc-children) - (map first)) + ;; => ([(TOP) (S)] + ;; [(TOP S) (NP VP .)] + ;; [(TOP S NP) (NP PP)] + ;; [(TOP S VP) (VBZ VP)] + ;; [(TOP S .) (".")] + ;; [(TOP S NP NP) (NN)] + ;; [(TOP S NP PP) (IN NP)] + ;; [(TOP S VP VBZ) ("is")] + ;; [(TOP S VP VP) (VBG)] + ;; [(TOP S NP NP NN) ("Everything")] + ;; [(TOP S NP PP IN) ("of")] + ;; [(TOP S NP PP NP) (NN)] + ;; [(TOP S VP VP VBG) ("falling")] + ;; [(TOP S NP PP NP NN) ("today")]) ) -(comment - (defn part-of-speech-n-grams - [zipper] - (letfn [(fn step [path []])])))