Working part-of-speech markov trie

main
Eric Ihli 3 years ago
parent b766f836d3
commit 78b9977fe0

@ -88,11 +88,11 @@
under :next-id) and returns that new id." under :next-id) and returns that new id."
[database] [database]
(fn [[k v]] (fn [[k v]]
(let [k' (map (fn [kn] (let [k' (mapv (fn [kn]
(if-let [id (get @database kn)] (if-let [id (get @database kn)]
id id
(new-key database kn))) (new-key database kn)))
k)] k)]
[k' 1]))) [k' 1])))
(defn xf-part-of-speech-database (defn xf-part-of-speech-database
@ -175,89 +175,119 @@
(tpt/tightly-packed-trie trie encode-fn decode-fn)) (tpt/tightly-packed-trie trie encode-fn decode-fn))
(def texts (eduction (def texts (eduction
(comp (xf-file-seq 0 10) (comp (xf-file-seq 0 100)
(map slurp)) (map slurp))
(file-seq (io/file "dark-corpus")))) (file-seq (io/file "dark-corpus"))))
(defn split-text-into-sentences (defn split-text-into-sentences
[text] [text]
(->> text (->> text
(#(string/replace % #"([\.\?\!])" "$1\n")) (#(string/replace % #"([\.\?\!\n]+)" "$1\n"))
(string/split-lines))) (string/split-lines)))
(defn mapmap (defn mapmap
[fn & body] [fn & body]
(apply map (partial map fn) body)) (apply map (partial map fn) body))
(defn treebank-zipper->trie-map-entries (defn normalize-text
[treebank-zipper] [[k v]]
(let [leaf-paths (nlp/leaf-pos-paths treebank-zipper)] (if (string? (first v))
leaf-paths)) [k (string/lower-case (first v))]
[k v]))
(comment
(treebank-zipper->trie-map-entries
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. ("."))))))))
(defn breadth-first-search [z]
(letfn [(zip-children [loc]
(when-let [first-child (zip/down loc)]
(take-while (comp not nil?)
(iterate zip/right first-child))))]
(loop [ret []
queue (conj clojure.lang.PersistentQueue/EMPTY z)]
(if (seq queue)
(let [[node children] ((juxt zip/node zip-children) (peek queue))]
(recur (conj ret node) (into (pop queue) children)))
ret))))
(filter
symbol?
(breadth-first-search
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))))
(->> (zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))
(iterate zip/next)
(take 10)
last
(zip/path)
(map first)
(filter symbol?))
)
(defn process-text (defn process-text
"Processes text into key value pairs where
the keys are parts-of-speech paths and the values
are the children at that path.
Ready to be inserted into a trie."
[text] [text]
(->> text (->> text
(split-text-into-sentences) (split-text-into-sentences)
(map string/trim) (map string/trim)
(remove empty?)
(map nlp/treebank-zipper) (map nlp/treebank-zipper)
(map nlp/leaf-pos-paths))) (remove nil?)
(map nlp/parts-of-speech-trie-entries)
(mapv (fn [file]
(mapv (fn [line]
(mapv vec line))
file)))
(reduce into [])
(mapv normalize-text)
(mapv (fn [[k v]]
(clojure.lang.MapEntry. (into k [v]) v)))))
(comment (comment
(into (map process-text texts)
#_(trie/make-trie)
(def test-database (atom {:next-id 1}))
(transduce
(comp
(map process-text))
conj
[] []
(map process-text)
texts) texts)
(take 20 @test-database)
;; => ([[DT JJR] 394]
;; [558 "progress"]
;; [453 "peace"]
;; [584 "rather"]
;; [487 "avoid"]
;; ["teaches" 315]
;; [519 [NP NP]]
;; [[VB ADJP] 482]
;; [357 INTJ]
;; [275 [VBP NP S]]
;; [NP 10]
;; [[NN .] 358]
;; ["skin" 384]
;; [530 "yourself"]
;; [[VBD NP] 173]
;; ["strikes" 101]
;; [389 "his"]
;; ["look" 259]
;; [[RB JJ] 196]
;; ["products" 179])
(def test-trie
(transduce
(comp
(map (fn [text]
(try
(process-text text)
(catch Exception e
(println text)
(throw e)))))
(map (partial map (make-database-processor test-database))))
(completing
(fn [trie entries]
(reduce
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [k 0])))
trie
entries)))
(trie/make-trie)
texts))
(->> test-trie
(take 20)
(map (fn [[k v]]
[(map @test-database k)
(last v)])))
(->> (take 100 test-trie))
(@test-database 16)
(update
(conj (assoc (trie/make-trie) '[top s [np vp .]] '[np])
'[[top s [s]] [s]])
'[[top s]] (fnil #(update % 1 inc) [:freq 0]))
(update {['top] 1} ['top] inc)
) )
(comment (comment

@ -12,6 +12,7 @@
ParserFactory) ParserFactory)
(opennlp.tools.cmdline.parser ParserTool))) (opennlp.tools.cmdline.parser ParserTool)))
(comment tb2/phrases)
(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin"))) (def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
(def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin"))) (def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
(def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin"))) (def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
@ -231,17 +232,23 @@
Porcelain. If you have the simple tree data structure Porcelain. If you have the simple tree data structure
returned by `parse-to-simple-tree`, then you can just returned by `parse-to-simple-tree`, then you can just
pass that directly to `zip/seq-zip`." pass that directly to `zip/seq-zip`.
Returns nil if something can't be parsed. This will be
the case for empty strings."
[text] [text]
(let [tree (->> text (try
tokenize (let [tree (->> text
(string/join " ") tokenize
vector (string/join " ")
parse vector
first parse
tb/make-tree first
unmake-tree)] tb/make-tree
(zip/seq-zip tree))) unmake-tree)]
(doall (zip/seq-zip tree)))
(catch Exception e
nil)))
(comment (comment
;; Here is a demo of zipping through a parse tree and changing ;; Here is a demo of zipping through a parse tree and changing
@ -1102,10 +1109,14 @@
(defn breadth-first (defn breadth-first
[zipper] [zipper]
(letfn [(zip-children [loc] (letfn [(zip-children [loc]
(when-let [first-child (zip/down loc)] (try
(take-while (when-let [first-child (zip/down loc)]
(comp not nil?) (take-while
(iterate zip/right first-child))))] (complement nil?)
(iterate zip/right first-child)))
(catch Exception e
(println (zip/root loc))
(throw e))))]
(loop [result [] (loop [result []
queue (conj clojure.lang.PersistentQueue/EMPTY zipper)] queue (conj clojure.lang.PersistentQueue/EMPTY zipper)]
(if (seq queue) (if (seq queue)
@ -1135,27 +1146,44 @@
(map first))]))) (map first))])))
(remove (comp nil? second)))) (remove (comp nil? second))))
(defn parts-of-speech-trie-entries
"Given a zipper of a treebank parse tree, returns a sequence of
key-value pairs where the key is a sequence of parts-of-speech
to traverse down the tree and the values are the children
in the parse tree at that path.
This can be plugged into a Trie with frequency data to
give you the following kind of info:
{'(TOP)
{'(S) {:freq 534
'(NP VB) {:freq 233}
'(NP ADJP VB {:freq 210})
,,,}
'(SBARQ) {:freq 110}
'(SQ) {:freq 23}}}
"
[zipper]
(try
(->> (breadth-first zipper)
(filter (comp symbol? zip/node))
(map zip/prev)
(filter zip/branch?)
(mapv (fn [loc]
[(->> (zip/next loc)
(zip/path)
(map first)
(filter symbol?))
(let [child (zip/next (zip/next loc))]
(if (zip/branch? (zip/next child))
(map first (zip/node child))
(zip/node child)))])))
(catch Exception e
(println (zip/node zipper))
(throw e))))
(comment (comment
(->> (zip/vector-zip [1 [2 [3]]]) (parts-of-speech-trie-entries
(iterate zip/next)
(take 6)
last
zip/path
(map first))
(->> (breadth-first
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. ("."))))))))
(map loc-children)
(filter seq?)
)
(part-of-speech-children
(zip/seq-zip (zip/seq-zip
'(TOP '(TOP
((S ((S
@ -1163,25 +1191,19 @@
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) ((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling")))))) (VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))) (. ("."))))))))
;; => ([(TOP) (S)]
(->> (zip/seq-zip ;; [(TOP S) (NP VP .)]
'(TOP ;; [(TOP S NP) (NP PP)]
((S ;; [(TOP S VP) (VBZ VP)]
((NP ;; [(TOP S .) (".")]
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today")))))))) ;; [(TOP S NP NP) (NN)]
(VP ((VBZ ("is")) (VP ((VBG ("falling")))))) ;; [(TOP S NP PP) (IN NP)]
(. ("."))))))) ;; [(TOP S VP VBZ) ("is")]
(zip/next) ;; [(TOP S VP VP) (VBG)]
(zip/next) ;; [(TOP S NP NP NN) ("Everything")]
(zip/next) ;; [(TOP S NP PP IN) ("of")]
(zip/next) ;; [(TOP S NP PP NP) (NN)]
(zip/next) ;; [(TOP S VP VP VBG) ("falling")]
(zip/node) ;; [(TOP S NP PP NP NN) ("today")])
#_#_(loc-children)
(map first))
) )
(comment
(defn part-of-speech-n-grams
[zipper]
(letfn [(fn step [path []])])))

Loading…
Cancel
Save