Working part-of-speech markov trie

main
Eric Ihli 3 years ago
parent b766f836d3
commit 78b9977fe0

@ -88,11 +88,11 @@
under :next-id) and returns that new id."
[database]
(fn [[k v]]
(let [k' (map (fn [kn]
(if-let [id (get @database kn)]
id
(new-key database kn)))
k)]
(let [k' (mapv (fn [kn]
(if-let [id (get @database kn)]
id
(new-key database kn)))
k)]
[k' 1])))
(defn xf-part-of-speech-database
@ -175,89 +175,119 @@
(tpt/tightly-packed-trie trie encode-fn decode-fn))
(def texts (eduction
(comp (xf-file-seq 0 10)
(comp (xf-file-seq 0 100)
(map slurp))
(file-seq (io/file "dark-corpus"))))
(defn split-text-into-sentences
[text]
(->> text
(#(string/replace % #"([\.\?\!])" "$1\n"))
(#(string/replace % #"([\.\?\!\n]+)" "$1\n"))
(string/split-lines)))
(defn mapmap
[fn & body]
(apply map (partial map fn) body))
(defn treebank-zipper->trie-map-entries
[treebank-zipper]
(let [leaf-paths (nlp/leaf-pos-paths treebank-zipper)]
leaf-paths))
(comment
(treebank-zipper->trie-map-entries
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. ("."))))))))
(defn breadth-first-search [z]
(letfn [(zip-children [loc]
(when-let [first-child (zip/down loc)]
(take-while (comp not nil?)
(iterate zip/right first-child))))]
(loop [ret []
queue (conj clojure.lang.PersistentQueue/EMPTY z)]
(if (seq queue)
(let [[node children] ((juxt zip/node zip-children) (peek queue))]
(recur (conj ret node) (into (pop queue) children)))
ret))))
(filter
symbol?
(breadth-first-search
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))))
(->> (zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))
(iterate zip/next)
(take 10)
last
(zip/path)
(map first)
(filter symbol?))
)
(defn normalize-text
[[k v]]
(if (string? (first v))
[k (string/lower-case (first v))]
[k v]))
(defn process-text
"Processes text into key value pairs where
the keys are parts-of-speech paths and the values
are the children at that path.
Ready to be inserted into a trie."
[text]
(->> text
(split-text-into-sentences)
(map string/trim)
(remove empty?)
(map nlp/treebank-zipper)
(map nlp/leaf-pos-paths)))
(remove nil?)
(map nlp/parts-of-speech-trie-entries)
(mapv (fn [file]
(mapv (fn [line]
(mapv vec line))
file)))
(reduce into [])
(mapv normalize-text)
(mapv (fn [[k v]]
(clojure.lang.MapEntry. (into k [v]) v)))))
(comment
(into
#_(trie/make-trie)
(map process-text texts)
(def test-database (atom {:next-id 1}))
(transduce
(comp
(map process-text))
conj
[]
(map process-text)
texts)
(take 20 @test-database)
;; => ([[DT JJR] 394]
;; [558 "progress"]
;; [453 "peace"]
;; [584 "rather"]
;; [487 "avoid"]
;; ["teaches" 315]
;; [519 [NP NP]]
;; [[VB ADJP] 482]
;; [357 INTJ]
;; [275 [VBP NP S]]
;; [NP 10]
;; [[NN .] 358]
;; ["skin" 384]
;; [530 "yourself"]
;; [[VBD NP] 173]
;; ["strikes" 101]
;; [389 "his"]
;; ["look" 259]
;; [[RB JJ] 196]
;; ["products" 179])
(def test-trie
(transduce
(comp
(map (fn [text]
(try
(process-text text)
(catch Exception e
(println text)
(throw e)))))
(map (partial map (make-database-processor test-database))))
(completing
(fn [trie entries]
(reduce
(fn [trie [k v]]
(update trie k (fnil #(update % 1 inc) [k 0])))
trie
entries)))
(trie/make-trie)
texts))
(->> test-trie
(take 20)
(map (fn [[k v]]
[(map @test-database k)
(last v)])))
(->> (take 100 test-trie))
(@test-database 16)
(update
(conj (assoc (trie/make-trie) '[top s [np vp .]] '[np])
'[[top s [s]] [s]])
'[[top s]] (fnil #(update % 1 inc) [:freq 0]))
(update {['top] 1} ['top] inc)
)
(comment

@ -12,6 +12,7 @@
ParserFactory)
(opennlp.tools.cmdline.parser ParserTool)))
(comment tb2/phrases)
(def tokenize (nlp/make-tokenizer (io/resource "models/en-token.bin")))
(def get-sentences (nlp/make-sentence-detector (io/resource "models/en-sent.bin")))
(def parse (tb/make-treebank-parser (io/resource "models/en-parser-chunking.bin")))
@ -231,17 +232,23 @@
Porcelain. If you have the simple tree data structure
returned by `parse-to-simple-tree`, then you can just
pass that directly to `zip/seq-zip`."
pass that directly to `zip/seq-zip`.
Returns nil if something can't be parsed. This will be
the case for empty strings."
[text]
(let [tree (->> text
tokenize
(string/join " ")
vector
parse
first
tb/make-tree
unmake-tree)]
(zip/seq-zip tree)))
(try
(let [tree (->> text
tokenize
(string/join " ")
vector
parse
first
tb/make-tree
unmake-tree)]
(doall (zip/seq-zip tree)))
(catch Exception e
nil)))
(comment
;; Here is a demo of zipping through a parse tree and changing
@ -1102,10 +1109,14 @@
(defn breadth-first
[zipper]
(letfn [(zip-children [loc]
(when-let [first-child (zip/down loc)]
(take-while
(comp not nil?)
(iterate zip/right first-child))))]
(try
(when-let [first-child (zip/down loc)]
(take-while
(complement nil?)
(iterate zip/right first-child)))
(catch Exception e
(println (zip/root loc))
(throw e))))]
(loop [result []
queue (conj clojure.lang.PersistentQueue/EMPTY zipper)]
(if (seq queue)
@ -1135,27 +1146,44 @@
(map first))])))
(remove (comp nil? second))))
(defn parts-of-speech-trie-entries
"Given a zipper of a treebank parse tree, returns a sequence of
key-value pairs where the key is a sequence of parts-of-speech
to traverse down the tree and the values are the children
in the parse tree at that path.
This can be plugged into a Trie with frequency data to
give you the following kind of info:
{'(TOP)
{'(S) {:freq 534
'(NP VB) {:freq 233}
'(NP ADJP VB {:freq 210})
,,,}
'(SBARQ) {:freq 110}
'(SQ) {:freq 23}}}
"
[zipper]
(try
(->> (breadth-first zipper)
(filter (comp symbol? zip/node))
(map zip/prev)
(filter zip/branch?)
(mapv (fn [loc]
[(->> (zip/next loc)
(zip/path)
(map first)
(filter symbol?))
(let [child (zip/next (zip/next loc))]
(if (zip/branch? (zip/next child))
(map first (zip/node child))
(zip/node child)))])))
(catch Exception e
(println (zip/node zipper))
(throw e))))
(comment
(->> (zip/vector-zip [1 [2 [3]]])
(iterate zip/next)
(take 6)
last
zip/path
(map first))
(->> (breadth-first
(zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. ("."))))))))
(map loc-children)
(filter seq?)
)
(part-of-speech-children
(parts-of-speech-trie-entries
(zip/seq-zip
'(TOP
((S
@ -1163,25 +1191,19 @@
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. ("."))))))))
(->> (zip/seq-zip
'(TOP
((S
((NP
((NP ((NN ("Everything")))) (PP ((IN ("of")) (NP ((NN ("today"))))))))
(VP ((VBZ ("is")) (VP ((VBG ("falling"))))))
(. (".")))))))
(zip/next)
(zip/next)
(zip/next)
(zip/next)
(zip/next)
(zip/node)
#_#_(loc-children)
(map first))
;; => ([(TOP) (S)]
;; [(TOP S) (NP VP .)]
;; [(TOP S NP) (NP PP)]
;; [(TOP S VP) (VBZ VP)]
;; [(TOP S .) (".")]
;; [(TOP S NP NP) (NN)]
;; [(TOP S NP PP) (IN NP)]
;; [(TOP S VP VBZ) ("is")]
;; [(TOP S VP VP) (VBG)]
;; [(TOP S NP NP NN) ("Everything")]
;; [(TOP S NP PP IN) ("of")]
;; [(TOP S NP PP NP) (NN)]
;; [(TOP S VP VP VBG) ("falling")]
;; [(TOP S NP PP NP NN) ("today")])
)
(comment
(defn part-of-speech-n-grams
[zipper]
(letfn [(fn step [path []])])))

Loading…
Cancel
Save