|
|
@ -2,7 +2,6 @@
|
|
|
|
(:require [clojure.string :as string]
|
|
|
|
(:require [clojure.string :as string]
|
|
|
|
[clojure.java.io :as io]
|
|
|
|
[clojure.java.io :as io]
|
|
|
|
[com.owoga.prhyme.data.dictionary :as dict]
|
|
|
|
[com.owoga.prhyme.data.dictionary :as dict]
|
|
|
|
[com.owoga.prhyme.nlp.core :as nlp]
|
|
|
|
|
|
|
|
[com.owoga.trie :as trie]
|
|
|
|
[com.owoga.trie :as trie]
|
|
|
|
[com.owoga.tightly-packed-trie :as tpt]
|
|
|
|
[com.owoga.tightly-packed-trie :as tpt]
|
|
|
|
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
|
|
|
[com.owoga.tightly-packed-trie.encoding :as encoding]
|
|
|
@ -97,7 +96,9 @@
|
|
|
|
k)]
|
|
|
|
k)]
|
|
|
|
[k' 1])))
|
|
|
|
[k' 1])))
|
|
|
|
|
|
|
|
|
|
|
|
(defn xf-part-of-speech-database
|
|
|
|
(comment
|
|
|
|
|
|
|
|
;; TODO: Move to nlp.core
|
|
|
|
|
|
|
|
(defn xf-part-of-speech-database
|
|
|
|
[database]
|
|
|
|
[database]
|
|
|
|
(fn [sentence]
|
|
|
|
(fn [sentence]
|
|
|
|
(let [leafs (->> sentence
|
|
|
|
(let [leafs (->> sentence
|
|
|
@ -111,7 +112,7 @@
|
|
|
|
k
|
|
|
|
k
|
|
|
|
(merge-with + (@database k) v)))
|
|
|
|
(merge-with + (@database k) v)))
|
|
|
|
leafs)
|
|
|
|
leafs)
|
|
|
|
sentence)))
|
|
|
|
sentence))))
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
(comment
|
|
|
|
(let [database (atom {})]
|
|
|
|
(let [database (atom {})]
|
|
|
@ -206,7 +207,9 @@
|
|
|
|
(recur (conj result [k v])
|
|
|
|
(recur (conj result [k v])
|
|
|
|
(rest k)))))
|
|
|
|
(rest k)))))
|
|
|
|
|
|
|
|
|
|
|
|
(defn process-text
|
|
|
|
(comment
|
|
|
|
|
|
|
|
;; TODO: Move to nlp.core
|
|
|
|
|
|
|
|
(defn process-text
|
|
|
|
"Processes text into key value pairs where
|
|
|
|
"Processes text into key value pairs where
|
|
|
|
the keys are parts-of-speech paths and the values
|
|
|
|
the keys are parts-of-speech paths and the values
|
|
|
|
are the children at that path.
|
|
|
|
are the children at that path.
|
|
|
@ -220,18 +223,15 @@
|
|
|
|
(mapv nlp/treebank-zipper)
|
|
|
|
(mapv nlp/treebank-zipper)
|
|
|
|
(remove nil?)
|
|
|
|
(remove nil?)
|
|
|
|
(map nlp/parts-of-speech-trie-entries)
|
|
|
|
(map nlp/parts-of-speech-trie-entries)
|
|
|
|
(mapv (fn [file]
|
|
|
|
|
|
|
|
(mapv (fn [line]
|
|
|
|
|
|
|
|
(mapv vec line))
|
|
|
|
|
|
|
|
file)))
|
|
|
|
|
|
|
|
(reduce into [])
|
|
|
|
(reduce into [])
|
|
|
|
(map flatten-trie-entry-to-all-subkeys)
|
|
|
|
(map flatten-trie-entry-to-all-subkeys)
|
|
|
|
(reduce into [])
|
|
|
|
(reduce into [])
|
|
|
|
(mapv normalize-text)
|
|
|
|
(mapv normalize-text)
|
|
|
|
(mapv (fn [[k v]]
|
|
|
|
(mapv (fn [[k v]]
|
|
|
|
(clojure.lang.MapEntry. (into (vec k) [v]) v)))))
|
|
|
|
(clojure.lang.MapEntry. (into (vec k) [v]) v))))))
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
(comment
|
|
|
|
|
|
|
|
(process-text (first texts))
|
|
|
|
(flatten-trie-entry-to-all-subkeys
|
|
|
|
(flatten-trie-entry-to-all-subkeys
|
|
|
|
'[(TOP S NP) (NP PP)])
|
|
|
|
'[(TOP S NP) (NP PP)])
|
|
|
|
;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]]
|
|
|
|
;; => [[(TOP S NP) (NP PP)] [(S NP) (NP PP)] [(NP) (NP PP)]]
|
|
|
@ -304,7 +304,7 @@
|
|
|
|
trie
|
|
|
|
trie
|
|
|
|
entries)))
|
|
|
|
entries)))
|
|
|
|
(trie/make-trie)
|
|
|
|
(trie/make-trie)
|
|
|
|
(take 300 texts))))
|
|
|
|
(take 3000 texts))))
|
|
|
|
|
|
|
|
|
|
|
|
(nippy/freeze-to-file "/tmp/test-trie.bin" (seq test-trie))
|
|
|
|
(nippy/freeze-to-file "/tmp/test-trie.bin" (seq test-trie))
|
|
|
|
(time
|
|
|
|
(time
|
|
|
@ -441,32 +441,6 @@
|
|
|
|
(#(zip/insert-right % (zip/node z2)))
|
|
|
|
(#(zip/insert-right % (zip/node z2)))
|
|
|
|
(zip/root))))
|
|
|
|
(zip/root))))
|
|
|
|
|
|
|
|
|
|
|
|
(defn generate
|
|
|
|
|
|
|
|
[trie database zipper]
|
|
|
|
|
|
|
|
(let [k (map first (zip/path zipper))]
|
|
|
|
|
|
|
|
(do (Thread/sleep 10) (println k))
|
|
|
|
|
|
|
|
(if (vector? (database (last k)))
|
|
|
|
|
|
|
|
(loop [zipper zipper]
|
|
|
|
|
|
|
|
(let [children (last (map first (zip/path zipper)))]
|
|
|
|
|
|
|
|
(Thread/sleep 50) (println children (zip/root zipper))
|
|
|
|
|
|
|
|
(if (empty? children)
|
|
|
|
|
|
|
|
zipper
|
|
|
|
|
|
|
|
(recur
|
|
|
|
|
|
|
|
(-> zipper
|
|
|
|
|
|
|
|
zip/up
|
|
|
|
|
|
|
|
(zip/append-child [(first children)])
|
|
|
|
|
|
|
|
(zip/down)
|
|
|
|
|
|
|
|
(zip/rightmost)
|
|
|
|
|
|
|
|
(zip/down)
|
|
|
|
|
|
|
|
(#(generate trie database %))
|
|
|
|
|
|
|
|
(zip/up)
|
|
|
|
|
|
|
|
(zip/up)
|
|
|
|
|
|
|
|
(zip/down)
|
|
|
|
|
|
|
|
(zip/replace (subvec 1 children)))))))
|
|
|
|
|
|
|
|
(zip/insert-right
|
|
|
|
|
|
|
|
zipper
|
|
|
|
|
|
|
|
(choose trie database k)))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(defn generate
|
|
|
|
(defn generate
|
|
|
|
[trie database zipper]
|
|
|
|
[trie database zipper]
|
|
|
|
(cond
|
|
|
|
(cond
|
|
|
@ -526,12 +500,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
(comment
|
|
|
|
(trie/lookup test-trie [1])
|
|
|
|
(trie/lookup test-trie [1])
|
|
|
|
(->> (generate test-trie @test-database (zip/vector-zip [1]))
|
|
|
|
(repeatedly
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
#(->> (generate test-trie @test-database (zip/vector-zip [1]))
|
|
|
|
(zip/vector-zip)
|
|
|
|
(zip/vector-zip)
|
|
|
|
(iterate zip/next)
|
|
|
|
(iterate zip/next)
|
|
|
|
(take-while (complement zip/end?))
|
|
|
|
(take-while (complement zip/end?))
|
|
|
|
(map zip/node)
|
|
|
|
(map zip/node)
|
|
|
|
(filter string?))
|
|
|
|
(filter string?)))
|
|
|
|
|
|
|
|
|
|
|
|
(-> [:a [:b] [:b]]
|
|
|
|
(-> [:a [:b] [:b]]
|
|
|
|
zip/vector-zip
|
|
|
|
zip/vector-zip
|
|
|
@ -649,7 +625,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
(defn xf-grammar-database
|
|
|
|
(comment
|
|
|
|
|
|
|
|
;; TODO: Move to nlp.core
|
|
|
|
|
|
|
|
(defn xf-grammar-database
|
|
|
|
[database]
|
|
|
|
[database]
|
|
|
|
(fn [sentence]
|
|
|
|
(fn [sentence]
|
|
|
|
(let [leafs (->> sentence
|
|
|
|
(let [leafs (->> sentence
|
|
|
@ -663,9 +641,11 @@
|
|
|
|
k
|
|
|
|
k
|
|
|
|
(merge-with + (@database k) v)))
|
|
|
|
(merge-with + (@database k) v)))
|
|
|
|
leafs)
|
|
|
|
leafs)
|
|
|
|
sentence)))
|
|
|
|
sentence))))
|
|
|
|
|
|
|
|
|
|
|
|
(defn file-seq->grammar-tree
|
|
|
|
(comment
|
|
|
|
|
|
|
|
;; TODO: remove or move to nlp.core
|
|
|
|
|
|
|
|
(defn file-seq->grammar-tree
|
|
|
|
[files]
|
|
|
|
[files]
|
|
|
|
(transduce
|
|
|
|
(transduce
|
|
|
|
(comp
|
|
|
|
(comp
|
|
|
@ -685,7 +665,7 @@
|
|
|
|
([acc m]
|
|
|
|
([acc m]
|
|
|
|
(merge-with + acc m)))
|
|
|
|
(merge-with + acc m)))
|
|
|
|
{}
|
|
|
|
{}
|
|
|
|
files))
|
|
|
|
files)))
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
(comment
|
|
|
|
(time
|
|
|
|
(time
|
|
|
@ -699,7 +679,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
(defn file-seq->part-of-speech-freqs
|
|
|
|
(comment
|
|
|
|
|
|
|
|
;; TODO: Remove or move to nlp.core
|
|
|
|
|
|
|
|
(defn file-seq->part-of-speech-freqs
|
|
|
|
[files]
|
|
|
|
[files]
|
|
|
|
(transduce
|
|
|
|
(transduce
|
|
|
|
(comp
|
|
|
|
(comp
|
|
|
@ -719,7 +701,7 @@
|
|
|
|
(fn [result input]
|
|
|
|
(fn [result input]
|
|
|
|
(nlp/deep-merge-with + result input)))
|
|
|
|
(nlp/deep-merge-with + result input)))
|
|
|
|
{}
|
|
|
|
{}
|
|
|
|
files))
|
|
|
|
files)))
|
|
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
(comment
|
|
|
|
(time (->> (file-seq->part-of-speech-freqs
|
|
|
|
(time (->> (file-seq->part-of-speech-freqs
|
|
|
@ -732,7 +714,9 @@
|
|
|
|
)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(defn file-seq->parts-of-speech-trie
|
|
|
|
(comment
|
|
|
|
|
|
|
|
;; TODO: Remove or move to nlp.core
|
|
|
|
|
|
|
|
(defn file-seq->parts-of-speech-trie
|
|
|
|
[files]
|
|
|
|
[files]
|
|
|
|
(transduce
|
|
|
|
(transduce
|
|
|
|
(comp
|
|
|
|
(comp
|
|
|
@ -752,4 +736,4 @@
|
|
|
|
([acc m]
|
|
|
|
([acc m]
|
|
|
|
(merge-with + acc m)))
|
|
|
|
(merge-with + acc m)))
|
|
|
|
{}
|
|
|
|
{}
|
|
|
|
files))
|
|
|
|
files)))
|
|
|
|