Fix bugs in file-seq->markov-trie

main
Eric Ihli 3 years ago
parent d84b2a0204
commit f6b1150431

@ -3,6 +3,7 @@
[com.owoga.prhyme.util :as util] [com.owoga.prhyme.util :as util]
[com.owoga.prhyme.data.dictionary :as dict] [com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.nlp.core :as nlp] [com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.prhyme.data-transform :as data-transform]
[com.owoga.trie :as trie] [com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie :as tpt]
[clojure.string :as string] [clojure.string :as string]
@ -233,6 +234,8 @@
(file-seq (io/file "dark-corpus")))] (file-seq (io/file "dark-corpus")))]
[trie database])) [trie database]))
(comment (comment
(take 20 trie) (take 20 trie)
@ -242,6 +245,39 @@
(map @trie-database)) (map @trie-database))
) )
(defn file-seq->markov-trie
[database files n m]
(transduce
(comp
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce data-transform/xf-tokenize conj))
(map (partial transduce data-transform/xf-filter-english conj))
(map (partial remove empty?))
(map (partial into [] (data-transform/xf-pad-tokens (dec m) "<s>" 1 "</s>")))
(map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m))))
(mapcat (partial mapv (data-transform/make-database-processor database))))
(completing
(fn [trie lookup]
(update trie lookup (fnil #(update % 1 inc) [lookup 0]))))
(trie/make-trie)
files))
(comment
(let [files (->> "dark-corpus"
io/file
file-seq
(eduction (xf-file-seq 501 2)))
database (atom {:next-id 1})
trie (file-seq->markov-trie database files 1 3)]
[(take 20 trie)
(count trie)
(get @database 1)
(take 10 @database)])
)
(defn initialize (defn initialize
"Takes an atom as a context. Swaps in :database, :trie, :rhyme-trie" "Takes an atom as a context. Swaps in :database, :trie, :rhyme-trie"
[context] [context]

@ -77,6 +77,7 @@
(comment (comment
(n-to-m-partitions 1 4 (range 6)) (n-to-m-partitions 1 4 (range 6))
;; => ((0) ;; => ((0)
;; (1) ;; (1)
;; ,,, ;; ,,,
@ -109,13 +110,13 @@
If not, it increments the id (which is stored in the database If not, it increments the id (which is stored in the database
under :next-id) and returns that new id." under :next-id) and returns that new id."
[database] [database]
(fn [[k v]] (fn [lookup ]
(let [k' (mapv (fn [kn] (let [lookup' (mapv (fn [key]
(if-let [id (get @database kn)] (if-let [id (get @database key)]
id id
(new-key database kn))) (new-key database key)))
k)] lookup)]
[k' 1]))) [lookup' v])))
(comment (comment
;; TODO: Move to nlp.core ;; TODO: Move to nlp.core
@ -184,7 +185,7 @@
(remove empty?) (remove empty?)
(map (partial transduce (xf-pad-tokens 1 "<s>" (dec m) "</s>") conj)) (map (partial transduce (xf-pad-tokens 1 "<s>" (dec m) "</s>") conj))
(map (partial map reverse)) (map (partial map reverse))
(mapcat (partial map (partial n-to-m-partitions n (inc m)))) (map (partial n-to-m-partitions n (inc m)))
(mapcat (partial into [])) (mapcat (partial into []))
(map #(clojure.lang.MapEntry. (vec %) %)) (map #(clojure.lang.MapEntry. (vec %) %))
(map (make-database-processor database))) (map (make-database-processor database)))

Loading…
Cancel
Save