Fix bugs in file-seq->markov-trie

4 years ago · f6b1150431
parent d84b2a0204
commit f6b1150431
2 changed files with 45 additions and 8 deletions
--- a/src/com/owoga/corpus/markov.clj
+++ b/src/com/owoga/corpus/markov.clj
@ -3,6 +3,7 @@
            [com.owoga.prhyme.util :as util]
            [com.owoga.prhyme.data.dictionary :as dict]
            [com.owoga.prhyme.nlp.core :as nlp]
+            [com.owoga.prhyme.data-transform :as data-transform]
            [com.owoga.trie :as trie]
            [com.owoga.tightly-packed-trie :as tpt]
            [clojure.string :as string]
@ -233,6 +234,8 @@
                        (file-seq (io/file "dark-corpus")))]
    [trie database]))

+
+
 (comment

  (take 20 trie)
@ -242,6 +245,39 @@
       (map @trie-database))

  )
+
+
+(defn file-seq->markov-trie
+  [database files n m]
+  (transduce
+   (comp
+    (map slurp)
+    (map #(string/split % #"[\n+\?\.]"))
+    (map (partial transduce data-transform/xf-tokenize conj))
+    (map (partial transduce data-transform/xf-filter-english conj))
+    (map (partial remove empty?))
+    (map (partial into [] (data-transform/xf-pad-tokens (dec m) "<s>" 1 "</s>")))
+    (map (partial mapcat (partial data-transform/n-to-m-partitions n (inc m))))
+    (mapcat (partial mapv (data-transform/make-database-processor database))))
+   (completing
+    (fn [trie lookup]
+      (update trie lookup (fnil #(update % 1 inc) [lookup 0]))))
+   (trie/make-trie)
+   files))
+
+(comment
+  (let [files (->> "dark-corpus"
+                   io/file
+                   file-seq
+                   (eduction (xf-file-seq 501 2)))
+        database (atom {:next-id 1})
+        trie (file-seq->markov-trie database files 1 3)]
+    [(take 20 trie)
+     (count trie)
+     (get @database 1)
+     (take 10 @database)])
+
+  )
 (defn initialize
  "Takes an atom as a context. Swaps in :database, :trie, :rhyme-trie"
  [context]
--- a/src/com/owoga/prhyme/data_transform.clj
+++ b/src/com/owoga/prhyme/data_transform.clj
@ -77,6 +77,7 @@

 (comment
  (n-to-m-partitions 1 4 (range 6))
+
  ;; => ((0)
  ;;     (1)
  ;;     ,,,
@ -109,13 +110,13 @@
  If not, it increments the id (which is stored in the database
  under :next-id) and returns that new id."
  [database]
-  (fn [[k v]]
-    (let [k' (mapv (fn [kn]
-                     (if-let [id (get @database kn)]
-                       id
-                       (new-key database kn)))
-                   k)]
-      [k' 1])))
+  (fn [lookup ]
+    (let [lookup' (mapv (fn [key]
+                          (if-let [id (get @database key)]
+                            id
+                            (new-key database key)))
+                        lookup)]
+      [lookup' v])))

 (comment
  ;; TODO: Move to nlp.core
@ -184,7 +185,7 @@
    (remove empty?)
    (map (partial transduce (xf-pad-tokens 1 "<s>" (dec m) "</s>") conj))
    (map (partial map reverse))
-    (mapcat (partial map (partial n-to-m-partitions n (inc m))))
+    (map (partial n-to-m-partitions n (inc m)))
    (mapcat (partial into []))
    (map #(clojure.lang.MapEntry. (vec %) %))
    (map (make-database-processor database)))