Scratch code related to grammar nlg

main
Eric Ihli 4 years ago
parent 0be5be0074
commit 0bd7683020

@ -207,8 +207,13 @@
(remove #(some string/blank? %))
(map #(string/join " " %))))))
(defn pos-path-freqs
"Seq of pos-path frequencies of each document.
(defn pathed-part-of-speech-word-frequencies
"Seq of pathed part-of-speech to word frequencies of each document.
{(TOP NP NN) {'test' 2 'sample' 4 ,,,}
(TOP VP VBZ) {'is' 5 'runs' 2 ,,,}
,,,}
To reduce, deep merge with +."
[documents]
(->> documents
@ -222,8 +227,13 @@
(map nlp/treebank-zipper)
(map nlp/leaf-pos-path-word-freqs)))
(defn structures
"Seq of structure frequencies of each document.
(defn grammar-tree-frequencies
"Seq of grammar tree frequencies of each document.
{(TOP (NP (NN)) (VP (VBZ))) 23
(TOP (NP (DT) (NN)) (VP (VBZ))) 18
,,,}
To reduce, merge with +."
[documents]
(->> documents
@ -252,7 +262,7 @@
(let [structure (->> documents
(drop chunk)
(take chunk-size)
pos-path-freqs
pathed-part-of-speech-word-frequencies
(reduce
(fn [a v]
(nlp/deep-merge-with + a v))
@ -303,18 +313,82 @@
;; [("won't" "intervention") {"divine" 1, "an" 1}]
;; [("pines" "weeping") {"the" 1}])
(def structures (nippy/thaw-from-file "resources/structure-freqs/0.nip"))
;; Merge pos paths
(def pos-freqs-data
(let [documents (->> "resources/pos-freqs"
io/file
file-seq
(remove #(.isDirectory %)))]
(reduce
(fn [accum document]
(let [data (nippy/thaw-from-file document)]
(nlp/deep-merge-with + accum data)))
{}
documents)))
(nippy/freeze-to-file "resources/corpus/darklyrics/pos-word-freqs.nippy" pos-freqs-data)
(count pos-freqs-data)
(take 20 pos-freqs-data)
(time
(def pos-freqs-data-3
(reduce
(fn [acc [k v]]
(let [new-map (hash-map (take-last 3 k) v)]
(nlp/deep-merge-with + acc new-map)))
{}
pos-freqs-data)))
(count pos-freqs-data-3)
(take 2 (reverse (sort-by #(count (second %)) pos-freqs-data-3)))
(time
(def
pos-freqs-data-2
(reduce
(fn [acc [k v]]
(let [new-map (hash-map (take-last 2 k) v)]
(nlp/deep-merge-with + acc new-map)))
{}
pos-freqs-data-3)))
(def structure-freq-data
(let [documents (->> "resources/structure-freqs"
io/file
file-seq
(remove #(.isDirectory %)))]
(reduce
(fn [accum document]
(let [data (nippy/thaw-from-file document)]
(nlp/deep-merge-with + accum data)))
{}
documents)))
(def popular-structure-freq-data (into {} (take 500 (reverse (sort-by #(second %) structure-freq-data)))))
(take 100 popular-structure-freq-data)
(nippy/freeze-to-file "resources/corpus/darklyrics/grammar-tree-freqs.nippy" structure-freq-data)
(def t1 (nippy/thaw-from-file "resources/structure-freqs/0.nip"))
structures
(take 100 (reverse (sort-by second structures)))
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop 5000)
(take 10000))
chunk-size 5000]
(chunked-writing-pos-path-freqs
documents
chunk-size))
(do
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop 5000))
chunk-size 5000]
(chunked-writing-pos-path-freqs
documents
chunk-size))
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop 50000))
chunk-size 5000]
(chunked-writing-structure-freqs
documents
chunk-size)))
(def t1 (nippy/thaw-from-file "resources/pos-freqs/0.nip"))
(take 10 t1)
@ -336,7 +410,7 @@
(->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))))
(remove #(.isDirectory %)))
(time
(def example-pos-freqs
@ -406,5 +480,4 @@
(-> zipper
zip/down
zip/right
zip/node))
)
zip/node)))

@ -1,5 +1,3 @@
;; TODO: Filter out non-English lyrics.
(ns com.owoga.corpus.darklyrics
(:require [net.cgrand.enlive-html :as html]
[com.owoga.prhyme.util :as util]
@ -10,10 +8,14 @@
(def base-url "http://www.darklyrics.com/a.html")
(def data-dir "dark-corpus")
(defn fix-url [url]
(defn fix-url
"Some hrefs are relative and some are absolute."
[url]
(string/replace url #".*(http://.*(?!http://).*$)" "$1"))
(defn fetch-url- [url]
(defn fetch-url-
"Memoized for faster iterations in development."
[url]
(let [url (fix-url url)]
(try
(html/html-resource (java.net.URL. url))
@ -139,9 +141,6 @@
{}
((util/window (inc n)) tokens))))
(defn read-darkov-2 []
(util/read-markov (io/resource "dark-corpus-2.edn")))
(defn norm-filepath [text]
(-> text
string/lower-case
@ -168,11 +167,6 @@
artist-album-texts)))
(comment
(def darkov-2 (util/read-markov (io/resource "dark-corpus-2.edn")))
(take 10 darkov-2)
(get darkov-2 '(nil nil))
(take 3 (scrape base-url))
(-main)
(def letters-urls (parse-letters-urls (fetch-url base-url)))
(def artists-urls (parse-artists-urls (fetch-url (first letters-urls))))
(def artist-html (fetch-url (first artists-urls)))
@ -190,16 +184,4 @@
(filter string?)
(map string/trim)
(string/join "\n")
(string/trim))])))
(->> (html/select artist-html [:h1])
(map html/text)
(first ))
(def darkov
(into
{}
(map (fn [[k v]] (vector (list k) v))
(make-markov (slurp "darklyrics.txt") 1))))
(run! write-scrape (take 4 (scrape base-url)))
(def lyrics (scrape base-url)))
(string/trim))]))))

@ -1,7 +1,12 @@
(ns com.owoga.prhyme.data.darklyrics
(:require [clojure.java.io :as io]
[taoensso.nippy :as nippy])
(:import [java.io DataInputStream ByteArrayOutputStream]))
[clojure.edn :as edn]
[taoensso.nippy :as nippy]
[next.jdbc :as jdbc]
[next.jdbc.sql :as sql]
[com.owoga.prhyme.data.dictionary :as dict])
(:import (java.io ByteArrayOutputStream ByteArrayInputStream
DataOutputStream DataInputStream)))
(defn thaw-from-file
"Convenience util: like `thaw`, but reads from `(clojure.java.io/file <file>)`.
@ -17,23 +22,52 @@
(io/copy xin xout)
(nippy/thaw (.toByteArray xout) thaw-opts))))
;; (thaw-from-file (io/resource "test.bin"))
;; (bytes (byte-array (take 20 (range))))
(def darklyrics-markov-2
(thaw-from-file (io/resource "dark-corpus-2.bin")))
;; (byte-array (map (comp byte int) "ascii"))
;; ;; => [97, 115, 99, 105, 105]
;; (bytes (byte-array (map (comp byte int) "ascii")))
;; ;; => [97, 115, 99, 105, 105]
(comment
(def words (map #(vector (hash %) %)
(map :normalized-word dict/prhyme-dict)))
(count words)
(count dict/prhyme-dict)
(count (into #{} (map first words)))
(take 5 words)
;; (let [xin (io/input-stream (io/resource "test.bin"))
;; xout (ByteArrayOutputStream.)]
;; (io/copy xin xout)
;; (nippy/thaw (.toByteArray xout)))
(def ds "jdbc:sqlite:resources/darklyrics.db")
;; (.fullyRead (io/input-stream (io/resource "test.bin")))
(def hashes
(into
{}
(map
(fn [[k v]]
[(hash k) k])
darklyrics-markov-2)))
;; (.getSize (io/input-stream (io/resource "dark-corpus-2.bin")))
;; (def data (into {} (map vec (partition 2 (range 20)))))
;; (nippy/freeze-to-file "resources/test.bin" data)
(def darklyrics-markov-2
(thaw-from-file (io/resource "dark-corpus-2.bin")))
(nippy/freeze-to-file
"resources/dark-corpus-hashes.nip"
hashes)
(run!
(fn [c]
(sql/insert-multi!
ds
:markov
[:hash :words]
c))
(partition (int 1e5) hashes))
(run!
(fn [c]
(sql/insert-multi!
ds
:dict
[:hash :word]
c))
(partition (int 1e5) words))
(println (+ 2 2))
(keyword "won't")
(get darklyrics-markov-2 '("hiding" "our"))
(count darklyrics-markov-2)
)

@ -5,6 +5,13 @@
[com.owoga.prhyme.util :as util]
[com.owoga.prhyme.core :as prhyme]))
(def cmu-with-stress
(->> (io/resource "cmudict-0.7b")
io/reader
line-seq
(drop-while #(= \; (first %)))
(map #(string/split % #"\s+"))))
(def cmu-dict
(->> (io/reader (io/resource "cmudict_SPHINX_40"))
(line-seq)
@ -62,3 +69,15 @@
(->> words
(filter #(word-set (string/lower-case %))))]
(< 0.7 (/ (count english-words) (max 1 (count words))))))
(comment
(let [phoneme-lookup (into
{}
(map
(fn [[word & phonemes]]
[(string/lower-case word)
phonemes])
cmu-with-stress))]
(phoneme-lookup "zhirinovsky"))
)

@ -221,13 +221,50 @@
)
(defmethod create-element '(TOP (PP (IN) (NP (DT) (NN))))
[tree]
(let [zipper (zip/seq-zip tree)
noun (->> tree
leaf-filter
(filter (fn [z]
(let [[pos word] (zip/node z)]
(= pos 'NN))))
(map zip/node)
first)
preposition (->> tree
leaf-filter
(filter (fn [z]
(let [[pos word] (zip/node z)]
(= pos 'IN))))
(map zip/node)
first)
determiner (->> tree
leaf-filter
(filter (fn [z]
(let [[pos word] (zip/node z)]
(= pos 'DT))))
(map zip/node)
first)
noun-phrase (.createNounPhrase nlg-factory (second noun))
prepositional-phrase (.createPrepositionPhrase nlg-factory)
clause (.createClause nlg-factory)]
(.setDeterminer noun-phrase (second determiner))
(.addComplement prepositional-phrase noun-phrase)
(.setPreposition prepositional-phrase (second preposition))
prepositional-phrase))
(comment
(.realise
realiser
(create-element '(TOP (PP (IN "in") (NP (DT "the") (NN "park"))))))
)
(defmethod create-element '(PRP$)
[[[_ child]]] (.createNounPhrase nlg-factory child))
(defmethod create-element 'NN
[clause [pos child]]
(let [noun-phrase (.createNounPhrase nlg-factory child)]
(.setNoun)))
(let [noun-phrase (.createNounPhrase nlg-factory child)]))
(comment
(let [clause (.createClause nlg-factory)
@ -270,7 +307,6 @@
(.setNoun clause nn2)
clause))
(realise (create-element '((DT "a") (NN "sample") (NN "test"))))
(defmethod create-element '(PRP$ NN)
[[prp$ nn]]
@ -288,18 +324,11 @@
[[[_ child]]]
(create-element child))
(realise (create-element '((NP ((PRP$ "Eric") (NN "test"))))))
(realise (create-element '((DT "This"))))
(create-element '((NN "test")))
(realise (create-element '((PRP$ "Eric") (NN "test"))))
(realise (create-element '((NNP "tests"))))
(defmethod create-element '(VB)
[[[_ child]]]
(.createVerbPhrase nlg-factory child))
(realise (create-element '((VB "run"))))
(defmethod create-element '(VBZ)
[[[_ child]]]
@ -315,7 +344,15 @@
clause))
(comment
(realise (create-element '((NP ((PRP$ "Eric") (NN "test"))))))
(realise (create-element '((VBZ "is") (NP ((NN "test"))))))
(realise (create-element '((VB "run"))))
(realise (create-element '((DT "a") (NN "sample") (NN "test"))))
(create-element '((NN "test")))
(realise (create-element '((PRP$ "Eric") (NN "test"))))
(realise (create-element '((NNP "tests"))))
)
(comment

@ -225,9 +225,71 @@
:else
(recur (next parse-zipper)))))
(defn generate-with-markov-with-custom-progression-n-2-pos-freqs
"Sams as above, but with next/prev and stop fns"
[next
prev
next-stop?
prev-stop?
pos-path->word-freqs
parse-zipper
markov]
(loop [parse-zipper parse-zipper]
(cond
(nil? (next parse-zipper)) (zip/root parse-zipper)
(next-stop? parse-zipper) (zip/root parse-zipper)
(zip/branch? parse-zipper)
(recur (next parse-zipper))
(string? (zip/node parse-zipper))
(recur (next parse-zipper))
(and (symbol? (zip/node parse-zipper))
(pos-path->word-freqs (take-last 2 (seq (map first (zip/path parse-zipper))))))
(let [target-path (take-last 2 (seq (map first (zip/path parse-zipper))))
target-pos (zip/node parse-zipper)
pos-path-word (pos-path->word-freqs target-path)
pos-map pos-path-word
markov-options (markov (reverse
(next-two-words (nlp/iter-zip
parse-zipper
prev
prev-stop?))))
selection-possibilities (merge-with
(fn [a b]
(let [max-pos (apply max (vals pos-map))]
(+ a b max-pos)))
pos-map
markov-options)]
(timbre/info "Markov options are"
(apply list (next-two-words (nlp/iter-zip
parse-zipper
prev
prev-stop?)))
(apply list (take 10 markov-options)))
(timbre/info "Choosing POS for" target-path)
(let [selection (weighted-rand/weighted-selection-from-map
selection-possibilities)]
(timbre/info
"Most likely selection possibilities"
(apply list (take 5 (reverse (sort-by second selection-possibilities)))))
(timbre/info "Chose " selection)
(recur
(-> parse-zipper
zip/up
(#(zip/replace % (list (zip/node (zip/down %)) selection)))
zip/down
next
next))))
:else
(recur (next parse-zipper)))))
(comment
(let [structure '(TOP (S (NP (DT) (JJ) (NN))
(VP (VBZ))
(VP (RB) (VBZ))
(NP (DT) (JJ) (NN))))
structure (-> structure
zip/seq-zip
@ -238,17 +300,14 @@
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
(->> (generate-with-markov-with-custom-progression-n-2-pos-freqs
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
examples/pos-freqs-data-2
structure
examples/darkov-2)
nlp/leaf-nodes
(string/join " ")))))
examples/darkov-2)))))
(timbre/set-level! :info)
(timbre/set-level! :error)
@ -276,24 +335,21 @@
(take 5 examples/t2)
(let [structure '(TOP (S (NP (DT) (JJ) (NN))
(VP (VBZ))
(NP (DT) (JJ) (NN))))
(let [structure (weighted-rand/weighted-selection-from-map
examples/popular-structure-freq-data)
structure (-> structure
zip/seq-zip
nlp/iter-zip
last)
pos-freqs (examples/pos-paths->pos-freqs
examples/t1)]
pos-freqs examples/pos-freqs-data-2]
(repeatedly
10
(fn []
(->> (generate-with-markov-with-custom-progression
(->> (generate-with-markov-with-custom-progression-n-2-pos-freqs
zip/prev
zip/next
nil?
zip/end?
examples/t1
pos-freqs
structure
examples/darkov-2)

@ -335,6 +335,11 @@
(map #(hash-map (butlast %) {(last %) 1}))
(apply deep-merge-with +)))
(defn pathed-part-of-speech-word-frequencies
"I like this name better."
[zipper]
(leaf-pos-path-word-freqs zipper))
(comment
(let [zipper (treebank-zipper ["Eric's test is difficult."
"Eric's test is thorough."
@ -379,6 +384,32 @@
;; (TOP (S (NP (NNP)) (VP (VBZ) (VP (VBG))) (.))))
)
(defn grammar-tree-frequencies
"Seq of grammar tree frequencies of each document.
{(TOP (NP (NN)) (VP (VBZ))) 23
(TOP (NP (DT) (NN)) (VP (VBZ))) 18
,,,}
To reduce, merge with +."
[document]
(->> document
parse-to-simple-tree
parse-tree-sans-leaf-words
(map #(hash-map % 1))
(apply merge-with +)))
(comment
(let [document ["this is a test"
"this is a sample"
"that was a test"
"you are a test"]]
(grammar-tree-frequencies
document))
;; => {(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))) 1,
;; (TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))))) 2,
;; (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))))) 1}
)
;;;; This is not sufficient
;; You'll end up with a mapping that says a verb phrase can be a
@ -826,13 +857,6 @@
(pos-tagger (tokenize phrase)))
)
(comment
(let [text "My name is Eric."
zipper (treebank-zipper text)]
(map identity (seq-zip zipper)))
)
(defn zipper->freqs [zipper]
(let [coll ()]))
@ -987,7 +1011,8 @@
)
(defn reverse-children-to-parents
"We want to work from the reverse, or backwards, representation of pos-constituents-frequencies.
"We want to work from the reverse, or backwards,
representation of pos-constituents-frequencies.
Pos-constituent-frequencies is good for generating text top-down. But we want
to generate text bottom-up for rhyming."
[pos-freqs]
@ -1018,7 +1043,7 @@
"Who is your mother and what does she do?"]
(pos-constituent-frequencies))]
(->> pos-freqs
#_(ways-to-make-a 'S)
(ways-to-make-a 'S)
(#(get % 'NN))))
(let [pos-freqs (->> ["My name is Eric."

@ -30,6 +30,22 @@
(map convert-to-sphinx)
(map string/upper-case)))
(defn get-phones-with-stress
".getPhones only "
[word]
(->> (map str (.getPhones cmu-lexicon word nil))
(map convert-to-sphinx)
(map string/upper-case)))
(comment
(require '[com.owoga.prhyme.data.dictionary :as dict])
(rest (first (filter #(= (first %) "ZHIRINOVSKY") dict/cmu-with-stress)))
;; => ("ZH" "IH2" "R" "AH0" "N" "AA1" "V" "S" "K" "IY2")
(get-phones-with-stress "zhirinovsky")
;; => ("ZH" "IH1" "R" "AH" "N" "AA1" "F" "S" "K" "IY")
)
(defn window [n]
(fn [coll]
(cond

Loading…
Cancel
Save