Generate with some grammar rules

main
Eric Ihli 3 years ago
parent dc5340be7f
commit dd51e0fdca

@ -857,9 +857,9 @@
generated-text
(let [children (loop [i 4]
(let [node (p :lookup
(trie/lookup
loaded-tightly-packed-trie
(vec (take-last i generated-text))))
(trie/lookup
loaded-tightly-packed-trie
(vec (take-last i generated-text))))
children (p :seq-children (and node (trie/children node)))]
(cond
(nil? node) (recur (dec i))

@ -2,16 +2,18 @@
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.prhyme.data.dictionary :as dict]
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[taoensso.nippy :as nippy]))
[taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))
(def re-word
"Regex for tokenizing a string into words
(including contractions and hyphenations),
commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)")
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\?|\n)")
(defn xf-file-seq [start end]
(comp (remove #(.isDirectory %))
@ -38,6 +40,23 @@
(map (partial map second))
(map (partial mapv string/lower-case))))
(def xf-untokenize
(comp
(map #(string/join " " %))
(map #(string/replace % #" (['\-,\?\.] ?)" "$1"))))
(comment
(let [tokens (transduce
xf-tokenize
conj
["Eric's name, is Bond." "James, bond? Yes."])]
[tokens
(map #(string/join " " %) tokens)
(transduce
xf-untokenize
conj
tokens)]))
(def xf-filter-english
(let [word? (fn [x] (or (#{"." "?" ","} x)
(dict/cmu-with-stress-map x)))]
@ -75,6 +94,34 @@
k)]
[k' 1])))
(defn xf-part-of-speech-database
[database]
(fn [sentence]
(let [leafs (->> sentence
nlp/treebank-zipper
nlp/leaf-pos-path-word-freqs)]
(run!
(fn [[k v]]
(swap!
database
assoc
k
(merge-with + (@database k) v)))
leafs)
sentence)))
(comment
(let [database (atom {})]
(transduce
(map (partial mapv (part-of-speech-database database)))
conj
[]
[["this test is difficult"]
["this foot is sore"]])
@database)
)
(def encode-fn
"Encodes a number as a variable-length encoded value.
nil gets encoded as 0."
@ -159,3 +206,108 @@
(map (fn [[k v]] [k (map database k) v])))))
)
(defn xf-grammar-database
[database]
(fn [sentence]
(let [leafs (->> sentence
nlp/treebank-zipper
nlp/leaf-pos-path-word-freqs)]
(run!
(fn [[k v]]
(swap!
database
assoc
k
(merge-with + (@database k) v)))
leafs)
sentence)))
(defn file-seq->grammar-tree
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map nlp/grammar-tree-frequencies)
(map (partial into {})))
(fn
([acc]
(sort-by (comp - second) acc))
([acc m]
(merge-with + acc m)))
{}
files))
(comment
(time
(->> (file-seq->grammar-tree
(file-seq (io/file "dark-corpus")))
(take 100)
(nippy/freeze-to-file "/tmp/grammar-freqs-top-100.bin")))
(def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin"))
(take 10 grammar-freqs)
)
(defn file-seq->part-of-speech-freqs
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map (partial map nlp/treebank-zipper))
(map (partial map nlp/leaf-pos-path-word-freqs))
(map (partial reduce (fn [acc m]
(nlp/deep-merge-with + acc m)) {})))
(completing
(fn [result input]
(nlp/deep-merge-with + result input)))
{}
files))
(comment
(time (->> (file-seq->part-of-speech-freqs
(file-seq (io/file "dark-corpus")))
(nippy/freeze-to-file "/tmp/part-of-speech-freqs.bin")))
(def parts-of-speech-freqs
(nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin"))
(take 20 parts-of-speech-freqs)
)
(defn file-seq->parts-of-speech-trie
[files]
(transduce
(comp
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map nlp/grammar-tree-frequencies)
(map (partial into {})))
(fn
([acc]
(sort-by (comp - second) acc))
([acc m]
(merge-with + acc m)))
{}
files))

@ -0,0 +1,544 @@
(ns com.owoga.prhyme.generation.markov-example
(:require [clojure.string :as string]
[clojure.java.io :as io]
[com.owoga.prhyme.util.math :as math]
[com.owoga.phonetics :as phonetics]
[com.owoga.phonetics.syllabify :as syllabify]
[cljol.dig9 :as d]
[clojure.zip :as zip]
[com.owoga.tightly-packed-trie.bit-manip :as bm]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[com.owoga.tightly-packed-trie :as tpt]
[taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
[com.owoga.prhyme.nlp.core :as nlp]))
(def corpus (slurp (io/resource "cask_of_amontillado.txt")))
;; For better generation of text, you'll probably want to pad the starts
;; of sentences with n-1 "start-of-sentence" tokens.
(defn prep-punctuation-for-tokenization
"Puts spaces around punctuation so that they aren't
tokenized with the words they are attached to.
Might add extraneous whitespace, but presumedly that will be ignored/removed
during tokenization."
[text]
(string/replace text #"([\.,!?])" " $1 "))
(defn remove-quotes
"...and hyphens"
[text]
(string/replace text #"[\"-]" ""))
(defn remove-formatting-characters
"Input has underscores, presumably because the text
might be rendered by something that can italicize or bold text.
We'll just ignore them for now."
[text]
(string/replace text #"[_*]" ""))
(defn tokenize [text]
(-> text
remove-formatting-characters
prep-punctuation-for-tokenization
remove-quotes
string/lower-case
(string/split #"[\n ]+")))
(defn interleave-all
"Like interleave, but instead of ending the interleave when the shortest collection
has been consumed, continues to interleave the remaining collections."
{:added "1.0"
:static true}
([] ())
([c1] (lazy-seq c1))
([c1 c2]
(lazy-seq
(let [s1 (seq c1) s2 (seq c2)]
(if (and s1 s2)
(cons (first s1) (cons (first s2)
(interleave-all (rest s1) (rest s2))))
(lazy-seq (or s1 s2))))))
([c1 c2 & colls]
(lazy-seq
(let [ss (->> (map seq (conj colls c2 c1))
(remove nil?))]
(when ss
(concat (map first ss) (apply interleave-all (map rest ss))))))))
(comment
(let [tokens [1 2 3 4 5]
p1 (partition 1 1 tokens)
p2 (partition 2 1 tokens)
p3 (partition 3 1 tokens)]
(interleave-all p1 p2 p3)))
(defn ngramify-tokens [n m tokens]
(let [partition-colls (map #(partition % 1 tokens) (range n m))
ngrams (apply interleave-all partition-colls)]
ngrams))
(comment
(->> (tokenize corpus)
(take 5)
(ngramify-tokens 1 4))
;; => (("the")
;; ("the" "thousand")
;; ("the" "thousand" "injuries")
;; ("thousand")
;; ("thousand" "injuries")
;; ("thousand" "injuries" "of")
;; ("injuries")
;; ("injuries" "of")
;; ("injuries" "of" "fortunato")
;; ("of")
;; ("of" "fortunato")
;; ("fortunato"))
)
(defn add-terminal-value-to-ngram
"The Trie expects entries to be of the form '(k1 k2 k3 value).
The ngrams generated above are just '(k1 k2 k3).
This adds a value that is simply the ngram itself:
'(k1 k2 k3 '(k1 k2 k3))."
[ngram]
(concat ngram (list ngram)))
(defn trie->frequency-of-frequencies-map
"The second argument to this function specifies which rank you
want to get the map for."
[trie n]
(->> trie
(trie/children-at-depth n)
(map (comp :count second first seq))
frequencies
(into (sorted-map))))
(comment
(trie->frequency-of-frequencies-map trie 1)
;; => {1 558,
;; 2 110,
;; ,,,
;; 167 1,
;; 177 1}
)
;; The frequency of a thus-far unseen species is the number of species seen once over the
;; total number of species.
;; That's commonly referred to as P0
;; There will be a different P0 for each rank of N-gram.
(defn P0 [trie n]
(let [freq-map (trie->frequency-of-frequencies-map trie n)]
(/ (freq-map 1) (apply + (vals freq-map)))))
(comment
(P0 trie 1)
;; => 31/45
)
;; From here on out, we follow a similar procedure.
;; What we just did, P0, is the probability of seeing something
;; that has been previously unseen.
;; We found that by using what we know about P1 (how many times
;; things have been seen once).
;;
;; Now, we need to adjust our P1 number since we just gave some probability
;; to P0, which previously had no probability since it wasn't in our
;; frequency table.
;;
;; What's the new probability that the next thing we see is from the group of
;; n-grams that we've seen once?
;;
;; The same way P0 was based off P1, P1 will be based off P2.
;;
;; It's basically 2 * the number of times we've seen things twice divided
;; by the total number of things we've seen.
;;
;; P0 was 1 * number of 1-time things / total number of n-time things.
;; P1 is 2 * number of 2-time things / total number of n-time things.
;; P2 is 3 * number of 3-time things / total number of n-time things.
;;
;; With a slight adjustment. The frequency of frequencies needs to be smoothed
;; so there are no 0-values. When you get up to P14, P15, etc... there might be gaps
;; where you'll see P14 1-time, then won't see anything 15 or 16 times, so P15 and P16 will
;; be 0, then you'll see something 17 times twice.
;;
;; This is just noise from having limited data. The noise needs to be smoothed out.
(defn simple-good-turing-map [trie n]
(let [freq-map (trie->frequency-of-frequencies-map trie n)
xs (->> freq-map keys (map #(Math/log %)))
ys (->> freq-map vals (map #(Math/log %)))
sgt (math/sgt (keys freq-map) (vals freq-map))
sgt-map (into (sorted-map) (apply map vector sgt))]
sgt-map))
(comment
(let [freq-map (trie->frequency-of-frequencies-map trie 2)
xs (->> freq-map keys (map #(Math/log %)))
ys (->> freq-map vals (map #(Math/log %)))
sgt (math/sgt (keys freq-map) (vals freq-map))
sgt-map (into (sorted-map) (apply map vector sgt))
sgt-with-counts (math/sgt-with-counts (keys freq-map)
(vals freq-map))
c1 (freq-map 1)
c1* (sgt-map 1)]
[c1 c1* sgt-with-counts])
)
;; Maximum Likelihood Estimate
;;
;; It was about dusk, one evening during the supreme madness of the
;; carnival season, that I encountered my friend. He accosted me with
;; excessive warmth, for he had been drinking much. The man wore motley.
;; He had on a tight-fitting parti-striped dress, and his head was
;; surmounted by the conical cap and bells. I was so pleased to see him,
;; that I thought I should never have done wringing his hand.
;;
;; Consider 3-grams...
;;
;; it was about
;; it was there
;;
;; Let `N` be a sample text size and `nr` be the number of
;; m-grams which occurred in the text exactly `r` times.
;;
;; So that `N` = (apply + (map #(* r nr) frequency-of-frequencies)
;; `N` = sum for all seen-counts ("number of things seen 'count' times" * 'count')
;; 10 things seen 5 times
;; 4 things seen 4 times
;; 2 things seen 1 time
;;
;; 10 things seen 5 times each makes up 50 "things"
;; 4 things seen 4 times each makes up 16 "things
;; 2 things seen once each makes up 2 "things"
;;
;; Makes for `N` = 50 + 16 + 2 things... 68 things (m-grams).
;;
;; Consider the m-gram "it was about" occurred 4 times.
;; And in total we saw 60 3-grams. Then the MLE
;; is 4 / 60.
;;
;;
;;;; Base MLE
;;
;; Disount of the n-gram
;; *
;; Count of n-gram
;; /
;; Count of n-1-gram
(defn maximum-likelihood-estimate [trie trie-database n-gram]
(/ (get-in trie-database [n-gram :count])
(get-in trie-database [(butlast n-gram) :count])))
(comment
(maximum-likelihood-estimate trie trie-database '("," "the"))
(maximum-likelihood-estimate trie trie-database '(","))
(let [[rs nrs ests lgts]
(apply
math/sgt-with-counts
(apply map vector (seq (trie->frequency-of-frequencies-map trie 2))))]
[rs nrs ests lgts])
)
;;;; KATZ ;;;;
;; (defn N [trie n-gram-rank]
;; (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)]
;; (apply + (map (fn [[r nr]] (* r nr)) r->Nr))))
;; (defn r* [trie n-gram-rank]
;; (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)
;; _ _ _ r*s]))
;;;;
(defn zipper-leaf-path-seq
[zipper]
(->> zipper
(iterate zip/next)
(take-while (complement zip/end?))
(filter (complement zip/branch?))
(map zip/path)
(map (partial map first))
(filter (comp tb2/words last))))
(comment
(def target-grammar-structure
'(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))))
(reverse (zipper-leaf-path-seq (zip/seq-zip target-grammar-structure)))
(defn decode-fn
"Decodes a variable-length encoded number from a byte-buffer.
Zero gets decoded to nil."
[byte-buffer]
(let [value (encoding/decode byte-buffer)]
(if (zero? value)
nil
value)))
(def tpt (tpt/load-tightly-packed-trie-from-file
(io/resource "dark-corpus-4-gram-backwards-tpt.bin")
decode-fn))
(def database (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
(def example-story
(loop [generated-text (vec (repeat 3 (get database "</s>")))
i 0]
(if (> i 20)
generated-text
(let [children (loop [i 4]
(let [node
(trie/lookup
tpt
(vec (take-last i generated-text)))
children
(and node (trie/children node))]
(cond
(nil? node) (recur (dec i))
(< i 0) (throw (Exception. "Error"))
(seq children) children
:else (recur (dec i)))))]
(recur
(conj
generated-text
(->> children
(map #(vector (.key %) (get % [])))
(remove (comp nil? second))
(#(math/weighted-selection
(fn [[_ c]] c)
%))
first))
(inc i))))))
(map database example-story)
)
(defn syllabify-phrase
[phrase]
(->> phrase
(#(string/split % #" "))
(map phonetics/get-phones)
(map first)
(map syllabify/syllabify)
(reduce into [])))
(defn markov-choice
[trie generated-text k xf-filter]
(let [node (trie/lookup trie k)
children (and node
(->> node
trie/children
(map #(vector (.key %) (get % [])))
(remove (comp nil? second))))
choices (transduce
(comp
(map (fn [child]
(vector generated-text child)))
xf-filter) conj children)]
(cond
(nil? node) (recur trie generated-text (butlast k) xf-filter)
(seq children)
(if (< (rand) (/ (apply max (map second children))
(apply + (map second children))))
(recur trie generated-text (butlast k) xf-filter)
(first
(math/weighted-selection
(fn [[_ c]] c)
choices)))
(> (count k) 0) (recur trie generated-text (butlast k) xf-filter)
:else (throw (Exception. "Error")))))
(defn syllable-count-pred
[syllable-count database]
(fn [node]
(let [syllables (syllabify-phrase (database (get node [])))]
(= syllable-count
(count syllables)))))
(defn markov-select
[n]
(fn [{:keys [trie database xf-filter tokens] :as context}]
(loop [n n]
(if (= n 0)
;; Unable to find a selection
nil
(let [key (take-last n tokens)
node (trie/lookup trie key)
children (and node (->> (trie/children node)
(remove nil?)))
choices (transduce
(comp
(map #(vector (.key %) (get % [])))
(map (fn [child] [context child]))
xf-filter)
conj
children)]
(let [freqs (map #(get % []) children)]
(cond
(or (empty? choices) (empty? freqs))
(recur (dec n))
(and
(> n 1)
(< (rand)
(/ (apply max freqs)
(apply + freqs))))
(recur
(dec n))
:else
(let [result (second (math/weighted-selection
(comp second second)
choices))]
(first result)))))))))
(defn generate-sentence
[{:keys [trie database stop? xf-filter tokens] :as context}]
(let [markov-fn (markov-select 4)]
(loop [context (assoc context :i 0)]
(let [tokens (:tokens context)]
(cond
(stop? context)
tokens
:else
(let [selection (markov-fn context)]
(if (nil? selection)
(do
(println tokens)
(throw (Exception. "No possible selection")))
(recur (update
context
:tokens
conj
selection)))))))))
(comment
(trie/lookup tpt '(1 1 1))
(let [context {:tokens (vec (repeat 3 (database "</s>")))
:trie tpt
:database database
:stop? (fn [{:keys [tokens] :as context}]
(let [sentence (->> tokens
(map database)
(remove #{"</s>"})
(string/join " "))]
(<= 10 (count (syllabify-phrase sentence)))))
:xf-filter (comp
(remove
(fn [[context [k v]]]
(= k 7)))
(filter
(fn [[context [k v]]]
(let [current-sentence
(->> (:tokens context)
(map database)
(remove #{"</s>"})
(string/join " "))
current-syllable-count
(count (syllabify-phrase current-sentence))
current-word (database k)
current-word-syllable-count (count (syllabify-phrase current-word))]
(>= (- 10 current-syllable-count)
current-word-syllable-count)))))}]
(->> (generate-sentence context)
(map database)))
(database "<s>")
)
#_(defn generate-sentence
[trie database stop? filters]
(loop [generated-text (vec (repeat 3 (get database "</s>")))
i 0]
(cond
(> i 400)
nil
(stop? generated-text)
generated-text
;; reset
(or (zero? (mod i 40))
(> syllable-count target-syllable-count))
(recur (vec (repeat 3 (get database "</s>"))) 0)
:else
(let [choice (markov-choice
trie
(take-last 4 generated-text)
filters)]
(recur
(conj
generated-text
choice)
(inc i))))))
(comment
(let [disallow-sentence-start-xf
(remove (= (database (first %)) "<s>") children)
])
(map database (generate-sentence tpt database 10))
(def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin"))
(def part-of-speech-freqs (nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin"))
(take 100 part-of-speech-freqs)
(loop [generated-text (vec (repeat 3 (get database "</s>")))
i 0]
(let [current-sentence
(take-while
(complement (into #{} (map database ["<s>" "</s>"])))
(reverse generated-text))]
(if (> i 20)
generated-text
(let [children (loop [i 4]
(let [node
(trie/lookup
tpt
(vec (take-last i generated-text)))
children
(and node (trie/children node))]
(cond
(nil? node) (recur (dec i))
(< i 0) (throw (Exception. "Error"))
(seq children) children
:else (recur (dec i)))))]
(recur
(conj
generated-text
(->> children
(map #(vector (.key %) (get % [])))
(remove (comp nil? second))
(#(math/weighted-selection
(fn [[_ c]] c)
%))
first))
(inc i))))))
)

@ -232,29 +232,34 @@
Porcelain. If you have the simple tree data structure
returned by `parse-to-simple-tree`, then you can just
pass that directly to `zip/seq-zip`."
[texts]
(let [tree (->> texts
(map tokenize)
(map (partial string/join " "))
[text]
(let [tree (->> text
tokenize
(string/join " ")
vector
parse
(map tb/make-tree)
first
tb/make-tree
unmake-tree)]
(zip/seq-zip tree)))
(comment
(let [texts ["Eric's test is difficult."]]
(loop [zipper (treebank-zipper texts)]
;; Here is a demo of zipping through a parse tree and changing
;; all adjectives to "thorough".
(let [text "Eric's test is difficult."]
(loop [zipper (treebank-zipper text)]
(cond
(zip/end? zipper) (zip/root zipper)
(= 'JJ (zip/node zipper)) (recur (-> zipper
zip/next
(zip/replace '("thorough"))))
:else (recur (zip/next zipper)))))
;; => ((TOP
;; ((S
;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
;; (. (".")))))))
;; => (TOP
;; ((S
;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
;; (. ("."))))))
)
(defn iter-zip
@ -341,6 +346,9 @@
(leaf-pos-path-word-freqs zipper))
(comment
(treebank-zipper ["Eric's test is difficult."
"Eric's test is thorough."
"Eric's testing."])
(let [zipper (treebank-zipper ["Eric's test is difficult."
"Eric's test is thorough."
"Eric's testing."])]
@ -406,6 +414,9 @@
"you are a test"]]
(grammar-tree-frequencies
document))
(grammar-tree-frequencies ["this is a test."])
(parse-to-simple-tree ["this is a test."])
;; => {(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))) 1,
;; (TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))))) 2,
;; (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))))) 1}
@ -786,7 +797,7 @@
(remove #(string? (first %)))))
(comment
(phrase-constituents "My name is Eric.")
(phrase-constituents ["My name is Eric."])
;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
(phrase-constituents "How are you?")
;; => ((TOP (SBARQ)) (SBARQ (WHADVP SQ .)) (WHADVP (WRB)) (SQ (VBP NP)) (NP (PRP)))
@ -833,6 +844,7 @@
"My hat is blue and I like cake."
"Your name is Taylor."
"How are you?"])
;; => {TOP {(S) 3, (SBARQ) 1},
;; S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
;; NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
@ -1002,6 +1014,7 @@
"Your name is not Eric."
"Who is your mother and what does she do?"]
(pos-constituent-frequencies)
#_#_(apply
merge-with
(fn [a b]
@ -1061,10 +1074,20 @@
)
(defn most-likely-parts-of-speech
[phrase]
(top-k-sequences prhyme-pos-tagger (tokenize phrase)))
(comment
(let [text ["bother me"]]
(->> text
(map tokenize)
(map #(top-k-sequences prhyme-pos-tagger %))))
(let [text "a dog"]
(first
(map #(.getOutcomes %)
(most-likely-parts-of-speech text))))
;; => ["PRP" "VBP" "DT" "NN" "."]
(map (juxt #(.getOutcomes %)
#(map float (.getProbs %)))
(top-k-sequences prhyme-pos-tagger (tokenize "")))
)

@ -29,6 +29,7 @@
;; of a word. So it should be e.lip.sis
;; As an alternative to handling the isolated "s"-at-the-end-of-internal-coda case,
;; it works well-enough for me to treat all fricatives as lowest priority.
(def ^clojure.lang.PersistentVector sonority-hierarchy
["vowel" "liquid" "affricate" "fricative" "nasal" "stop" "semivowel" "aspirate"])

@ -286,6 +286,50 @@
[lgt-estimate lgt?]
[turing-estimate lgt?]))))))))
(defn smoothed-frequencies
[rs nrs]
(let [l (count rs)
N (apply + (map #(apply * %) (map vector rs nrs)))
p0 (/ (first nrs) N)
zrs (average-consecutives rs nrs)
log-rs (map #(Math/log %) rs)
log-zrs (map #(Math/log %) zrs)
lm (least-squares-linear-regression log-rs log-zrs)
lgts (map lm rs)
estimations (loop [coll rs
lgt? false
e (estimator lm rs zrs)
estimations []]
(cond
(empty? coll) estimations
:else
(let [[estimation lgt?] (e (first coll) lgt?)]
(recur
(rest coll)
lgt?
e
(conj estimations estimation)))))
N* (apply + (map #(apply * %) (map vector nrs estimations)))
probs (cons
(float p0)
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
[lgts
(map
(fn [r]
(* (inc r) (/ (lm (inc r)) (lm r))))
(partition 2 1 (conj rs (inc (peek rs)))))]))
(comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
(smoothed-frequencies rs nrs))
)
(defn sgt [rs nrs]
(assert (and (not-empty nrs) (not-empty rs))
"frequencies and frequency-of-frequencies can't be empty")
@ -316,7 +360,8 @@
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
[(cons 0 rs)
(map #(/ % sum-probs) probs)]))
(map #(/ % sum-probs) probs)
estimations]))
(comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]

Loading…
Cancel
Save