Generate with some grammar rules

Eric Ihli 4 years ago
parent dc5340be7f
commit dd51e0fdca

@ -2,16 +2,18 @@
(:require [clojure.string :as string] (:require [clojure.string :as string]
[ :as io] [ :as io]
[ :as dict] [ :as dict]
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.trie :as trie] [com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie :as tpt] [com.owoga.tightly-packed-trie :as tpt]
[com.owoga.tightly-packed-trie.encoding :as encoding] [com.owoga.tightly-packed-trie.encoding :as encoding]
[taoensso.nippy :as nippy])) [taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]))
(def re-word (def re-word
"Regex for tokenizing a string into words "Regex for tokenizing a string into words
(including contractions and hyphenations), (including contractions and hyphenations),
commas, periods, and newlines." commas, periods, and newlines."
#"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\n)") #"(?s).*?([a-zA-Z\d]+(?:['\-]?[a-zA-Z]+)?|,|\.|\?|\n)")
(defn xf-file-seq [start end] (defn xf-file-seq [start end]
(comp (remove #(.isDirectory %)) (comp (remove #(.isDirectory %))
@ -38,6 +40,23 @@
(map (partial map second)) (map (partial map second))
(map (partial mapv string/lower-case)))) (map (partial mapv string/lower-case))))
(def xf-untokenize
(map #(string/join " " %))
(map #(string/replace % #" (['\-,\?\.] ?)" "$1"))))
(let [tokens (transduce
["Eric's name, is Bond." "James, bond? Yes."])]
(map #(string/join " " %) tokens)
(def xf-filter-english (def xf-filter-english
(let [word? (fn [x] (or (#{"." "?" ","} x) (let [word? (fn [x] (or (#{"." "?" ","} x)
(dict/cmu-with-stress-map x)))] (dict/cmu-with-stress-map x)))]
@ -75,6 +94,34 @@
k)] k)]
[k' 1]))) [k' 1])))
(defn xf-part-of-speech-database
(fn [sentence]
(let [leafs (->> sentence
(fn [[k v]]
(merge-with + (@database k) v)))
(let [database (atom {})]
(map (partial mapv (part-of-speech-database database)))
[["this test is difficult"]
["this foot is sore"]])
(def encode-fn (def encode-fn
"Encodes a number as a variable-length encoded value. "Encodes a number as a variable-length encoded value.
nil gets encoded as 0." nil gets encoded as 0."
@ -159,3 +206,108 @@
(map (fn [[k v]] [k (map database k) v]))))) (map (fn [[k v]] [k (map database k) v])))))
) )
(defn xf-grammar-database
(fn [sentence]
(let [leafs (->> sentence
(fn [[k v]]
(merge-with + (@database k) v)))
(defn file-seq->grammar-tree
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map nlp/grammar-tree-frequencies)
(map (partial into {})))
(sort-by (comp - second) acc))
([acc m]
(merge-with + acc m)))
(->> (file-seq->grammar-tree
(file-seq (io/file "dark-corpus")))
(take 100)
(nippy/freeze-to-file "/tmp/grammar-freqs-top-100.bin")))
(def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin"))
(take 10 grammar-freqs)
(defn file-seq->part-of-speech-freqs
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map (partial map nlp/treebank-zipper))
(map (partial map nlp/leaf-pos-path-word-freqs))
(map (partial reduce (fn [acc m]
(nlp/deep-merge-with + acc m)) {})))
(fn [result input]
(nlp/deep-merge-with + result input)))
(time (->> (file-seq->part-of-speech-freqs
(file-seq (io/file "dark-corpus")))
(nippy/freeze-to-file "/tmp/part-of-speech-freqs.bin")))
(def parts-of-speech-freqs
(nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin"))
(take 20 parts-of-speech-freqs)
(defn file-seq->parts-of-speech-trie
(xf-file-seq 0 1000)
(map slurp)
(map #(string/split % #"[\n+\?\.]"))
(map (partial transduce xf-tokenize conj))
(map (partial transduce xf-filter-english conj))
(map (partial remove empty?))
(remove empty?)
(map (partial transduce xf-untokenize conj))
(map nlp/grammar-tree-frequencies)
(map (partial into {})))
(sort-by (comp - second) acc))
([acc m]
(merge-with + acc m)))

@ -0,0 +1,544 @@
(ns com.owoga.prhyme.generation.markov-example
(:require [clojure.string :as string]
[ :as io]
[com.owoga.prhyme.util.math :as math]
[com.owoga.phonetics :as phonetics]
[com.owoga.phonetics.syllabify :as syllabify]
[cljol.dig9 :as d]
[ :as zip]
[com.owoga.tightly-packed-trie.bit-manip :as bm]
[com.owoga.trie :as trie]
[com.owoga.tightly-packed-trie.encoding :as encoding]
[com.owoga.tightly-packed-trie :as tpt]
[taoensso.nippy :as nippy]
[com.owoga.prhyme.nlp.tag-sets.treebank-ii :as tb2]
[com.owoga.prhyme.nlp.core :as nlp]))
(def corpus (slurp (io/resource "cask_of_amontillado.txt")))
;; For better generation of text, you'll probably want to pad the starts
;; of sentences with n-1 "start-of-sentence" tokens.
(defn prep-punctuation-for-tokenization
"Puts spaces around punctuation so that they aren't
tokenized with the words they are attached to.
Might add extraneous whitespace, but presumedly that will be ignored/removed
during tokenization."
(string/replace text #"([\.,!?])" " $1 "))
(defn remove-quotes
"...and hyphens"
(string/replace text #"[\"-]" ""))
(defn remove-formatting-characters
"Input has underscores, presumably because the text
might be rendered by something that can italicize or bold text.
We'll just ignore them for now."
(string/replace text #"[_*]" ""))
(defn tokenize [text]
(-> text
(string/split #"[\n ]+")))
(defn interleave-all
"Like interleave, but instead of ending the interleave when the shortest collection
has been consumed, continues to interleave the remaining collections."
{:added "1.0"
:static true}
([] ())
([c1] (lazy-seq c1))
([c1 c2]
(let [s1 (seq c1) s2 (seq c2)]
(if (and s1 s2)
(cons (first s1) (cons (first s2)
(interleave-all (rest s1) (rest s2))))
(lazy-seq (or s1 s2))))))
([c1 c2 & colls]
(let [ss (->> (map seq (conj colls c2 c1))
(remove nil?))]
(when ss
(concat (map first ss) (apply interleave-all (map rest ss))))))))
(let [tokens [1 2 3 4 5]
p1 (partition 1 1 tokens)
p2 (partition 2 1 tokens)
p3 (partition 3 1 tokens)]
(interleave-all p1 p2 p3)))
(defn ngramify-tokens [n m tokens]
(let [partition-colls (map #(partition % 1 tokens) (range n m))
ngrams (apply interleave-all partition-colls)]
(->> (tokenize corpus)
(take 5)
(ngramify-tokens 1 4))
;; => (("the")
;; ("the" "thousand")
;; ("the" "thousand" "injuries")
;; ("thousand")
;; ("thousand" "injuries")
;; ("thousand" "injuries" "of")
;; ("injuries")
;; ("injuries" "of")
;; ("injuries" "of" "fortunato")
;; ("of")
;; ("of" "fortunato")
;; ("fortunato"))
(defn add-terminal-value-to-ngram
"The Trie expects entries to be of the form '(k1 k2 k3 value).
The ngrams generated above are just '(k1 k2 k3).
This adds a value that is simply the ngram itself:
'(k1 k2 k3 '(k1 k2 k3))."
(concat ngram (list ngram)))
(defn trie->frequency-of-frequencies-map
"The second argument to this function specifies which rank you
want to get the map for."
[trie n]
(->> trie
(trie/children-at-depth n)
(map (comp :count second first seq))
(into (sorted-map))))
(trie->frequency-of-frequencies-map trie 1)
;; => {1 558,
;; 2 110,
;; ,,,
;; 167 1,
;; 177 1}
;; The frequency of a thus-far unseen species is the number of species seen once over the
;; total number of species.
;; That's commonly referred to as P0
;; There will be a different P0 for each rank of N-gram.
(defn P0 [trie n]
(let [freq-map (trie->frequency-of-frequencies-map trie n)]
(/ (freq-map 1) (apply + (vals freq-map)))))
(P0 trie 1)
;; => 31/45
;; From here on out, we follow a similar procedure.
;; What we just did, P0, is the probability of seeing something
;; that has been previously unseen.
;; We found that by using what we know about P1 (how many times
;; things have been seen once).
;; Now, we need to adjust our P1 number since we just gave some probability
;; to P0, which previously had no probability since it wasn't in our
;; frequency table.
;; What's the new probability that the next thing we see is from the group of
;; n-grams that we've seen once?
;; The same way P0 was based off P1, P1 will be based off P2.
;; It's basically 2 * the number of times we've seen things twice divided
;; by the total number of things we've seen.
;; P0 was 1 * number of 1-time things / total number of n-time things.
;; P1 is 2 * number of 2-time things / total number of n-time things.
;; P2 is 3 * number of 3-time things / total number of n-time things.
;; With a slight adjustment. The frequency of frequencies needs to be smoothed
;; so there are no 0-values. When you get up to P14, P15, etc... there might be gaps
;; where you'll see P14 1-time, then won't see anything 15 or 16 times, so P15 and P16 will
;; be 0, then you'll see something 17 times twice.
;; This is just noise from having limited data. The noise needs to be smoothed out.
(defn simple-good-turing-map [trie n]
(let [freq-map (trie->frequency-of-frequencies-map trie n)
xs (->> freq-map keys (map #(Math/log %)))
ys (->> freq-map vals (map #(Math/log %)))
sgt (math/sgt (keys freq-map) (vals freq-map))
sgt-map (into (sorted-map) (apply map vector sgt))]
(let [freq-map (trie->frequency-of-frequencies-map trie 2)
xs (->> freq-map keys (map #(Math/log %)))
ys (->> freq-map vals (map #(Math/log %)))
sgt (math/sgt (keys freq-map) (vals freq-map))
sgt-map (into (sorted-map) (apply map vector sgt))
sgt-with-counts (math/sgt-with-counts (keys freq-map)
(vals freq-map))
c1 (freq-map 1)
c1* (sgt-map 1)]
[c1 c1* sgt-with-counts])
;; Maximum Likelihood Estimate
;; It was about dusk, one evening during the supreme madness of the
;; carnival season, that I encountered my friend. He accosted me with
;; excessive warmth, for he had been drinking much. The man wore motley.
;; He had on a tight-fitting parti-striped dress, and his head was
;; surmounted by the conical cap and bells. I was so pleased to see him,
;; that I thought I should never have done wringing his hand.
;; Consider 3-grams...
;; it was about
;; it was there
;; Let `N` be a sample text size and `nr` be the number of
;; m-grams which occurred in the text exactly `r` times.
;; So that `N` = (apply + (map #(* r nr) frequency-of-frequencies)
;; `N` = sum for all seen-counts ("number of things seen 'count' times" * 'count')
;; 10 things seen 5 times
;; 4 things seen 4 times
;; 2 things seen 1 time
;; 10 things seen 5 times each makes up 50 "things"
;; 4 things seen 4 times each makes up 16 "things
;; 2 things seen once each makes up 2 "things"
;; Makes for `N` = 50 + 16 + 2 things... 68 things (m-grams).
;; Consider the m-gram "it was about" occurred 4 times.
;; And in total we saw 60 3-grams. Then the MLE
;; is 4 / 60.
;;;; Base MLE
;; Disount of the n-gram
;; *
;; Count of n-gram
;; /
;; Count of n-1-gram
(defn maximum-likelihood-estimate [trie trie-database n-gram]
(/ (get-in trie-database [n-gram :count])
(get-in trie-database [(butlast n-gram) :count])))
(maximum-likelihood-estimate trie trie-database '("," "the"))
(maximum-likelihood-estimate trie trie-database '(","))
(let [[rs nrs ests lgts]
(apply map vector (seq (trie->frequency-of-frequencies-map trie 2))))]
[rs nrs ests lgts])
;;;; KATZ ;;;;
;; (defn N [trie n-gram-rank]
;; (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)]
;; (apply + (map (fn [[r nr]] (* r nr)) r->Nr))))
;; (defn r* [trie n-gram-rank]
;; (let [r->Nr (trie->frequency-of-frequencies-map trie n-gram-rank)
;; _ _ _ r*s]))
(defn zipper-leaf-path-seq
(->> zipper
(iterate zip/next)
(take-while (complement zip/end?))
(filter (complement zip/branch?))
(map zip/path)
(map (partial map first))
(filter (comp tb2/words last))))
(def target-grammar-structure
'(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))))
(reverse (zipper-leaf-path-seq (zip/seq-zip target-grammar-structure)))
(defn decode-fn
"Decodes a variable-length encoded number from a byte-buffer.
Zero gets decoded to nil."
(let [value (encoding/decode byte-buffer)]
(if (zero? value)
(def tpt (tpt/load-tightly-packed-trie-from-file
(io/resource "dark-corpus-4-gram-backwards-tpt.bin")
(def database (nippy/thaw-from-file (io/resource "dark-corpus-4-gram-backwards-db.bin")))
(def example-story
(loop [generated-text (vec (repeat 3 (get database "</s>")))
i 0]
(if (> i 20)
(let [children (loop [i 4]
(let [node
(vec (take-last i generated-text)))
(and node (trie/children node))]
(nil? node) (recur (dec i))
(< i 0) (throw (Exception. "Error"))
(seq children) children
:else (recur (dec i)))))]
(->> children
(map #(vector (.key %) (get % [])))
(remove (comp nil? second))
(fn [[_ c]] c)
(inc i))))))
(map database example-story)
(defn syllabify-phrase
(->> phrase
(#(string/split % #" "))
(map phonetics/get-phones)
(map first)
(map syllabify/syllabify)
(reduce into [])))
(defn markov-choice
[trie generated-text k xf-filter]
(let [node (trie/lookup trie k)
children (and node
(->> node
(map #(vector (.key %) (get % [])))
(remove (comp nil? second))))
choices (transduce
(map (fn [child]
(vector generated-text child)))
xf-filter) conj children)]
(nil? node) (recur trie generated-text (butlast k) xf-filter)
(seq children)
(if (< (rand) (/ (apply max (map second children))
(apply + (map second children))))
(recur trie generated-text (butlast k) xf-filter)
(fn [[_ c]] c)
(> (count k) 0) (recur trie generated-text (butlast k) xf-filter)
:else (throw (Exception. "Error")))))
(defn syllable-count-pred
[syllable-count database]
(fn [node]
(let [syllables (syllabify-phrase (database (get node [])))]
(= syllable-count
(count syllables)))))
(defn markov-select
(fn [{:keys [trie database xf-filter tokens] :as context}]
(loop [n n]
(if (= n 0)
;; Unable to find a selection
(let [key (take-last n tokens)
node (trie/lookup trie key)
children (and node (->> (trie/children node)
(remove nil?)))
choices (transduce
(map #(vector (.key %) (get % [])))
(map (fn [child] [context child]))
(let [freqs (map #(get % []) children)]
(or (empty? choices) (empty? freqs))
(recur (dec n))
(> n 1)
(< (rand)
(/ (apply max freqs)
(apply + freqs))))
(dec n))
(let [result (second (math/weighted-selection
(comp second second)
(first result)))))))))
(defn generate-sentence
[{:keys [trie database stop? xf-filter tokens] :as context}]
(let [markov-fn (markov-select 4)]
(loop [context (assoc context :i 0)]
(let [tokens (:tokens context)]
(stop? context)
(let [selection (markov-fn context)]
(if (nil? selection)
(println tokens)
(throw (Exception. "No possible selection")))
(recur (update
(trie/lookup tpt '(1 1 1))
(let [context {:tokens (vec (repeat 3 (database "</s>")))
:trie tpt
:database database
:stop? (fn [{:keys [tokens] :as context}]
(let [sentence (->> tokens
(map database)
(remove #{"</s>"})
(string/join " "))]
(<= 10 (count (syllabify-phrase sentence)))))
:xf-filter (comp
(fn [[context [k v]]]
(= k 7)))
(fn [[context [k v]]]
(let [current-sentence
(->> (:tokens context)
(map database)
(remove #{"</s>"})
(string/join " "))
(count (syllabify-phrase current-sentence))
current-word (database k)
current-word-syllable-count (count (syllabify-phrase current-word))]
(>= (- 10 current-syllable-count)
(->> (generate-sentence context)
(map database)))
(database "<s>")
#_(defn generate-sentence
[trie database stop? filters]
(loop [generated-text (vec (repeat 3 (get database "</s>")))
i 0]
(> i 400)
(stop? generated-text)
;; reset
(or (zero? (mod i 40))
(> syllable-count target-syllable-count))
(recur (vec (repeat 3 (get database "</s>"))) 0)
(let [choice (markov-choice
(take-last 4 generated-text)
(inc i))))))
(let [disallow-sentence-start-xf
(remove (= (database (first %)) "<s>") children)
(map database (generate-sentence tpt database 10))
(def grammar-freqs (nippy/thaw-from-file "/tmp/grammar-freqs-top-100.bin"))
(def part-of-speech-freqs (nippy/thaw-from-file "/tmp/part-of-speech-freqs.bin"))
(take 100 part-of-speech-freqs)
(loop [generated-text (vec (repeat 3 (get database "</s>")))
i 0]
(let [current-sentence
(complement (into #{} (map database ["<s>" "</s>"])))
(reverse generated-text))]
(if (> i 20)
(let [children (loop [i 4]
(let [node
(vec (take-last i generated-text)))
(and node (trie/children node))]
(nil? node) (recur (dec i))
(< i 0) (throw (Exception. "Error"))
(seq children) children
:else (recur (dec i)))))]
(->> children
(map #(vector (.key %) (get % [])))
(remove (comp nil? second))
(fn [[_ c]] c)
(inc i))))))

@ -232,29 +232,34 @@
Porcelain. If you have the simple tree data structure Porcelain. If you have the simple tree data structure
returned by `parse-to-simple-tree`, then you can just returned by `parse-to-simple-tree`, then you can just
pass that directly to `zip/seq-zip`." pass that directly to `zip/seq-zip`."
[texts] [text]
(let [tree (->> texts (let [tree (->> text
(map tokenize) tokenize
(map (partial string/join " ")) (string/join " ")
parse parse
(map tb/make-tree) first
unmake-tree)] unmake-tree)]
(zip/seq-zip tree))) (zip/seq-zip tree)))
(comment (comment
(let [texts ["Eric's test is difficult."]] ;; Here is a demo of zipping through a parse tree and changing
(loop [zipper (treebank-zipper texts)] ;; all adjectives to "thorough".
(let [text "Eric's test is difficult."]
(loop [zipper (treebank-zipper text)]
(cond (cond
(zip/end? zipper) (zip/root zipper) (zip/end? zipper) (zip/root zipper)
(= 'JJ (zip/node zipper)) (recur (-> zipper (= 'JJ (zip/node zipper)) (recur (-> zipper
zip/next zip/next
(zip/replace '("thorough")))) (zip/replace '("thorough"))))
:else (recur (zip/next zipper))))) :else (recur (zip/next zipper)))))
;; => ((TOP ;; => (TOP
;; ((S ;; ((S
;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test")))) ;; ((NP ((NP ((NNP ("Eric")) (POS ("'s")))) (NN ("test"))))
;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough")))))) ;; (VP ((VBZ ("is")) (ADJP ((JJ ("thorough"))))))
;; (. ("."))))))) ;; (. ("."))))))
) )
(defn iter-zip (defn iter-zip
@ -341,6 +346,9 @@
(leaf-pos-path-word-freqs zipper)) (leaf-pos-path-word-freqs zipper))
(comment (comment
(treebank-zipper ["Eric's test is difficult."
"Eric's test is thorough."
"Eric's testing."])
(let [zipper (treebank-zipper ["Eric's test is difficult." (let [zipper (treebank-zipper ["Eric's test is difficult."
"Eric's test is thorough." "Eric's test is thorough."
"Eric's testing."])] "Eric's testing."])]
@ -406,6 +414,9 @@
"you are a test"]] "you are a test"]]
(grammar-tree-frequencies (grammar-tree-frequencies
document)) document))
(grammar-tree-frequencies ["this is a test."])
(parse-to-simple-tree ["this is a test."])
;; => {(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))) 1, ;; => {(TOP (S (NP (WDT)) (VP (VBD) (NP (DT) (NN))))) 1,
;; (TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))))) 2, ;; (TOP (S (NP (DT)) (VP (VBZ) (NP (DT) (NN))))) 2,
;; (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))))) 1} ;; (TOP (S (NP (PRP)) (VP (VBP) (NP (DT) (NN))))) 1}
@ -786,7 +797,7 @@
(remove #(string? (first %))))) (remove #(string? (first %)))))
(comment (comment
(phrase-constituents "My name is Eric.") (phrase-constituents ["My name is Eric."])
;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP))) ;; => ((TOP (S)) (S (NP VP .)) (NP (PRP$ NN)) (VP (VBZ NP)) (NP (NNP)))
(phrase-constituents "How are you?") (phrase-constituents "How are you?")
@ -833,6 +844,7 @@
"My hat is blue and I like cake." "My hat is blue and I like cake."
"Your name is Taylor." "Your name is Taylor."
"How are you?"]) "How are you?"])
;; => {TOP {(S) 3, (SBARQ) 1}, ;; => {TOP {(S) 3, (SBARQ) 1},
;; S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2}, ;; S {(NP VP .) 2, (S CC S .) 1, (NP VP) 2},
;; NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1}, ;; NP {(PRP$ NN) 3, (NNP) 2, (PRP) 2, (NN) 1},
@ -1002,6 +1014,7 @@
"Your name is not Eric." "Your name is not Eric."
"Who is your mother and what does she do?"] "Who is your mother and what does she do?"]
(pos-constituent-frequencies) (pos-constituent-frequencies)
#_#_(apply #_#_(apply
merge-with merge-with
(fn [a b] (fn [a b]
@ -1061,10 +1074,20 @@
) )
(defn most-likely-parts-of-speech
(top-k-sequences prhyme-pos-tagger (tokenize phrase)))
(comment (comment
(let [text ["bother me"]] (let [text "a dog"]
(->> text (first
(map tokenize) (map #(.getOutcomes %)
(map #(top-k-sequences prhyme-pos-tagger %)))) (most-likely-parts-of-speech text))))
;; => ["PRP" "VBP" "DT" "NN" "."]
(map (juxt #(.getOutcomes %)
#(map float (.getProbs %)))
(top-k-sequences prhyme-pos-tagger (tokenize "")))
) )

@ -29,6 +29,7 @@
;; of a word. So it should be e.lip.sis ;; of a word. So it should be e.lip.sis
;; As an alternative to handling the isolated "s"-at-the-end-of-internal-coda case, ;; As an alternative to handling the isolated "s"-at-the-end-of-internal-coda case,
;; it works well-enough for me to treat all fricatives as lowest priority. ;; it works well-enough for me to treat all fricatives as lowest priority.
(def ^clojure.lang.PersistentVector sonority-hierarchy (def ^clojure.lang.PersistentVector sonority-hierarchy
["vowel" "liquid" "affricate" "fricative" "nasal" "stop" "semivowel" "aspirate"]) ["vowel" "liquid" "affricate" "fricative" "nasal" "stop" "semivowel" "aspirate"])

@ -286,6 +286,50 @@
[lgt-estimate lgt?] [lgt-estimate lgt?]
[turing-estimate lgt?])))))))) [turing-estimate lgt?]))))))))
(defn smoothed-frequencies
[rs nrs]
(let [l (count rs)
N (apply + (map #(apply * %) (map vector rs nrs)))
p0 (/ (first nrs) N)
zrs (average-consecutives rs nrs)
log-rs (map #(Math/log %) rs)
log-zrs (map #(Math/log %) zrs)
lm (least-squares-linear-regression log-rs log-zrs)
lgts (map lm rs)
estimations (loop [coll rs
lgt? false
e (estimator lm rs zrs)
estimations []]
(empty? coll) estimations
(let [[estimation lgt?] (e (first coll) lgt?)]
(rest coll)
(conj estimations estimation)))))
N* (apply + (map #(apply * %) (map vector nrs estimations)))
probs (cons
(float p0)
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
(fn [r]
(* (inc r) (/ (lm (inc r)) (lm r))))
(partition 2 1 (conj rs (inc (peek rs)))))]))
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
(smoothed-frequencies rs nrs))
(defn sgt [rs nrs] (defn sgt [rs nrs]
(assert (and (not-empty nrs) (not-empty rs)) (assert (and (not-empty nrs) (not-empty rs))
"frequencies and frequency-of-frequencies can't be empty") "frequencies and frequency-of-frequencies can't be empty")
@ -316,7 +360,8 @@
(map #(* (- 1 p0) (/ % N*)) estimations)) (map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)] sum-probs (apply + probs)]
[(cons 0 rs) [(cons 0 rs)
(map #(/ % sum-probs) probs)])) (map #(/ % sum-probs) probs)
(comment (comment
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
