|
|
|
@ -99,3 +99,150 @@
|
|
|
|
|
index (nextr rng nil)
|
|
|
|
|
selection (nth coll index)]
|
|
|
|
|
selection)))
|
|
|
|
|
|
|
|
|
|
;;;; Simple Good-Turing Frequency Estimation
|
|
|
|
|
;;
|
|
|
|
|
;; Good-Turing Smoothing
|
|
|
|
|
;;
|
|
|
|
|
;; There are 4 steps to perform the GT smoothing, which are:
|
|
|
|
|
;; 1. Count the frequency of frequency (Nr)
|
|
|
|
|
;; 2. Average all the non-zero counts using Zr = Nr / 0.5 (t - q)
|
|
|
|
|
;; 3. Fit a linear regression model log(Zr) = a + b log(r)
|
|
|
|
|
;; 4. Update r with r* using Katz equation and constant k, with
|
|
|
|
|
;; updated Zr corresponding to specific r read out from the linear
|
|
|
|
|
;; regression model.
|
|
|
|
|
|
|
|
|
|
(defn least-squares-linear-regression
|
|
|
|
|
"Returns a 'Good-Turing Estimator' as defined on page 4 of
|
|
|
|
|
https://www.csie.ntu.edu.tw/~b92b02053/print/good-turing-smoothing-without.pdf
|
|
|
|
|
|
|
|
|
|
A precise statement of the theorem underlying the Good-Turing method is that
|
|
|
|
|
r* = (r + 1) * E(Nᵣ + 1) / E(Nᵣ)
|
|
|
|
|
Where E(x) represents the expectation of random variable x.
|
|
|
|
|
|
|
|
|
|
It's not unreasonable for E to be identity, simply substituting
|
|
|
|
|
Nᵣ for E(Nᵣ). In fact, that is known as the Turing Estimator.
|
|
|
|
|
|
|
|
|
|
However, the larger r is, the less reasonable this substitution is, due
|
|
|
|
|
to how much noise there is in large values of r.
|
|
|
|
|
|
|
|
|
|
So, this function acts as a more reasonable E.
|
|
|
|
|
|
|
|
|
|
Note! Your raw data is probably exponential. This is linear regression
|
|
|
|
|
and doesn't handle linearization for you. `log` your data befor sending it here.
|
|
|
|
|
|
|
|
|
|
The paper also states that you should use the Turing estimator so long
|
|
|
|
|
as the Turing estimate is significantly different from the Linear Good-Turing
|
|
|
|
|
estimate. It defines significantly different as exceeding 1.65 times the
|
|
|
|
|
standard deviation of the Turing estimate."
|
|
|
|
|
[xs ys]
|
|
|
|
|
(let [n (count xs)
|
|
|
|
|
sum-x (apply + xs)
|
|
|
|
|
sum-y (apply + ys)
|
|
|
|
|
mean-x (/ sum-x n)
|
|
|
|
|
mean-y (/ sum-y n)
|
|
|
|
|
err-x (map #(- % mean-x) xs)
|
|
|
|
|
err-y (map #(- % mean-y) ys)
|
|
|
|
|
err-x-sqr (map #(* % %) err-x)
|
|
|
|
|
m (/ (apply + (map #(apply * %) (map vector err-x err-y)))
|
|
|
|
|
(apply + err-x-sqr))
|
|
|
|
|
b (/ (- sum-y (* m sum-x)) n)]
|
|
|
|
|
(assert (< m -1) "See Good-Turing Without Tears for why slope must be less than -1.")
|
|
|
|
|
(fn [x]
|
|
|
|
|
(+ b (* m x)))))
|
|
|
|
|
|
|
|
|
|
(defn average-consecutives
|
|
|
|
|
"Average all the non-zero frequency of observations (frequency of frequencies)
|
|
|
|
|
using the equation Zr = Nr / 0.5 (t - q)
|
|
|
|
|
where q, r, and t are consecutive observations.
|
|
|
|
|
|
|
|
|
|
An intuitive reason for this is that you can't see something a fraction of a time,
|
|
|
|
|
but seeing something a fraction of a time is a truer representation of its
|
|
|
|
|
expectation.
|
|
|
|
|
|
|
|
|
|
For example, in a typical linguistic corpus, you'll see many tokens once, many twice,
|
|
|
|
|
fewer 3 times, fewer 4 times, etc... By the time you get up to tokens that have been
|
|
|
|
|
seen 20 times, you might only see 1 token. Then 0 occurrences of
|
|
|
|
|
21, 22, and 23 tokens. Then you might once see a token 24 times.
|
|
|
|
|
Then 0 occurrences of 25, 26, or 27 tokens, then 1 occurence of 28 tokens.
|
|
|
|
|
|
|
|
|
|
Even though frequencies of 24 and 28 have both occurred once, that doesn't mean
|
|
|
|
|
their expected occurence is each once. In actuality, 28 is less likely than 24.
|
|
|
|
|
|
|
|
|
|
This averaging accomplishes that.
|
|
|
|
|
|
|
|
|
|
It's known as Zᵣ in most papers on Good-Turing estimation. It's used in place of
|
|
|
|
|
Nᵣ as soon as possible, since it's more accurate.
|
|
|
|
|
|
|
|
|
|
Let's say you observered
|
|
|
|
|
observation frequency Zr
|
|
|
|
|
1 32 32
|
|
|
|
|
2 20 20
|
|
|
|
|
3 9 6
|
|
|
|
|
5 2 1
|
|
|
|
|
7 1 0.5
|
|
|
|
|
|
|
|
|
|
If observations occur consecutively, then Zr is unchanged from the observed frequency.
|
|
|
|
|
But if there is a gap in observation occurence, then this effectively
|
|
|
|
|
redistributes some of the observations that we did make into some of the observations that
|
|
|
|
|
we didn't make.
|
|
|
|
|
|
|
|
|
|
For example, we saw some 3's and 5's, but no 4's. So this algorithm shaves a little off
|
|
|
|
|
of the 3's and 5's so that if we fit a line to the frequencies the line will more accurately
|
|
|
|
|
go through the `4` observations that we just so happened to miss.
|
|
|
|
|
|
|
|
|
|
This assumes some type of distribution amongst the data where that assumption is valid,
|
|
|
|
|
that the observations are independent of each other and that are either linear or logarithmic
|
|
|
|
|
(not polynomial... maybe another word for that is monotonic?)
|
|
|
|
|
"
|
|
|
|
|
[freqs Nrs]
|
|
|
|
|
(let [freqs (vec freqs)
|
|
|
|
|
Nrs (vec Nrs)]
|
|
|
|
|
(loop [i 0
|
|
|
|
|
result []]
|
|
|
|
|
(let [q (if (= i 0) 0 (nth freqs (dec i)))
|
|
|
|
|
Nr (nth Nrs i)
|
|
|
|
|
r (nth freqs i)
|
|
|
|
|
t (if (= (inc i) (count freqs))
|
|
|
|
|
(- (* 2 r) q)
|
|
|
|
|
(nth freqs (inc i)))]
|
|
|
|
|
(println q Nr r t)
|
|
|
|
|
(cond
|
|
|
|
|
(= (inc i) (count freqs))
|
|
|
|
|
(conj result (/ (* 2 Nr) (- t q)))
|
|
|
|
|
|
|
|
|
|
:else
|
|
|
|
|
(recur
|
|
|
|
|
(inc i)
|
|
|
|
|
(conj result (/ (* 2 Nr) (- t q)))))))))
|
|
|
|
|
|
|
|
|
|
(comment
|
|
|
|
|
(let [xs [1 2 3 4 5 6 7 8 9 10 12 26]
|
|
|
|
|
ys [32 20 10 3 1 2 1 1 1 2 1 1]
|
|
|
|
|
ys-avg-cons (average-consecutives xs ys)
|
|
|
|
|
log-xs (map #(Math/log %) xs)
|
|
|
|
|
log-ys (map #(Math/log %) ys-avg-cons)
|
|
|
|
|
lm (least-squares-linear-regression log-xs log-ys)
|
|
|
|
|
zs (map lm log-xs)]
|
|
|
|
|
;; => [32 20 10 3 1 2 1 1 1 2 1/2 1/14]
|
|
|
|
|
[log-ys log-xs zs (map #(Math/pow Math/E %) zs)])
|
|
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
(defn stdv-for-turing-estimate
|
|
|
|
|
"The Simple Good-Turing paper suggests using a Turing estimator
|
|
|
|
|
for small values of r and switching to a Linear Good Turing estimator
|
|
|
|
|
once the differences between the two are no longer significantly different.
|
|
|
|
|
|
|
|
|
|
Turing estimate are considered significantly different from LGT estimates
|
|
|
|
|
if their difference exceeds 1.65 times the standard deviation of
|
|
|
|
|
the Turing estimate.
|
|
|
|
|
|
|
|
|
|
The variance for the Turing estimate is approximately
|
|
|
|
|
(r + 1)² * Nᵣ₊₁ / N²ᵣ * (1 + Nᵣ₊₁ / N²ᵣ)"
|
|
|
|
|
[r1 nr nr1]
|
|
|
|
|
(Math/sqrt
|
|
|
|
|
(* (Math/pow r1 2)
|
|
|
|
|
(/ nr1 (Math/pow nr 2))
|
|
|
|
|
(inc (/ nr1 nr)))))
|
|
|
|
|
|
|
|
|
|