Experiments with simple good turing estimation

main
Eric Ihli 4 years ago
parent acfa291e85
commit 56109c3810

@ -327,51 +327,68 @@
(let [n (count xs) (let [n (count xs)
sum-x (apply + xs) sum-x (apply + xs)
sum-y (apply + ys) sum-y (apply + ys)
sum-xy (apply + (map #(apply * %) (map vector xs ys))) mean-x (/ sum-x n)
sum-x-sqr (apply + (map #(* % %) xs)) mean-y (/ sum-y n)
m (/ (- (* n sum-xy) (* sum-x sum-y)) err-x (map #(- % mean-x) xs)
(- (* n sum-x-sqr) (* sum-x sum-x))) err-y (map #(- % mean-y) ys)
err-x-sqr (map #(* % %) err-x)
m (/ (apply + (map #(apply * %) (map vector err-x err-y)))
(apply + err-x-sqr))
b (/ (- sum-y (* m sum-x)) n)] b (/ (- sum-y (* m sum-x)) n)]
(println (format "intercept %f slope %f" b m))
(fn [x] (fn [x]
(+ (* m x) b)))) (+ b (* m x)))))
(comment
(float ((least-squares-linear-regression
[1 2 3 4]
[2 4 5 7])
5))
)
(defn average-consecutives (defn average-consecutives
"Average all the non-zero counts using the equation "Average all the non-zero counts using the equation
Zr = Nr / 0.5 (t - q)" q, r, t
Zr = Nr / 0.5 (t - q)
or
Zr = 2 Nr / (t - q)"
[freqs Nrs] [freqs Nrs]
(let [freqs (vec freqs) (let [freqs (vec freqs)
Nrs (vec Nrs)] Nrs (vec Nrs)]
(loop [i 0 (loop [i 0
result []] result []]
(let [q (nth freqs (max (dec i) 0)) (let [q (if (= i 0) 0 (nth freqs (dec i)))
Nr (nth Nrs (min (dec (count freqs)) i)) Nr (nth Nrs i)
r (nth freqs (min (dec (count freqs)) i)) r (nth freqs i)
t (nth freqs (min (dec (count freqs)) (inc i)))] t (if (= (inc i) (count freqs))
(- (* 2 r) q)
(nth freqs (inc i)))]
(println q Nr r t)
(cond (cond
(= i (count freqs)) result (= (inc i) (count freqs))
(conj result (/ (* 2 Nr) (- t q)))
(zero? i)
(recur (inc i)
(conj result (/ (* 2 Nr) t)))
(= (dec i) (count freqs))
(recur (inc i)
(conj result (/ (* 2 Nr (- t q)))))
:else :else
(recur (inc i) (recur
(conj result (/ Nr (- r q))))))))) (inc i)
(conj result (/ (* 2 Nr) (- t q)))))))))
(comment (comment
(let [xs [1 2 3 4 5 6 7 8 9 10 12 26] (let [xs [1 2 3 4 5 6 7 8 9 10 12 26]
ys [32 20 10 3 1 2 1 1 1 2 1 1] ys [32 20 10 3 1 2 1 1 1 2 1 1]
smoothed (average-consecutives xs ys) ys-avg-cons (average-consecutives xs ys)]
logged (map #(Math/log %) smoothed) (map float ys-avg-cons))
lm (least-squares-linear-regression xs ys)
log-lm (map lm xs)
log-ys (map #(Math/pow % Math/E) log-lm)]
;; => [32 20 10 3 1 2 1 1 1 2 1/2 1/14]
[log-lm log-ys]) ;; y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
(let [xs [1 2 3 4 5 6 7 8 9 10 12 26]
ys [32 20 10 3 1 2 1 1 1 2 1 1]
ys-avg-cons (average-consecutives xs ys)
log-xs (map #(Math/log %) xs)
log-ys (map #(Math/log %) ys-avg-cons)
lm (least-squares-linear-regression log-xs log-ys)
zs (map lm log-xs)]
;; => [32 20 10 3 1 2 1 1 1 2 1/2 1/14]
[log-ys log-xs zs (map #(Math/pow Math/E %) zs)])
(Math/log 1) (Math/log 1)
) )

@ -228,11 +228,15 @@
{ {
int i; int i;
printf("intercept %f slope %f\n", intercept, slope);
printf("0\t%.4g\n", PZero); printf("0\t%.4g\n", PZero);
for (i = 0; i < rows; ++i) for (i = 0; i < rows; ++i)
printf("%d\t%.4g\n", r[i], p[i]); printf("%d\t%.4g\n", r[i], p[i]);
for (i = 0; i < rows; ++i)
printf("%d\t%.4g\n", r[i], smoothed(r[i]));
} }
void analyseInput(void) void analyseInput(void)
{ {
int i, j, next_n; int i, j, next_n;
@ -260,6 +264,11 @@
log_r[j] = log(r[j]); log_r[j] = log(r[j]);
log_Z[j] = log(Z[j]); log_Z[j] = log(Z[j]);
} }
for (i = 0; i < rows; ++i)
printf("%d\t%.4g\n", r[i], Z[i]);
findBestFit(); findBestFit();
for (j = 0; j < rows; ++j) for (j = 0; j < rows; ++j)
{ {

@ -147,9 +147,15 @@
m (/ (apply + (map #(apply * %) (map vector err-x err-y))) m (/ (apply + (map #(apply * %) (map vector err-x err-y)))
(apply + err-x-sqr)) (apply + err-x-sqr))
b (/ (- sum-y (* m sum-x)) n)] b (/ (- sum-y (* m sum-x)) n)]
(assert (< m -1) "See Good-Turing Without Tears for why slope must be less than -1.") (assert (< m -1)
(format
(str "See Good-Turing Without Tears"
" for why slope must be less than -1."
"\nSlope: %0.2f Intersect %0.2f")
(float m)
(float b)))
(fn [x] (fn [x]
(+ b (* m x))))) (Math/pow Math/E (+ b (* m (Math/log x)))))))
(defn average-consecutives (defn average-consecutives
"Average all the non-zero frequency of observations (frequency of frequencies) "Average all the non-zero frequency of observations (frequency of frequencies)
@ -206,7 +212,6 @@
t (if (= (inc i) (count freqs)) t (if (= (inc i) (count freqs))
(- (* 2 r) q) (- (* 2 r) q)
(nth freqs (inc i)))] (nth freqs (inc i)))]
(println q Nr r t)
(cond (cond
(= (inc i) (count freqs)) (= (inc i) (count freqs))
(conj result (/ (* 2 Nr) (- t q))) (conj result (/ (* 2 Nr) (- t q)))
@ -246,3 +251,67 @@
(/ nr1 (Math/pow nr 2)) (/ nr1 (Math/pow nr 2))
(inc (/ nr1 nr))))) (inc (/ nr1 nr)))))
(defn estimator
([lm rs nrs]
(estimator lm rs nrs false))
([lm rs nrs take-lgt?]
(fn [x]
(let [i (.indexOf x rs)
turing-estimate (nth rs i)
stdv (stdv-for-turing-estimate (nth rs (inc i))
turing-estimate
(nth nrs (inc i)))
lgt-estimate (lm x)]
(assert (>= i 0) (str x " not found"))
(if (or (< (Math/abs (- lgt-estimate turing-estimate))
(* 1.65 stdv))
take-lgt?)
[lgt-estimate (estimator lm rs nrs true)]
[turing-estimate (estimator lm rs nrs false)])))))
(defn sgt [rs nrs]
(assert (and (not-empty nrs) (not-empty rs))
"frequencies and frequency-of-frequencies can't be empty")
(let [l (count rs)
N (apply + (map #(apply * %) (map vector rs nrs)))
p0 (/ (first nrs) N)
zrs (average-consecutives rs nrs)
log-rs (map #(Math/log %) rs)
log-zrs (map #(Math/log %) zrs)
lm (least-squares-linear-regression log-rs log-zrs)
lgts (map lm rs)
sgts (loop [i 0
result []
take-lgt? false]
(cond
(= (inc i) l)
(conj result (last lgts))
:else
(let [x (nth zrs i)
y (nth lgts i)
stdv (stdv-for-turing-estimate
(nth rs (inc i))
(nth zrs i)
(nth zrs (inc i)))
take-lgt? (or take-lgt?
(<= (Math/abs (- x y))
(* 1.65 stdv)))]
(recur (inc i)
(conj
result
(if take-lgt?
(nth lgts i)
(nth zrs i)))
take-lgt?))))
N* (apply + (map #(apply * %) (map vector rs sgts)))]
))
(comment
(let [rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]
sgts (sgt rs nrs)]
sgts
)
)

Loading…
Cancel
Save