From 56109c38107642ebe51535297f00aec3c3703ae6 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Tue, 22 Dec 2020 16:19:36 -0800 Subject: [PATCH] Experiments with simple good turing estimation --- dev/examples/scratch.clj | 71 +++++++++++++++++----------- simple_good_turing_estimator.c | 11 ++++- src/com/owoga/prhyme/util/math.clj | 75 ++++++++++++++++++++++++++++-- 3 files changed, 126 insertions(+), 31 deletions(-) diff --git a/dev/examples/scratch.clj b/dev/examples/scratch.clj index eb26c49..32987bb 100644 --- a/dev/examples/scratch.clj +++ b/dev/examples/scratch.clj @@ -327,51 +327,68 @@ (let [n (count xs) sum-x (apply + xs) sum-y (apply + ys) - sum-xy (apply + (map #(apply * %) (map vector xs ys))) - sum-x-sqr (apply + (map #(* % %) xs)) - m (/ (- (* n sum-xy) (* sum-x sum-y)) - (- (* n sum-x-sqr) (* sum-x sum-x))) + mean-x (/ sum-x n) + mean-y (/ sum-y n) + err-x (map #(- % mean-x) xs) + err-y (map #(- % mean-y) ys) + err-x-sqr (map #(* % %) err-x) + m (/ (apply + (map #(apply * %) (map vector err-x err-y))) + (apply + err-x-sqr)) b (/ (- sum-y (* m sum-x)) n)] + (println (format "intercept %f slope %f" b m)) (fn [x] - (+ (* m x) b)))) + (+ b (* m x))))) + +(comment + (float ((least-squares-linear-regression + [1 2 3 4] + [2 4 5 7]) + 5)) + ) (defn average-consecutives "Average all the non-zero counts using the equation - Zr = Nr / 0.5 (t - q)" + q, r, t + Zr = Nr / 0.5 (t - q) + or + Zr = 2 Nr / (t - q)" [freqs Nrs] (let [freqs (vec freqs) Nrs (vec Nrs)] (loop [i 0 result []] - (let [q (nth freqs (max (dec i) 0)) - Nr (nth Nrs (min (dec (count freqs)) i)) - r (nth freqs (min (dec (count freqs)) i)) - t (nth freqs (min (dec (count freqs)) (inc i)))] + (let [q (if (= i 0) 0 (nth freqs (dec i))) + Nr (nth Nrs i) + r (nth freqs i) + t (if (= (inc i) (count freqs)) + (- (* 2 r) q) + (nth freqs (inc i)))] + (println q Nr r t) (cond - (= i (count freqs)) result - - (zero? i) - (recur (inc i) - (conj result (/ (* 2 Nr) t))) + (= (inc i) (count freqs)) + (conj result (/ (* 2 Nr) (- t q))) - (= (dec i) (count freqs)) - (recur (inc i) - (conj result (/ (* 2 Nr (- t q))))) :else - (recur (inc i) - (conj result (/ Nr (- r q))))))))) + (recur + (inc i) + (conj result (/ (* 2 Nr) (- t q))))))))) (comment (let [xs [1 2 3 4 5 6 7 8 9 10 12 26] ys [32 20 10 3 1 2 1 1 1 2 1 1] - smoothed (average-consecutives xs ys) - logged (map #(Math/log %) smoothed) - lm (least-squares-linear-regression xs ys) - log-lm (map lm xs) - log-ys (map #(Math/pow % Math/E) log-lm)] - ;; => [32 20 10 3 1 2 1 1 1 2 1/2 1/14] + ys-avg-cons (average-consecutives xs ys)] + (map float ys-avg-cons)) - [log-lm log-ys]) + ;; y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]); + (let [xs [1 2 3 4 5 6 7 8 9 10 12 26] + ys [32 20 10 3 1 2 1 1 1 2 1 1] + ys-avg-cons (average-consecutives xs ys) + log-xs (map #(Math/log %) xs) + log-ys (map #(Math/log %) ys-avg-cons) + lm (least-squares-linear-regression log-xs log-ys) + zs (map lm log-xs)] + ;; => [32 20 10 3 1 2 1 1 1 2 1/2 1/14] + [log-ys log-xs zs (map #(Math/pow Math/E %) zs)]) (Math/log 1) ) diff --git a/simple_good_turing_estimator.c b/simple_good_turing_estimator.c index ec5b237..0802161 100644 --- a/simple_good_turing_estimator.c +++ b/simple_good_turing_estimator.c @@ -227,11 +227,15 @@ void showEstimates(void) { int i; - + + printf("intercept %f slope %f\n", intercept, slope); printf("0\t%.4g\n", PZero); for (i = 0; i < rows; ++i) printf("%d\t%.4g\n", r[i], p[i]); + for (i = 0; i < rows; ++i) + printf("%d\t%.4g\n", r[i], smoothed(r[i])); } + void analyseInput(void) { @@ -260,6 +264,11 @@ log_r[j] = log(r[j]); log_Z[j] = log(Z[j]); } + + for (i = 0; i < rows; ++i) + printf("%d\t%.4g\n", r[i], Z[i]); + + findBestFit(); for (j = 0; j < rows; ++j) { diff --git a/src/com/owoga/prhyme/util/math.clj b/src/com/owoga/prhyme/util/math.clj index 96f05d0..2af2774 100644 --- a/src/com/owoga/prhyme/util/math.clj +++ b/src/com/owoga/prhyme/util/math.clj @@ -147,9 +147,15 @@ m (/ (apply + (map #(apply * %) (map vector err-x err-y))) (apply + err-x-sqr)) b (/ (- sum-y (* m sum-x)) n)] - (assert (< m -1) "See Good-Turing Without Tears for why slope must be less than -1.") + (assert (< m -1) + (format + (str "See Good-Turing Without Tears" + " for why slope must be less than -1." + "\nSlope: %0.2f Intersect %0.2f") + (float m) + (float b))) (fn [x] - (+ b (* m x))))) + (Math/pow Math/E (+ b (* m (Math/log x))))))) (defn average-consecutives "Average all the non-zero frequency of observations (frequency of frequencies) @@ -206,7 +212,6 @@ t (if (= (inc i) (count freqs)) (- (* 2 r) q) (nth freqs (inc i)))] - (println q Nr r t) (cond (= (inc i) (count freqs)) (conj result (/ (* 2 Nr) (- t q))) @@ -246,3 +251,67 @@ (/ nr1 (Math/pow nr 2)) (inc (/ nr1 nr))))) +(defn estimator + ([lm rs nrs] + (estimator lm rs nrs false)) + ([lm rs nrs take-lgt?] + (fn [x] + (let [i (.indexOf x rs) + turing-estimate (nth rs i) + stdv (stdv-for-turing-estimate (nth rs (inc i)) + turing-estimate + (nth nrs (inc i))) + lgt-estimate (lm x)] + (assert (>= i 0) (str x " not found")) + (if (or (< (Math/abs (- lgt-estimate turing-estimate)) + (* 1.65 stdv)) + take-lgt?) + [lgt-estimate (estimator lm rs nrs true)] + [turing-estimate (estimator lm rs nrs false)]))))) + +(defn sgt [rs nrs] + (assert (and (not-empty nrs) (not-empty rs)) + "frequencies and frequency-of-frequencies can't be empty") + (let [l (count rs) + N (apply + (map #(apply * %) (map vector rs nrs))) + p0 (/ (first nrs) N) + zrs (average-consecutives rs nrs) + log-rs (map #(Math/log %) rs) + log-zrs (map #(Math/log %) zrs) + lm (least-squares-linear-regression log-rs log-zrs) + lgts (map lm rs) + sgts (loop [i 0 + result [] + take-lgt? false] + (cond + (= (inc i) l) + (conj result (last lgts)) + + :else + (let [x (nth zrs i) + y (nth lgts i) + stdv (stdv-for-turing-estimate + (nth rs (inc i)) + (nth zrs i) + (nth zrs (inc i))) + take-lgt? (or take-lgt? + (<= (Math/abs (- x y)) + (* 1.65 stdv)))] + (recur (inc i) + (conj + result + (if take-lgt? + (nth lgts i) + (nth zrs i))) + take-lgt?)))) + N* (apply + (map #(apply * %) (map vector rs sgts)))] + )) + +(comment + (let [rs [1 2 3 4 5 6 7 8 9 10 12 26] + nrs [32 20 10 3 1 2 1 1 1 2 1 1] + sgts (sgt rs nrs)] + sgts + ) + + )