Working completions

6 years ago · 495ef6c602
parent cbde1996dc
commit 495ef6c602
9 changed files with 619 additions and 13 deletions
--- a/dev/examples/scratch.clj
+++ b/dev/examples/scratch.clj
@ -2,7 +2,8 @@
  (:require [clojure.java.io :as io]
            [clojure.string :as string]
            [clojure.set]
-            [com.owoga.prhyme.nlp.core :as nlp]))
+            [com.owoga.prhyme.nlp.core :as nlp]
+            [com.owoga.prhyme.util.math :as math]))

 (def re-word
  "Regex for tokenizing a string into words
@ -17,12 +18,24 @@
       (re-seq re-word)
       (map second)
       (map string/lower-case)
+       (cons :bol)
       (reverse)
-       (cons :end)))
+       (cons :eol)))
+
+(defn tokenize-line
+  [line]
+  (->> line
+       (string/trim)
+       (re-seq re-word)
+       (map second)
+       (map string/lower-case)))

 (comment
-  (-> (slurp "dev/examples/sandman.txt")
-      tokenize))
+  (->> (slurp "dev/examples/sandman.txt")
+       (#(string/split % #"\n"))
+       (map tokenize-line))
+
+  )

 (defn zero-to-n-seq
  ([coll]
@ -85,7 +98,99 @@
                (zero-to-n-seq (first windows)))
               (rest windows))))))

+(defn add-to-trie-1
+  [trie n tokens]
+  (let [pad-n (dec n)
+        tokens (concat (repeat pad-n :bol) tokens (repeat pad-n :eol))
+        partitions (partition n 1 tokens)]
+    (reduce
+     (fn [acc tokens]
+       (update-in acc (concat tokens [:count]) (fnil inc 0)))
+     trie
+     partitions)))
+
+(defn flatmap
+  ([m]
+   (flatmap m []))
+  ([m prefix]
+   (mapcat
+    (fn [[k v]]
+      (if (map? v)
+        (flatmap v (conj prefix k))
+        [(conj prefix k) v]))
+    m)))
+
+(defn filter-trie-to-ngrams [trie n]
+  (->> trie
+       (flatmap)
+       (partition 2)
+       ;; Inc to account for :count
+       (filter #(= (inc n) (count (first %))))))
+
+(comment
+  (let [trie {}]
+    (-> (add-to-trie-1 trie 2 '("of" "lives" "lost" "at" "sea"))
+        (add-to-trie-1 1 '("of" "lives" "lost" "at" "sea"))))
+  )
+
+(defn wrand
+  "given a vector of slice sizes, returns the index of a slice given a
+  random spin of a roulette wheel with compartments proportional to
+  slices."
+  [slices]
+  (let [total (reduce + slices)
+        r (rand total)]
+    (loop [i 0 sum 0]
+      (if (< r (+ (slices i) sum))
+        i
+        (recur (inc i) (+ (slices i) sum))))))
+
+(defn completions [trie probs words]
+  (let [n (inc (count words))
+        possibilities (->> (get-in trie words)
+                           (filter #(string? (first %)))
+                           (map (fn [[k v]]
+                                  [k (get-in probs [n (:count v)])]))
+                           (into {}))
+        sum-probs (apply + (vals possibilities))
+        possibilities (into {} (map (fn [[k v]] [k (/ v sum-probs)]) possibilities))]
+    possibilities))
+
 (comment
+  ;; Turning corpus into a trie.
+  (let [documents (->> "dark-corpus"
+                       io/file
+                       file-seq
+                       (remove #(.isDirectory %))
+                       (drop 500)
+                       (take 5))
+        trie (->> documents
+                  (map slurp)
+                  (mapcat #(string/split % #"\n"))
+                  (map tokenize-line)
+                  (filter #(> (count %) 1))
+                  (take 500)
+                  (reduce
+                   (fn [acc tokens]
+                     (-> (add-to-trie-1 acc 1 tokens)
+                         (add-to-trie-1 2 tokens)
+                         (add-to-trie-1 3 tokens)))
+                   {}))
+        probs (->> (range 1 4)
+                   (map #(vector % (filter-trie-to-ngrams trie %)))
+                   (map (fn [[n v]] [n (map #(second %) v)]))
+                   (map (fn [[n v]] [n (into (sorted-map) (frequencies v))]))
+                   (map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
+                   (map (fn [[n [rs probs]]]
+                          [n (into {} (map vector  rs probs))]))
+                   (into {}))]
+    (reverse (sort-by second (completions trie probs [:bol "you"]))))
+
+  (into {} (map vector [1 2 3] [4 5 6]))
+  ;;
+  ;; => ([1 (1 2 8 7 3 6 4 23) (85 18 2 2 6 3 1 1)]
+  ;;     [2 (1 2 5 3 4 7) (170 25 2 4 2 2)]
+  ;;     [3 (1 2 3 4 7 5) (213 30 5 1 1 3)])
  (let [last-window '("in" "the" "frat")]
    (concat (zero-to-n-seq last-window)
            (rest (n-to-zero-seq last-window))))
@ -96,6 +201,7 @@
   (string/split
    "the cat in the hat is the rat in the frat"
    #" "))
+
  ;; => {"the"
  ;;     {:count 3,
  ;;      "cat" {:count 1, "in" {:count 1}},
@ -184,7 +290,29 @@
      :else (flat-at-depth (->> m (mapcat second) (remove #(= :count (first %))))
                           (dec depth)))))

+(defn flatmap
+  ([m]
+   (flatmap m []))
+  ([m prefix]
+   (mapcat
+    (fn [[k v]]
+      (if (map? v)
+        (flatmap v (conj prefix k))
+        [(conj prefix k) v]))
+    m)))
+
+(defn filter-trie-to-ngrams [trie n]
+  (->> trie
+       (flatmap)
+       (partition 2)
+       ;; Inc to account for :count
+       (filter #(= (inc n) (count (first %))))))
+
+(apply hash-map '([1 2] 3 [4 5] 6))
+
 (comment
+  (apply hash-map (flatmap {1 {2 {3 4} 5 {6 7}} 8 {9 10}} []))
+
  (let [trie {"d" {:count 3
                   "o" {:count 3
                        "g" {:count 2}
@ -196,7 +324,8 @@
                        "g" {:count 1}}
                   "i" {:count 1
                        "g" {:count 1}}}}]
-    (->> (flat-at-depth trie 2)))
+    (filter-trie-to-ngrams trie 3))
+
  )


@ -458,7 +587,6 @@
  (def n-gram-freq-map (n-gram-frequency-map trie 3))
  (def unigram-frequencies (n-gram-freq-map 1))
  unigram-frequencies
-
  )

 (defn number-of-n-grams-that-occur-with-count [trie n c]
--- a/sgt/ReadMe.txt
+++ b/sgt/ReadMe.txt
@ -0,0 +1,72 @@
+SGT
+===
+
+The files here contain a C++ class for implementing simple Good-Turing
+re-estimation, as described by Geoff Sampson in the book Empirical Linguistics
+(2001), and on the web at http://www.grsampson.net/RGoodTur.html. The code
+here is a C++ adaptation of the published code by Sampson and Gale, with the
+bug fix issued in 2000. It is encapsulated as a class to allow it to be
+incorporated into other programs. An additional coding change is that the data
+can be presented in any order, whereas the original code required the data to
+be in ascending order.
+
+Sampson's original code was issued with no restrictions on use. In keeping
+with the spirit of this, the code here is issued under an open source licence
+which allows essentially unrestricted use.
+
+LICENCE
+-------
+Copyright (c) David Elworthy 2004.
+All rights reserved.
+
+Redistribution and use in source and binary forms for any purpose, with or
+without modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+   this list of conditions, and the following disclaimer.
+ 
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer that follows 
+   these conditions in the documentation and/or other materials 
+   provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Contact details
+---------------
+You may contact me at david@friendlymoose.com. I would be happy to hear of any
+experiences you have with the code; please feel free to send me updated
+versions. The reference site for the code is http://www.friendlymoose.com/.
+
+Files and use
+-------------
+There are three files:
+sgt.h       SGT header file
+sgttest.cpp A test and example program
+
+There is no source file, as the SGT class is a template over the observation
+type, typically either an int or a double.
+
+Information about using the class is included in the header file. The code has
+been tested with g++ version 3.2 on cygwin and Microsoft Visual Studio version
+6 on Windows 2000. You can compile and link the test program using g++ using
+the command
+     g++ -o sgttest sgttest.cpp
+
+For Visual Studio, from the command line, you can compile and link with
+     cl -GX sgttest.cpp
+
+Version history
+---------------
+Initial version released January 2004.
+Updated to a better implementation April 2004.
--- a/sgt/a.out
+++ b/sgt/a.out
--- a/sgt/freq_freqs.txt
+++ b/sgt/freq_freqs.txt
@ -0,0 +1,12 @@
+1 32
+2 20
+3 10
+4 3
+5 1
+6 2
+7 1
+8 1
+9 1
+10 2
+12 1
+26 1
--- a/sgt/sgt.h
+++ b/sgt/sgt.h
@ -0,0 +1,314 @@
+#ifndef SGT_H
+#define SGT_H
+// Simple Good-Turing estimation
+//
+// Copyright (c) David Elworthy 2004.
+
+// A class for implementing simple Good-Turing re-estimation, as described by
+// Geoff Sampson in the book Empirical Linguistics (2001), and on the web at
+// http://www.grsampson.net/RGoodTur.html. The code here is a C++ adaptation
+// of the published code by Sampson and Gale, with the bug fix issued in
+// 2000. It is encapsulated as a class to allow it to be incorporated into
+// other programs. An additional coding change is that the data can be
+// presented in any order, whereas the original code required the data to be
+// in ascending order.
+//
+// Copyright (c) David Elworthy 2004.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms for any purpose, with or
+// without modification, are permitted provided that the following conditions
+// are met:
+// 
+// 1. Redistributions of source code must retain the above copyright notice,
+//    this list of conditions, and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright
+//    notice, this list of conditions, and the disclaimer that follows 
+//    these conditions in the documentation and/or other materials 
+//    provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+// NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// You may contact me at david@friendlymoose.com.
+
+#include <iostream>
+#include <map>
+#include <vector>
+#include <cmath>
+using namespace std;
+
+// Simple Good-Turing class.
+// To use the class, create an SGT object and data to it by calling add() with
+// each data point. A data point consists of the observed value and the
+// frequency of the the observation (what Sampson and Gale refer to as the
+// frequency, and the frequency of the the frequency). When you have added all
+// the data points call analyse(). You can then call estimate() with an
+// observed value as argument to get the estimated frequency for that value,
+// or call iterate() to iterate over the data points. There is one special
+// case to estimate(). If called with an argument of zero, it delivers the
+// estimated frequency for unseen events. This is not delivered from pair().
+// To get back from the estimate value to the smoothed value of the
+// observation, multiply by total();
+//
+// In the original Sampson and Gale version, the observation was an integer.
+// For this version, we make the code be a template over the observation type.
+// However, it must always be some suitable numeric type, such as int or double.
+//
+// The code is implemented using the Standard Template Library (STL).
+
+template <class ObsType> class SGT
+{
+    private:
+    // Data block, holding the frequency and estimate. The estimate is set up
+    // by analyse().
+    struct Data
+    {
+        Data(unsigned int f) : freq(f), estimate(0) {}
+        
+        unsigned int freq;
+        double estimate;
+    };
+    
+    // Internal representation, as a map from observations to frequencies.
+    // After calling analyse(), it provides the estimates as well.
+    typedef map<ObsType, Data, less<ObsType> > DataMap;
+
+    // Minimum number of data points for a valid analysis
+#ifdef _WIN32
+#define MinInput (5)
+#else
+    static const unsigned int MinInput = 5;
+#endif
+    
+    template <class T> double sq(T d) { return ((double) d)*d; }
+
+    double smoothed(ObsType i, double intercept, double slope)
+    { return (exp(intercept + slope * log((double) i))); }
+    
+    public:
+    // Iterator type for iterate();
+    typedef typename DataMap::const_iterator iterator;
+    
+    // Construct a SGT object.
+    SGT() : totalObs(0) {}
+
+    // Destroy SGT object.
+    ~SGT() {}
+
+    // Add a data point.
+    // If an observation with the same value has already been supplied, this adds
+    // to its frequency.
+    void add(ObsType observation, unsigned int frequency)
+    {
+        typename DataMap::iterator i = data.find(observation);
+        if (i == data.end())
+            data.insert(make_pair(observation, Data(frequency)));
+        else
+            (*i).second.freq += frequency;
+
+        totalObs += observation * frequency;
+    }
+
+    // Get total number of observations (= sum of obs*freq)
+    ObsType total() const { return totalObs; }
+
+    // Analyse the data.
+    // Returns false if there is not enough data for a valid analysis.
+    // In this case, the estimate is set to the original value.
+    bool analyse()
+    {
+        if (data.size() < MinInput)
+            return false;
+
+        // The code which follows is based on S and G's analyseInput()
+        ObsType bigN = 0;
+        unsigned int rows = data.size();
+
+        // j could be declared in each for statement, but has to be here for
+        // Visual C++, which disobeys the ANSI standard on variable scope.
+        typename DataMap::iterator j;
+        for (j = data.begin(); j != data.end(); ++j)
+            bigN += (*j).first * (*j).second.freq;
+    
+        // Find the frequency for observation of value 1, if any
+        iterator row1 = row(1, data.begin());
+        PZero = (row1 == data.end()) ? 0 : (*row1).second.freq / (double) bigN;
+
+        // Set up internal arrays
+        vector<double> log_obs(rows);
+        vector<double> log_Z(rows);
+        vector<double> rStar(rows);
+
+        double XYs = 0, Xsquares = 0, meanX = 0, meanY = 0;
+        ObsType prevObs = 0;
+        unsigned int r = 0;
+    
+        for (j = data.begin(); j != data.end(); ++r)
+        {
+            ObsType obs = (*j).first;
+            Data &d = (*j).second;
+        
+            double k = (++j == data.end())
+                ? (double) (2 * obs - prevObs) : (double) (*j).first;
+
+            double Z   = 2 * d.freq / (k - prevObs);
+            log_obs[r] = log((double) obs);
+            log_Z[r]   = log(Z);
+
+            meanX += log_obs[r];
+            meanY += log_Z[r];
+
+            prevObs = obs;
+        }
+
+        // Find the line with the best fit.
+        meanX /= rows;
+        meanY /= rows;
+
+        for (r = 0; r < rows; ++r)
+        {
+            XYs += (log_obs[r] - meanX) * (log_Z[r] - meanY);
+            Xsquares += sq(log_obs[r] - meanX);
+        }
+        double slope = XYs / Xsquares;
+        double intercept = meanY - slope * meanX;
+    
+        // Now construct the estimates smoothing using the fitted line.
+        bool indiffValsSeen = false;
+        
+        for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
+        {
+            ObsType obs = (*j).first;
+            Data &d = (*j).second;
+
+            ObsType obs1 = obs + 1;
+            double y = obs1 * smoothed(obs1, intercept, slope)
+                / smoothed(obs, intercept, slope);
+
+            iterator nextRow = row(obs1, j);
+            if (nextRow == data.end())
+            {
+                indiffValsSeen = true;
+            }
+            else if (!indiffValsSeen)
+            {
+                unsigned int next_n = (*nextRow).second.freq;
+                unsigned int freq   = d.freq;
+
+                double x = obs1 * next_n / (double) freq;
+                printf("%0.2f %0.2f %0.2f\n",
+                       (float) obs1, (float) next_n, (float) freq);
+                printf("stdv %0.2f\n",
+                       sqrt(sq(obs1) * next_n
+                        / (sq(freq)) * (1 + next_n / (double) freq)));
+                printf("x %0.2f y %0.2f\n", x, y);
+
+                if (fabs(x - y) <= 1.96 * sqrt(sq(obs1) * next_n
+                        / (sq(freq)) * (1 + next_n / (double) freq)))
+                {
+                    indiffValsSeen = true;
+                }
+                else
+                {
+                    rStar[r] = x;
+                }
+            }
+            
+            if (indiffValsSeen)
+            {
+                rStar[r] = y;
+            }
+        }
+
+        double bigNprime = 0.0;
+        for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) {
+            printf("%f\n", (float) (*j).second.freq);
+            bigNprime += (*j).second.freq * rStar[r];
+        }
+
+        printf("%f %f\n", (float) PZero, (float) bigNprime);
+        for (int i = 0; i < (int) rStar.size(); i++)
+            printf("%f\n", rStar[i]);
+
+        for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
+            (*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime;
+        
+        return true;
+    }
+
+    // Analyze the data.
+    // This just calls analyse(), and is included as a concession to speakers
+    // of debased dialects of English.
+    void analyze() { analyse(); }
+
+    // Get the estimate for an observation.
+    // If there was no such observation, return false.
+    // Otherwise return true and yield the estimate.
+    bool estimate(ObsType observation, double &estimate) const
+    {
+        if (observation == 0)
+        {
+            estimate = PZero;
+            return true;
+        }
+        
+        iterator rownum = row(observation, data.begin());
+        if (rownum == data.end())
+        {
+            return false;
+        }
+
+        estimate = (*rownum).second.estimate;
+        return true;
+    }
+    
+    // Get start and end iterators over the data map.
+    // You do not derefence these iterators directly, but instead used the
+    // access functions, obs, freq and estimate.
+    pair<iterator, iterator> iterate() const
+    { return make_pair(data.begin(), data.end()); }
+    
+    // Get the observation from an iterator.
+    ObsType obs(iterator i) const { return (*i).first; }
+
+    // Get the frequency from an iterator (as supplied by add).
+    unsigned int freq(iterator i) const { return (*i).second.freq; }
+
+    // Get the estimated relative frequency from an iterator.
+    double estimate(iterator i) const { return (*i).second.estimate; }
+
+    private:
+    // The data points
+    DataMap data;
+
+    // Zero estimate (only valid after a call to analyse()).
+    double PZero;
+
+    // Total number of observations
+    ObsType totalObs;
+
+    // Find the last row of the data which has a value equals to obs.
+    // If there is no such value, return data.end().
+    // start is a hint about where to start searching.
+    iterator row(ObsType obs, iterator start) const
+    {
+        iterator j = start;
+        
+        while (j != data.end() && (*j).first < obs)
+            ++j;
+
+        return ((j != data.end() && (*j).first == obs) ? j : data.end());
+    }
+};
+
+#endif //SGT_H
--- a/sgt/sgt.h.gch
+++ b/sgt/sgt.h.gch
--- a/sgt/sgt.zip
+++ b/sgt/sgt.zip
--- a/sgt/sgttest.cpp
+++ b/sgt/sgttest.cpp
@ -0,0 +1,54 @@
+// Test program for sgt code
+
+// Reads a file in which each line contains an observed value and the
+// frequency of the value. Prints out a table of the estimates, and also the
+// estimate for value 1 (to test the estimate() function).
+
+#include "sgt.h"
+#include <iostream>
+
+// Set this to the type for the observation
+//typedef double Obs;
+typedef unsigned int Obs;
+
+int main()
+{
+    SGT<Obs> sgt;
+    Obs observation;
+    unsigned int frequency;
+    while (cin >> observation)
+    {
+        if (!(cin >> frequency))
+        {
+            cerr << "Incomplete input" << endl;
+            return -1;
+        }
+
+        sgt.add(observation, frequency);
+    }
+
+    sgt.analyse();
+    cout << "Results:" << endl;
+
+    // Use iterators to access the results
+    pair<SGT<Obs>::iterator, SGT<Obs>::iterator> i = sgt.iterate();
+    for (; i.first != i.second; ++i.first)
+    {
+        cout << sgt.obs(i.first)
+            << "\t" << sgt.freq(i.first)
+            << "\t" << sgt.estimate(i.first)
+            << "\t" << sgt.estimate(i.first) * sgt.total()
+            << endl;
+    }
+
+    double estimate;
+    sgt.estimate(0, estimate);
+    cout << "0\t" << estimate << endl;
+
+    if (sgt.estimate(1, estimate))
+        cout << "Estimate on obs=1: " << estimate << endl;
+    else
+        cout << "No estimate for obs=1" << endl;
+    return 0;
+}
+
--- a/src/com/owoga/prhyme/util/math.clj
+++ b/src/com/owoga/prhyme/util/math.clj
@ -157,6 +157,18 @@
    (fn [x]
      (Math/pow Math/E (+ b (* m (Math/log x)))))))

+(defn averaged-smooth
+  "Assumes 0 Nrs are included."
+  [rs Nrs]
+  (let [rs (concat rs [(inc (last rs))])
+        Nrs (concat Nrs [(+ (last Nrs) (- (last Nrs)
+                                          (last (butlast Nrs))))])]
+    [rs Nrs]))
+(comment
+  (averaged-smooth [1 2 3 4] [32 10 0 2])
+
+  )
+
 (defn average-consecutives
  "Average all the non-zero frequency of observations (frequency of frequencies)
  using the equation Zr = Nr / 0.5 (t - q)
@ -312,14 +324,28 @@
                           lgt?
                           e
                           (conj estimations estimation)))))
-        N* (apply + (map #(apply * %) (map vector nrs estimations)))]
-    [(map #(* (- 1 p0) (/ % N*)) estimations)]))
+        N* (apply + (map #(apply * %) (map vector nrs estimations)))
+        probs (cons
+               (float p0)
+               (map #(* (- 1 p0) (/ % N*)) estimations))
+        sum-probs (apply + probs)]
+    [(cons 0 rs) (map #(/ % sum-probs) probs)]))

 (comment
-  (let [rs [1 2 3 4 5 6 7 8 9 10 12 26]
-        nrs [32 20 10 3 1 2 1 1 1 2 1 1]
-        sgts (sgt rs nrs)]
-    sgts
-    )
+  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
+        nrs [32 20 10 3 1 2 1 1 1  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
+        rs  [1 2 3 4 5 6 7 8 9 10 12 26]
+        nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
+    (map #(apply * %) (map vector rs (sgt rs nrs)))
+    (sgt rs nrs))
+
+  )
+(comment
+  (let [rs [1 2 3 4 5 6 7 8 9 10 12]
+        nrs [120 40 24 13 15 5 11 2 2 1 3]
+        sgts (sgt rs nrs)
+        N0 (apply + nrs)]
+    [(float (/ 120 N0))
+     (apply + sgts)])

  )