Working completions

5 years ago · 495ef6c602
parent cbde1996dc
commit 495ef6c602
9 changed files with 619 additions and 13 deletions
--- a/dev/examples/scratch.clj
+++ b/dev/examples/scratch.clj
@ -2,7 +2,8 @@
  (:require [clojure.java.io :as io]
            [clojure.string :as string]
            [clojure.set]
-            [com.owoga.prhyme.nlp.core :as nlp]))
+            [com.owoga.prhyme.nlp.core :as nlp]
            [com.owoga.prhyme.util.math :as math]))
 (def re-word
  "Regex for tokenizing a string into words
@ -17,12 +18,24 @@
       (re-seq re-word)
       (map second)
       (map string/lower-case)
       (cons :bol)
       (reverse)
-       (cons :end)))
+       (cons :eol)))
 (defn tokenize-line
  [line]
  (->> line
       (string/trim)
       (re-seq re-word)
       (map second)
       (map string/lower-case)))
 (comment
-  (-> (slurp "dev/examples/sandman.txt")
+  (->> (slurp "dev/examples/sandman.txt")
-      tokenize))
+       (#(string/split % #"\n"))
       (map tokenize-line))
  )
 (defn zero-to-n-seq
  ([coll]
@ -85,7 +98,99 @@
                (zero-to-n-seq (first windows)))
               (rest windows))))))
 (defn add-to-trie-1
  [trie n tokens]
  (let [pad-n (dec n)
        tokens (concat (repeat pad-n :bol) tokens (repeat pad-n :eol))
        partitions (partition n 1 tokens)]
    (reduce
     (fn [acc tokens]
       (update-in acc (concat tokens [:count]) (fnil inc 0)))
     trie
     partitions)))
 (defn flatmap
  ([m]
   (flatmap m []))
  ([m prefix]
   (mapcat
    (fn [[k v]]
      (if (map? v)
        (flatmap v (conj prefix k))
        [(conj prefix k) v]))
    m)))
 (defn filter-trie-to-ngrams [trie n]
  (->> trie
       (flatmap)
       (partition 2)
       ;; Inc to account for :count
       (filter #(= (inc n) (count (first %))))))
 (comment
  (let [trie {}]
    (-> (add-to-trie-1 trie 2 '("of" "lives" "lost" "at" "sea"))
        (add-to-trie-1 1 '("of" "lives" "lost" "at" "sea"))))
  )
 (defn wrand
  "given a vector of slice sizes, returns the index of a slice given a
  random spin of a roulette wheel with compartments proportional to
  slices."
  [slices]
  (let [total (reduce + slices)
        r (rand total)]
    (loop [i 0 sum 0]
      (if (< r (+ (slices i) sum))
        i
        (recur (inc i) (+ (slices i) sum))))))
 (defn completions [trie probs words]
  (let [n (inc (count words))
        possibilities (->> (get-in trie words)
                           (filter #(string? (first %)))
                           (map (fn [[k v]]
                                  [k (get-in probs [n (:count v)])]))
                           (into {}))
        sum-probs (apply + (vals possibilities))
        possibilities (into {} (map (fn [[k v]] [k (/ v sum-probs)]) possibilities))]
    possibilities))
 (comment
  ;; Turning corpus into a trie.
  (let [documents (->> "dark-corpus"
                       io/file
                       file-seq
                       (remove #(.isDirectory %))
                       (drop 500)
                       (take 5))
        trie (->> documents
                  (map slurp)
                  (mapcat #(string/split % #"\n"))
                  (map tokenize-line)
                  (filter #(> (count %) 1))
                  (take 500)
                  (reduce
                   (fn [acc tokens]
                     (-> (add-to-trie-1 acc 1 tokens)
                         (add-to-trie-1 2 tokens)
                         (add-to-trie-1 3 tokens)))
                   {}))
        probs (->> (range 1 4)
                   (map #(vector % (filter-trie-to-ngrams trie %)))
                   (map (fn [[n v]] [n (map #(second %) v)]))
                   (map (fn [[n v]] [n (into (sorted-map) (frequencies v))]))
                   (map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
                   (map (fn [[n [rs probs]]]
                          [n (into {} (map vector  rs probs))]))
                   (into {}))]
    (reverse (sort-by second (completions trie probs [:bol "you"]))))
  (into {} (map vector [1 2 3] [4 5 6]))
  ;;
  ;; => ([1 (1 2 8 7 3 6 4 23) (85 18 2 2 6 3 1 1)]
  ;;     [2 (1 2 5 3 4 7) (170 25 2 4 2 2)]
  ;;     [3 (1 2 3 4 7 5) (213 30 5 1 1 3)])
  (let [last-window '("in" "the" "frat")]
    (concat (zero-to-n-seq last-window)
            (rest (n-to-zero-seq last-window))))
@ -96,6 +201,7 @@
   (string/split
    "the cat in the hat is the rat in the frat"
    #" "))
  ;; => {"the"
  ;;     {:count 3,
  ;;      "cat" {:count 1, "in" {:count 1}},
@ -184,7 +290,29 @@
      :else (flat-at-depth (->> m (mapcat second) (remove #(= :count (first %))))
                           (dec depth)))))
 (defn flatmap
  ([m]
   (flatmap m []))
  ([m prefix]
   (mapcat
    (fn [[k v]]
      (if (map? v)
        (flatmap v (conj prefix k))
        [(conj prefix k) v]))
    m)))
 (defn filter-trie-to-ngrams [trie n]
  (->> trie
       (flatmap)
       (partition 2)
       ;; Inc to account for :count
       (filter #(= (inc n) (count (first %))))))
 (apply hash-map '([1 2] 3 [4 5] 6))
 (comment
  (apply hash-map (flatmap {1 {2 {3 4} 5 {6 7}} 8 {9 10}} []))
  (let [trie {"d" {:count 3
                   "o" {:count 3
                        "g" {:count 2}
@ -196,7 +324,8 @@
                        "g" {:count 1}}
                   "i" {:count 1
                        "g" {:count 1}}}}]
-    (->> (flat-at-depth trie 2)))
+    (filter-trie-to-ngrams trie 3))
  )
@ -458,7 +587,6 @@
  (def n-gram-freq-map (n-gram-frequency-map trie 3))
  (def unigram-frequencies (n-gram-freq-map 1))
  unigram-frequencies
  )
 (defn number-of-n-grams-that-occur-with-count [trie n c]
--- a/sgt/ReadMe.txt
+++ b/sgt/ReadMe.txt
@ -0,0 +1,72 @@
 SGT
 ===
 The files here contain a C++ class for implementing simple Good-Turing
 re-estimation, as described by Geoff Sampson in the book Empirical Linguistics
 (2001), and on the web at http://www.grsampson.net/RGoodTur.html. The code
 here is a C++ adaptation of the published code by Sampson and Gale, with the
 bug fix issued in 2000. It is encapsulated as a class to allow it to be
 incorporated into other programs. An additional coding change is that the data
 can be presented in any order, whereas the original code required the data to
 be in ascending order.
 Sampson's original code was issued with no restrictions on use. In keeping
 with the spirit of this, the code here is issued under an open source licence
 which allows essentially unrestricted use.
 LICENCE
 -------
 Copyright (c) David Elworthy 2004.
 All rights reserved.
 Redistribution and use in source and binary forms for any purpose, with or
 without modification, are permitted provided that the following conditions
 are met:
 1. Redistributions of source code must retain the above copyright notice,
   this list of conditions, and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer that follows 
   these conditions in the documentation and/or other materials 
   provided with the distribution.
 THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 Contact details
 ---------------
 You may contact me at david@friendlymoose.com. I would be happy to hear of any
 experiences you have with the code; please feel free to send me updated
 versions. The reference site for the code is http://www.friendlymoose.com/.
 Files and use
 -------------
 There are three files:
 sgt.h       SGT header file
 sgttest.cpp A test and example program
 There is no source file, as the SGT class is a template over the observation
 type, typically either an int or a double.
 Information about using the class is included in the header file. The code has
 been tested with g++ version 3.2 on cygwin and Microsoft Visual Studio version
 6 on Windows 2000. You can compile and link the test program using g++ using
 the command
     g++ -o sgttest sgttest.cpp
 For Visual Studio, from the command line, you can compile and link with
     cl -GX sgttest.cpp
 Version history
 ---------------
 Initial version released January 2004.
 Updated to a better implementation April 2004.
--- a/sgt/a.out
+++ b/sgt/a.out
--- a/sgt/freq_freqs.txt
+++ b/sgt/freq_freqs.txt
@ -0,0 +1,12 @@
 1 32
 2 20
 3 10
 4 3
 5 1
 6 2
 7 1
 8 1
 9 1
 10 2
 12 1
 26 1
--- a/sgt/sgt.h
+++ b/sgt/sgt.h
@ -0,0 +1,314 @@
 #ifndef SGT_H
 #define SGT_H
 // Simple Good-Turing estimation
 //
 // Copyright (c) David Elworthy 2004.
 // A class for implementing simple Good-Turing re-estimation, as described by
 // Geoff Sampson in the book Empirical Linguistics (2001), and on the web at
 // http://www.grsampson.net/RGoodTur.html. The code here is a C++ adaptation
 // of the published code by Sampson and Gale, with the bug fix issued in
 // 2000. It is encapsulated as a class to allow it to be incorporated into
 // other programs. An additional coding change is that the data can be
 // presented in any order, whereas the original code required the data to be
 // in ascending order.
 //
 // Copyright (c) David Elworthy 2004.
 // All rights reserved.
 //
 // Redistribution and use in source and binary forms for any purpose, with or
 // without modification, are permitted provided that the following conditions
 // are met:
 // 
 // 1. Redistributions of source code must retain the above copyright notice,
 //    this list of conditions, and the following disclaimer.
 // 
 // 2. Redistributions in binary form must reproduce the above copyright
 //    notice, this list of conditions, and the disclaimer that follows 
 //    these conditions in the documentation and/or other materials 
 //    provided with the distribution.
 //
 // THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 // WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 // MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 // NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // You may contact me at david@friendlymoose.com.
 #include <iostream>
 #include <map>
 #include <vector>
 #include <cmath>
 using namespace std;
 // Simple Good-Turing class.
 // To use the class, create an SGT object and data to it by calling add() with
 // each data point. A data point consists of the observed value and the
 // frequency of the the observation (what Sampson and Gale refer to as the
 // frequency, and the frequency of the the frequency). When you have added all
 // the data points call analyse(). You can then call estimate() with an
 // observed value as argument to get the estimated frequency for that value,
 // or call iterate() to iterate over the data points. There is one special
 // case to estimate(). If called with an argument of zero, it delivers the
 // estimated frequency for unseen events. This is not delivered from pair().
 // To get back from the estimate value to the smoothed value of the
 // observation, multiply by total();
 //
 // In the original Sampson and Gale version, the observation was an integer.
 // For this version, we make the code be a template over the observation type.
 // However, it must always be some suitable numeric type, such as int or double.
 //
 // The code is implemented using the Standard Template Library (STL).
 template <class ObsType> class SGT
 {
    private:
    // Data block, holding the frequency and estimate. The estimate is set up
    // by analyse().
    struct Data
    {
        Data(unsigned int f) : freq(f), estimate(0) {}
        unsigned int freq;
        double estimate;
    };
    // Internal representation, as a map from observations to frequencies.
    // After calling analyse(), it provides the estimates as well.
    typedef map<ObsType, Data, less<ObsType> > DataMap;
    // Minimum number of data points for a valid analysis
 #ifdef _WIN32
 #define MinInput (5)
 #else
    static const unsigned int MinInput = 5;
 #endif
    template <class T> double sq(T d) { return ((double) d)*d; }
    double smoothed(ObsType i, double intercept, double slope)
    { return (exp(intercept + slope * log((double) i))); }
    public:
    // Iterator type for iterate();
    typedef typename DataMap::const_iterator iterator;
    // Construct a SGT object.
    SGT() : totalObs(0) {}
    // Destroy SGT object.
    ~SGT() {}
    // Add a data point.
    // If an observation with the same value has already been supplied, this adds
    // to its frequency.
    void add(ObsType observation, unsigned int frequency)
    {
        typename DataMap::iterator i = data.find(observation);
        if (i == data.end())
            data.insert(make_pair(observation, Data(frequency)));
        else
            (*i).second.freq += frequency;
        totalObs += observation * frequency;
    }
    // Get total number of observations (= sum of obs*freq)
    ObsType total() const { return totalObs; }
    // Analyse the data.
    // Returns false if there is not enough data for a valid analysis.
    // In this case, the estimate is set to the original value.
    bool analyse()
    {
        if (data.size() < MinInput)
            return false;
        // The code which follows is based on S and G's analyseInput()
        ObsType bigN = 0;
        unsigned int rows = data.size();
        // j could be declared in each for statement, but has to be here for
        // Visual C++, which disobeys the ANSI standard on variable scope.
        typename DataMap::iterator j;
        for (j = data.begin(); j != data.end(); ++j)
            bigN += (*j).first * (*j).second.freq;
        // Find the frequency for observation of value 1, if any
        iterator row1 = row(1, data.begin());
        PZero = (row1 == data.end()) ? 0 : (*row1).second.freq / (double) bigN;
        // Set up internal arrays
        vector<double> log_obs(rows);
        vector<double> log_Z(rows);
        vector<double> rStar(rows);
        double XYs = 0, Xsquares = 0, meanX = 0, meanY = 0;
        ObsType prevObs = 0;
        unsigned int r = 0;
        for (j = data.begin(); j != data.end(); ++r)
        {
            ObsType obs = (*j).first;
            Data &d = (*j).second;
            double k = (++j == data.end())
                ? (double) (2 * obs - prevObs) : (double) (*j).first;
            double Z   = 2 * d.freq / (k - prevObs);
            log_obs[r] = log((double) obs);
            log_Z[r]   = log(Z);
            meanX += log_obs[r];
            meanY += log_Z[r];
            prevObs = obs;
        }
        // Find the line with the best fit.
        meanX /= rows;
        meanY /= rows;
        for (r = 0; r < rows; ++r)
        {
            XYs += (log_obs[r] - meanX) * (log_Z[r] - meanY);
            Xsquares += sq(log_obs[r] - meanX);
        }
        double slope = XYs / Xsquares;
        double intercept = meanY - slope * meanX;
        // Now construct the estimates smoothing using the fitted line.
        bool indiffValsSeen = false;
        for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
        {
            ObsType obs = (*j).first;
            Data &d = (*j).second;
            ObsType obs1 = obs + 1;
            double y = obs1 * smoothed(obs1, intercept, slope)
                / smoothed(obs, intercept, slope);
            iterator nextRow = row(obs1, j);
            if (nextRow == data.end())
            {
                indiffValsSeen = true;
            }
            else if (!indiffValsSeen)
            {
                unsigned int next_n = (*nextRow).second.freq;
                unsigned int freq   = d.freq;
                double x = obs1 * next_n / (double) freq;
                printf("%0.2f %0.2f %0.2f\n",
                       (float) obs1, (float) next_n, (float) freq);
                printf("stdv %0.2f\n",
                       sqrt(sq(obs1) * next_n
                        / (sq(freq)) * (1 + next_n / (double) freq)));
                printf("x %0.2f y %0.2f\n", x, y);
                if (fabs(x - y) <= 1.96 * sqrt(sq(obs1) * next_n
                        / (sq(freq)) * (1 + next_n / (double) freq)))
                {
                    indiffValsSeen = true;
                }
                else
                {
                    rStar[r] = x;
                }
            }
            if (indiffValsSeen)
            {
                rStar[r] = y;
            }
        }
        double bigNprime = 0.0;
        for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) {
            printf("%f\n", (float) (*j).second.freq);
            bigNprime += (*j).second.freq * rStar[r];
        }
        printf("%f %f\n", (float) PZero, (float) bigNprime);
        for (int i = 0; i < (int) rStar.size(); i++)
            printf("%f\n", rStar[i]);
        for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
            (*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime;
        return true;
    }
    // Analyze the data.
    // This just calls analyse(), and is included as a concession to speakers
    // of debased dialects of English.
    void analyze() { analyse(); }
    // Get the estimate for an observation.
    // If there was no such observation, return false.
    // Otherwise return true and yield the estimate.
    bool estimate(ObsType observation, double &estimate) const
    {
        if (observation == 0)
        {
            estimate = PZero;
            return true;
        }
        iterator rownum = row(observation, data.begin());
        if (rownum == data.end())
        {
            return false;
        }
        estimate = (*rownum).second.estimate;
        return true;
    }
    // Get start and end iterators over the data map.
    // You do not derefence these iterators directly, but instead used the
    // access functions, obs, freq and estimate.
    pair<iterator, iterator> iterate() const
    { return make_pair(data.begin(), data.end()); }
    // Get the observation from an iterator.
    ObsType obs(iterator i) const { return (*i).first; }
    // Get the frequency from an iterator (as supplied by add).
    unsigned int freq(iterator i) const { return (*i).second.freq; }
    // Get the estimated relative frequency from an iterator.
    double estimate(iterator i) const { return (*i).second.estimate; }
    private:
    // The data points
    DataMap data;
    // Zero estimate (only valid after a call to analyse()).
    double PZero;
    // Total number of observations
    ObsType totalObs;
    // Find the last row of the data which has a value equals to obs.
    // If there is no such value, return data.end().
    // start is a hint about where to start searching.
    iterator row(ObsType obs, iterator start) const
    {
        iterator j = start;
        while (j != data.end() && (*j).first < obs)
            ++j;
        return ((j != data.end() && (*j).first == obs) ? j : data.end());
    }
 };
 #endif //SGT_H
--- a/sgt/sgt.h.gch
+++ b/sgt/sgt.h.gch
--- a/sgt/sgt.zip
+++ b/sgt/sgt.zip
--- a/sgt/sgttest.cpp
+++ b/sgt/sgttest.cpp
@ -0,0 +1,54 @@
 // Test program for sgt code
 // Reads a file in which each line contains an observed value and the
 // frequency of the value. Prints out a table of the estimates, and also the
 // estimate for value 1 (to test the estimate() function).
 #include "sgt.h"
 #include <iostream>
 // Set this to the type for the observation
 //typedef double Obs;
 typedef unsigned int Obs;
 int main()
 {
    SGT<Obs> sgt;
    Obs observation;
    unsigned int frequency;
    while (cin >> observation)
    {
        if (!(cin >> frequency))
        {
            cerr << "Incomplete input" << endl;
            return -1;
        }
        sgt.add(observation, frequency);
    }
    sgt.analyse();
    cout << "Results:" << endl;
    // Use iterators to access the results
    pair<SGT<Obs>::iterator, SGT<Obs>::iterator> i = sgt.iterate();
    for (; i.first != i.second; ++i.first)
    {
        cout << sgt.obs(i.first)
            << "\t" << sgt.freq(i.first)
            << "\t" << sgt.estimate(i.first)
            << "\t" << sgt.estimate(i.first) * sgt.total()
            << endl;
    }
    double estimate;
    sgt.estimate(0, estimate);
    cout << "0\t" << estimate << endl;
    if (sgt.estimate(1, estimate))
        cout << "Estimate on obs=1: " << estimate << endl;
    else
        cout << "No estimate for obs=1" << endl;
    return 0;
 }
--- a/src/com/owoga/prhyme/util/math.clj
+++ b/src/com/owoga/prhyme/util/math.clj
@ -157,6 +157,18 @@
    (fn [x]
      (Math/pow Math/E (+ b (* m (Math/log x)))))))
 (defn averaged-smooth
  "Assumes 0 Nrs are included."
  [rs Nrs]
  (let [rs (concat rs [(inc (last rs))])
        Nrs (concat Nrs [(+ (last Nrs) (- (last Nrs)
                                          (last (butlast Nrs))))])]
    [rs Nrs]))
 (comment
  (averaged-smooth [1 2 3 4] [32 10 0 2])
  )
 (defn average-consecutives
  "Average all the non-zero frequency of observations (frequency of frequencies)
  using the equation Zr = Nr / 0.5 (t - q)
@ -312,14 +324,28 @@
                           lgt?
                           e
                           (conj estimations estimation)))))
-        N* (apply + (map #(apply * %) (map vector nrs estimations)))]
+        N* (apply + (map #(apply * %) (map vector nrs estimations)))
-    [(map #(* (- 1 p0) (/ % N*)) estimations)]))
+        probs (cons
               (float p0)
               (map #(* (- 1 p0) (/ % N*)) estimations))
        sum-probs (apply + probs)]
    [(cons 0 rs) (map #(/ % sum-probs) probs)]))
 (comment
-  (let [rs [1 2 3 4 5 6 7 8 9 10 12 26]
+  (let [rs  [ 1  2  3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
-        nrs [32 20 10 3 1 2 1 1 1 2 1 1]
+        nrs [32 20 10 3 1 2 1 1 1  2  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  1]
-        sgts (sgt rs nrs)]
+        rs  [1 2 3 4 5 6 7 8 9 10 12 26]
-    sgts
+        nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
-    )
+    (map #(apply * %) (map vector rs (sgt rs nrs)))
    (sgt rs nrs))
  )
 (comment
  (let [rs [1 2 3 4 5 6 7 8 9 10 12]
        nrs [120 40 24 13 15 5 11 2 2 1 3]
        sgts (sgt rs nrs)
        N0 (apply + nrs)]
    [(float (/ 120 N0))
     (apply + sgts)])
  )
+32
+20
+10
+3
+1
+2
+1
+1
+1
+2
+1
+1