Working completions
parent
cbde1996dc
commit
495ef6c602
@ -0,0 +1,72 @@
|
|||||||
|
SGT
|
||||||
|
===
|
||||||
|
|
||||||
|
The files here contain a C++ class for implementing simple Good-Turing
|
||||||
|
re-estimation, as described by Geoff Sampson in the book Empirical Linguistics
|
||||||
|
(2001), and on the web at http://www.grsampson.net/RGoodTur.html. The code
|
||||||
|
here is a C++ adaptation of the published code by Sampson and Gale, with the
|
||||||
|
bug fix issued in 2000. It is encapsulated as a class to allow it to be
|
||||||
|
incorporated into other programs. An additional coding change is that the data
|
||||||
|
can be presented in any order, whereas the original code required the data to
|
||||||
|
be in ascending order.
|
||||||
|
|
||||||
|
Sampson's original code was issued with no restrictions on use. In keeping
|
||||||
|
with the spirit of this, the code here is issued under an open source licence
|
||||||
|
which allows essentially unrestricted use.
|
||||||
|
|
||||||
|
LICENCE
|
||||||
|
-------
|
||||||
|
Copyright (c) David Elworthy 2004.
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms for any purpose, with or
|
||||||
|
without modification, are permitted provided that the following conditions
|
||||||
|
are met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
this list of conditions, and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions, and the disclaimer that follows
|
||||||
|
these conditions in the documentation and/or other materials
|
||||||
|
provided with the distribution.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
|
||||||
|
NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
Contact details
|
||||||
|
---------------
|
||||||
|
You may contact me at david@friendlymoose.com. I would be happy to hear of any
|
||||||
|
experiences you have with the code; please feel free to send me updated
|
||||||
|
versions. The reference site for the code is http://www.friendlymoose.com/.
|
||||||
|
|
||||||
|
Files and use
|
||||||
|
-------------
|
||||||
|
There are three files:
|
||||||
|
sgt.h SGT header file
|
||||||
|
sgttest.cpp A test and example program
|
||||||
|
|
||||||
|
There is no source file, as the SGT class is a template over the observation
|
||||||
|
type, typically either an int or a double.
|
||||||
|
|
||||||
|
Information about using the class is included in the header file. The code has
|
||||||
|
been tested with g++ version 3.2 on cygwin and Microsoft Visual Studio version
|
||||||
|
6 on Windows 2000. You can compile and link the test program using g++ using
|
||||||
|
the command
|
||||||
|
g++ -o sgttest sgttest.cpp
|
||||||
|
|
||||||
|
For Visual Studio, from the command line, you can compile and link with
|
||||||
|
cl -GX sgttest.cpp
|
||||||
|
|
||||||
|
Version history
|
||||||
|
---------------
|
||||||
|
Initial version released January 2004.
|
||||||
|
Updated to a better implementation April 2004.
|
@ -0,0 +1,12 @@
|
|||||||
|
1 32
|
||||||
|
2 20
|
||||||
|
3 10
|
||||||
|
4 3
|
||||||
|
5 1
|
||||||
|
6 2
|
||||||
|
7 1
|
||||||
|
8 1
|
||||||
|
9 1
|
||||||
|
10 2
|
||||||
|
12 1
|
||||||
|
26 1
|
@ -0,0 +1,314 @@
|
|||||||
|
#ifndef SGT_H
|
||||||
|
#define SGT_H
|
||||||
|
// Simple Good-Turing estimation
|
||||||
|
//
|
||||||
|
// Copyright (c) David Elworthy 2004.
|
||||||
|
|
||||||
|
// A class for implementing simple Good-Turing re-estimation, as described by
|
||||||
|
// Geoff Sampson in the book Empirical Linguistics (2001), and on the web at
|
||||||
|
// http://www.grsampson.net/RGoodTur.html. The code here is a C++ adaptation
|
||||||
|
// of the published code by Sampson and Gale, with the bug fix issued in
|
||||||
|
// 2000. It is encapsulated as a class to allow it to be incorporated into
|
||||||
|
// other programs. An additional coding change is that the data can be
|
||||||
|
// presented in any order, whereas the original code required the data to be
|
||||||
|
// in ascending order.
|
||||||
|
//
|
||||||
|
// Copyright (c) David Elworthy 2004.
|
||||||
|
// All rights reserved.
|
||||||
|
//
|
||||||
|
// Redistribution and use in source and binary forms for any purpose, with or
|
||||||
|
// without modification, are permitted provided that the following conditions
|
||||||
|
// are met:
|
||||||
|
//
|
||||||
|
// 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
// this list of conditions, and the following disclaimer.
|
||||||
|
//
|
||||||
|
// 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
// notice, this list of conditions, and the disclaimer that follows
|
||||||
|
// these conditions in the documentation and/or other materials
|
||||||
|
// provided with the distribution.
|
||||||
|
//
|
||||||
|
// THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||||
|
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
|
||||||
|
// NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||||
|
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||||
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||||
|
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
//
|
||||||
|
// You may contact me at david@friendlymoose.com.
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <cmath>
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
// Simple Good-Turing class.
|
||||||
|
// To use the class, create an SGT object and data to it by calling add() with
|
||||||
|
// each data point. A data point consists of the observed value and the
|
||||||
|
// frequency of the the observation (what Sampson and Gale refer to as the
|
||||||
|
// frequency, and the frequency of the the frequency). When you have added all
|
||||||
|
// the data points call analyse(). You can then call estimate() with an
|
||||||
|
// observed value as argument to get the estimated frequency for that value,
|
||||||
|
// or call iterate() to iterate over the data points. There is one special
|
||||||
|
// case to estimate(). If called with an argument of zero, it delivers the
|
||||||
|
// estimated frequency for unseen events. This is not delivered from pair().
|
||||||
|
// To get back from the estimate value to the smoothed value of the
|
||||||
|
// observation, multiply by total();
|
||||||
|
//
|
||||||
|
// In the original Sampson and Gale version, the observation was an integer.
|
||||||
|
// For this version, we make the code be a template over the observation type.
|
||||||
|
// However, it must always be some suitable numeric type, such as int or double.
|
||||||
|
//
|
||||||
|
// The code is implemented using the Standard Template Library (STL).
|
||||||
|
|
||||||
|
template <class ObsType> class SGT
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
// Data block, holding the frequency and estimate. The estimate is set up
|
||||||
|
// by analyse().
|
||||||
|
struct Data
|
||||||
|
{
|
||||||
|
Data(unsigned int f) : freq(f), estimate(0) {}
|
||||||
|
|
||||||
|
unsigned int freq;
|
||||||
|
double estimate;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Internal representation, as a map from observations to frequencies.
|
||||||
|
// After calling analyse(), it provides the estimates as well.
|
||||||
|
typedef map<ObsType, Data, less<ObsType> > DataMap;
|
||||||
|
|
||||||
|
// Minimum number of data points for a valid analysis
|
||||||
|
#ifdef _WIN32
|
||||||
|
#define MinInput (5)
|
||||||
|
#else
|
||||||
|
static const unsigned int MinInput = 5;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <class T> double sq(T d) { return ((double) d)*d; }
|
||||||
|
|
||||||
|
double smoothed(ObsType i, double intercept, double slope)
|
||||||
|
{ return (exp(intercept + slope * log((double) i))); }
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Iterator type for iterate();
|
||||||
|
typedef typename DataMap::const_iterator iterator;
|
||||||
|
|
||||||
|
// Construct a SGT object.
|
||||||
|
SGT() : totalObs(0) {}
|
||||||
|
|
||||||
|
// Destroy SGT object.
|
||||||
|
~SGT() {}
|
||||||
|
|
||||||
|
// Add a data point.
|
||||||
|
// If an observation with the same value has already been supplied, this adds
|
||||||
|
// to its frequency.
|
||||||
|
void add(ObsType observation, unsigned int frequency)
|
||||||
|
{
|
||||||
|
typename DataMap::iterator i = data.find(observation);
|
||||||
|
if (i == data.end())
|
||||||
|
data.insert(make_pair(observation, Data(frequency)));
|
||||||
|
else
|
||||||
|
(*i).second.freq += frequency;
|
||||||
|
|
||||||
|
totalObs += observation * frequency;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get total number of observations (= sum of obs*freq)
|
||||||
|
ObsType total() const { return totalObs; }
|
||||||
|
|
||||||
|
// Analyse the data.
|
||||||
|
// Returns false if there is not enough data for a valid analysis.
|
||||||
|
// In this case, the estimate is set to the original value.
|
||||||
|
bool analyse()
|
||||||
|
{
|
||||||
|
if (data.size() < MinInput)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
// The code which follows is based on S and G's analyseInput()
|
||||||
|
ObsType bigN = 0;
|
||||||
|
unsigned int rows = data.size();
|
||||||
|
|
||||||
|
// j could be declared in each for statement, but has to be here for
|
||||||
|
// Visual C++, which disobeys the ANSI standard on variable scope.
|
||||||
|
typename DataMap::iterator j;
|
||||||
|
for (j = data.begin(); j != data.end(); ++j)
|
||||||
|
bigN += (*j).first * (*j).second.freq;
|
||||||
|
|
||||||
|
// Find the frequency for observation of value 1, if any
|
||||||
|
iterator row1 = row(1, data.begin());
|
||||||
|
PZero = (row1 == data.end()) ? 0 : (*row1).second.freq / (double) bigN;
|
||||||
|
|
||||||
|
// Set up internal arrays
|
||||||
|
vector<double> log_obs(rows);
|
||||||
|
vector<double> log_Z(rows);
|
||||||
|
vector<double> rStar(rows);
|
||||||
|
|
||||||
|
double XYs = 0, Xsquares = 0, meanX = 0, meanY = 0;
|
||||||
|
ObsType prevObs = 0;
|
||||||
|
unsigned int r = 0;
|
||||||
|
|
||||||
|
for (j = data.begin(); j != data.end(); ++r)
|
||||||
|
{
|
||||||
|
ObsType obs = (*j).first;
|
||||||
|
Data &d = (*j).second;
|
||||||
|
|
||||||
|
double k = (++j == data.end())
|
||||||
|
? (double) (2 * obs - prevObs) : (double) (*j).first;
|
||||||
|
|
||||||
|
double Z = 2 * d.freq / (k - prevObs);
|
||||||
|
log_obs[r] = log((double) obs);
|
||||||
|
log_Z[r] = log(Z);
|
||||||
|
|
||||||
|
meanX += log_obs[r];
|
||||||
|
meanY += log_Z[r];
|
||||||
|
|
||||||
|
prevObs = obs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the line with the best fit.
|
||||||
|
meanX /= rows;
|
||||||
|
meanY /= rows;
|
||||||
|
|
||||||
|
for (r = 0; r < rows; ++r)
|
||||||
|
{
|
||||||
|
XYs += (log_obs[r] - meanX) * (log_Z[r] - meanY);
|
||||||
|
Xsquares += sq(log_obs[r] - meanX);
|
||||||
|
}
|
||||||
|
double slope = XYs / Xsquares;
|
||||||
|
double intercept = meanY - slope * meanX;
|
||||||
|
|
||||||
|
// Now construct the estimates smoothing using the fitted line.
|
||||||
|
bool indiffValsSeen = false;
|
||||||
|
|
||||||
|
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
|
||||||
|
{
|
||||||
|
ObsType obs = (*j).first;
|
||||||
|
Data &d = (*j).second;
|
||||||
|
|
||||||
|
ObsType obs1 = obs + 1;
|
||||||
|
double y = obs1 * smoothed(obs1, intercept, slope)
|
||||||
|
/ smoothed(obs, intercept, slope);
|
||||||
|
|
||||||
|
iterator nextRow = row(obs1, j);
|
||||||
|
if (nextRow == data.end())
|
||||||
|
{
|
||||||
|
indiffValsSeen = true;
|
||||||
|
}
|
||||||
|
else if (!indiffValsSeen)
|
||||||
|
{
|
||||||
|
unsigned int next_n = (*nextRow).second.freq;
|
||||||
|
unsigned int freq = d.freq;
|
||||||
|
|
||||||
|
double x = obs1 * next_n / (double) freq;
|
||||||
|
printf("%0.2f %0.2f %0.2f\n",
|
||||||
|
(float) obs1, (float) next_n, (float) freq);
|
||||||
|
printf("stdv %0.2f\n",
|
||||||
|
sqrt(sq(obs1) * next_n
|
||||||
|
/ (sq(freq)) * (1 + next_n / (double) freq)));
|
||||||
|
printf("x %0.2f y %0.2f\n", x, y);
|
||||||
|
|
||||||
|
if (fabs(x - y) <= 1.96 * sqrt(sq(obs1) * next_n
|
||||||
|
/ (sq(freq)) * (1 + next_n / (double) freq)))
|
||||||
|
{
|
||||||
|
indiffValsSeen = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
rStar[r] = x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (indiffValsSeen)
|
||||||
|
{
|
||||||
|
rStar[r] = y;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double bigNprime = 0.0;
|
||||||
|
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) {
|
||||||
|
printf("%f\n", (float) (*j).second.freq);
|
||||||
|
bigNprime += (*j).second.freq * rStar[r];
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%f %f\n", (float) PZero, (float) bigNprime);
|
||||||
|
for (int i = 0; i < (int) rStar.size(); i++)
|
||||||
|
printf("%f\n", rStar[i]);
|
||||||
|
|
||||||
|
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
|
||||||
|
(*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Analyze the data.
|
||||||
|
// This just calls analyse(), and is included as a concession to speakers
|
||||||
|
// of debased dialects of English.
|
||||||
|
void analyze() { analyse(); }
|
||||||
|
|
||||||
|
// Get the estimate for an observation.
|
||||||
|
// If there was no such observation, return false.
|
||||||
|
// Otherwise return true and yield the estimate.
|
||||||
|
bool estimate(ObsType observation, double &estimate) const
|
||||||
|
{
|
||||||
|
if (observation == 0)
|
||||||
|
{
|
||||||
|
estimate = PZero;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
iterator rownum = row(observation, data.begin());
|
||||||
|
if (rownum == data.end())
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
estimate = (*rownum).second.estimate;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get start and end iterators over the data map.
|
||||||
|
// You do not derefence these iterators directly, but instead used the
|
||||||
|
// access functions, obs, freq and estimate.
|
||||||
|
pair<iterator, iterator> iterate() const
|
||||||
|
{ return make_pair(data.begin(), data.end()); }
|
||||||
|
|
||||||
|
// Get the observation from an iterator.
|
||||||
|
ObsType obs(iterator i) const { return (*i).first; }
|
||||||
|
|
||||||
|
// Get the frequency from an iterator (as supplied by add).
|
||||||
|
unsigned int freq(iterator i) const { return (*i).second.freq; }
|
||||||
|
|
||||||
|
// Get the estimated relative frequency from an iterator.
|
||||||
|
double estimate(iterator i) const { return (*i).second.estimate; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
// The data points
|
||||||
|
DataMap data;
|
||||||
|
|
||||||
|
// Zero estimate (only valid after a call to analyse()).
|
||||||
|
double PZero;
|
||||||
|
|
||||||
|
// Total number of observations
|
||||||
|
ObsType totalObs;
|
||||||
|
|
||||||
|
// Find the last row of the data which has a value equals to obs.
|
||||||
|
// If there is no such value, return data.end().
|
||||||
|
// start is a hint about where to start searching.
|
||||||
|
iterator row(ObsType obs, iterator start) const
|
||||||
|
{
|
||||||
|
iterator j = start;
|
||||||
|
|
||||||
|
while (j != data.end() && (*j).first < obs)
|
||||||
|
++j;
|
||||||
|
|
||||||
|
return ((j != data.end() && (*j).first == obs) ? j : data.end());
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif //SGT_H
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,54 @@
|
|||||||
|
// Test program for sgt code
|
||||||
|
|
||||||
|
// Reads a file in which each line contains an observed value and the
|
||||||
|
// frequency of the value. Prints out a table of the estimates, and also the
|
||||||
|
// estimate for value 1 (to test the estimate() function).
|
||||||
|
|
||||||
|
#include "sgt.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
// Set this to the type for the observation
|
||||||
|
//typedef double Obs;
|
||||||
|
typedef unsigned int Obs;
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
SGT<Obs> sgt;
|
||||||
|
Obs observation;
|
||||||
|
unsigned int frequency;
|
||||||
|
while (cin >> observation)
|
||||||
|
{
|
||||||
|
if (!(cin >> frequency))
|
||||||
|
{
|
||||||
|
cerr << "Incomplete input" << endl;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
sgt.add(observation, frequency);
|
||||||
|
}
|
||||||
|
|
||||||
|
sgt.analyse();
|
||||||
|
cout << "Results:" << endl;
|
||||||
|
|
||||||
|
// Use iterators to access the results
|
||||||
|
pair<SGT<Obs>::iterator, SGT<Obs>::iterator> i = sgt.iterate();
|
||||||
|
for (; i.first != i.second; ++i.first)
|
||||||
|
{
|
||||||
|
cout << sgt.obs(i.first)
|
||||||
|
<< "\t" << sgt.freq(i.first)
|
||||||
|
<< "\t" << sgt.estimate(i.first)
|
||||||
|
<< "\t" << sgt.estimate(i.first) * sgt.total()
|
||||||
|
<< endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
double estimate;
|
||||||
|
sgt.estimate(0, estimate);
|
||||||
|
cout << "0\t" << estimate << endl;
|
||||||
|
|
||||||
|
if (sgt.estimate(1, estimate))
|
||||||
|
cout << "Estimate on obs=1: " << estimate << endl;
|
||||||
|
else
|
||||||
|
cout << "No estimate for obs=1" << endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue