Working completions
parent
cbde1996dc
commit
495ef6c602
@ -0,0 +1,72 @@
|
||||
SGT
|
||||
===
|
||||
|
||||
The files here contain a C++ class for implementing simple Good-Turing
|
||||
re-estimation, as described by Geoff Sampson in the book Empirical Linguistics
|
||||
(2001), and on the web at http://www.grsampson.net/RGoodTur.html. The code
|
||||
here is a C++ adaptation of the published code by Sampson and Gale, with the
|
||||
bug fix issued in 2000. It is encapsulated as a class to allow it to be
|
||||
incorporated into other programs. An additional coding change is that the data
|
||||
can be presented in any order, whereas the original code required the data to
|
||||
be in ascending order.
|
||||
|
||||
Sampson's original code was issued with no restrictions on use. In keeping
|
||||
with the spirit of this, the code here is issued under an open source licence
|
||||
which allows essentially unrestricted use.
|
||||
|
||||
LICENCE
|
||||
-------
|
||||
Copyright (c) David Elworthy 2004.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms for any purpose, with or
|
||||
without modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice,
|
||||
this list of conditions, and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions, and the disclaimer that follows
|
||||
these conditions in the documentation and/or other materials
|
||||
provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
|
||||
NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
Contact details
|
||||
---------------
|
||||
You may contact me at david@friendlymoose.com. I would be happy to hear of any
|
||||
experiences you have with the code; please feel free to send me updated
|
||||
versions. The reference site for the code is http://www.friendlymoose.com/.
|
||||
|
||||
Files and use
|
||||
-------------
|
||||
There are three files:
|
||||
sgt.h SGT header file
|
||||
sgttest.cpp A test and example program
|
||||
|
||||
There is no source file, as the SGT class is a template over the observation
|
||||
type, typically either an int or a double.
|
||||
|
||||
Information about using the class is included in the header file. The code has
|
||||
been tested with g++ version 3.2 on cygwin and Microsoft Visual Studio version
|
||||
6 on Windows 2000. You can compile and link the test program using g++ using
|
||||
the command
|
||||
g++ -o sgttest sgttest.cpp
|
||||
|
||||
For Visual Studio, from the command line, you can compile and link with
|
||||
cl -GX sgttest.cpp
|
||||
|
||||
Version history
|
||||
---------------
|
||||
Initial version released January 2004.
|
||||
Updated to a better implementation April 2004.
|
@ -0,0 +1,12 @@
|
||||
1 32
|
||||
2 20
|
||||
3 10
|
||||
4 3
|
||||
5 1
|
||||
6 2
|
||||
7 1
|
||||
8 1
|
||||
9 1
|
||||
10 2
|
||||
12 1
|
||||
26 1
|
@ -0,0 +1,314 @@
|
||||
#ifndef SGT_H
|
||||
#define SGT_H
|
||||
// Simple Good-Turing estimation
|
||||
//
|
||||
// Copyright (c) David Elworthy 2004.
|
||||
|
||||
// A class for implementing simple Good-Turing re-estimation, as described by
|
||||
// Geoff Sampson in the book Empirical Linguistics (2001), and on the web at
|
||||
// http://www.grsampson.net/RGoodTur.html. The code here is a C++ adaptation
|
||||
// of the published code by Sampson and Gale, with the bug fix issued in
|
||||
// 2000. It is encapsulated as a class to allow it to be incorporated into
|
||||
// other programs. An additional coding change is that the data can be
|
||||
// presented in any order, whereas the original code required the data to be
|
||||
// in ascending order.
|
||||
//
|
||||
// Copyright (c) David Elworthy 2004.
|
||||
// All rights reserved.
|
||||
//
|
||||
// Redistribution and use in source and binary forms for any purpose, with or
|
||||
// without modification, are permitted provided that the following conditions
|
||||
// are met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions, and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions, and the disclaimer that follows
|
||||
// these conditions in the documentation and/or other materials
|
||||
// provided with the distribution.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
|
||||
// NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// You may contact me at david@friendlymoose.com.
|
||||
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
using namespace std;
|
||||
|
||||
// Simple Good-Turing class.
|
||||
// To use the class, create an SGT object and data to it by calling add() with
|
||||
// each data point. A data point consists of the observed value and the
|
||||
// frequency of the the observation (what Sampson and Gale refer to as the
|
||||
// frequency, and the frequency of the the frequency). When you have added all
|
||||
// the data points call analyse(). You can then call estimate() with an
|
||||
// observed value as argument to get the estimated frequency for that value,
|
||||
// or call iterate() to iterate over the data points. There is one special
|
||||
// case to estimate(). If called with an argument of zero, it delivers the
|
||||
// estimated frequency for unseen events. This is not delivered from pair().
|
||||
// To get back from the estimate value to the smoothed value of the
|
||||
// observation, multiply by total();
|
||||
//
|
||||
// In the original Sampson and Gale version, the observation was an integer.
|
||||
// For this version, we make the code be a template over the observation type.
|
||||
// However, it must always be some suitable numeric type, such as int or double.
|
||||
//
|
||||
// The code is implemented using the Standard Template Library (STL).
|
||||
|
||||
template <class ObsType> class SGT
|
||||
{
|
||||
private:
|
||||
// Data block, holding the frequency and estimate. The estimate is set up
|
||||
// by analyse().
|
||||
struct Data
|
||||
{
|
||||
Data(unsigned int f) : freq(f), estimate(0) {}
|
||||
|
||||
unsigned int freq;
|
||||
double estimate;
|
||||
};
|
||||
|
||||
// Internal representation, as a map from observations to frequencies.
|
||||
// After calling analyse(), it provides the estimates as well.
|
||||
typedef map<ObsType, Data, less<ObsType> > DataMap;
|
||||
|
||||
// Minimum number of data points for a valid analysis
|
||||
#ifdef _WIN32
|
||||
#define MinInput (5)
|
||||
#else
|
||||
static const unsigned int MinInput = 5;
|
||||
#endif
|
||||
|
||||
template <class T> double sq(T d) { return ((double) d)*d; }
|
||||
|
||||
double smoothed(ObsType i, double intercept, double slope)
|
||||
{ return (exp(intercept + slope * log((double) i))); }
|
||||
|
||||
public:
|
||||
// Iterator type for iterate();
|
||||
typedef typename DataMap::const_iterator iterator;
|
||||
|
||||
// Construct a SGT object.
|
||||
SGT() : totalObs(0) {}
|
||||
|
||||
// Destroy SGT object.
|
||||
~SGT() {}
|
||||
|
||||
// Add a data point.
|
||||
// If an observation with the same value has already been supplied, this adds
|
||||
// to its frequency.
|
||||
void add(ObsType observation, unsigned int frequency)
|
||||
{
|
||||
typename DataMap::iterator i = data.find(observation);
|
||||
if (i == data.end())
|
||||
data.insert(make_pair(observation, Data(frequency)));
|
||||
else
|
||||
(*i).second.freq += frequency;
|
||||
|
||||
totalObs += observation * frequency;
|
||||
}
|
||||
|
||||
// Get total number of observations (= sum of obs*freq)
|
||||
ObsType total() const { return totalObs; }
|
||||
|
||||
// Analyse the data.
|
||||
// Returns false if there is not enough data for a valid analysis.
|
||||
// In this case, the estimate is set to the original value.
|
||||
bool analyse()
|
||||
{
|
||||
if (data.size() < MinInput)
|
||||
return false;
|
||||
|
||||
// The code which follows is based on S and G's analyseInput()
|
||||
ObsType bigN = 0;
|
||||
unsigned int rows = data.size();
|
||||
|
||||
// j could be declared in each for statement, but has to be here for
|
||||
// Visual C++, which disobeys the ANSI standard on variable scope.
|
||||
typename DataMap::iterator j;
|
||||
for (j = data.begin(); j != data.end(); ++j)
|
||||
bigN += (*j).first * (*j).second.freq;
|
||||
|
||||
// Find the frequency for observation of value 1, if any
|
||||
iterator row1 = row(1, data.begin());
|
||||
PZero = (row1 == data.end()) ? 0 : (*row1).second.freq / (double) bigN;
|
||||
|
||||
// Set up internal arrays
|
||||
vector<double> log_obs(rows);
|
||||
vector<double> log_Z(rows);
|
||||
vector<double> rStar(rows);
|
||||
|
||||
double XYs = 0, Xsquares = 0, meanX = 0, meanY = 0;
|
||||
ObsType prevObs = 0;
|
||||
unsigned int r = 0;
|
||||
|
||||
for (j = data.begin(); j != data.end(); ++r)
|
||||
{
|
||||
ObsType obs = (*j).first;
|
||||
Data &d = (*j).second;
|
||||
|
||||
double k = (++j == data.end())
|
||||
? (double) (2 * obs - prevObs) : (double) (*j).first;
|
||||
|
||||
double Z = 2 * d.freq / (k - prevObs);
|
||||
log_obs[r] = log((double) obs);
|
||||
log_Z[r] = log(Z);
|
||||
|
||||
meanX += log_obs[r];
|
||||
meanY += log_Z[r];
|
||||
|
||||
prevObs = obs;
|
||||
}
|
||||
|
||||
// Find the line with the best fit.
|
||||
meanX /= rows;
|
||||
meanY /= rows;
|
||||
|
||||
for (r = 0; r < rows; ++r)
|
||||
{
|
||||
XYs += (log_obs[r] - meanX) * (log_Z[r] - meanY);
|
||||
Xsquares += sq(log_obs[r] - meanX);
|
||||
}
|
||||
double slope = XYs / Xsquares;
|
||||
double intercept = meanY - slope * meanX;
|
||||
|
||||
// Now construct the estimates smoothing using the fitted line.
|
||||
bool indiffValsSeen = false;
|
||||
|
||||
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
|
||||
{
|
||||
ObsType obs = (*j).first;
|
||||
Data &d = (*j).second;
|
||||
|
||||
ObsType obs1 = obs + 1;
|
||||
double y = obs1 * smoothed(obs1, intercept, slope)
|
||||
/ smoothed(obs, intercept, slope);
|
||||
|
||||
iterator nextRow = row(obs1, j);
|
||||
if (nextRow == data.end())
|
||||
{
|
||||
indiffValsSeen = true;
|
||||
}
|
||||
else if (!indiffValsSeen)
|
||||
{
|
||||
unsigned int next_n = (*nextRow).second.freq;
|
||||
unsigned int freq = d.freq;
|
||||
|
||||
double x = obs1 * next_n / (double) freq;
|
||||
printf("%0.2f %0.2f %0.2f\n",
|
||||
(float) obs1, (float) next_n, (float) freq);
|
||||
printf("stdv %0.2f\n",
|
||||
sqrt(sq(obs1) * next_n
|
||||
/ (sq(freq)) * (1 + next_n / (double) freq)));
|
||||
printf("x %0.2f y %0.2f\n", x, y);
|
||||
|
||||
if (fabs(x - y) <= 1.96 * sqrt(sq(obs1) * next_n
|
||||
/ (sq(freq)) * (1 + next_n / (double) freq)))
|
||||
{
|
||||
indiffValsSeen = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
rStar[r] = x;
|
||||
}
|
||||
}
|
||||
|
||||
if (indiffValsSeen)
|
||||
{
|
||||
rStar[r] = y;
|
||||
}
|
||||
}
|
||||
|
||||
double bigNprime = 0.0;
|
||||
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) {
|
||||
printf("%f\n", (float) (*j).second.freq);
|
||||
bigNprime += (*j).second.freq * rStar[r];
|
||||
}
|
||||
|
||||
printf("%f %f\n", (float) PZero, (float) bigNprime);
|
||||
for (int i = 0; i < (int) rStar.size(); i++)
|
||||
printf("%f\n", rStar[i]);
|
||||
|
||||
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
|
||||
(*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Analyze the data.
|
||||
// This just calls analyse(), and is included as a concession to speakers
|
||||
// of debased dialects of English.
|
||||
void analyze() { analyse(); }
|
||||
|
||||
// Get the estimate for an observation.
|
||||
// If there was no such observation, return false.
|
||||
// Otherwise return true and yield the estimate.
|
||||
bool estimate(ObsType observation, double &estimate) const
|
||||
{
|
||||
if (observation == 0)
|
||||
{
|
||||
estimate = PZero;
|
||||
return true;
|
||||
}
|
||||
|
||||
iterator rownum = row(observation, data.begin());
|
||||
if (rownum == data.end())
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
estimate = (*rownum).second.estimate;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Get start and end iterators over the data map.
|
||||
// You do not derefence these iterators directly, but instead used the
|
||||
// access functions, obs, freq and estimate.
|
||||
pair<iterator, iterator> iterate() const
|
||||
{ return make_pair(data.begin(), data.end()); }
|
||||
|
||||
// Get the observation from an iterator.
|
||||
ObsType obs(iterator i) const { return (*i).first; }
|
||||
|
||||
// Get the frequency from an iterator (as supplied by add).
|
||||
unsigned int freq(iterator i) const { return (*i).second.freq; }
|
||||
|
||||
// Get the estimated relative frequency from an iterator.
|
||||
double estimate(iterator i) const { return (*i).second.estimate; }
|
||||
|
||||
private:
|
||||
// The data points
|
||||
DataMap data;
|
||||
|
||||
// Zero estimate (only valid after a call to analyse()).
|
||||
double PZero;
|
||||
|
||||
// Total number of observations
|
||||
ObsType totalObs;
|
||||
|
||||
// Find the last row of the data which has a value equals to obs.
|
||||
// If there is no such value, return data.end().
|
||||
// start is a hint about where to start searching.
|
||||
iterator row(ObsType obs, iterator start) const
|
||||
{
|
||||
iterator j = start;
|
||||
|
||||
while (j != data.end() && (*j).first < obs)
|
||||
++j;
|
||||
|
||||
return ((j != data.end() && (*j).first == obs) ? j : data.end());
|
||||
}
|
||||
};
|
||||
|
||||
#endif //SGT_H
|
Binary file not shown.
Binary file not shown.
@ -0,0 +1,54 @@
|
||||
// Test program for sgt code
|
||||
|
||||
// Reads a file in which each line contains an observed value and the
|
||||
// frequency of the value. Prints out a table of the estimates, and also the
|
||||
// estimate for value 1 (to test the estimate() function).
|
||||
|
||||
#include "sgt.h"
|
||||
#include <iostream>
|
||||
|
||||
// Set this to the type for the observation
|
||||
//typedef double Obs;
|
||||
typedef unsigned int Obs;
|
||||
|
||||
int main()
|
||||
{
|
||||
SGT<Obs> sgt;
|
||||
Obs observation;
|
||||
unsigned int frequency;
|
||||
while (cin >> observation)
|
||||
{
|
||||
if (!(cin >> frequency))
|
||||
{
|
||||
cerr << "Incomplete input" << endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
sgt.add(observation, frequency);
|
||||
}
|
||||
|
||||
sgt.analyse();
|
||||
cout << "Results:" << endl;
|
||||
|
||||
// Use iterators to access the results
|
||||
pair<SGT<Obs>::iterator, SGT<Obs>::iterator> i = sgt.iterate();
|
||||
for (; i.first != i.second; ++i.first)
|
||||
{
|
||||
cout << sgt.obs(i.first)
|
||||
<< "\t" << sgt.freq(i.first)
|
||||
<< "\t" << sgt.estimate(i.first)
|
||||
<< "\t" << sgt.estimate(i.first) * sgt.total()
|
||||
<< endl;
|
||||
}
|
||||
|
||||
double estimate;
|
||||
sgt.estimate(0, estimate);
|
||||
cout << "0\t" << estimate << endl;
|
||||
|
||||
if (sgt.estimate(1, estimate))
|
||||
cout << "Estimate on obs=1: " << estimate << endl;
|
||||
else
|
||||
cout << "No estimate for obs=1" << endl;
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue