Add sample simple good turing C code
parent
ab79181da5
commit
45e88a495b
@ -0,0 +1,299 @@
|
||||
/*
|
||||
*
|
||||
*
|
||||
* Simple Good-Turing Frequency Estimator
|
||||
*
|
||||
*
|
||||
* Geoffrey Sampson, with help from Miles Dennis
|
||||
*
|
||||
* Department of Informatics
|
||||
* Sussex University
|
||||
*
|
||||
* www.grsampson.net
|
||||
*
|
||||
*
|
||||
* First release: 27 June 1995
|
||||
* Revised release: 24 July 2000
|
||||
* This header information revised: 23 March 2005
|
||||
* Further revised release: 8 April 2008
|
||||
* Further revised release: 11 July 2008
|
||||
*
|
||||
*
|
||||
* Takes a set of (frequency, frequency-of-frequency) pairs, and
|
||||
* applies the "Simple Good-Turing" technique for estimating
|
||||
* the probabilities corresponding to the observed frequencies,
|
||||
* and P.0, the joint probability of all unobserved species.
|
||||
* The Simple Good-Turing technique was devised by the late William
|
||||
* A. Gale of AT&T Bell Labs, and described in Gale & Sampson,
|
||||
* "Good-Turing Frequency Estimation Without Tears" (JOURNAL
|
||||
* OF QUANTITATIVE LINGUISTICS, vol. 2, pp. 217-37 -- reprinted in
|
||||
* Geoffrey Sampson, EMPIRICAL LINGUISTICS, Continuum, 2001).
|
||||
*
|
||||
* Anyone is welcome to take copies of this program and use it
|
||||
* for any purpose, at his or her own risk. If it is used in
|
||||
* connexion with published work, acknowledgment of Sampson and
|
||||
* the University of Sussex would be a welcome courtesy.
|
||||
*
|
||||
* The program is written to take input from "stdin" and send output
|
||||
* to "stdout"; redirection can be used to take input from and
|
||||
* send output to permanent files. The code is in ANSI standard C.
|
||||
*
|
||||
* The input file should be a series of lines separated by newline
|
||||
* characters, where all nonblank lines contain two positive integers
|
||||
* (an observed frequency, followed by the frequency of that frequency)
|
||||
* separated by whitespace. (Blank lines are ignored.)
|
||||
* The lines should be in ascending order of frequency, and must
|
||||
* begin with frequency 1.
|
||||
*
|
||||
* No checks are made for linearity; the program simply assumes that the
|
||||
* requirements for using the SGT estimator are met.
|
||||
*
|
||||
* The output is a series of lines each containing an integer followed
|
||||
* by a probability (a real number between zero and one), separated by a
|
||||
* tab. In the first line, the integer is 0 and the real number is the
|
||||
* estimate for P.0. In subsequent lines, the integers are the
|
||||
* successive observed frequencies, and the reals are the estimated
|
||||
* probabilities corresponding to those frequencies.
|
||||
*
|
||||
* Later releases cure bugs to which my attention has kindly been
|
||||
* drawn at different times by Martin Jansche of Ohio State University
|
||||
* and Steve Arons of New York City. No warranty is given
|
||||
* as to absence of further bugs.
|
||||
*
|
||||
* Fan Yang of Next IT Inc., Spokane, Washington, has suggested to me
|
||||
* that in the light of his experience with the SGT technique, for some
|
||||
* data-sets it could be preferable to use the 0.1 significance criterion
|
||||
* actually used in the experiments reported in the Gale & Sampson
|
||||
* paper, rather than the 0.05 criterion suggested in that paper
|
||||
* for the sake of conformity with standard statistical convention.
|
||||
* (See note 8 of the paper.) Neither Fan Yang nor I have pursued
|
||||
* this far enough to formulate a definite recommendation; but, in
|
||||
* order to make it easier for users of the software to experiment
|
||||
* with alternative confidence levels, the July 2008 release moves
|
||||
* the relevant "magic number" out of the middle of the program into
|
||||
* a #define line near the beginning where it is given the constant
|
||||
* name CONFID_FACTOR. The value 1.96 corresponds to the p < 0.05
|
||||
* criterion; in order to use the p < 0.1 criterion, 1.96 in the
|
||||
* #define line should be changed to 1.65.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define TRUE 1
|
||||
#define FALSE 0
|
||||
#define MAX_LINE 100
|
||||
#define MAX_ROWS 200
|
||||
#define MIN_INPUT 5
|
||||
#define CONFID_FACTOR 1.96
|
||||
|
||||
int r[MAX_ROWS], n[MAX_ROWS];
|
||||
double Z[MAX_ROWS], log_r[MAX_ROWS], log_Z[MAX_ROWS],
|
||||
rStar[MAX_ROWS], p[MAX_ROWS];
|
||||
int rows, bigN;
|
||||
double PZero, bigNprime, slope, intercept;
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int readValidInput(void);
|
||||
void analyseInput(void);
|
||||
|
||||
if ((rows = readValidInput()) >= 0)
|
||||
{
|
||||
if (rows < MIN_INPUT)
|
||||
printf("\nFewer than %d input value-pairs\n",
|
||||
MIN_INPUT);
|
||||
else
|
||||
analyseInput();
|
||||
}
|
||||
return(TRUE);
|
||||
}
|
||||
|
||||
double sq(double x)
|
||||
{
|
||||
return(x * x);
|
||||
}
|
||||
|
||||
int readValidInput(void)
|
||||
/*
|
||||
* returns number of rows if input file is valid, else -1
|
||||
* NB: number of rows is one more than index of last row
|
||||
*
|
||||
*/
|
||||
|
||||
{
|
||||
char line[MAX_LINE];
|
||||
const char* whiteSpace = " \t\n\v\f\r";
|
||||
int lineNumber = 0;
|
||||
int rowNumber = 0;
|
||||
const int error = -1;
|
||||
|
||||
while (fgets(line, MAX_LINE, stdin) != NULL && rowNumber < MAX_ROWS)
|
||||
{
|
||||
char* ptr = line;
|
||||
char* integer;
|
||||
int i;
|
||||
|
||||
++lineNumber;
|
||||
|
||||
while (isspace(*ptr))
|
||||
++ptr; /* skip white space at the start of a line */
|
||||
if (*ptr == '\0')
|
||||
continue;
|
||||
if ((integer = strtok(ptr, whiteSpace)) == NULL ||
|
||||
(i = atoi(integer)) < 1)
|
||||
{
|
||||
fprintf(stderr, "Invalid field 1, line %d\n",
|
||||
lineNumber);
|
||||
return(error);
|
||||
}
|
||||
if (rowNumber > 0 && i <= r[rowNumber - 1])
|
||||
{
|
||||
fprintf(stderr,
|
||||
"Frequency not in ascending order, line %d\n",
|
||||
lineNumber);
|
||||
return(error);
|
||||
}
|
||||
r[rowNumber] = i;
|
||||
if ((integer = strtok(NULL, whiteSpace)) == NULL ||
|
||||
(i = atoi(integer)) < 1)
|
||||
{
|
||||
fprintf(stderr, "Invalid field 2, line %d\n",
|
||||
lineNumber);
|
||||
return(error);
|
||||
}
|
||||
n[rowNumber] = i;
|
||||
if (strtok(NULL, whiteSpace) != NULL)
|
||||
{
|
||||
fprintf(stderr, "Invalid extra field, line %d\n",
|
||||
lineNumber);
|
||||
return(error);
|
||||
}
|
||||
++rowNumber;
|
||||
}
|
||||
if (rowNumber >= MAX_ROWS)
|
||||
{
|
||||
fprintf(stderr, "\nInsufficient memory reserved for input\
|
||||
values\nYou need to change the definition of\
|
||||
MAX_ROWS\n");
|
||||
return(error);
|
||||
}
|
||||
return(rowNumber);
|
||||
}
|
||||
|
||||
void findBestFit(void)
|
||||
{
|
||||
double XYs, Xsquares, meanX, meanY;
|
||||
double sq(double);
|
||||
int i;
|
||||
|
||||
XYs = Xsquares = meanX = meanY = 0.0;
|
||||
for (i = 0; i < rows; ++i)
|
||||
{
|
||||
meanX += log_r[i];
|
||||
meanY += log_Z[i];
|
||||
}
|
||||
meanX /= rows;
|
||||
meanY /= rows;
|
||||
for (i = 0; i < rows; ++i)
|
||||
{
|
||||
XYs += (log_r[i] - meanX) * (log_Z[i] - meanY);
|
||||
Xsquares += sq(log_r[i] - meanX);
|
||||
}
|
||||
slope = XYs / Xsquares;
|
||||
intercept = meanY - slope * meanX;
|
||||
}
|
||||
|
||||
double smoothed(int i)
|
||||
{
|
||||
return(exp(intercept + slope * log(i)));
|
||||
}
|
||||
|
||||
int row(int i)
|
||||
{
|
||||
int j = 0;
|
||||
|
||||
while (j < rows && r[j] < i)
|
||||
++j;
|
||||
return((j < rows && r[j] == i) ? j : -1);
|
||||
}
|
||||
|
||||
void showEstimates(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
printf("0\t%.4g\n", PZero);
|
||||
for (i = 0; i < rows; ++i)
|
||||
printf("%d\t%.4g\n", r[i], p[i]);
|
||||
}
|
||||
|
||||
void analyseInput(void)
|
||||
{
|
||||
int i, j, next_n;
|
||||
double k, x, y;
|
||||
int indiffValsSeen = FALSE;
|
||||
int row(int);
|
||||
void findBestFit(void);
|
||||
double smoothed(int);
|
||||
double sq(double);
|
||||
void showEstimates(void);
|
||||
|
||||
bigN = 0;
|
||||
for (j = 0; j < rows; ++j)
|
||||
bigN += r[j] * n[j];
|
||||
next_n = row(1);
|
||||
PZero = (next_n < 0) ? 0 : n[next_n] / (double) bigN;
|
||||
for (j = 0; j < rows; ++j)
|
||||
{
|
||||
i = (j == 0 ? 0 : r[j - 1]);
|
||||
if (j == rows - 1)
|
||||
k = (double) (2 * r[j] - i);
|
||||
else
|
||||
k = (double) r[j + 1];
|
||||
Z[j] = 2 * n[j] / (k - i);
|
||||
log_r[j] = log(r[j]);
|
||||
log_Z[j] = log(Z[j]);
|
||||
}
|
||||
findBestFit();
|
||||
for (j = 0; j < rows; ++j)
|
||||
{
|
||||
y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
|
||||
if (row(r[j] + 1) < 0)
|
||||
indiffValsSeen = TRUE;
|
||||
if (! indiffValsSeen)
|
||||
{
|
||||
x = (r[j] + 1) * (next_n = n[row(r[j] + 1)]) /
|
||||
(double) n[j];
|
||||
if (fabs(x - y) <= CONFID_FACTOR * sqrt(sq(r[j] + 1.0)
|
||||
* next_n / (sq((double) n[j]))
|
||||
* (1 + next_n / (double) n[j])))
|
||||
indiffValsSeen = TRUE;
|
||||
else
|
||||
rStar[j] = x;
|
||||
}
|
||||
if (indiffValsSeen)
|
||||
rStar[j] = y;
|
||||
}
|
||||
bigNprime = 0.0;
|
||||
for (j = 0; j < rows; ++j)
|
||||
bigNprime += n[j] * rStar[j];
|
||||
for (j = 0; j < rows; ++j)
|
||||
p[j] = (1 - PZero) * rStar[j] / bigNprime;
|
||||
showEstimates();
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue