Add sample simple good turing C code

5 years ago · 45e88a495b
parent ab79181da5
commit 45e88a495b
1 changed files with 299 additions and 0 deletions
--- a/simple_good_turing_estimator.c
+++ b/simple_good_turing_estimator.c
@ -0,0 +1,299 @@
+/*
+ *
+ *
+ *	Simple Good-Turing Frequency Estimator
+ *
+ *
+ *	Geoffrey Sampson, with help from Miles Dennis
+ *
+ *	Department of Informatics
+ *	Sussex University
+ *
+ *	www.grsampson.net
+ *
+ *
+ *	First release:  27 June 1995
+ *	Revised release:  24 July 2000
+ *	This header information revised:  23 March 2005
+ *	Further revised release:  8 April 2008
+ *	Further revised release:  11 July 2008
+ *
+ *
+ *	Takes a set of (frequency, frequency-of-frequency) pairs, and 
+ *	applies the "Simple Good-Turing" technique for estimating 
+ *	the probabilities corresponding to the observed frequencies, 
+ *	and P.0, the joint probability of all unobserved species.
+ *	The Simple Good-Turing technique was devised by the late William
+ *	A. Gale of AT&T Bell Labs, and described in Gale & Sampson, 
+ *	"Good-Turing Frequency Estimation Without Tears" (JOURNAL
+ *	OF QUANTITATIVE LINGUISTICS, vol. 2, pp. 217-37 -- reprinted in
+ *	Geoffrey Sampson, EMPIRICAL LINGUISTICS, Continuum, 2001).
+ *
+ *	Anyone is welcome to take copies of this program and use it
+ *	for any purpose, at his or her own risk.  If it is used in
+ *	connexion with published work, acknowledgment of Sampson and
+ *	the University of Sussex would be a welcome courtesy.
+ *
+ *	The program is written to take input from "stdin" and send output
+ *	to "stdout"; redirection can be used to take input from and
+ *	send output to permanent files.  The code is in ANSI standard C.
+ *
+ *	The input file should be a series of lines separated by newline
+ *	characters, where all nonblank lines contain two positive integers
+ *	(an observed frequency, followed by the frequency of that frequency)
+ *	separated by whitespace.  (Blank lines are ignored.)
+ *	The lines should be in ascending order of frequency, and must
+ *	begin with frequency 1.
+ *
+ *	No checks are made for linearity; the program simply assumes that the 
+ *	requirements for using the SGT estimator are met.
+ *
+ *	The output is a series of lines each containing an integer followed  
+ *	by a probability (a real number between zero and one), separated by a 
+ *	tab.  In the first line, the integer is 0 and the real number is the 
+ *	estimate for P.0.  In subsequent lines, the integers are the  
+ *	successive observed frequencies, and the reals are the estimated  
+ *	probabilities corresponding to those frequencies.
+ *
+ *	Later releases cure bugs to which my attention has kindly been
+ *	drawn at different times by Martin Jansche of Ohio State University
+ *	and Steve Arons of New York City.  No warranty is given 
+ *	as to absence of further bugs.
+ *
+ *	Fan Yang of Next IT Inc., Spokane, Washington, has suggested to me 
+ *  that in the light of his experience with the SGT technique, for some 
+ *	data-sets it could be preferable to use the 0.1 significance criterion
+ *	actually used in the experiments reported in the Gale & Sampson
+ *	paper, rather than the 0.05 criterion suggested in that paper
+ *	for the sake of conformity with standard statistical convention.
+ *	(See note 8 of the paper.)  Neither Fan Yang nor I have pursued
+ *	this far enough to formulate a definite recommendation; but, in
+ *	order to make it easier for users of the software to experiment
+ *	with alternative confidence levels, the July 2008 release moves
+ *	the relevant "magic number" out of the middle of the program into
+ *	a #define line near the beginning where it is given the constant
+ *	name CONFID_FACTOR.  The value 1.96 corresponds to the p < 0.05
+ *	criterion; in order to use the p < 0.1 criterion, 1.96 in the 
+ *	#define line should be changed to 1.65.
+ *
+ *
+ */
+ 
+ 
+ #include <stdio.h>
+ #include <math.h>
+ #include <ctype.h>
+ #include <stdlib.h>
+ #include <string.h>
+ 
+ #define TRUE   1
+ #define FALSE  0
+ #define MAX_LINE       100
+ #define MAX_ROWS       200
+ #define MIN_INPUT      5
+ #define CONFID_FACTOR	1.96
+ 
+ int r[MAX_ROWS], n[MAX_ROWS];
+ double Z[MAX_ROWS], log_r[MAX_ROWS], log_Z[MAX_ROWS], 
+                rStar[MAX_ROWS], p[MAX_ROWS];
+ int rows, bigN;
+ double PZero, bigNprime, slope, intercept;
+ 
+ int main(void)
+        {
+        int readValidInput(void);
+        void analyseInput(void);
+        
+        if ((rows = readValidInput()) >= 0)
+                {
+                if (rows < MIN_INPUT)
+                        printf("\nFewer than %d input value-pairs\n",
+                                        MIN_INPUT);
+                else
+                        analyseInput();
+                }
+        return(TRUE);
+        }
+ 
+ double sq(double x)
+        {
+        return(x * x);
+        }
+ 
+ int readValidInput(void)
+        /*
+         *      returns number of rows if input file is valid, else -1
+         *      NB:  number of rows is one more than index of last row
+         *
+         */
+        
+        {
+        char line[MAX_LINE];
+        const char* whiteSpace = " \t\n\v\f\r";
+        int lineNumber = 0;
+        int rowNumber = 0;
+        const int error = -1;
+
+        while (fgets(line, MAX_LINE, stdin) != NULL && rowNumber < MAX_ROWS)
+                {
+                char* ptr = line;
+                char* integer;
+                int i;
+
+                ++lineNumber;
+
+                while (isspace(*ptr)) 
+                        ++ptr;  /* skip white space at the start of a line */
+                if (*ptr == '\0')
+                        continue;
+                if ((integer = strtok(ptr, whiteSpace)) == NULL || 
+                                (i = atoi(integer)) < 1)
+                        {
+                        fprintf(stderr, "Invalid field 1, line %d\n",
+                                        lineNumber);
+                        return(error);
+                        }
+                if (rowNumber > 0 && i <= r[rowNumber - 1])
+                        {
+                        fprintf(stderr, 
+                      "Frequency not in ascending order, line %d\n", 
+                                        lineNumber);
+                        return(error);
+                        }
+                r[rowNumber] = i;
+                if ((integer = strtok(NULL, whiteSpace)) == NULL || 
+                                (i = atoi(integer)) < 1)
+                        {
+                        fprintf(stderr, "Invalid field 2, line %d\n",
+                                        lineNumber);
+                        return(error);
+                        }
+                n[rowNumber] = i;
+                if (strtok(NULL, whiteSpace) != NULL)
+                        {
+                        fprintf(stderr, "Invalid extra field, line %d\n",
+                                        lineNumber);
+                        return(error);
+                        }
+                ++rowNumber;
+                }
+        if (rowNumber >= MAX_ROWS)
+                {
+                fprintf(stderr, "\nInsufficient memory reserved for input\
+                        values\nYou need to change the definition of\
+                        MAX_ROWS\n");
+                return(error);
+                }
+        return(rowNumber);
+        }
+         
+ void findBestFit(void)
+        {
+        double XYs, Xsquares, meanX, meanY;
+        double sq(double);
+        int i;
+        
+        XYs = Xsquares = meanX = meanY = 0.0;
+        for (i = 0; i < rows; ++i)
+                {
+                meanX += log_r[i];
+                meanY += log_Z[i];
+                }
+        meanX /= rows;
+        meanY /= rows;
+        for (i = 0; i < rows; ++i)
+                {
+                XYs += (log_r[i] - meanX) * (log_Z[i] - meanY);
+                Xsquares += sq(log_r[i] - meanX);
+                }
+        slope = XYs / Xsquares;
+        intercept = meanY - slope * meanX;
+        }
+        
+ double smoothed(int i)
+        {
+        return(exp(intercept + slope * log(i)));
+        }
+        
+ int row(int i)
+        {
+        int j = 0;
+        
+        while (j < rows && r[j] < i)
+                ++j;
+        return((j < rows && r[j] == i) ? j : -1);
+        }
+        
+ void showEstimates(void)
+        {
+        int i;
+        
+        printf("0\t%.4g\n", PZero);
+        for (i = 0; i < rows; ++i)
+                printf("%d\t%.4g\n", r[i], p[i]);
+        }
+        
+ void analyseInput(void)
+        {
+        int i, j, next_n;
+        double k, x, y;
+        int indiffValsSeen = FALSE;
+        int row(int);
+        void findBestFit(void);
+        double smoothed(int);
+        double sq(double);
+        void showEstimates(void);
+        
+        bigN = 0;
+        for (j = 0; j < rows; ++j)
+                bigN += r[j] * n[j];
+        next_n = row(1);
+        PZero = (next_n < 0) ? 0 : n[next_n] / (double) bigN;
+        for (j = 0; j < rows; ++j)
+                {
+                i = (j == 0 ? 0 : r[j - 1]);
+                if (j == rows - 1)
+                        k = (double) (2 * r[j] - i);
+                else
+                        k = (double) r[j + 1];
+                Z[j] = 2 * n[j] / (k - i);
+                log_r[j] = log(r[j]);
+                log_Z[j] = log(Z[j]);
+                }
+        findBestFit();
+        for (j = 0; j < rows; ++j)
+                {
+                y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
+                if (row(r[j] + 1) < 0)
+                        indiffValsSeen = TRUE;
+                if (! indiffValsSeen)
+                        {
+                        x = (r[j] + 1) * (next_n = n[row(r[j] + 1)]) / 
+                                        (double) n[j];
+                        if (fabs(x - y) <= CONFID_FACTOR * sqrt(sq(r[j] + 1.0)
+                                        * next_n / (sq((double) n[j]))
+                                        * (1 + next_n / (double) n[j])))
+                                indiffValsSeen = TRUE;
+                        else
+                                rStar[j] = x;
+                        }
+                if (indiffValsSeen)
+                        rStar[j] = y;
+                }
+        bigNprime = 0.0;
+        for (j = 0; j < rows; ++j)
+                bigNprime += n[j] * rStar[j];
+        for (j = 0; j < rows; ++j)
+                p[j] = (1 - PZero) * rStar[j] / bigNprime;
+        showEstimates();
+        }
+
+
+
+
+
+
+
+
+
+