prhyme/simple_good_turing_estimator.c

/*
 *
 *
 *	Simple Good-Turing Frequency Estimator
 *
 *
 *	Geoffrey Sampson, with help from Miles Dennis
 *
 *	Department of Informatics
 *	Sussex University
 *
 *	www.grsampson.net
 *
 *
 *	First release:  27 June 1995
 *	Revised release:  24 July 2000
 *	This header information revised:  23 March 2005
 *	Further revised release:  8 April 2008
 *	Further revised release:  11 July 2008
 *
 *
 *	Takes a set of (frequency, frequency-of-frequency) pairs, and 
 *	applies the "Simple Good-Turing" technique for estimating 
 *	the probabilities corresponding to the observed frequencies, 
 *	and P.0, the joint probability of all unobserved species.
 *	The Simple Good-Turing technique was devised by the late William
 *	A. Gale of AT&T Bell Labs, and described in Gale & Sampson, 
 *	"Good-Turing Frequency Estimation Without Tears" (JOURNAL
 *	OF QUANTITATIVE LINGUISTICS, vol. 2, pp. 217-37 -- reprinted in
 *	Geoffrey Sampson, EMPIRICAL LINGUISTICS, Continuum, 2001).
 *
 *	Anyone is welcome to take copies of this program and use it
 *	for any purpose, at his or her own risk.  If it is used in
 *	connexion with published work, acknowledgment of Sampson and
 *	the University of Sussex would be a welcome courtesy.
 *
 *	The program is written to take input from "stdin" and send output
 *	to "stdout"; redirection can be used to take input from and
 *	send output to permanent files.  The code is in ANSI standard C.
 *
 *	The input file should be a series of lines separated by newline
 *	characters, where all nonblank lines contain two positive integers
 *	(an observed frequency, followed by the frequency of that frequency)
 *	separated by whitespace.  (Blank lines are ignored.)
 *	The lines should be in ascending order of frequency, and must
 *	begin with frequency 1.
 *
 *	No checks are made for linearity; the program simply assumes that the 
 *	requirements for using the SGT estimator are met.
 *
 *	The output is a series of lines each containing an integer followed  
 *	by a probability (a real number between zero and one), separated by a 
 *	tab.  In the first line, the integer is 0 and the real number is the 
 *	estimate for P.0.  In subsequent lines, the integers are the  
 *	successive observed frequencies, and the reals are the estimated  
 *	probabilities corresponding to those frequencies.
 *
 *	Later releases cure bugs to which my attention has kindly been
 *	drawn at different times by Martin Jansche of Ohio State University
 *	and Steve Arons of New York City.  No warranty is given 
 *	as to absence of further bugs.
 *
 *	Fan Yang of Next IT Inc., Spokane, Washington, has suggested to me 
 *  that in the light of his experience with the SGT technique, for some 
 *	data-sets it could be preferable to use the 0.1 significance criterion
 *	actually used in the experiments reported in the Gale & Sampson
 *	paper, rather than the 0.05 criterion suggested in that paper
 *	for the sake of conformity with standard statistical convention.
 *	(See note 8 of the paper.)  Neither Fan Yang nor I have pursued
 *	this far enough to formulate a definite recommendation; but, in
 *	order to make it easier for users of the software to experiment
 *	with alternative confidence levels, the July 2008 release moves
 *	the relevant "magic number" out of the middle of the program into
 *	a #define line near the beginning where it is given the constant
 *	name CONFID_FACTOR.  The value 1.96 corresponds to the p < 0.05
 *	criterion; in order to use the p < 0.1 criterion, 1.96 in the 
 *	#define line should be changed to 1.65.
 *
 *
 */
 
 
 #include <stdio.h>
 #include <math.h>
 #include <ctype.h>
 #include <stdlib.h>
 #include <string.h>
 
 #define TRUE   1
 #define FALSE  0
 #define MAX_LINE       100
 #define MAX_ROWS       200
 #define MIN_INPUT      5
 #define CONFID_FACTOR	1.96
 
 int r[MAX_ROWS], n[MAX_ROWS];
 double Z[MAX_ROWS], log_r[MAX_ROWS], log_Z[MAX_ROWS], 
                rStar[MAX_ROWS], p[MAX_ROWS];
 int rows, bigN;
 double PZero, bigNprime, slope, intercept;
 
 int main(void)
        {
        int readValidInput(void);
        void analyseInput(void);
        
        if ((rows = readValidInput()) >= 0)
                {
                if (rows < MIN_INPUT)
                        printf("\nFewer than %d input value-pairs\n",
                                        MIN_INPUT);
                else
                        analyseInput();
                }
        return(TRUE);
        }
 
 double sq(double x)
        {
        return(x * x);
        }
 
 int readValidInput(void)
        /*
         *      returns number of rows if input file is valid, else -1
         *      NB:  number of rows is one more than index of last row
         *
         */
        
        {
        char line[MAX_LINE];
        const char* whiteSpace = " \t\n\v\f\r";
        int lineNumber = 0;
        int rowNumber = 0;
        const int error = -1;

        while (fgets(line, MAX_LINE, stdin) != NULL && rowNumber < MAX_ROWS)
                {
                char* ptr = line;
                char* integer;
                int i;

                ++lineNumber;

                while (isspace(*ptr)) 
                        ++ptr;  /* skip white space at the start of a line */
                if (*ptr == '\0')
                        continue;
                if ((integer = strtok(ptr, whiteSpace)) == NULL || 
                                (i = atoi(integer)) < 1)
                        {
                        fprintf(stderr, "Invalid field 1, line %d\n",
                                        lineNumber);
                        return(error);
                        }
                if (rowNumber > 0 && i <= r[rowNumber - 1])
                        {
                        fprintf(stderr, 
                      "Frequency not in ascending order, line %d\n", 
                                        lineNumber);
                        return(error);
                        }
                r[rowNumber] = i;
                if ((integer = strtok(NULL, whiteSpace)) == NULL || 
                                (i = atoi(integer)) < 1)
                        {
                        fprintf(stderr, "Invalid field 2, line %d\n",
                                        lineNumber);
                        return(error);
                        }
                n[rowNumber] = i;
                if (strtok(NULL, whiteSpace) != NULL)
                        {
                        fprintf(stderr, "Invalid extra field, line %d\n",
                                        lineNumber);
                        return(error);
                        }
                ++rowNumber;
                }
        if (rowNumber >= MAX_ROWS)
                {
                fprintf(stderr, "\nInsufficient memory reserved for input\
                        values\nYou need to change the definition of\
                        MAX_ROWS\n");
                return(error);
                }
        return(rowNumber);
        }
         
 void findBestFit(void)
        {
        double XYs, Xsquares, meanX, meanY;
        double sq(double);
        int i;
        
        XYs = Xsquares = meanX = meanY = 0.0;
        for (i = 0; i < rows; ++i)
                {
                meanX += log_r[i];
                meanY += log_Z[i];
                }
        meanX /= rows;
        meanY /= rows;
        for (i = 0; i < rows; ++i)
                {
                XYs += (log_r[i] - meanX) * (log_Z[i] - meanY);
                Xsquares += sq(log_r[i] - meanX);
                }
        slope = XYs / Xsquares;
        intercept = meanY - slope * meanX;
        }
        
 double smoothed(int i)
        {
        return(exp(intercept + slope * log(i)));
        }
        
 int row(int i)
        {
        int j = 0;
        
        while (j < rows && r[j] < i)
                ++j;
        return((j < rows && r[j] == i) ? j : -1);
        }
        
 void showEstimates(void)
        {
        int i;

        printf("intercept %f slope %f\n", intercept, slope);
        printf("0\t%.4g\n", PZero);
        for (i = 0; i < rows; ++i)
                printf("%d\t%.4g\n", r[i], p[i]);
        for (i = 0; i < rows; ++i)
                printf("%d\t%.4g\n", r[i], smoothed(r[i]));
        }

        
 void analyseInput(void)
        {
        int i, j, next_n;
        double k, x, y;
        int indiffValsSeen = FALSE;
        int row(int);
        void findBestFit(void);
        double smoothed(int);
        double sq(double);
        void showEstimates(void);
        
        bigN = 0;
        for (j = 0; j < rows; ++j)
                bigN += r[j] * n[j];
        next_n = row(1);
        PZero = (next_n < 0) ? 0 : n[next_n] / (double) bigN;
        for (j = 0; j < rows; ++j)
                {
                i = (j == 0 ? 0 : r[j - 1]);
                if (j == rows - 1)
                        k = (double) (2 * r[j] - i);
                else
                        k = (double) r[j + 1];
                Z[j] = 2 * n[j] / (k - i);
                log_r[j] = log(r[j]);
                log_Z[j] = log(Z[j]);
                }

        for (i = 0; i < rows; ++i)
                printf("%d\t%.4g\n", r[i], Z[i]);


        findBestFit();
        for (j = 0; j < rows; ++j)
                {
                y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);
                if (row(r[j] + 1) < 0)
                        indiffValsSeen = TRUE;
                if (! indiffValsSeen)
                        {
                        x = (r[j] + 1) * (next_n = n[row(r[j] + 1)]) / 
                                        (double) n[j];
                        if (fabs(x - y) <= CONFID_FACTOR * sqrt(sq(r[j] + 1.0)
                                        * next_n / (sq((double) n[j]))
                                        * (1 + next_n / (double) n[j])))
                                indiffValsSeen = TRUE;
                        else
                                rStar[j] = x;
                        }
                if (indiffValsSeen)
                        rStar[j] = y;
                }
        bigNprime = 0.0;
        for (j = 0; j < rows; ++j)
                bigNprime += n[j] * rStar[j];
        for (j = 0; j < rows; ++j)
                p[j] = (1 - PZero) * rStar[j] / bigNprime;
        showEstimates();
        }
Add sample simple good turing C code 4 years ago			`/*`
			`*`
			`*`
			`* Simple Good-Turing Frequency Estimator`
			`*`
			`*`
			`* Geoffrey Sampson, with help from Miles Dennis`
			`*`
			`* Department of Informatics`
			`* Sussex University`
			`*`
			`* www.grsampson.net`
			`*`
			`*`
			`* First release: 27 June 1995`
			`* Revised release: 24 July 2000`
			`* This header information revised: 23 March 2005`
			`* Further revised release: 8 April 2008`
			`* Further revised release: 11 July 2008`
			`*`
			`*`
			`* Takes a set of (frequency, frequency-of-frequency) pairs, and`
			`* applies the "Simple Good-Turing" technique for estimating`
			`* the probabilities corresponding to the observed frequencies,`
			`* and P.0, the joint probability of all unobserved species.`
			`* The Simple Good-Turing technique was devised by the late William`
			`* A. Gale of AT&T Bell Labs, and described in Gale & Sampson,`
			`* "Good-Turing Frequency Estimation Without Tears" (JOURNAL`
			`* OF QUANTITATIVE LINGUISTICS, vol. 2, pp. 217-37 -- reprinted in`
			`* Geoffrey Sampson, EMPIRICAL LINGUISTICS, Continuum, 2001).`
			`*`
			`* Anyone is welcome to take copies of this program and use it`
			`* for any purpose, at his or her own risk. If it is used in`
			`* connexion with published work, acknowledgment of Sampson and`
			`* the University of Sussex would be a welcome courtesy.`
			`*`
			`* The program is written to take input from "stdin" and send output`
			`* to "stdout"; redirection can be used to take input from and`
			`* send output to permanent files. The code is in ANSI standard C.`
			`*`
			`* The input file should be a series of lines separated by newline`
			`* characters, where all nonblank lines contain two positive integers`
			`* (an observed frequency, followed by the frequency of that frequency)`
			`* separated by whitespace. (Blank lines are ignored.)`
			`* The lines should be in ascending order of frequency, and must`
			`* begin with frequency 1.`
			`*`
			`* No checks are made for linearity; the program simply assumes that the`
			`* requirements for using the SGT estimator are met.`
			`*`
			`* The output is a series of lines each containing an integer followed`
			`* by a probability (a real number between zero and one), separated by a`
			`* tab. In the first line, the integer is 0 and the real number is the`
			`* estimate for P.0. In subsequent lines, the integers are the`
			`* successive observed frequencies, and the reals are the estimated`
			`* probabilities corresponding to those frequencies.`
			`*`
			`* Later releases cure bugs to which my attention has kindly been`
			`* drawn at different times by Martin Jansche of Ohio State University`
			`* and Steve Arons of New York City. No warranty is given`
			`* as to absence of further bugs.`
			`*`
			`* Fan Yang of Next IT Inc., Spokane, Washington, has suggested to me`
			`* that in the light of his experience with the SGT technique, for some`
			`* data-sets it could be preferable to use the 0.1 significance criterion`
			`* actually used in the experiments reported in the Gale & Sampson`
			`* paper, rather than the 0.05 criterion suggested in that paper`
			`* for the sake of conformity with standard statistical convention.`
			`* (See note 8 of the paper.) Neither Fan Yang nor I have pursued`
			`* this far enough to formulate a definite recommendation; but, in`
			`* order to make it easier for users of the software to experiment`
			`* with alternative confidence levels, the July 2008 release moves`
			`* the relevant "magic number" out of the middle of the program into`
			`* a #define line near the beginning where it is given the constant`
			`* name CONFID_FACTOR. The value 1.96 corresponds to the p < 0.05`
			`* criterion; in order to use the p < 0.1 criterion, 1.96 in the`
			`* #define line should be changed to 1.65.`
			`*`
			`*`
			`*/`


			`#include <stdio.h>`
			`#include <math.h>`
			`#include <ctype.h>`
			`#include <stdlib.h>`
			`#include <string.h>`

			`#define TRUE 1`
			`#define FALSE 0`
			`#define MAX_LINE 100`
			`#define MAX_ROWS 200`
			`#define MIN_INPUT 5`
			`#define CONFID_FACTOR 1.96`

			`int r[MAX_ROWS], n[MAX_ROWS];`
			`double Z[MAX_ROWS], log_r[MAX_ROWS], log_Z[MAX_ROWS],`
			`rStar[MAX_ROWS], p[MAX_ROWS];`
			`int rows, bigN;`
			`double PZero, bigNprime, slope, intercept;`

			`int main(void)`
			`{`
			`int readValidInput(void);`
			`void analyseInput(void);`

			`if ((rows = readValidInput()) >= 0)`
			`{`
			`if (rows < MIN_INPUT)`
			`printf("\nFewer than %d input value-pairs\n",`
			`MIN_INPUT);`
			`else`
			`analyseInput();`
			`}`
			`return(TRUE);`
			`}`

			`double sq(double x)`
			`{`
			`return(x * x);`
			`}`

			`int readValidInput(void)`
			`/*`
			`* returns number of rows if input file is valid, else -1`
			`* NB: number of rows is one more than index of last row`
			`*`
			`*/`

			`{`
			`char line[MAX_LINE];`
			`const char* whiteSpace = " \t\n\v\f\r";`
			`int lineNumber = 0;`
			`int rowNumber = 0;`
			`const int error = -1;`

			`while (fgets(line, MAX_LINE, stdin) != NULL && rowNumber < MAX_ROWS)`
			`{`
			`char* ptr = line;`
			`char* integer;`
			`int i;`

			`++lineNumber;`

			`while (isspace(*ptr))`
			`++ptr; /* skip white space at the start of a line */`
			`if (*ptr == '\0')`
			`continue;`
			`if ((integer = strtok(ptr, whiteSpace)) == NULL \|\|`
			`(i = atoi(integer)) < 1)`
			`{`
			`fprintf(stderr, "Invalid field 1, line %d\n",`
			`lineNumber);`
			`return(error);`
			`}`
			`if (rowNumber > 0 && i <= r[rowNumber - 1])`
			`{`
			`fprintf(stderr,`
			`"Frequency not in ascending order, line %d\n",`
			`lineNumber);`
			`return(error);`
			`}`
			`r[rowNumber] = i;`
			`if ((integer = strtok(NULL, whiteSpace)) == NULL \|\|`
			`(i = atoi(integer)) < 1)`
			`{`
			`fprintf(stderr, "Invalid field 2, line %d\n",`
			`lineNumber);`
			`return(error);`
			`}`
			`n[rowNumber] = i;`
			`if (strtok(NULL, whiteSpace) != NULL)`
			`{`
			`fprintf(stderr, "Invalid extra field, line %d\n",`
			`lineNumber);`
			`return(error);`
			`}`
			`++rowNumber;`
			`}`
			`if (rowNumber >= MAX_ROWS)`
			`{`
			`fprintf(stderr, "\nInsufficient memory reserved for input\`
			`values\nYou need to change the definition of\`
			`MAX_ROWS\n");`
			`return(error);`
			`}`
			`return(rowNumber);`
			`}`

			`void findBestFit(void)`
			`{`
			`double XYs, Xsquares, meanX, meanY;`
			`double sq(double);`
			`int i;`

			`XYs = Xsquares = meanX = meanY = 0.0;`
			`for (i = 0; i < rows; ++i)`
			`{`
			`meanX += log_r[i];`
			`meanY += log_Z[i];`
			`}`
			`meanX /= rows;`
			`meanY /= rows;`
			`for (i = 0; i < rows; ++i)`
			`{`
			`XYs += (log_r[i] - meanX) * (log_Z[i] - meanY);`
			`Xsquares += sq(log_r[i] - meanX);`
			`}`
			`slope = XYs / Xsquares;`
			`intercept = meanY - slope * meanX;`
			`}`

			`double smoothed(int i)`
			`{`
			`return(exp(intercept + slope * log(i)));`
			`}`

			`int row(int i)`
			`{`
			`int j = 0;`

			`while (j < rows && r[j] < i)`
			`++j;`
			`return((j < rows && r[j] == i) ? j : -1);`
			`}`

			`void showEstimates(void)`
			`{`
			`int i;`
Experiments with simple good turing estimation 4 years ago
			`printf("intercept %f slope %f\n", intercept, slope);`
Add sample simple good turing C code 4 years ago			`printf("0\t%.4g\n", PZero);`
			`for (i = 0; i < rows; ++i)`
			`printf("%d\t%.4g\n", r[i], p[i]);`
Experiments with simple good turing estimation 4 years ago			`for (i = 0; i < rows; ++i)`
			`printf("%d\t%.4g\n", r[i], smoothed(r[i]));`
Add sample simple good turing C code 4 years ago			`}`
Experiments with simple good turing estimation 4 years ago
Add sample simple good turing C code 4 years ago
			`void analyseInput(void)`
			`{`
			`int i, j, next_n;`
			`double k, x, y;`
			`int indiffValsSeen = FALSE;`
			`int row(int);`
			`void findBestFit(void);`
			`double smoothed(int);`
			`double sq(double);`
			`void showEstimates(void);`

			`bigN = 0;`
			`for (j = 0; j < rows; ++j)`
			`bigN += r[j] * n[j];`
			`next_n = row(1);`
			`PZero = (next_n < 0) ? 0 : n[next_n] / (double) bigN;`
			`for (j = 0; j < rows; ++j)`
			`{`
			`i = (j == 0 ? 0 : r[j - 1]);`
			`if (j == rows - 1)`
			`k = (double) (2 * r[j] - i);`
			`else`
			`k = (double) r[j + 1];`
			`Z[j] = 2 * n[j] / (k - i);`
			`log_r[j] = log(r[j]);`
			`log_Z[j] = log(Z[j]);`
			`}`
Experiments with simple good turing estimation 4 years ago
			`for (i = 0; i < rows; ++i)`
			`printf("%d\t%.4g\n", r[i], Z[i]);`


Add sample simple good turing C code 4 years ago			`findBestFit();`
			`for (j = 0; j < rows; ++j)`
			`{`
			`y = (r[j] + 1) * smoothed(r[j] + 1) / smoothed(r[j]);`
			`if (row(r[j] + 1) < 0)`
			`indiffValsSeen = TRUE;`
			`if (! indiffValsSeen)`
			`{`
			`x = (r[j] + 1) * (next_n = n[row(r[j] + 1)]) /`
			`(double) n[j];`
			`if (fabs(x - y) <= CONFID_FACTOR * sqrt(sq(r[j] + 1.0)`
			`* next_n / (sq((double) n[j]))`
			`* (1 + next_n / (double) n[j])))`
			`indiffValsSeen = TRUE;`
			`else`
			`rStar[j] = x;`
			`}`
			`if (indiffValsSeen)`
			`rStar[j] = y;`
			`}`
			`bigNprime = 0.0;`
			`for (j = 0; j < rows; ++j)`
			`bigNprime += n[j] * rStar[j];`
			`for (j = 0; j < rows; ++j)`
			`p[j] = (1 - PZero) * rStar[j] / bigNprime;`
			`showEstimates();`
			`}`