diff --git a/dev/examples/scratch.clj b/dev/examples/scratch.clj index 32987bb..a4bd327 100644 --- a/dev/examples/scratch.clj +++ b/dev/examples/scratch.clj @@ -2,7 +2,8 @@ (:require [clojure.java.io :as io] [clojure.string :as string] [clojure.set] - [com.owoga.prhyme.nlp.core :as nlp])) + [com.owoga.prhyme.nlp.core :as nlp] + [com.owoga.prhyme.util.math :as math])) (def re-word "Regex for tokenizing a string into words @@ -17,12 +18,24 @@ (re-seq re-word) (map second) (map string/lower-case) + (cons :bol) (reverse) - (cons :end))) + (cons :eol))) + +(defn tokenize-line + [line] + (->> line + (string/trim) + (re-seq re-word) + (map second) + (map string/lower-case))) (comment - (-> (slurp "dev/examples/sandman.txt") - tokenize)) + (->> (slurp "dev/examples/sandman.txt") + (#(string/split % #"\n")) + (map tokenize-line)) + + ) (defn zero-to-n-seq ([coll] @@ -85,7 +98,99 @@ (zero-to-n-seq (first windows))) (rest windows)))))) +(defn add-to-trie-1 + [trie n tokens] + (let [pad-n (dec n) + tokens (concat (repeat pad-n :bol) tokens (repeat pad-n :eol)) + partitions (partition n 1 tokens)] + (reduce + (fn [acc tokens] + (update-in acc (concat tokens [:count]) (fnil inc 0))) + trie + partitions))) + +(defn flatmap + ([m] + (flatmap m [])) + ([m prefix] + (mapcat + (fn [[k v]] + (if (map? v) + (flatmap v (conj prefix k)) + [(conj prefix k) v])) + m))) + +(defn filter-trie-to-ngrams [trie n] + (->> trie + (flatmap) + (partition 2) + ;; Inc to account for :count + (filter #(= (inc n) (count (first %)))))) + +(comment + (let [trie {}] + (-> (add-to-trie-1 trie 2 '("of" "lives" "lost" "at" "sea")) + (add-to-trie-1 1 '("of" "lives" "lost" "at" "sea")))) + ) + +(defn wrand + "given a vector of slice sizes, returns the index of a slice given a + random spin of a roulette wheel with compartments proportional to + slices." + [slices] + (let [total (reduce + slices) + r (rand total)] + (loop [i 0 sum 0] + (if (< r (+ (slices i) sum)) + i + (recur (inc i) (+ (slices i) sum)))))) + +(defn completions [trie probs words] + (let [n (inc (count words)) + possibilities (->> (get-in trie words) + (filter #(string? (first %))) + (map (fn [[k v]] + [k (get-in probs [n (:count v)])])) + (into {})) + sum-probs (apply + (vals possibilities)) + possibilities (into {} (map (fn [[k v]] [k (/ v sum-probs)]) possibilities))] + possibilities)) + (comment + ;; Turning corpus into a trie. + (let [documents (->> "dark-corpus" + io/file + file-seq + (remove #(.isDirectory %)) + (drop 500) + (take 5)) + trie (->> documents + (map slurp) + (mapcat #(string/split % #"\n")) + (map tokenize-line) + (filter #(> (count %) 1)) + (take 500) + (reduce + (fn [acc tokens] + (-> (add-to-trie-1 acc 1 tokens) + (add-to-trie-1 2 tokens) + (add-to-trie-1 3 tokens))) + {})) + probs (->> (range 1 4) + (map #(vector % (filter-trie-to-ngrams trie %))) + (map (fn [[n v]] [n (map #(second %) v)])) + (map (fn [[n v]] [n (into (sorted-map) (frequencies v))])) + (map (fn [[n v]] [n (math/sgt (keys v) (vals v))])) + (map (fn [[n [rs probs]]] + [n (into {} (map vector rs probs))])) + (into {}))] + (reverse (sort-by second (completions trie probs [:bol "you"])))) + + (into {} (map vector [1 2 3] [4 5 6])) + ;; + ;; => ([1 (1 2 8 7 3 6 4 23) (85 18 2 2 6 3 1 1)] + ;; [2 (1 2 5 3 4 7) (170 25 2 4 2 2)] + ;; [3 (1 2 3 4 7 5) (213 30 5 1 1 3)]) (let [last-window '("in" "the" "frat")] (concat (zero-to-n-seq last-window) (rest (n-to-zero-seq last-window)))) @@ -96,6 +201,7 @@ (string/split "the cat in the hat is the rat in the frat" #" ")) + ;; => {"the" ;; {:count 3, ;; "cat" {:count 1, "in" {:count 1}}, @@ -184,7 +290,29 @@ :else (flat-at-depth (->> m (mapcat second) (remove #(= :count (first %)))) (dec depth))))) +(defn flatmap + ([m] + (flatmap m [])) + ([m prefix] + (mapcat + (fn [[k v]] + (if (map? v) + (flatmap v (conj prefix k)) + [(conj prefix k) v])) + m))) + +(defn filter-trie-to-ngrams [trie n] + (->> trie + (flatmap) + (partition 2) + ;; Inc to account for :count + (filter #(= (inc n) (count (first %)))))) + +(apply hash-map '([1 2] 3 [4 5] 6)) + (comment + (apply hash-map (flatmap {1 {2 {3 4} 5 {6 7}} 8 {9 10}} [])) + (let [trie {"d" {:count 3 "o" {:count 3 "g" {:count 2} @@ -196,7 +324,8 @@ "g" {:count 1}} "i" {:count 1 "g" {:count 1}}}}] - (->> (flat-at-depth trie 2))) + (filter-trie-to-ngrams trie 3)) + ) @@ -458,7 +587,6 @@ (def n-gram-freq-map (n-gram-frequency-map trie 3)) (def unigram-frequencies (n-gram-freq-map 1)) unigram-frequencies - ) (defn number-of-n-grams-that-occur-with-count [trie n c] diff --git a/sgt/ReadMe.txt b/sgt/ReadMe.txt new file mode 100644 index 0000000..54dd031 --- /dev/null +++ b/sgt/ReadMe.txt @@ -0,0 +1,72 @@ +SGT +=== + +The files here contain a C++ class for implementing simple Good-Turing +re-estimation, as described by Geoff Sampson in the book Empirical Linguistics +(2001), and on the web at http://www.grsampson.net/RGoodTur.html. The code +here is a C++ adaptation of the published code by Sampson and Gale, with the +bug fix issued in 2000. It is encapsulated as a class to allow it to be +incorporated into other programs. An additional coding change is that the data +can be presented in any order, whereas the original code required the data to +be in ascending order. + +Sampson's original code was issued with no restrictions on use. In keeping +with the spirit of this, the code here is issued under an open source licence +which allows essentially unrestricted use. + +LICENCE +------- +Copyright (c) David Elworthy 2004. +All rights reserved. + +Redistribution and use in source and binary forms for any purpose, with or +without modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright notice, + this list of conditions, and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions, and the disclaimer that follows + these conditions in the documentation and/or other materials + provided with the distribution. + +THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED +WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN +NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Contact details +--------------- +You may contact me at david@friendlymoose.com. I would be happy to hear of any +experiences you have with the code; please feel free to send me updated +versions. The reference site for the code is http://www.friendlymoose.com/. + +Files and use +------------- +There are three files: +sgt.h SGT header file +sgttest.cpp A test and example program + +There is no source file, as the SGT class is a template over the observation +type, typically either an int or a double. + +Information about using the class is included in the header file. The code has +been tested with g++ version 3.2 on cygwin and Microsoft Visual Studio version +6 on Windows 2000. You can compile and link the test program using g++ using +the command + g++ -o sgttest sgttest.cpp + +For Visual Studio, from the command line, you can compile and link with + cl -GX sgttest.cpp + +Version history +--------------- +Initial version released January 2004. +Updated to a better implementation April 2004. diff --git a/sgt/a.out b/sgt/a.out new file mode 100755 index 0000000..5118285 Binary files /dev/null and b/sgt/a.out differ diff --git a/sgt/freq_freqs.txt b/sgt/freq_freqs.txt new file mode 100644 index 0000000..0f4374e --- /dev/null +++ b/sgt/freq_freqs.txt @@ -0,0 +1,12 @@ +1 32 +2 20 +3 10 +4 3 +5 1 +6 2 +7 1 +8 1 +9 1 +10 2 +12 1 +26 1 diff --git a/sgt/sgt.h b/sgt/sgt.h new file mode 100644 index 0000000..350b7f7 --- /dev/null +++ b/sgt/sgt.h @@ -0,0 +1,314 @@ +#ifndef SGT_H +#define SGT_H +// Simple Good-Turing estimation +// +// Copyright (c) David Elworthy 2004. + +// A class for implementing simple Good-Turing re-estimation, as described by +// Geoff Sampson in the book Empirical Linguistics (2001), and on the web at +// http://www.grsampson.net/RGoodTur.html. The code here is a C++ adaptation +// of the published code by Sampson and Gale, with the bug fix issued in +// 2000. It is encapsulated as a class to allow it to be incorporated into +// other programs. An additional coding change is that the data can be +// presented in any order, whereas the original code required the data to be +// in ascending order. +// +// Copyright (c) David Elworthy 2004. +// All rights reserved. +// +// Redistribution and use in source and binary forms for any purpose, with or +// without modification, are permitted provided that the following conditions +// are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions, and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions, and the disclaimer that follows +// these conditions in the documentation and/or other materials +// provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED +// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN +// NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED +// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// You may contact me at david@friendlymoose.com. + +#include +#include +#include +#include +using namespace std; + +// Simple Good-Turing class. +// To use the class, create an SGT object and data to it by calling add() with +// each data point. A data point consists of the observed value and the +// frequency of the the observation (what Sampson and Gale refer to as the +// frequency, and the frequency of the the frequency). When you have added all +// the data points call analyse(). You can then call estimate() with an +// observed value as argument to get the estimated frequency for that value, +// or call iterate() to iterate over the data points. There is one special +// case to estimate(). If called with an argument of zero, it delivers the +// estimated frequency for unseen events. This is not delivered from pair(). +// To get back from the estimate value to the smoothed value of the +// observation, multiply by total(); +// +// In the original Sampson and Gale version, the observation was an integer. +// For this version, we make the code be a template over the observation type. +// However, it must always be some suitable numeric type, such as int or double. +// +// The code is implemented using the Standard Template Library (STL). + +template class SGT +{ + private: + // Data block, holding the frequency and estimate. The estimate is set up + // by analyse(). + struct Data + { + Data(unsigned int f) : freq(f), estimate(0) {} + + unsigned int freq; + double estimate; + }; + + // Internal representation, as a map from observations to frequencies. + // After calling analyse(), it provides the estimates as well. + typedef map > DataMap; + + // Minimum number of data points for a valid analysis +#ifdef _WIN32 +#define MinInput (5) +#else + static const unsigned int MinInput = 5; +#endif + + template double sq(T d) { return ((double) d)*d; } + + double smoothed(ObsType i, double intercept, double slope) + { return (exp(intercept + slope * log((double) i))); } + + public: + // Iterator type for iterate(); + typedef typename DataMap::const_iterator iterator; + + // Construct a SGT object. + SGT() : totalObs(0) {} + + // Destroy SGT object. + ~SGT() {} + + // Add a data point. + // If an observation with the same value has already been supplied, this adds + // to its frequency. + void add(ObsType observation, unsigned int frequency) + { + typename DataMap::iterator i = data.find(observation); + if (i == data.end()) + data.insert(make_pair(observation, Data(frequency))); + else + (*i).second.freq += frequency; + + totalObs += observation * frequency; + } + + // Get total number of observations (= sum of obs*freq) + ObsType total() const { return totalObs; } + + // Analyse the data. + // Returns false if there is not enough data for a valid analysis. + // In this case, the estimate is set to the original value. + bool analyse() + { + if (data.size() < MinInput) + return false; + + // The code which follows is based on S and G's analyseInput() + ObsType bigN = 0; + unsigned int rows = data.size(); + + // j could be declared in each for statement, but has to be here for + // Visual C++, which disobeys the ANSI standard on variable scope. + typename DataMap::iterator j; + for (j = data.begin(); j != data.end(); ++j) + bigN += (*j).first * (*j).second.freq; + + // Find the frequency for observation of value 1, if any + iterator row1 = row(1, data.begin()); + PZero = (row1 == data.end()) ? 0 : (*row1).second.freq / (double) bigN; + + // Set up internal arrays + vector log_obs(rows); + vector log_Z(rows); + vector rStar(rows); + + double XYs = 0, Xsquares = 0, meanX = 0, meanY = 0; + ObsType prevObs = 0; + unsigned int r = 0; + + for (j = data.begin(); j != data.end(); ++r) + { + ObsType obs = (*j).first; + Data &d = (*j).second; + + double k = (++j == data.end()) + ? (double) (2 * obs - prevObs) : (double) (*j).first; + + double Z = 2 * d.freq / (k - prevObs); + log_obs[r] = log((double) obs); + log_Z[r] = log(Z); + + meanX += log_obs[r]; + meanY += log_Z[r]; + + prevObs = obs; + } + + // Find the line with the best fit. + meanX /= rows; + meanY /= rows; + + for (r = 0; r < rows; ++r) + { + XYs += (log_obs[r] - meanX) * (log_Z[r] - meanY); + Xsquares += sq(log_obs[r] - meanX); + } + double slope = XYs / Xsquares; + double intercept = meanY - slope * meanX; + + // Now construct the estimates smoothing using the fitted line. + bool indiffValsSeen = false; + + for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) + { + ObsType obs = (*j).first; + Data &d = (*j).second; + + ObsType obs1 = obs + 1; + double y = obs1 * smoothed(obs1, intercept, slope) + / smoothed(obs, intercept, slope); + + iterator nextRow = row(obs1, j); + if (nextRow == data.end()) + { + indiffValsSeen = true; + } + else if (!indiffValsSeen) + { + unsigned int next_n = (*nextRow).second.freq; + unsigned int freq = d.freq; + + double x = obs1 * next_n / (double) freq; + printf("%0.2f %0.2f %0.2f\n", + (float) obs1, (float) next_n, (float) freq); + printf("stdv %0.2f\n", + sqrt(sq(obs1) * next_n + / (sq(freq)) * (1 + next_n / (double) freq))); + printf("x %0.2f y %0.2f\n", x, y); + + if (fabs(x - y) <= 1.96 * sqrt(sq(obs1) * next_n + / (sq(freq)) * (1 + next_n / (double) freq))) + { + indiffValsSeen = true; + } + else + { + rStar[r] = x; + } + } + + if (indiffValsSeen) + { + rStar[r] = y; + } + } + + double bigNprime = 0.0; + for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) { + printf("%f\n", (float) (*j).second.freq); + bigNprime += (*j).second.freq * rStar[r]; + } + + printf("%f %f\n", (float) PZero, (float) bigNprime); + for (int i = 0; i < (int) rStar.size(); i++) + printf("%f\n", rStar[i]); + + for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) + (*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime; + + return true; + } + + // Analyze the data. + // This just calls analyse(), and is included as a concession to speakers + // of debased dialects of English. + void analyze() { analyse(); } + + // Get the estimate for an observation. + // If there was no such observation, return false. + // Otherwise return true and yield the estimate. + bool estimate(ObsType observation, double &estimate) const + { + if (observation == 0) + { + estimate = PZero; + return true; + } + + iterator rownum = row(observation, data.begin()); + if (rownum == data.end()) + { + return false; + } + + estimate = (*rownum).second.estimate; + return true; + } + + // Get start and end iterators over the data map. + // You do not derefence these iterators directly, but instead used the + // access functions, obs, freq and estimate. + pair iterate() const + { return make_pair(data.begin(), data.end()); } + + // Get the observation from an iterator. + ObsType obs(iterator i) const { return (*i).first; } + + // Get the frequency from an iterator (as supplied by add). + unsigned int freq(iterator i) const { return (*i).second.freq; } + + // Get the estimated relative frequency from an iterator. + double estimate(iterator i) const { return (*i).second.estimate; } + + private: + // The data points + DataMap data; + + // Zero estimate (only valid after a call to analyse()). + double PZero; + + // Total number of observations + ObsType totalObs; + + // Find the last row of the data which has a value equals to obs. + // If there is no such value, return data.end(). + // start is a hint about where to start searching. + iterator row(ObsType obs, iterator start) const + { + iterator j = start; + + while (j != data.end() && (*j).first < obs) + ++j; + + return ((j != data.end() && (*j).first == obs) ? j : data.end()); + } +}; + +#endif //SGT_H diff --git a/sgt/sgt.h.gch b/sgt/sgt.h.gch new file mode 100644 index 0000000..2e98d03 Binary files /dev/null and b/sgt/sgt.h.gch differ diff --git a/sgt/sgt.zip b/sgt/sgt.zip new file mode 100644 index 0000000..ccce498 Binary files /dev/null and b/sgt/sgt.zip differ diff --git a/sgt/sgttest.cpp b/sgt/sgttest.cpp new file mode 100644 index 0000000..62ab3b4 --- /dev/null +++ b/sgt/sgttest.cpp @@ -0,0 +1,54 @@ +// Test program for sgt code + +// Reads a file in which each line contains an observed value and the +// frequency of the value. Prints out a table of the estimates, and also the +// estimate for value 1 (to test the estimate() function). + +#include "sgt.h" +#include + +// Set this to the type for the observation +//typedef double Obs; +typedef unsigned int Obs; + +int main() +{ + SGT sgt; + Obs observation; + unsigned int frequency; + while (cin >> observation) + { + if (!(cin >> frequency)) + { + cerr << "Incomplete input" << endl; + return -1; + } + + sgt.add(observation, frequency); + } + + sgt.analyse(); + cout << "Results:" << endl; + + // Use iterators to access the results + pair::iterator, SGT::iterator> i = sgt.iterate(); + for (; i.first != i.second; ++i.first) + { + cout << sgt.obs(i.first) + << "\t" << sgt.freq(i.first) + << "\t" << sgt.estimate(i.first) + << "\t" << sgt.estimate(i.first) * sgt.total() + << endl; + } + + double estimate; + sgt.estimate(0, estimate); + cout << "0\t" << estimate << endl; + + if (sgt.estimate(1, estimate)) + cout << "Estimate on obs=1: " << estimate << endl; + else + cout << "No estimate for obs=1" << endl; + return 0; +} + diff --git a/src/com/owoga/prhyme/util/math.clj b/src/com/owoga/prhyme/util/math.clj index 5e804b3..1fa3d2d 100644 --- a/src/com/owoga/prhyme/util/math.clj +++ b/src/com/owoga/prhyme/util/math.clj @@ -157,6 +157,18 @@ (fn [x] (Math/pow Math/E (+ b (* m (Math/log x))))))) +(defn averaged-smooth + "Assumes 0 Nrs are included." + [rs Nrs] + (let [rs (concat rs [(inc (last rs))]) + Nrs (concat Nrs [(+ (last Nrs) (- (last Nrs) + (last (butlast Nrs))))])] + [rs Nrs])) +(comment + (averaged-smooth [1 2 3 4] [32 10 0 2]) + + ) + (defn average-consecutives "Average all the non-zero frequency of observations (frequency of frequencies) using the equation Zr = Nr / 0.5 (t - q) @@ -312,14 +324,28 @@ lgt? e (conj estimations estimation))))) - N* (apply + (map #(apply * %) (map vector nrs estimations)))] - [(map #(* (- 1 p0) (/ % N*)) estimations)])) + N* (apply + (map #(apply * %) (map vector nrs estimations))) + probs (cons + (float p0) + (map #(* (- 1 p0) (/ % N*)) estimations)) + sum-probs (apply + probs)] + [(cons 0 rs) (map #(/ % sum-probs) probs)])) (comment - (let [rs [1 2 3 4 5 6 7 8 9 10 12 26] - nrs [32 20 10 3 1 2 1 1 1 2 1 1] - sgts (sgt rs nrs)] - sgts - ) + (let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26] + nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1] + rs [1 2 3 4 5 6 7 8 9 10 12 26] + nrs [32 20 10 3 1 2 1 1 1 2 1 1]] + (map #(apply * %) (map vector rs (sgt rs nrs))) + (sgt rs nrs)) + + ) +(comment + (let [rs [1 2 3 4 5 6 7 8 9 10 12] + nrs [120 40 24 13 15 5 11 2 2 1 3] + sgts (sgt rs nrs) + N0 (apply + nrs)] + [(float (/ 120 N0)) + (apply + sgts)]) )