Working completions

main
Eric Ihli 4 years ago
parent cbde1996dc
commit 495ef6c602

@ -2,7 +2,8 @@
(:require [clojure.java.io :as io]
[clojure.string :as string]
[clojure.set]
[com.owoga.prhyme.nlp.core :as nlp]))
[com.owoga.prhyme.nlp.core :as nlp]
[com.owoga.prhyme.util.math :as math]))
(def re-word
"Regex for tokenizing a string into words
@ -17,12 +18,24 @@
(re-seq re-word)
(map second)
(map string/lower-case)
(cons :bol)
(reverse)
(cons :end)))
(cons :eol)))
(defn tokenize-line
[line]
(->> line
(string/trim)
(re-seq re-word)
(map second)
(map string/lower-case)))
(comment
(-> (slurp "dev/examples/sandman.txt")
tokenize))
(->> (slurp "dev/examples/sandman.txt")
(#(string/split % #"\n"))
(map tokenize-line))
)
(defn zero-to-n-seq
([coll]
@ -85,7 +98,99 @@
(zero-to-n-seq (first windows)))
(rest windows))))))
(defn add-to-trie-1
[trie n tokens]
(let [pad-n (dec n)
tokens (concat (repeat pad-n :bol) tokens (repeat pad-n :eol))
partitions (partition n 1 tokens)]
(reduce
(fn [acc tokens]
(update-in acc (concat tokens [:count]) (fnil inc 0)))
trie
partitions)))
(defn flatmap
([m]
(flatmap m []))
([m prefix]
(mapcat
(fn [[k v]]
(if (map? v)
(flatmap v (conj prefix k))
[(conj prefix k) v]))
m)))
(defn filter-trie-to-ngrams [trie n]
(->> trie
(flatmap)
(partition 2)
;; Inc to account for :count
(filter #(= (inc n) (count (first %))))))
(comment
(let [trie {}]
(-> (add-to-trie-1 trie 2 '("of" "lives" "lost" "at" "sea"))
(add-to-trie-1 1 '("of" "lives" "lost" "at" "sea"))))
)
(defn wrand
"given a vector of slice sizes, returns the index of a slice given a
random spin of a roulette wheel with compartments proportional to
slices."
[slices]
(let [total (reduce + slices)
r (rand total)]
(loop [i 0 sum 0]
(if (< r (+ (slices i) sum))
i
(recur (inc i) (+ (slices i) sum))))))
(defn completions [trie probs words]
(let [n (inc (count words))
possibilities (->> (get-in trie words)
(filter #(string? (first %)))
(map (fn [[k v]]
[k (get-in probs [n (:count v)])]))
(into {}))
sum-probs (apply + (vals possibilities))
possibilities (into {} (map (fn [[k v]] [k (/ v sum-probs)]) possibilities))]
possibilities))
(comment
;; Turning corpus into a trie.
(let [documents (->> "dark-corpus"
io/file
file-seq
(remove #(.isDirectory %))
(drop 500)
(take 5))
trie (->> documents
(map slurp)
(mapcat #(string/split % #"\n"))
(map tokenize-line)
(filter #(> (count %) 1))
(take 500)
(reduce
(fn [acc tokens]
(-> (add-to-trie-1 acc 1 tokens)
(add-to-trie-1 2 tokens)
(add-to-trie-1 3 tokens)))
{}))
probs (->> (range 1 4)
(map #(vector % (filter-trie-to-ngrams trie %)))
(map (fn [[n v]] [n (map #(second %) v)]))
(map (fn [[n v]] [n (into (sorted-map) (frequencies v))]))
(map (fn [[n v]] [n (math/sgt (keys v) (vals v))]))
(map (fn [[n [rs probs]]]
[n (into {} (map vector rs probs))]))
(into {}))]
(reverse (sort-by second (completions trie probs [:bol "you"]))))
(into {} (map vector [1 2 3] [4 5 6]))
;;
;; => ([1 (1 2 8 7 3 6 4 23) (85 18 2 2 6 3 1 1)]
;; [2 (1 2 5 3 4 7) (170 25 2 4 2 2)]
;; [3 (1 2 3 4 7 5) (213 30 5 1 1 3)])
(let [last-window '("in" "the" "frat")]
(concat (zero-to-n-seq last-window)
(rest (n-to-zero-seq last-window))))
@ -96,6 +201,7 @@
(string/split
"the cat in the hat is the rat in the frat"
#" "))
;; => {"the"
;; {:count 3,
;; "cat" {:count 1, "in" {:count 1}},
@ -184,7 +290,29 @@
:else (flat-at-depth (->> m (mapcat second) (remove #(= :count (first %))))
(dec depth)))))
(defn flatmap
([m]
(flatmap m []))
([m prefix]
(mapcat
(fn [[k v]]
(if (map? v)
(flatmap v (conj prefix k))
[(conj prefix k) v]))
m)))
(defn filter-trie-to-ngrams [trie n]
(->> trie
(flatmap)
(partition 2)
;; Inc to account for :count
(filter #(= (inc n) (count (first %))))))
(apply hash-map '([1 2] 3 [4 5] 6))
(comment
(apply hash-map (flatmap {1 {2 {3 4} 5 {6 7}} 8 {9 10}} []))
(let [trie {"d" {:count 3
"o" {:count 3
"g" {:count 2}
@ -196,7 +324,8 @@
"g" {:count 1}}
"i" {:count 1
"g" {:count 1}}}}]
(->> (flat-at-depth trie 2)))
(filter-trie-to-ngrams trie 3))
)
@ -458,7 +587,6 @@
(def n-gram-freq-map (n-gram-frequency-map trie 3))
(def unigram-frequencies (n-gram-freq-map 1))
unigram-frequencies
)
(defn number-of-n-grams-that-occur-with-count [trie n c]

@ -0,0 +1,72 @@
SGT
===
The files here contain a C++ class for implementing simple Good-Turing
re-estimation, as described by Geoff Sampson in the book Empirical Linguistics
(2001), and on the web at http://www.grsampson.net/RGoodTur.html. The code
here is a C++ adaptation of the published code by Sampson and Gale, with the
bug fix issued in 2000. It is encapsulated as a class to allow it to be
incorporated into other programs. An additional coding change is that the data
can be presented in any order, whereas the original code required the data to
be in ascending order.
Sampson's original code was issued with no restrictions on use. In keeping
with the spirit of this, the code here is issued under an open source licence
which allows essentially unrestricted use.
LICENCE
-------
Copyright (c) David Elworthy 2004.
All rights reserved.
Redistribution and use in source and binary forms for any purpose, with or
without modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions, and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer that follows
these conditions in the documentation and/or other materials
provided with the distribution.
THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Contact details
---------------
You may contact me at david@friendlymoose.com. I would be happy to hear of any
experiences you have with the code; please feel free to send me updated
versions. The reference site for the code is http://www.friendlymoose.com/.
Files and use
-------------
There are three files:
sgt.h SGT header file
sgttest.cpp A test and example program
There is no source file, as the SGT class is a template over the observation
type, typically either an int or a double.
Information about using the class is included in the header file. The code has
been tested with g++ version 3.2 on cygwin and Microsoft Visual Studio version
6 on Windows 2000. You can compile and link the test program using g++ using
the command
g++ -o sgttest sgttest.cpp
For Visual Studio, from the command line, you can compile and link with
cl -GX sgttest.cpp
Version history
---------------
Initial version released January 2004.
Updated to a better implementation April 2004.

Binary file not shown.

@ -0,0 +1,12 @@
1 32
2 20
3 10
4 3
5 1
6 2
7 1
8 1
9 1
10 2
12 1
26 1

@ -0,0 +1,314 @@
#ifndef SGT_H
#define SGT_H
// Simple Good-Turing estimation
//
// Copyright (c) David Elworthy 2004.
// A class for implementing simple Good-Turing re-estimation, as described by
// Geoff Sampson in the book Empirical Linguistics (2001), and on the web at
// http://www.grsampson.net/RGoodTur.html. The code here is a C++ adaptation
// of the published code by Sampson and Gale, with the bug fix issued in
// 2000. It is encapsulated as a class to allow it to be incorporated into
// other programs. An additional coding change is that the data can be
// presented in any order, whereas the original code required the data to be
// in ascending order.
//
// Copyright (c) David Elworthy 2004.
// All rights reserved.
//
// Redistribution and use in source and binary forms for any purpose, with or
// without modification, are permitted provided that the following conditions
// are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions, and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions, and the disclaimer that follows
// these conditions in the documentation and/or other materials
// provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
// WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
// NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// You may contact me at david@friendlymoose.com.
#include <iostream>
#include <map>
#include <vector>
#include <cmath>
using namespace std;
// Simple Good-Turing class.
// To use the class, create an SGT object and data to it by calling add() with
// each data point. A data point consists of the observed value and the
// frequency of the the observation (what Sampson and Gale refer to as the
// frequency, and the frequency of the the frequency). When you have added all
// the data points call analyse(). You can then call estimate() with an
// observed value as argument to get the estimated frequency for that value,
// or call iterate() to iterate over the data points. There is one special
// case to estimate(). If called with an argument of zero, it delivers the
// estimated frequency for unseen events. This is not delivered from pair().
// To get back from the estimate value to the smoothed value of the
// observation, multiply by total();
//
// In the original Sampson and Gale version, the observation was an integer.
// For this version, we make the code be a template over the observation type.
// However, it must always be some suitable numeric type, such as int or double.
//
// The code is implemented using the Standard Template Library (STL).
template <class ObsType> class SGT
{
private:
// Data block, holding the frequency and estimate. The estimate is set up
// by analyse().
struct Data
{
Data(unsigned int f) : freq(f), estimate(0) {}
unsigned int freq;
double estimate;
};
// Internal representation, as a map from observations to frequencies.
// After calling analyse(), it provides the estimates as well.
typedef map<ObsType, Data, less<ObsType> > DataMap;
// Minimum number of data points for a valid analysis
#ifdef _WIN32
#define MinInput (5)
#else
static const unsigned int MinInput = 5;
#endif
template <class T> double sq(T d) { return ((double) d)*d; }
double smoothed(ObsType i, double intercept, double slope)
{ return (exp(intercept + slope * log((double) i))); }
public:
// Iterator type for iterate();
typedef typename DataMap::const_iterator iterator;
// Construct a SGT object.
SGT() : totalObs(0) {}
// Destroy SGT object.
~SGT() {}
// Add a data point.
// If an observation with the same value has already been supplied, this adds
// to its frequency.
void add(ObsType observation, unsigned int frequency)
{
typename DataMap::iterator i = data.find(observation);
if (i == data.end())
data.insert(make_pair(observation, Data(frequency)));
else
(*i).second.freq += frequency;
totalObs += observation * frequency;
}
// Get total number of observations (= sum of obs*freq)
ObsType total() const { return totalObs; }
// Analyse the data.
// Returns false if there is not enough data for a valid analysis.
// In this case, the estimate is set to the original value.
bool analyse()
{
if (data.size() < MinInput)
return false;
// The code which follows is based on S and G's analyseInput()
ObsType bigN = 0;
unsigned int rows = data.size();
// j could be declared in each for statement, but has to be here for
// Visual C++, which disobeys the ANSI standard on variable scope.
typename DataMap::iterator j;
for (j = data.begin(); j != data.end(); ++j)
bigN += (*j).first * (*j).second.freq;
// Find the frequency for observation of value 1, if any
iterator row1 = row(1, data.begin());
PZero = (row1 == data.end()) ? 0 : (*row1).second.freq / (double) bigN;
// Set up internal arrays
vector<double> log_obs(rows);
vector<double> log_Z(rows);
vector<double> rStar(rows);
double XYs = 0, Xsquares = 0, meanX = 0, meanY = 0;
ObsType prevObs = 0;
unsigned int r = 0;
for (j = data.begin(); j != data.end(); ++r)
{
ObsType obs = (*j).first;
Data &d = (*j).second;
double k = (++j == data.end())
? (double) (2 * obs - prevObs) : (double) (*j).first;
double Z = 2 * d.freq / (k - prevObs);
log_obs[r] = log((double) obs);
log_Z[r] = log(Z);
meanX += log_obs[r];
meanY += log_Z[r];
prevObs = obs;
}
// Find the line with the best fit.
meanX /= rows;
meanY /= rows;
for (r = 0; r < rows; ++r)
{
XYs += (log_obs[r] - meanX) * (log_Z[r] - meanY);
Xsquares += sq(log_obs[r] - meanX);
}
double slope = XYs / Xsquares;
double intercept = meanY - slope * meanX;
// Now construct the estimates smoothing using the fitted line.
bool indiffValsSeen = false;
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
{
ObsType obs = (*j).first;
Data &d = (*j).second;
ObsType obs1 = obs + 1;
double y = obs1 * smoothed(obs1, intercept, slope)
/ smoothed(obs, intercept, slope);
iterator nextRow = row(obs1, j);
if (nextRow == data.end())
{
indiffValsSeen = true;
}
else if (!indiffValsSeen)
{
unsigned int next_n = (*nextRow).second.freq;
unsigned int freq = d.freq;
double x = obs1 * next_n / (double) freq;
printf("%0.2f %0.2f %0.2f\n",
(float) obs1, (float) next_n, (float) freq);
printf("stdv %0.2f\n",
sqrt(sq(obs1) * next_n
/ (sq(freq)) * (1 + next_n / (double) freq)));
printf("x %0.2f y %0.2f\n", x, y);
if (fabs(x - y) <= 1.96 * sqrt(sq(obs1) * next_n
/ (sq(freq)) * (1 + next_n / (double) freq)))
{
indiffValsSeen = true;
}
else
{
rStar[r] = x;
}
}
if (indiffValsSeen)
{
rStar[r] = y;
}
}
double bigNprime = 0.0;
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r) {
printf("%f\n", (float) (*j).second.freq);
bigNprime += (*j).second.freq * rStar[r];
}
printf("%f %f\n", (float) PZero, (float) bigNprime);
for (int i = 0; i < (int) rStar.size(); i++)
printf("%f\n", rStar[i]);
for (j = data.begin(), r = 0; j != data.end(); ++j, ++r)
(*j).second.estimate = (1 - PZero) * rStar[r] / bigNprime;
return true;
}
// Analyze the data.
// This just calls analyse(), and is included as a concession to speakers
// of debased dialects of English.
void analyze() { analyse(); }
// Get the estimate for an observation.
// If there was no such observation, return false.
// Otherwise return true and yield the estimate.
bool estimate(ObsType observation, double &estimate) const
{
if (observation == 0)
{
estimate = PZero;
return true;
}
iterator rownum = row(observation, data.begin());
if (rownum == data.end())
{
return false;
}
estimate = (*rownum).second.estimate;
return true;
}
// Get start and end iterators over the data map.
// You do not derefence these iterators directly, but instead used the
// access functions, obs, freq and estimate.
pair<iterator, iterator> iterate() const
{ return make_pair(data.begin(), data.end()); }
// Get the observation from an iterator.
ObsType obs(iterator i) const { return (*i).first; }
// Get the frequency from an iterator (as supplied by add).
unsigned int freq(iterator i) const { return (*i).second.freq; }
// Get the estimated relative frequency from an iterator.
double estimate(iterator i) const { return (*i).second.estimate; }
private:
// The data points
DataMap data;
// Zero estimate (only valid after a call to analyse()).
double PZero;
// Total number of observations
ObsType totalObs;
// Find the last row of the data which has a value equals to obs.
// If there is no such value, return data.end().
// start is a hint about where to start searching.
iterator row(ObsType obs, iterator start) const
{
iterator j = start;
while (j != data.end() && (*j).first < obs)
++j;
return ((j != data.end() && (*j).first == obs) ? j : data.end());
}
};
#endif //SGT_H

Binary file not shown.

Binary file not shown.

@ -0,0 +1,54 @@
// Test program for sgt code
// Reads a file in which each line contains an observed value and the
// frequency of the value. Prints out a table of the estimates, and also the
// estimate for value 1 (to test the estimate() function).
#include "sgt.h"
#include <iostream>
// Set this to the type for the observation
//typedef double Obs;
typedef unsigned int Obs;
int main()
{
SGT<Obs> sgt;
Obs observation;
unsigned int frequency;
while (cin >> observation)
{
if (!(cin >> frequency))
{
cerr << "Incomplete input" << endl;
return -1;
}
sgt.add(observation, frequency);
}
sgt.analyse();
cout << "Results:" << endl;
// Use iterators to access the results
pair<SGT<Obs>::iterator, SGT<Obs>::iterator> i = sgt.iterate();
for (; i.first != i.second; ++i.first)
{
cout << sgt.obs(i.first)
<< "\t" << sgt.freq(i.first)
<< "\t" << sgt.estimate(i.first)
<< "\t" << sgt.estimate(i.first) * sgt.total()
<< endl;
}
double estimate;
sgt.estimate(0, estimate);
cout << "0\t" << estimate << endl;
if (sgt.estimate(1, estimate))
cout << "Estimate on obs=1: " << estimate << endl;
else
cout << "No estimate for obs=1" << endl;
return 0;
}

@ -157,6 +157,18 @@
(fn [x]
(Math/pow Math/E (+ b (* m (Math/log x)))))))
(defn averaged-smooth
"Assumes 0 Nrs are included."
[rs Nrs]
(let [rs (concat rs [(inc (last rs))])
Nrs (concat Nrs [(+ (last Nrs) (- (last Nrs)
(last (butlast Nrs))))])]
[rs Nrs]))
(comment
(averaged-smooth [1 2 3 4] [32 10 0 2])
)
(defn average-consecutives
"Average all the non-zero frequency of observations (frequency of frequencies)
using the equation Zr = Nr / 0.5 (t - q)
@ -312,14 +324,28 @@
lgt?
e
(conj estimations estimation)))))
N* (apply + (map #(apply * %) (map vector nrs estimations)))]
[(map #(* (- 1 p0) (/ % N*)) estimations)]))
N* (apply + (map #(apply * %) (map vector nrs estimations)))
probs (cons
(float p0)
(map #(* (- 1 p0) (/ % N*)) estimations))
sum-probs (apply + probs)]
[(cons 0 rs) (map #(/ % sum-probs) probs)]))
(comment
(let [rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]
sgts (sgt rs nrs)]
sgts
)
(let [rs [ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]
nrs [32 20 10 3 1 2 1 1 1 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
rs [1 2 3 4 5 6 7 8 9 10 12 26]
nrs [32 20 10 3 1 2 1 1 1 2 1 1]]
(map #(apply * %) (map vector rs (sgt rs nrs)))
(sgt rs nrs))
)
(comment
(let [rs [1 2 3 4 5 6 7 8 9 10 12]
nrs [120 40 24 13 15 5 11 2 2 1 3]
sgts (sgt rs nrs)
N0 (apply + nrs)]
[(float (/ 120 N0))
(apply + sgts)])
)

Loading…
Cancel
Save