From f0ea2bc513e04704d60629be6f68c05b91ac25a4 Mon Sep 17 00:00:00 2001
From: Eric Ihli <ericihli@gmail.com>
Date: Tue, 27 Oct 2020 20:04:29 -0700
Subject: [PATCH] WIP

---
 src/com/owoga/prhyme/data/dictionary.clj | 44 ++++++++++++++++++++++++
 src/com/owoga/prhyme/data/phonetics.clj  | 22 ++++++++++++
 src/com/owoga/prhyme/data/thesaurus.clj  |  9 +++++
 3 files changed, 75 insertions(+)
 create mode 100644 src/com/owoga/prhyme/data/dictionary.clj
 create mode 100644 src/com/owoga/prhyme/data/phonetics.clj
 create mode 100644 src/com/owoga/prhyme/data/thesaurus.clj

diff --git a/src/com/owoga/prhyme/data/dictionary.clj b/src/com/owoga/prhyme/data/dictionary.clj
new file mode 100644
index 0000000..b4a8c07
--- /dev/null
+++ b/src/com/owoga/prhyme/data/dictionary.clj
@@ -0,0 +1,44 @@
+(ns com.owoga.prhyme.data.dictionary
+  (:require [clojure.string :as string]
+            [clojure.java.io :as io]
+            [clojure.set]
+            [com.owoga.prhyme.core :as prhyme]))
+
+(def cmu-dict
+  (->> (io/reader (io/resource "cmudict_SPHINX_40"))
+       (line-seq)
+       (map #(string/split % #"[\t ]"))))
+
+(def prhyme-dict
+  (into [] (map prhyme/cmu->prhyme cmu-dict)))
+
+(def popular
+  (set (line-seq (io/reader (io/resource "popular.txt")))))
+
+(def adverbs
+  (clojure.set/intersection
+   popular
+   (set (line-seq (io/reader (io/resource "adverbs.txt"))))))
+
+(def adjectives
+  (clojure.set/intersection
+   popular
+   (set (line-seq (io/reader (io/resource "adjectives.txt"))))))
+
+(def verbs
+  (clojure.set/intersection
+   popular
+   (set (line-seq (io/reader (io/resource "verbs.txt"))))))
+
+(def nouns
+  (clojure.set/intersection
+   popular
+   (set (line-seq (io/reader (io/resource "nouns.txt"))))))
+
+(defn english? [text]
+  (let [words (string/split text #"\s+")
+        english-words
+        (->> words
+             (filter #((into #{} (map :normalized-word prhyme-dict))
+                       (string/lower-case %))))]
+    (< 0.7 (/ (count english-words) (max 1 (count words))))))
diff --git a/src/com/owoga/prhyme/data/phonetics.clj b/src/com/owoga/prhyme/data/phonetics.clj
new file mode 100644
index 0000000..a014bc1
--- /dev/null
+++ b/src/com/owoga/prhyme/data/phonetics.clj
@@ -0,0 +1,22 @@
+(ns com.owoga.prhyme.data.phonetics
+  (:require [clojure.string :as string]
+            [clojure.set]
+            [clojure.java.io :as io]))
+
+(def phonemap
+  (->> (io/reader (io/resource "cmudict-0.7b.phones"))
+       (line-seq)
+       (map #(string/split % #"\t"))
+       (into {})))
+
+(def long-vowel #{"EY" "IY" "AY" "OW" "UW"})
+
+(def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"})
+
+(def vowel (clojure.set/union long-vowel short-vowel))
+
+(def consonant (clojure.set/difference (into #{} (keys phonemap)) vowel))
+
+(def syllable-end (clojure.set/union consonant long-vowel))
+
+(def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"})
diff --git a/src/com/owoga/prhyme/data/thesaurus.clj b/src/com/owoga/prhyme/data/thesaurus.clj
new file mode 100644
index 0000000..6489386
--- /dev/null
+++ b/src/com/owoga/prhyme/data/thesaurus.clj
@@ -0,0 +1,9 @@
+(ns com.owoga.prhyme.data.thesaurus
+  (:require [clojure.string :as string]
+            [clojure.java.io :as io]))
+
+(def thesaurus
+  (->> (line-seq (io/reader (io/resource "mthesaur.txt")))
+       (map #(string/split % #","))
+       (map #(vector (first %) (rest %)))
+       (into {})))