From f0ea2bc513e04704d60629be6f68c05b91ac25a4 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Tue, 27 Oct 2020 20:04:29 -0700 Subject: [PATCH] WIP --- src/com/owoga/prhyme/data/dictionary.clj | 44 ++++++++++++++++++++++++ src/com/owoga/prhyme/data/phonetics.clj | 22 ++++++++++++ src/com/owoga/prhyme/data/thesaurus.clj | 9 +++++ 3 files changed, 75 insertions(+) create mode 100644 src/com/owoga/prhyme/data/dictionary.clj create mode 100644 src/com/owoga/prhyme/data/phonetics.clj create mode 100644 src/com/owoga/prhyme/data/thesaurus.clj diff --git a/src/com/owoga/prhyme/data/dictionary.clj b/src/com/owoga/prhyme/data/dictionary.clj new file mode 100644 index 0000000..b4a8c07 --- /dev/null +++ b/src/com/owoga/prhyme/data/dictionary.clj @@ -0,0 +1,44 @@ +(ns com.owoga.prhyme.data.dictionary + (:require [clojure.string :as string] + [clojure.java.io :as io] + [clojure.set] + [com.owoga.prhyme.core :as prhyme])) + +(def cmu-dict + (->> (io/reader (io/resource "cmudict_SPHINX_40")) + (line-seq) + (map #(string/split % #"[\t ]")))) + +(def prhyme-dict + (into [] (map prhyme/cmu->prhyme cmu-dict))) + +(def popular + (set (line-seq (io/reader (io/resource "popular.txt"))))) + +(def adverbs + (clojure.set/intersection + popular + (set (line-seq (io/reader (io/resource "adverbs.txt")))))) + +(def adjectives + (clojure.set/intersection + popular + (set (line-seq (io/reader (io/resource "adjectives.txt")))))) + +(def verbs + (clojure.set/intersection + popular + (set (line-seq (io/reader (io/resource "verbs.txt")))))) + +(def nouns + (clojure.set/intersection + popular + (set (line-seq (io/reader (io/resource "nouns.txt")))))) + +(defn english? [text] + (let [words (string/split text #"\s+") + english-words + (->> words + (filter #((into #{} (map :normalized-word prhyme-dict)) + (string/lower-case %))))] + (< 0.7 (/ (count english-words) (max 1 (count words)))))) diff --git a/src/com/owoga/prhyme/data/phonetics.clj b/src/com/owoga/prhyme/data/phonetics.clj new file mode 100644 index 0000000..a014bc1 --- /dev/null +++ b/src/com/owoga/prhyme/data/phonetics.clj @@ -0,0 +1,22 @@ +(ns com.owoga.prhyme.data.phonetics + (:require [clojure.string :as string] + [clojure.set] + [clojure.java.io :as io])) + +(def phonemap + (->> (io/reader (io/resource "cmudict-0.7b.phones")) + (line-seq) + (map #(string/split % #"\t")) + (into {}))) + +(def long-vowel #{"EY" "IY" "AY" "OW" "UW"}) + +(def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"}) + +(def vowel (clojure.set/union long-vowel short-vowel)) + +(def consonant (clojure.set/difference (into #{} (keys phonemap)) vowel)) + +(def syllable-end (clojure.set/union consonant long-vowel)) + +(def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"}) diff --git a/src/com/owoga/prhyme/data/thesaurus.clj b/src/com/owoga/prhyme/data/thesaurus.clj new file mode 100644 index 0000000..6489386 --- /dev/null +++ b/src/com/owoga/prhyme/data/thesaurus.clj @@ -0,0 +1,9 @@ +(ns com.owoga.prhyme.data.thesaurus + (:require [clojure.string :as string] + [clojure.java.io :as io])) + +(def thesaurus + (->> (line-seq (io/reader (io/resource "mthesaur.txt"))) + (map #(string/split % #",")) + (map #(vector (first %) (rest %))) + (into {})))