Initial commit, syllabification and phonetics.
commit
69d88fb732
@ -0,0 +1,2 @@
|
|||||||
|
**.pdf filter=lfs diff=lfs merge=lfs -text
|
||||||
|
resources/cmudict-0.7b filter=lfs diff=lfs merge=lfs -text
|
@ -0,0 +1,15 @@
|
|||||||
|
/target
|
||||||
|
/classes
|
||||||
|
/checkouts
|
||||||
|
*.jar
|
||||||
|
*.class
|
||||||
|
/.calva/output-window/
|
||||||
|
/.cpcache
|
||||||
|
/.lein-*
|
||||||
|
/.lsp/sqlite*.db
|
||||||
|
/.nrepl-history
|
||||||
|
/.nrepl-port
|
||||||
|
/.rebel_readline_history
|
||||||
|
/.socket-repl-port
|
||||||
|
.hgignore
|
||||||
|
.hg/
|
@ -0,0 +1,24 @@
|
|||||||
|
# Change Log
|
||||||
|
All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
### Changed
|
||||||
|
- Add a new arity to `make-widget-async` to provide a different widget shape.
|
||||||
|
|
||||||
|
## [0.1.1] - 2021-04-22
|
||||||
|
### Changed
|
||||||
|
- Documentation on how to make the widgets.
|
||||||
|
|
||||||
|
### Removed
|
||||||
|
- `make-widget-sync` - we're all async, all the time.
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- Fixed widget maker to keep working when daylight savings switches over.
|
||||||
|
|
||||||
|
## 0.1.0 - 2021-04-22
|
||||||
|
### Added
|
||||||
|
- Files from the new template.
|
||||||
|
- Widget maker public API - `make-widget-sync`.
|
||||||
|
|
||||||
|
[Unreleased]: https://github.com/com.owoga/phonetics/compare/0.1.1...HEAD
|
||||||
|
[0.1.1]: https://github.com/com.owoga/phonetics/compare/0.1.0...0.1.1
|
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2021 Eric Ihli
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
@ -0,0 +1,21 @@
|
|||||||
|
{:paths ["src" "resources"]
|
||||||
|
:deps {org.clojure/clojure {:mvn/version "1.10.3"}
|
||||||
|
net.sf.sociaal/freetts {:mvn/version "1.2.2"}}
|
||||||
|
:aliases
|
||||||
|
{:test {:extra-paths ["test"]
|
||||||
|
:extra-deps {org.clojure/test.check {:mvn/version "1.1.0"}}}
|
||||||
|
:runner
|
||||||
|
{:extra-deps {com.cognitect/test-runner
|
||||||
|
{:git/url "https://github.com/cognitect-labs/test-runner"
|
||||||
|
:sha "b6b3193fcc42659d7e46ecd1884a228993441182"}}
|
||||||
|
:main-opts ["-m" "cognitect.test-runner"
|
||||||
|
"-d" "test"]}
|
||||||
|
:jar {:replace-deps {com.github.seancorfield/depstar {:mvn/version "2.0.211"}}
|
||||||
|
:exec-fn hf.depstar/jar
|
||||||
|
:exec-args {:jar "phonetics.jar" :sync-pom true}}
|
||||||
|
:install {:replace-deps {slipset/deps-deploy {:mvn/version "0.1.5"}}
|
||||||
|
:exec-fn deps-deploy.deps-deploy/deploy
|
||||||
|
:exec-args {:installer :local :artifact "phonetics.jar"}}
|
||||||
|
:deploy {:replace-deps {slipset/deps-deploy {:mvn/version "0.1.5"}}
|
||||||
|
:exec-fn deps-deploy.deps-deploy/deploy
|
||||||
|
:exec-args {:installer :remote :artifact "phonetics.jar"}}}}
|
@ -0,0 +1,3 @@
|
|||||||
|
# Introduction to phonetics
|
||||||
|
|
||||||
|
TODO: write [great documentation](http://jacobian.org/writing/what-to-write/)
|
@ -0,0 +1,55 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||||
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
<groupId>com.owoga</groupId>
|
||||||
|
<artifactId>phonetics</artifactId>
|
||||||
|
<version>0.1.1</version>
|
||||||
|
<name>com.owoga/phonetics</name>
|
||||||
|
<description>Phonetics and syllabification of English words.</description>
|
||||||
|
<url>https://github.com/com.owoga/phonetics</url>
|
||||||
|
<licenses>
|
||||||
|
<license>
|
||||||
|
<name>MIT License</name>
|
||||||
|
<url>https://mit-license.org/</url>
|
||||||
|
</license>
|
||||||
|
</licenses>
|
||||||
|
<developers>
|
||||||
|
<developer>
|
||||||
|
<name>Eric Ihli</name>
|
||||||
|
</developer>
|
||||||
|
</developers>
|
||||||
|
<scm>
|
||||||
|
<url>https://github.com/com.owoga/phonetics</url>
|
||||||
|
<connection>scm:git:git://github.com/com.owoga/phonetics.git</connection>
|
||||||
|
<developerConnection>scm:git:ssh://git@github.com/com.owoga/phonetics.git</developerConnection>
|
||||||
|
<tag>v0.1.0-SNAPSHOT</tag>
|
||||||
|
</scm>
|
||||||
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.clojure</groupId>
|
||||||
|
<artifactId>clojure</artifactId>
|
||||||
|
<version>1.10.3</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.sf.sociaal</groupId>
|
||||||
|
<artifactId>freetts</artifactId>
|
||||||
|
<version>1.2.2</version>
|
||||||
|
</dependency>
|
||||||
|
</dependencies>
|
||||||
|
<build>
|
||||||
|
<sourceDirectory>src</sourceDirectory>
|
||||||
|
</build>
|
||||||
|
<repositories>
|
||||||
|
<repository>
|
||||||
|
<id>clojars</id>
|
||||||
|
<url>https://repo.clojars.org/</url>
|
||||||
|
</repository>
|
||||||
|
</repositories>
|
||||||
|
<distributionManagement>
|
||||||
|
<repository>
|
||||||
|
<id>clojars</id>
|
||||||
|
<name>Clojars repository</name>
|
||||||
|
<url>https://clojars.org/repo</url>
|
||||||
|
</repository>
|
||||||
|
</distributionManagement>
|
||||||
|
</project>
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,204 @@
|
|||||||
|
(ns com.owoga.phonetics
|
||||||
|
(:require [clojure.set]
|
||||||
|
[clojure.string :as string]
|
||||||
|
[clojure.java.io :as io]
|
||||||
|
[clojure.set :as set])
|
||||||
|
(:import (com.sun.speech.freetts.en.us CMULexicon)))
|
||||||
|
|
||||||
|
;; From http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.phones
|
||||||
|
|
||||||
|
(def phonemap
|
||||||
|
{"T" "stop",
|
||||||
|
"CH" "affricate",
|
||||||
|
"K" "stop",
|
||||||
|
"HH" "aspirate",
|
||||||
|
"UH" "vowel",
|
||||||
|
"AY" "vowel",
|
||||||
|
"AH" "vowel",
|
||||||
|
"OW" "vowel",
|
||||||
|
"L" "liquid",
|
||||||
|
"JH" "affricate",
|
||||||
|
"UW" "vowel",
|
||||||
|
"G" "stop",
|
||||||
|
"EH" "vowel",
|
||||||
|
"M" "nasal",
|
||||||
|
"OY" "vowel",
|
||||||
|
"S" "fricative",
|
||||||
|
"Y" "semivowel",
|
||||||
|
"EY" "vowel",
|
||||||
|
"Z" "fricative",
|
||||||
|
"R" "liquid",
|
||||||
|
"F" "fricative",
|
||||||
|
"AW" "vowel",
|
||||||
|
"IY" "vowel",
|
||||||
|
"B" "stop",
|
||||||
|
"SH" "fricative",
|
||||||
|
"P" "stop",
|
||||||
|
"V" "fricative",
|
||||||
|
"TH" "fricative",
|
||||||
|
"IH" "vowel",
|
||||||
|
"AA" "vowel",
|
||||||
|
"AO" "vowel",
|
||||||
|
"N" "nasal",
|
||||||
|
"DH" "fricative",
|
||||||
|
"W" "semivowel",
|
||||||
|
"ZH" "fricative",
|
||||||
|
"NG" "nasal",
|
||||||
|
"D" "stop",
|
||||||
|
"ER" "vowel",
|
||||||
|
"AE" "vowel"})
|
||||||
|
|
||||||
|
(def long-vowel #{"EY" "IY" "AY" "OW" "UW"})
|
||||||
|
|
||||||
|
(def short-vowel #{"AA" "AE" "AH" "AO" "AW" "EH" "ER" "IH" "OY" "UH"})
|
||||||
|
|
||||||
|
(def vowel (clojure.set/union long-vowel short-vowel))
|
||||||
|
|
||||||
|
(def consonant (clojure.set/difference (into #{} (keys phonemap)) vowel))
|
||||||
|
|
||||||
|
(def syllable-end (clojure.set/union consonant long-vowel))
|
||||||
|
|
||||||
|
(def single-sound-bigram #{"TH" "SH" "PH" "WH" "CH"})
|
||||||
|
|
||||||
|
(def cmu-word-to-stressed-phones-map
|
||||||
|
"Map of lowercase English words to their phonetic sounding based on
|
||||||
|
the CMU Pronouncing Dictionary at http://www.speech.cs.cmu.edu/cgi-bin/cmudict/
|
||||||
|
|
||||||
|
Includes words with apostrophes, like possessive aaronson's.
|
||||||
|
|
||||||
|
Words with multiple pronunciations have keys with a `(1)` or `(2)` after their
|
||||||
|
duplicates, like [aaronsons(1) (AA1 R AH0 N S AH0 N Z)]
|
||||||
|
|
||||||
|
Primary stress is indicated by a `1` after the phoneme. Secondary stress with a `2`.
|
||||||
|
Unstressed with a `0`."
|
||||||
|
(->> "cmudict-0.7b"
|
||||||
|
io/resource
|
||||||
|
io/reader
|
||||||
|
line-seq
|
||||||
|
(drop-while #(= \; (first %)))
|
||||||
|
(map #(string/split % #"\s+"))
|
||||||
|
(map (partial split-at 1))
|
||||||
|
(map #(vector
|
||||||
|
(string/lower-case
|
||||||
|
(first (first %)))
|
||||||
|
(vec (second %))))
|
||||||
|
(into {})))
|
||||||
|
|
||||||
|
(def cmu-word-alternatives
|
||||||
|
"For words with multiple pronunciations in the CMU dictionary,
|
||||||
|
this maps from the word to its variations.
|
||||||
|
reputed -> reputed, reputed(1), reputed(2).
|
||||||
|
|
||||||
|
Not particularly useful itself since reputed(1) doesn't tell you how it's
|
||||||
|
different from reputed. But it's useful to look up the pronunciations in the
|
||||||
|
CMU dictionary."
|
||||||
|
(reduce
|
||||||
|
(fn [m k]
|
||||||
|
(let [norm-key (string/replace k #"\(\d\)" "")]
|
||||||
|
(update m norm-key (fnil (comp sort conj) []) k)))
|
||||||
|
{}
|
||||||
|
(keys cmu-word-to-stressed-phones-map)))
|
||||||
|
|
||||||
|
(defn word-alternatives
|
||||||
|
"For words with multiple pronunciations in the CMU dictionary,
|
||||||
|
this maps from the word to its variations.
|
||||||
|
reputed -> reputed, reputed(1), reputed(2).
|
||||||
|
|
||||||
|
Not particularly useful itself since reputed(1) doesn't tell you how it's
|
||||||
|
different from reputed. But it's useful to look up the pronunciations in the
|
||||||
|
CMU dictionary."
|
||||||
|
[word]
|
||||||
|
(get cmu-word-alternatives word))
|
||||||
|
|
||||||
|
(def stressed-phones-to-cmu-word-map
|
||||||
|
"The same sequence of phones can map to multiple words."
|
||||||
|
(reduce
|
||||||
|
(fn [m [k v]]
|
||||||
|
(update m v (fnil conj []) k))
|
||||||
|
{}
|
||||||
|
cmu-word-to-stressed-phones-map))
|
||||||
|
|
||||||
|
(def cmu-word-to-unstressed-phones-map
|
||||||
|
(->> cmu-word-to-stressed-phones-map
|
||||||
|
(mapv (fn [[k v]] [k (mapv #(string/replace % #"\d" "") v)]))
|
||||||
|
(into {})))
|
||||||
|
|
||||||
|
(def unstressed-phones-to-cmu-word-map
|
||||||
|
"There might be unstressed phones that can map
|
||||||
|
to two different pronunciations when stress is added,
|
||||||
|
so this maps unstressed phones to a vector of words that
|
||||||
|
can be looked up in the CMU Pronouncing dictionary to
|
||||||
|
see what their stressed phones are.
|
||||||
|
|
||||||
|
Another example, look at how many words map to [N IY S].
|
||||||
|
[[N IY S]
|
||||||
|
[neice neece niece nice kneece kniess neiss neace niess]]"
|
||||||
|
(reduce
|
||||||
|
(fn [m [k v]]
|
||||||
|
(let [v (map #(string/replace % #"\d" "") v)]
|
||||||
|
(update m v (fnil conj []) k)))
|
||||||
|
{}
|
||||||
|
cmu-word-to-stressed-phones-map))
|
||||||
|
|
||||||
|
(CMULexicon. "cmulex" true)
|
||||||
|
|
||||||
|
(def ^CMULexicon cmu-lexicon
|
||||||
|
"The CMULexicon can get phones for words that aren't in the
|
||||||
|
CMU Pronouncing Dictionary. But the phones are slightly different.
|
||||||
|
The `AH` sound, as in `allow`, is returned as `ax` from the CMULexicon.
|
||||||
|
Also, unstressed vowels don't have a `0` suffix. Instead, the CMULexicon
|
||||||
|
just returns unstressed vowels as the vowel itself with no suffix.
|
||||||
|
|
||||||
|
The above is important to note if you want clean interplay between these
|
||||||
|
two different ways of getting phonemes."
|
||||||
|
(CMULexicon/getInstance true))
|
||||||
|
|
||||||
|
(defn remove-stress [phonemes]
|
||||||
|
(mapv #(string/replace % #"\d" "") phonemes))
|
||||||
|
|
||||||
|
(defn cmu-lexicon->cmu-pronouncing-dict
|
||||||
|
"The CMULexicon returns the `AH` sound, as in `allow`, as `ax`.
|
||||||
|
The Sphinx dictionary treates that sound as `AH`. This
|
||||||
|
converts `ax` to `AH`. It also adds `0` to phonemes that are
|
||||||
|
unstressed, which CMULexicon returns as the plain phoneme with
|
||||||
|
no stress marker."
|
||||||
|
[phonemes]
|
||||||
|
(mapv
|
||||||
|
(fn [phoneme]
|
||||||
|
(->> phoneme
|
||||||
|
(#(if (.equals % "ax") "ah" %))
|
||||||
|
string/upper-case
|
||||||
|
(#(if (vowel %) (str % "0") %))))
|
||||||
|
phonemes))
|
||||||
|
|
||||||
|
(comment
|
||||||
|
(type (.getPhones cmu-lexicon "allow" nil)) ;; => [Ljava.lang.String;
|
||||||
|
(vec (.getPhones cmu-lexicon "allow" nil)) ;; => ["ax" "l" "aw1"]
|
||||||
|
(cmu-lexicon->cmu-pronouncing-dict
|
||||||
|
(.getPhones cmu-lexicon "allowance" nil))
|
||||||
|
;; => ["AH0" "L" "AW1" "AH0" "N" "S"]
|
||||||
|
(cmu-word-to-stressed-phones-map "allowance")
|
||||||
|
;; => ["AH0" "L" "AW1" "AH0" "N" "S"]
|
||||||
|
)
|
||||||
|
|
||||||
|
(defn get-phones
|
||||||
|
"Tries to get phones first from the CMU Pronouncing Dictionary
|
||||||
|
and falls back to the CMULexicon if the word doesn't exist in
|
||||||
|
the dictionary.
|
||||||
|
|
||||||
|
Input must be lower-case.
|
||||||
|
|
||||||
|
Returns a vector of all possible pronunciations."
|
||||||
|
[word]
|
||||||
|
(let [cmu-phones (mapv cmu-word-to-stressed-phones-map (word-alternatives word))]
|
||||||
|
(if (seq cmu-phones)
|
||||||
|
cmu-phones
|
||||||
|
[(cmu-lexicon->cmu-pronouncing-dict
|
||||||
|
(.getPhones cmu-lexicon word nil))])))
|
||||||
|
|
||||||
|
(defn get-word
|
||||||
|
[phones]
|
||||||
|
(let [stressed? (some #(re-matches #".*\d" %) phones)]
|
||||||
|
(if stressed?
|
||||||
|
(stressed-phones-to-cmu-word-map phones)
|
||||||
|
(unstressed-phones-to-cmu-word-map phones))))
|
@ -0,0 +1,135 @@
|
|||||||
|
(ns com.owoga.phonetics.syllabify
|
||||||
|
(:require [com.owoga.phonetics :as phonetics]
|
||||||
|
[com.owoga.phonetics.util :as util]
|
||||||
|
[clojure.string :as string]))
|
||||||
|
|
||||||
|
(set! *warn-on-reflection* true)
|
||||||
|
|
||||||
|
;; This sonority hierarchy may not be perfect.
|
||||||
|
;; It stems from: http://www.glottopedia.org/index.php/Sonority_hierarchy
|
||||||
|
;; I tried to match the phones provided by the CMU dict to the hierarchies
|
||||||
|
;; listed on that page:
|
||||||
|
;; vowels > liquids > nasals > voiced fricatives
|
||||||
|
;; > voiceless fricatives = voiced plosives
|
||||||
|
;; > voiceless plosives (Anderson & Ewen 1987)
|
||||||
|
(def ^clojure.lang.PersistentVector sonority-hierarchy
|
||||||
|
;; more sonorous < < < vowel < < < (maximal onset) vowel > > > less sonorous
|
||||||
|
["vowel" "liquid" "semivowel" "aspirate" "affricate" "nasal" "fricative" "stop"])
|
||||||
|
|
||||||
|
(def lax-vowels #{"EH" "IH" "AE" "AH" "UH"})
|
||||||
|
|
||||||
|
(defn sonority [phone]
|
||||||
|
(.indexOf sonority-hierarchy (phonetics/phonemap phone)))
|
||||||
|
|
||||||
|
(defn vowel? [phone]
|
||||||
|
(phonetics/vowel phone))
|
||||||
|
|
||||||
|
(def consonant? (complement vowel?))
|
||||||
|
|
||||||
|
(defn >sonorous [a b]
|
||||||
|
(< (sonority a) (sonority b)))
|
||||||
|
|
||||||
|
(defn <sonorous [a b]
|
||||||
|
(> (sonority a) (sonority b)))
|
||||||
|
|
||||||
|
(defn slurp-rime
|
||||||
|
"Returns a vector of the rime and the remaining phones to process."
|
||||||
|
[phones]
|
||||||
|
(let [splits (util/take-through vowel? phones)]
|
||||||
|
[(vec (reverse (first splits))) (vec (flatten (rest splits)))]))
|
||||||
|
|
||||||
|
(comment
|
||||||
|
(slurp-rime ["AH" "K" "S" "AE" "L" "AH"])
|
||||||
|
;; => [["AH"] ["K" "S" "AE" "L" "AH"]]
|
||||||
|
(slurp-rime ["K" "S" "AE" "L" "AH"])
|
||||||
|
;; => [["AE" "S" "K"] ["L" "AH"]]
|
||||||
|
)
|
||||||
|
|
||||||
|
(defn slurp-onset-given-rime
|
||||||
|
"Phones and rime are vectors of phones.
|
||||||
|
Phones is backwards since we process naturally that way
|
||||||
|
due to the maximal onset principle. Rime is forwards since
|
||||||
|
it's the end-result of how we're reading the word.
|
||||||
|
|
||||||
|
Returns a vector of the syllable and the remaining phones to process."
|
||||||
|
[phones rime]
|
||||||
|
(loop [phones phones
|
||||||
|
syllable rime]
|
||||||
|
(cond
|
||||||
|
(empty? phones) [syllable phones]
|
||||||
|
|
||||||
|
;; Two vowels next to each other is treated as two syllables.
|
||||||
|
;; This might not always be the case if the vowels are lax.
|
||||||
|
;; Is "royal" 1 syllable or two? This treats it as two.
|
||||||
|
(vowel? (nth phones 0))
|
||||||
|
[syllable phones]
|
||||||
|
|
||||||
|
;; Maximal onset principle with exception for lax vowels occurring in
|
||||||
|
;; closed syllables.
|
||||||
|
(and (consonant? (nth syllable 0))
|
||||||
|
(<sonorous (nth phones 0) (nth syllable 0))
|
||||||
|
(not (lax-vowels (nth phones 1 nil))))
|
||||||
|
(recur (subvec phones 1)
|
||||||
|
(into [(nth phones 0)] syllable))
|
||||||
|
|
||||||
|
(vowel? (nth syllable 0))
|
||||||
|
(recur (subvec phones 1)
|
||||||
|
(into [(nth phones 0)] syllable))
|
||||||
|
|
||||||
|
:else [syllable phones])))
|
||||||
|
|
||||||
|
(comment
|
||||||
|
(slurp-onset-given-rime
|
||||||
|
["K" "S" "AE" "L" "A"]
|
||||||
|
["AH"])
|
||||||
|
|
||||||
|
)
|
||||||
|
(defn apply-stress [unstressed-syllables stressed-phones]
|
||||||
|
(loop [unstressed-syllables unstressed-syllables
|
||||||
|
stressed-phones stressed-phones
|
||||||
|
result-syllables [[]]]
|
||||||
|
(cond
|
||||||
|
(empty? stressed-phones)
|
||||||
|
result-syllables
|
||||||
|
|
||||||
|
(empty? (first unstressed-syllables))
|
||||||
|
(recur (rest unstressed-syllables)
|
||||||
|
stressed-phones
|
||||||
|
(conj result-syllables []))
|
||||||
|
|
||||||
|
:else
|
||||||
|
(recur
|
||||||
|
(cons (rest (first unstressed-syllables))
|
||||||
|
(rest unstressed-syllables))
|
||||||
|
(rest stressed-phones)
|
||||||
|
(conj (pop result-syllables) (conj (peek result-syllables) (first stressed-phones)))))))
|
||||||
|
|
||||||
|
(comment
|
||||||
|
(apply-stress '(("AH") ("L" "AE" "S") ("K" "AH"))
|
||||||
|
'("AH0" "L" "AE1" "S" "K" "AH0"))
|
||||||
|
;; => [["AH0"] ["L" "AE1" "S"] ["K" "AH0"]]
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
(defn syllabify [original-phones]
|
||||||
|
;; It's easier to work backwards.
|
||||||
|
;; The final syllable will always be
|
||||||
|
;; all of the last (if any) consonants preceded by
|
||||||
|
;; (or folllowed-by considering we're working
|
||||||
|
;; backwards through the phones) a vowel.
|
||||||
|
;; So, reverse the phones as a first step.
|
||||||
|
(let [phones (phonetics/remove-stress (reverse original-phones))]
|
||||||
|
(loop [phones phones
|
||||||
|
segments []]
|
||||||
|
(if (empty? phones)
|
||||||
|
(apply-stress segments original-phones)
|
||||||
|
(let [[rime phones'] (slurp-rime phones)
|
||||||
|
[syllable phones''] (slurp-onset-given-rime phones' rime)]
|
||||||
|
(recur phones'' (into [syllable] segments)))))))
|
||||||
|
|
||||||
|
(comment
|
||||||
|
(phonetics/remove-stress ["AH" "L" "AE" "S" "K" "AH"])
|
||||||
|
(slurp-onset-given-rime ["L" "AE" "S" "K" "AH"] ["AH"])
|
||||||
|
(syllabify ["AH0" "L" "AE1" "S" "K" "AH0"])
|
||||||
|
|
||||||
|
)
|
@ -0,0 +1,19 @@
|
|||||||
|
(ns com.owoga.phonetics.util)
|
||||||
|
|
||||||
|
(defn take-through
|
||||||
|
"(take-through even? [1 2 3 4 7 7 5 2 8 10])
|
||||||
|
returns '((1 2 3 4) (7 7 5 2) (8) (10))"
|
||||||
|
[pred coll]
|
||||||
|
(loop [coll coll
|
||||||
|
acc '()]
|
||||||
|
(cond
|
||||||
|
(empty? coll)
|
||||||
|
(if (empty? acc) acc (list (reverse acc)))
|
||||||
|
|
||||||
|
(pred (first coll))
|
||||||
|
(let [acc (cons (first coll) acc)]
|
||||||
|
(lazy-seq (cons (reverse acc) (take-through pred (rest coll)))))
|
||||||
|
|
||||||
|
:else
|
||||||
|
(recur (rest coll)
|
||||||
|
(cons (first coll) acc)))))
|
@ -0,0 +1,41 @@
|
|||||||
|
(ns com.owoga.phonetics.syllabify-test
|
||||||
|
(:require [clojure.test :refer :all]
|
||||||
|
[com.owoga.phonetics.syllabify :refer :all]))
|
||||||
|
|
||||||
|
(deftest syllabification-test
|
||||||
|
(testing "alaska"
|
||||||
|
(is (= '(("AH") ("L" "AE" "S") ("K" "AH"))
|
||||||
|
(syllabify '("AH" "L" "AE" "S" "K" "AH")))))
|
||||||
|
(testing "parentheses"
|
||||||
|
(is (= '(("P" "ER") ("IH" "N") ("TH" "UH") ("S" "IY" "S"))
|
||||||
|
(syllabify '("P" "ER" "IH" "N" "TH" "UH" "S" "IY" "S")))))
|
||||||
|
(testing "herald"
|
||||||
|
(is (= '(("H" "ER") ("AH" "L" "D"))
|
||||||
|
(syllabify '("H" "ER" "AH" "L" "D")))))
|
||||||
|
(testing "royal with cheese"
|
||||||
|
(is (= '(("R" "OY") ("AH" "L") ("W" "IH" "TH") ("CH" "IY" "Z"))
|
||||||
|
(syllabify ["R" "OY" "AH" "L" "W" "IH" "TH" "CH" "IY" "Z"]))))
|
||||||
|
(testing "uprising"
|
||||||
|
(is (= '(("UH" "P") ("R" "AY") ("S" "IY" "NG"))
|
||||||
|
(syllabify ["UH" "P" "R" "AY" "S" "IY" "NG"]))))
|
||||||
|
(testing "glimpstred"
|
||||||
|
(is (= '(("G" "L" "IH" "M" "P" "S") ("T" "R" "EH" "D"))
|
||||||
|
(syllabify ["G" "L" "IH" "M" "P" "S" "T" "R" "EH" "D"]))))
|
||||||
|
(testing "boink"
|
||||||
|
(is (= '(("B" "OY" "N" "K"))
|
||||||
|
(syllabify ["B" "OY" "N" "K"]))))
|
||||||
|
;; Lax vowels can only occur in closed syllables.
|
||||||
|
(testing "elipsis"
|
||||||
|
(is (= '(("IY") ("L" "IH" "P") ("S" "IH" "S"))
|
||||||
|
(syllabify ["IY" "L" "IH" "P" "S" "IH" "S"]))))
|
||||||
|
;; http://www.glottopedia.org/index.php/Maximal_Onset_Principle
|
||||||
|
(testing "maximal onset principle"
|
||||||
|
(testing "diploma"
|
||||||
|
(is (= '(("D" "IH" "P") ("L" "OW") ("M" "AH"))
|
||||||
|
(syllabify ["D" "IH" "P" "L" "OW" "M" "AH"])))))
|
||||||
|
;; http://www.glottopedia.org/index.php/Ambisyllabic
|
||||||
|
;; Since we are syllabifying phones, we don't need to worry
|
||||||
|
;; about handling ambisyllabic words. There's no such thing.
|
||||||
|
(testing "pillow"
|
||||||
|
(is (= '(("P" "IH") ("L" "OW"))
|
||||||
|
(syllabify ["P" "IH" "L" "OW"])))))
|
@ -0,0 +1,16 @@
|
|||||||
|
(ns com.owoga.phonetics-test
|
||||||
|
(:require [clojure.test :refer :all]
|
||||||
|
[com.owoga.phonetics :refer :all]))
|
||||||
|
|
||||||
|
(deftest phonetics-test
|
||||||
|
(testing "word to phones"
|
||||||
|
(is (= [["HH" "AH0" "L" "OW1"]
|
||||||
|
["HH" "EH0" "L" "OW1"]]
|
||||||
|
(get-phones "hello"))))
|
||||||
|
(testing "phones to word"
|
||||||
|
(is (= ["hello(1)"]
|
||||||
|
(get-word ["HH" "EH0" "L" "OW1"])))
|
||||||
|
(is (= ["hello(1)"]
|
||||||
|
(get-word ["HH" "EH" "L" "OW"])))
|
||||||
|
(is (= ["ensure(1)" "insure"]
|
||||||
|
(get-word ["IH" "N" "SH" "UH" "R"])))))
|
Loading…
Reference in New Issue