Remove unused file
parent
4433ca6ca5
commit
01280ff29c
@ -1,279 +0,0 @@
|
|||||||
#!/usr/bin/env python
|
|
||||||
# Copyright (c) 2012-2013 Kyle Gorman <gormanky@ohsu.edu>
|
|
||||||
#
|
|
||||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
||||||
# copy of this software and associated documentation files (the
|
|
||||||
# "Software"), to deal in the Software without restriction, including
|
|
||||||
# without limitation the rights to use, copy, modify, merge, publish,
|
|
||||||
# distribute, sublicense, and/or sell copies of the Software, and to
|
|
||||||
# permit persons to whom the Software is furnished to do so, subject to
|
|
||||||
# the following conditions:
|
|
||||||
#
|
|
||||||
# The above copyright notice and this permission notice shall be included
|
|
||||||
# in all copies or substantial portions of the Software.
|
|
||||||
#
|
|
||||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
||||||
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
||||||
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
|
||||||
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
|
||||||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
|
||||||
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
|
||||||
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
||||||
#
|
|
||||||
# syllabify.py: prosodic parsing of ARPABET entries
|
|
||||||
|
|
||||||
from itertools import chain
|
|
||||||
|
|
||||||
# constants
|
|
||||||
SLAX = {
|
|
||||||
"IH1",
|
|
||||||
"IH2",
|
|
||||||
"EH1",
|
|
||||||
"EH2",
|
|
||||||
"AE1",
|
|
||||||
"AE2",
|
|
||||||
"AH1",
|
|
||||||
"AH2",
|
|
||||||
"UH1",
|
|
||||||
"UH2",
|
|
||||||
}
|
|
||||||
VOWELS = {
|
|
||||||
"IY1",
|
|
||||||
"IY2",
|
|
||||||
"IY0",
|
|
||||||
"EY1",
|
|
||||||
"EY2",
|
|
||||||
"EY0",
|
|
||||||
"AA1",
|
|
||||||
"AA2",
|
|
||||||
"AA0",
|
|
||||||
"ER1",
|
|
||||||
"ER2",
|
|
||||||
"ER0",
|
|
||||||
"AW1",
|
|
||||||
"AW2",
|
|
||||||
"AW0",
|
|
||||||
"AO1",
|
|
||||||
"AO2",
|
|
||||||
"AO0",
|
|
||||||
"AY1",
|
|
||||||
"AY2",
|
|
||||||
"AY0",
|
|
||||||
"OW1",
|
|
||||||
"OW2",
|
|
||||||
"OW0",
|
|
||||||
"OY1",
|
|
||||||
"OY2",
|
|
||||||
"OY0",
|
|
||||||
"IH0",
|
|
||||||
"EH0",
|
|
||||||
"AE0",
|
|
||||||
"AH0",
|
|
||||||
"UH0",
|
|
||||||
"UW1",
|
|
||||||
"UW2",
|
|
||||||
"UW0",
|
|
||||||
"UW",
|
|
||||||
"IY",
|
|
||||||
"EY",
|
|
||||||
"AA",
|
|
||||||
"ER",
|
|
||||||
"AW",
|
|
||||||
"AO",
|
|
||||||
"AY",
|
|
||||||
"OW",
|
|
||||||
"OY",
|
|
||||||
"UH",
|
|
||||||
"IH",
|
|
||||||
"EH",
|
|
||||||
"AE",
|
|
||||||
"AH",
|
|
||||||
"UH",
|
|
||||||
} | SLAX
|
|
||||||
|
|
||||||
# licit medial onsets
|
|
||||||
|
|
||||||
O2 = {
|
|
||||||
("P", "R"),
|
|
||||||
("T", "R"),
|
|
||||||
("K", "R"),
|
|
||||||
("B", "R"),
|
|
||||||
("D", "R"),
|
|
||||||
("G", "R"),
|
|
||||||
("F", "R"),
|
|
||||||
("TH", "R"),
|
|
||||||
("P", "L"),
|
|
||||||
("K", "L"),
|
|
||||||
("B", "L"),
|
|
||||||
("G", "L"),
|
|
||||||
("F", "L"),
|
|
||||||
("S", "L"),
|
|
||||||
("K", "W"),
|
|
||||||
("G", "W"),
|
|
||||||
("S", "W"),
|
|
||||||
("S", "P"),
|
|
||||||
("S", "T"),
|
|
||||||
("S", "K"),
|
|
||||||
("HH", "Y"), # "clerihew"
|
|
||||||
("R", "W"),
|
|
||||||
}
|
|
||||||
O3 = {("S", "T", "R"), ("S", "K", "L"), ("T", "R", "W")} # "octroi"
|
|
||||||
|
|
||||||
# This does not represent anything like a complete list of onsets, but
|
|
||||||
# merely those that need to be maximized in medial position.
|
|
||||||
|
|
||||||
|
|
||||||
def syllabify(pron, alaska_rule=True):
|
|
||||||
"""
|
|
||||||
Syllabifies a CMU dictionary (ARPABET) word string
|
|
||||||
|
|
||||||
# Alaska rule:
|
|
||||||
>>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska
|
|
||||||
'-AH0-.L-AE1-S.K-AH0-'
|
|
||||||
>>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska
|
|
||||||
'-AH0-.L-AE1-.S K-AH0-'
|
|
||||||
|
|
||||||
# huge medial onsets:
|
|
||||||
>>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel
|
|
||||||
'M-IH1-N.S T R-AH0-L'
|
|
||||||
>>> pprint(syllabify('AA1 K T R W AA0 R'.split())) # octroi
|
|
||||||
'-AA1-K.T R W-AA0-R'
|
|
||||||
|
|
||||||
# destressing
|
|
||||||
>>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split())))
|
|
||||||
'M-IH-.L-AH-.T-EH-.R-IY-'
|
|
||||||
|
|
||||||
# normal treatment of 'j':
|
|
||||||
>>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu
|
|
||||||
'M-EH1-N.Y-UW0-'
|
|
||||||
>>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel
|
|
||||||
'S P-AE1-N.Y-AH0-L'
|
|
||||||
>>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon
|
|
||||||
'K-AE1-N.Y-AH0-N'
|
|
||||||
>>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet
|
|
||||||
'M-IH0-N.Y-UW2-.-EH1-T'
|
|
||||||
>>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior
|
|
||||||
'JH-UW1-N.Y-ER0-'
|
|
||||||
>>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew
|
|
||||||
'K L-EH-.R-IH-.HH Y-UW-'
|
|
||||||
|
|
||||||
# nuclear treatment of 'j'
|
|
||||||
>>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue
|
|
||||||
'R-EH1-S.K-Y UW0-'
|
|
||||||
>>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute
|
|
||||||
'T R-IH1-B.Y-UW0-T'
|
|
||||||
>>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula
|
|
||||||
'N-EH1-B.Y-AH0-.L-AH0-'
|
|
||||||
>>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula
|
|
||||||
'S P-AE1-.CH-UH0-.L-AH0-'
|
|
||||||
>>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen
|
|
||||||
'-AH0-K.Y-UW1-.M-AH0-N'
|
|
||||||
>>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent
|
|
||||||
'S-AH1-K.Y-AH0-.L-IH0-N T'
|
|
||||||
>>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula
|
|
||||||
'F-AO1 R-M.Y-AH0-.L-AH0-'
|
|
||||||
>>> pprint(syllabify('V AE1 L Y UW0'.split())) # value
|
|
||||||
'V-AE1-L.Y-UW0-'
|
|
||||||
|
|
||||||
# everything else
|
|
||||||
>>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic
|
|
||||||
'N-AO0-.S T-AE1-L.JH-IH0-K'
|
|
||||||
>>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen
|
|
||||||
'CH-ER1-CH.M-AH0-N'
|
|
||||||
>>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate
|
|
||||||
'K-AA1-M.P-AH0-N.S-EY2-T'
|
|
||||||
>>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE
|
|
||||||
'-IH0-N.S-EH1-N S'
|
|
||||||
>>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense
|
|
||||||
'-IH1-N.S-EH2-N S'
|
|
||||||
>>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend
|
|
||||||
'-AH0-.S-EH1-N D'
|
|
||||||
>>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate
|
|
||||||
'R-OW1-.T-EY2-T'
|
|
||||||
>>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist
|
|
||||||
'-AA1 R-.T-AH0-S T'
|
|
||||||
>>> pprint(syllabify('AE1 K T ER0'.split())) # actor
|
|
||||||
'-AE1-K.T-ER0-'
|
|
||||||
>>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster
|
|
||||||
'P L-AE1-S.T-ER0-'
|
|
||||||
>>> pprint(syllabify('B AH1 T ER0'.split())) # butter
|
|
||||||
'B-AH1-.T-ER0-'
|
|
||||||
>>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel
|
|
||||||
'K-AE1-.M-AH0-L'
|
|
||||||
>>> pprint(syllabify('AH1 P ER0'.split())) # upper
|
|
||||||
'-AH1-.P-ER0-'
|
|
||||||
>>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon
|
|
||||||
'B-AH0-.L-UW1-N'
|
|
||||||
>>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim
|
|
||||||
'P R-OW0-.K L-EY1-M'
|
|
||||||
>>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane
|
|
||||||
'-IH0-N.S-EY1-N'
|
|
||||||
>>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude
|
|
||||||
'-IH0-K.S K L-UW1-D'
|
|
||||||
"""
|
|
||||||
## main pass
|
|
||||||
mypron = list(pron)
|
|
||||||
nuclei = []
|
|
||||||
onsets = []
|
|
||||||
i = -1
|
|
||||||
for (j, seg) in enumerate(mypron):
|
|
||||||
if seg in VOWELS:
|
|
||||||
nuclei.append([seg])
|
|
||||||
onsets.append(mypron[i + 1 : j]) # actually interludes, r.n.
|
|
||||||
i = j
|
|
||||||
codas = [mypron[i + 1 :]]
|
|
||||||
## resolve disputes and compute coda
|
|
||||||
for i in range(1, len(onsets)):
|
|
||||||
coda = []
|
|
||||||
# boundary cases
|
|
||||||
if len(onsets[i]) > 1 and onsets[i][0] == "R":
|
|
||||||
nuclei[i - 1].append(onsets[i].pop(0))
|
|
||||||
if len(onsets[i]) > 2 and onsets[i][-1] == "Y":
|
|
||||||
nuclei[i].insert(0, onsets[i].pop())
|
|
||||||
if (
|
|
||||||
len(onsets[i]) > 1
|
|
||||||
and alaska_rule
|
|
||||||
and nuclei[i - 1][-1] in SLAX
|
|
||||||
and onsets[i][0] == "S"
|
|
||||||
):
|
|
||||||
coda.append(onsets[i].pop(0))
|
|
||||||
# onset maximization
|
|
||||||
depth = 1
|
|
||||||
if len(onsets[i]) > 1:
|
|
||||||
if tuple(onsets[i][-2:]) in O2:
|
|
||||||
depth = 3 if tuple(onsets[i][-3:]) in O3 else 2
|
|
||||||
for j in range(len(onsets[i]) - depth):
|
|
||||||
coda.append(onsets[i].pop(0))
|
|
||||||
# store coda
|
|
||||||
codas.insert(i - 1, coda)
|
|
||||||
|
|
||||||
## verify that all segments are included in the ouput
|
|
||||||
output = list(zip(onsets, nuclei, codas)) # in Python3 zip is a generator
|
|
||||||
flat_output = list(chain.from_iterable(chain.from_iterable(output)))
|
|
||||||
if flat_output != mypron:
|
|
||||||
raise ValueError(f"could not syllabify {mypron}, got {flat_output}")
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
def pprint(syllab):
|
|
||||||
"""
|
|
||||||
Pretty-print a syllabification
|
|
||||||
"""
|
|
||||||
return ".".join("-".join(" ".join(p) for p in syl) for syl in syllab)
|
|
||||||
|
|
||||||
|
|
||||||
def destress(syllab):
|
|
||||||
"""
|
|
||||||
Generate a syllabification with nuclear stress information removed
|
|
||||||
"""
|
|
||||||
syls = []
|
|
||||||
for (onset, nucleus, coda) in syllab:
|
|
||||||
nuke = [p[:-1] if p[-1] in {"0", "1", "2"} else p for p in nucleus]
|
|
||||||
syls.append((onset, nuke, coda))
|
|
||||||
return syls
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
import doctest
|
|
||||||
|
|
||||||
doctest.testmod()
|
|
Loading…
Reference in New Issue