prhyme/syllabify.py

#!/usr/bin/env python
# Copyright (c) 2012-2013 Kyle Gorman <gormanky@ohsu.edu>
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
# syllabify.py: prosodic parsing of ARPABET entries

from itertools import chain

# constants
SLAX = {
    "IH1",
    "IH2",
    "EH1",
    "EH2",
    "AE1",
    "AE2",
    "AH1",
    "AH2",
    "UH1",
    "UH2",
}
VOWELS = {
    "IY1",
    "IY2",
    "IY0",
    "EY1",
    "EY2",
    "EY0",
    "AA1",
    "AA2",
    "AA0",
    "ER1",
    "ER2",
    "ER0",
    "AW1",
    "AW2",
    "AW0",
    "AO1",
    "AO2",
    "AO0",
    "AY1",
    "AY2",
    "AY0",
    "OW1",
    "OW2",
    "OW0",
    "OY1",
    "OY2",
    "OY0",
    "IH0",
    "EH0",
    "AE0",
    "AH0",
    "UH0",
    "UW1",
    "UW2",
    "UW0",
    "UW",
    "IY",
    "EY",
    "AA",
    "ER",
    "AW",
    "AO",
    "AY",
    "OW",
    "OY",
    "UH",
    "IH",
    "EH",
    "AE",
    "AH",
    "UH",
} | SLAX

# licit medial onsets

O2 = {
    ("P", "R"),
    ("T", "R"),
    ("K", "R"),
    ("B", "R"),
    ("D", "R"),
    ("G", "R"),
    ("F", "R"),
    ("TH", "R"),
    ("P", "L"),
    ("K", "L"),
    ("B", "L"),
    ("G", "L"),
    ("F", "L"),
    ("S", "L"),
    ("K", "W"),
    ("G", "W"),
    ("S", "W"),
    ("S", "P"),
    ("S", "T"),
    ("S", "K"),
    ("HH", "Y"),  # "clerihew"
    ("R", "W"),
}
O3 = {("S", "T", "R"), ("S", "K", "L"), ("T", "R", "W")}  # "octroi"

# This does not represent anything like a complete list of onsets, but
# merely those that need to be maximized in medial position.


def syllabify(pron, alaska_rule=True):
    """
    Syllabifies a CMU dictionary (ARPABET) word string

    # Alaska rule:
    >>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska
    '-AH0-.L-AE1-S.K-AH0-'
    >>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska
    '-AH0-.L-AE1-.S K-AH0-'

    # huge medial onsets:
    >>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel
    'M-IH1-N.S T R-AH0-L'
    >>> pprint(syllabify('AA1  K T R W AA0 R'.split())) # octroi
    '-AA1-K.T R W-AA0-R'

    # destressing
    >>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split())))
    'M-IH-.L-AH-.T-EH-.R-IY-'

    # normal treatment of 'j':
    >>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu
    'M-EH1-N.Y-UW0-'
    >>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel
    'S P-AE1-N.Y-AH0-L'
    >>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon
    'K-AE1-N.Y-AH0-N'
    >>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet
    'M-IH0-N.Y-UW2-.-EH1-T'
    >>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior
    'JH-UW1-N.Y-ER0-'
    >>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew
    'K L-EH-.R-IH-.HH Y-UW-'

    # nuclear treatment of 'j'
    >>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue
    'R-EH1-S.K-Y UW0-'
    >>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute
    'T R-IH1-B.Y-UW0-T'
    >>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula
    'N-EH1-B.Y-AH0-.L-AH0-'
    >>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula
    'S P-AE1-.CH-UH0-.L-AH0-'
    >>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen
    '-AH0-K.Y-UW1-.M-AH0-N'
    >>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent
    'S-AH1-K.Y-AH0-.L-IH0-N T'
    >>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula
    'F-AO1 R-M.Y-AH0-.L-AH0-'
    >>> pprint(syllabify('V AE1 L Y UW0'.split())) # value
    'V-AE1-L.Y-UW0-'

    # everything else
    >>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic
    'N-AO0-.S T-AE1-L.JH-IH0-K'
    >>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen
    'CH-ER1-CH.M-AH0-N'
    >>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate
    'K-AA1-M.P-AH0-N.S-EY2-T'
    >>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE
    '-IH0-N.S-EH1-N S'
    >>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense
    '-IH1-N.S-EH2-N S'
    >>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend
    '-AH0-.S-EH1-N D'
    >>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate
    'R-OW1-.T-EY2-T'
    >>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist
    '-AA1 R-.T-AH0-S T'
    >>> pprint(syllabify('AE1 K T ER0'.split())) # actor
    '-AE1-K.T-ER0-'
    >>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster
    'P L-AE1-S.T-ER0-'
    >>> pprint(syllabify('B AH1 T ER0'.split())) # butter
    'B-AH1-.T-ER0-'
    >>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel
    'K-AE1-.M-AH0-L'
    >>> pprint(syllabify('AH1 P ER0'.split())) # upper
    '-AH1-.P-ER0-'
    >>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon
    'B-AH0-.L-UW1-N'
    >>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim
    'P R-OW0-.K L-EY1-M'
    >>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane
    '-IH0-N.S-EY1-N'
    >>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude
    '-IH0-K.S K L-UW1-D'
    """
    ## main pass
    mypron = list(pron)
    nuclei = []
    onsets = []
    i = -1
    for (j, seg) in enumerate(mypron):
        if seg in VOWELS:
            nuclei.append([seg])
            onsets.append(mypron[i + 1 : j])  # actually interludes, r.n.
            i = j
    codas = [mypron[i + 1 :]]
    ## resolve disputes and compute coda
    for i in range(1, len(onsets)):
        coda = []
        # boundary cases
        if len(onsets[i]) > 1 and onsets[i][0] == "R":
            nuclei[i - 1].append(onsets[i].pop(0))
        if len(onsets[i]) > 2 and onsets[i][-1] == "Y":
            nuclei[i].insert(0, onsets[i].pop())
        if (
            len(onsets[i]) > 1
            and alaska_rule
            and nuclei[i - 1][-1] in SLAX
            and onsets[i][0] == "S"
        ):
            coda.append(onsets[i].pop(0))
        # onset maximization
        depth = 1
        if len(onsets[i]) > 1:
            if tuple(onsets[i][-2:]) in O2:
                depth = 3 if tuple(onsets[i][-3:]) in O3 else 2
        for j in range(len(onsets[i]) - depth):
            coda.append(onsets[i].pop(0))
        # store coda
        codas.insert(i - 1, coda)

    ## verify that all segments are included in the ouput
    output = list(zip(onsets, nuclei, codas))  # in Python3 zip is a generator
    flat_output = list(chain.from_iterable(chain.from_iterable(output)))
    if flat_output != mypron:
        raise ValueError(f"could not syllabify {mypron}, got {flat_output}")
    return output


def pprint(syllab):
    """
    Pretty-print a syllabification
    """
    return ".".join("-".join(" ".join(p) for p in syl) for syl in syllab)


def destress(syllab):
    """
    Generate a syllabification with nuclear stress information removed
    """
    syls = []
    for (onset, nucleus, coda) in syllab:
        nuke = [p[:-1] if p[-1] in {"0", "1", "2"} else p for p in nucleus]
        syls.append((onset, nuke, coda))
    return syls


if __name__ == "__main__":
    import doctest

    doctest.testmod()
Initial commit 4 years ago			`#!/usr/bin/env python`
			`# Copyright (c) 2012-2013 Kyle Gorman <gormanky@ohsu.edu>`
			`#`
			`# Permission is hereby granted, free of charge, to any person obtaining a`
			`# copy of this software and associated documentation files (the`
			`# "Software"), to deal in the Software without restriction, including`
			`# without limitation the rights to use, copy, modify, merge, publish,`
			`# distribute, sublicense, and/or sell copies of the Software, and to`
			`# permit persons to whom the Software is furnished to do so, subject to`
			`# the following conditions:`
			`#`
			`# The above copyright notice and this permission notice shall be included`
			`# in all copies or substantial portions of the Software.`
			`#`
			`# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS`
			`# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF`
			`# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.`
			`# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY`
			`# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,`
			`# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE`
			`# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.`
			`#`
			`# syllabify.py: prosodic parsing of ARPABET entries`

			`from itertools import chain`

			`# constants`
			`SLAX = {`
			`"IH1",`
			`"IH2",`
			`"EH1",`
			`"EH2",`
			`"AE1",`
			`"AE2",`
			`"AH1",`
			`"AH2",`
			`"UH1",`
			`"UH2",`
			`}`
			`VOWELS = {`
			`"IY1",`
			`"IY2",`
			`"IY0",`
			`"EY1",`
			`"EY2",`
			`"EY0",`
			`"AA1",`
			`"AA2",`
			`"AA0",`
			`"ER1",`
			`"ER2",`
			`"ER0",`
			`"AW1",`
			`"AW2",`
			`"AW0",`
			`"AO1",`
			`"AO2",`
			`"AO0",`
			`"AY1",`
			`"AY2",`
			`"AY0",`
			`"OW1",`
			`"OW2",`
			`"OW0",`
			`"OY1",`
			`"OY2",`
			`"OY0",`
			`"IH0",`
			`"EH0",`
			`"AE0",`
			`"AH0",`
			`"UH0",`
			`"UW1",`
			`"UW2",`
			`"UW0",`
			`"UW",`
			`"IY",`
			`"EY",`
			`"AA",`
			`"ER",`
			`"AW",`
			`"AO",`
			`"AY",`
			`"OW",`
			`"OY",`
			`"UH",`
			`"IH",`
			`"EH",`
			`"AE",`
			`"AH",`
			`"UH",`
			`} \| SLAX`

			`# licit medial onsets`

			`O2 = {`
			`("P", "R"),`
			`("T", "R"),`
			`("K", "R"),`
			`("B", "R"),`
			`("D", "R"),`
			`("G", "R"),`
			`("F", "R"),`
			`("TH", "R"),`
			`("P", "L"),`
			`("K", "L"),`
			`("B", "L"),`
			`("G", "L"),`
			`("F", "L"),`
			`("S", "L"),`
			`("K", "W"),`
			`("G", "W"),`
			`("S", "W"),`
			`("S", "P"),`
			`("S", "T"),`
			`("S", "K"),`
			`("HH", "Y"), # "clerihew"`
			`("R", "W"),`
			`}`
			`O3 = {("S", "T", "R"), ("S", "K", "L"), ("T", "R", "W")} # "octroi"`

			`# This does not represent anything like a complete list of onsets, but`
			`# merely those that need to be maximized in medial position.`


			`def syllabify(pron, alaska_rule=True):`
			`"""`
			`Syllabifies a CMU dictionary (ARPABET) word string`

			`# Alaska rule:`
			`>>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska`
			`'-AH0-.L-AE1-S.K-AH0-'`
			`>>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska`
			`'-AH0-.L-AE1-.S K-AH0-'`

			`# huge medial onsets:`
			`>>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel`
			`'M-IH1-N.S T R-AH0-L'`
			`>>> pprint(syllabify('AA1 K T R W AA0 R'.split())) # octroi`
			`'-AA1-K.T R W-AA0-R'`

			`# destressing`
			`>>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split())))`
			`'M-IH-.L-AH-.T-EH-.R-IY-'`

			`# normal treatment of 'j':`
			`>>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu`
			`'M-EH1-N.Y-UW0-'`
			`>>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel`
			`'S P-AE1-N.Y-AH0-L'`
			`>>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon`
			`'K-AE1-N.Y-AH0-N'`
			`>>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet`
			`'M-IH0-N.Y-UW2-.-EH1-T'`
			`>>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior`
			`'JH-UW1-N.Y-ER0-'`
			`>>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew`
			`'K L-EH-.R-IH-.HH Y-UW-'`

			`# nuclear treatment of 'j'`
			`>>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue`
			`'R-EH1-S.K-Y UW0-'`
			`>>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute`
			`'T R-IH1-B.Y-UW0-T'`
			`>>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula`
			`'N-EH1-B.Y-AH0-.L-AH0-'`
			`>>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula`
			`'S P-AE1-.CH-UH0-.L-AH0-'`
			`>>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen`
			`'-AH0-K.Y-UW1-.M-AH0-N'`
			`>>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent`
			`'S-AH1-K.Y-AH0-.L-IH0-N T'`
			`>>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula`
			`'F-AO1 R-M.Y-AH0-.L-AH0-'`
			`>>> pprint(syllabify('V AE1 L Y UW0'.split())) # value`
			`'V-AE1-L.Y-UW0-'`

			`# everything else`
			`>>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic`
			`'N-AO0-.S T-AE1-L.JH-IH0-K'`
			`>>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen`
			`'CH-ER1-CH.M-AH0-N'`
			`>>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate`
			`'K-AA1-M.P-AH0-N.S-EY2-T'`
			`>>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE`
			`'-IH0-N.S-EH1-N S'`
			`>>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense`
			`'-IH1-N.S-EH2-N S'`
			`>>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend`
			`'-AH0-.S-EH1-N D'`
			`>>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate`
			`'R-OW1-.T-EY2-T'`
			`>>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist`
			`'-AA1 R-.T-AH0-S T'`
			`>>> pprint(syllabify('AE1 K T ER0'.split())) # actor`
			`'-AE1-K.T-ER0-'`
			`>>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster`
			`'P L-AE1-S.T-ER0-'`
			`>>> pprint(syllabify('B AH1 T ER0'.split())) # butter`
			`'B-AH1-.T-ER0-'`
			`>>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel`
			`'K-AE1-.M-AH0-L'`
			`>>> pprint(syllabify('AH1 P ER0'.split())) # upper`
			`'-AH1-.P-ER0-'`
			`>>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon`
			`'B-AH0-.L-UW1-N'`
			`>>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim`
			`'P R-OW0-.K L-EY1-M'`
			`>>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane`
			`'-IH0-N.S-EY1-N'`
			`>>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude`
			`'-IH0-K.S K L-UW1-D'`
			`"""`
			`## main pass`
			`mypron = list(pron)`
			`nuclei = []`
			`onsets = []`
			`i = -1`
			`for (j, seg) in enumerate(mypron):`
			`if seg in VOWELS:`
			`nuclei.append([seg])`
			`onsets.append(mypron[i + 1 : j]) # actually interludes, r.n.`
			`i = j`
			`codas = [mypron[i + 1 :]]`
			`## resolve disputes and compute coda`
			`for i in range(1, len(onsets)):`
			`coda = []`
			`# boundary cases`
			`if len(onsets[i]) > 1 and onsets[i][0] == "R":`
			`nuclei[i - 1].append(onsets[i].pop(0))`
			`if len(onsets[i]) > 2 and onsets[i][-1] == "Y":`
			`nuclei[i].insert(0, onsets[i].pop())`
			`if (`
			`len(onsets[i]) > 1`
			`and alaska_rule`
			`and nuclei[i - 1][-1] in SLAX`
			`and onsets[i][0] == "S"`
			`):`
			`coda.append(onsets[i].pop(0))`
			`# onset maximization`
			`depth = 1`
			`if len(onsets[i]) > 1:`
			`if tuple(onsets[i][-2:]) in O2:`
			`depth = 3 if tuple(onsets[i][-3:]) in O3 else 2`
			`for j in range(len(onsets[i]) - depth):`
			`coda.append(onsets[i].pop(0))`
			`# store coda`
			`codas.insert(i - 1, coda)`

			`## verify that all segments are included in the ouput`
			`output = list(zip(onsets, nuclei, codas)) # in Python3 zip is a generator`
			`flat_output = list(chain.from_iterable(chain.from_iterable(output)))`
			`if flat_output != mypron:`
			`raise ValueError(f"could not syllabify {mypron}, got {flat_output}")`
			`return output`


			`def pprint(syllab):`
			`"""`
			`Pretty-print a syllabification`
			`"""`
			`return ".".join("-".join(" ".join(p) for p in syl) for syl in syllab)`


			`def destress(syllab):`
			`"""`
			`Generate a syllabification with nuclear stress information removed`
			`"""`
			`syls = []`
			`for (onset, nucleus, coda) in syllab:`
			`nuke = [p[:-1] if p[-1] in {"0", "1", "2"} else p for p in nucleus]`
			`syls.append((onset, nuke, coda))`
			`return syls`


			`if __name__ == "__main__":`
			`import doctest`

			`doctest.testmod()`