From 01280ff29c425c2e43824e58b0ab3c73c0801933 Mon Sep 17 00:00:00 2001 From: Eric Ihli Date: Mon, 21 Jun 2021 13:42:25 -0500 Subject: [PATCH] Remove unused file --- syllabify.py | 279 --------------------------------------------------- 1 file changed, 279 deletions(-) delete mode 100644 syllabify.py diff --git a/syllabify.py b/syllabify.py deleted file mode 100644 index a5b08d8..0000000 --- a/syllabify.py +++ /dev/null @@ -1,279 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2012-2013 Kyle Gorman -# -# Permission is hereby granted, free of charge, to any person obtaining a -# copy of this software and associated documentation files (the -# "Software"), to deal in the Software without restriction, including -# without limitation the rights to use, copy, modify, merge, publish, -# distribute, sublicense, and/or sell copies of the Software, and to -# permit persons to whom the Software is furnished to do so, subject to -# the following conditions: -# -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS -# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -# -# syllabify.py: prosodic parsing of ARPABET entries - -from itertools import chain - -# constants -SLAX = { - "IH1", - "IH2", - "EH1", - "EH2", - "AE1", - "AE2", - "AH1", - "AH2", - "UH1", - "UH2", -} -VOWELS = { - "IY1", - "IY2", - "IY0", - "EY1", - "EY2", - "EY0", - "AA1", - "AA2", - "AA0", - "ER1", - "ER2", - "ER0", - "AW1", - "AW2", - "AW0", - "AO1", - "AO2", - "AO0", - "AY1", - "AY2", - "AY0", - "OW1", - "OW2", - "OW0", - "OY1", - "OY2", - "OY0", - "IH0", - "EH0", - "AE0", - "AH0", - "UH0", - "UW1", - "UW2", - "UW0", - "UW", - "IY", - "EY", - "AA", - "ER", - "AW", - "AO", - "AY", - "OW", - "OY", - "UH", - "IH", - "EH", - "AE", - "AH", - "UH", -} | SLAX - -# licit medial onsets - -O2 = { - ("P", "R"), - ("T", "R"), - ("K", "R"), - ("B", "R"), - ("D", "R"), - ("G", "R"), - ("F", "R"), - ("TH", "R"), - ("P", "L"), - ("K", "L"), - ("B", "L"), - ("G", "L"), - ("F", "L"), - ("S", "L"), - ("K", "W"), - ("G", "W"), - ("S", "W"), - ("S", "P"), - ("S", "T"), - ("S", "K"), - ("HH", "Y"), # "clerihew" - ("R", "W"), -} -O3 = {("S", "T", "R"), ("S", "K", "L"), ("T", "R", "W")} # "octroi" - -# This does not represent anything like a complete list of onsets, but -# merely those that need to be maximized in medial position. - - -def syllabify(pron, alaska_rule=True): - """ - Syllabifies a CMU dictionary (ARPABET) word string - - # Alaska rule: - >>> pprint(syllabify('AH0 L AE1 S K AH0'.split())) # Alaska - '-AH0-.L-AE1-S.K-AH0-' - >>> pprint(syllabify('AH0 L AE1 S K AH0'.split(), 0)) # Alaska - '-AH0-.L-AE1-.S K-AH0-' - - # huge medial onsets: - >>> pprint(syllabify('M IH1 N S T R AH0 L'.split())) # minstrel - 'M-IH1-N.S T R-AH0-L' - >>> pprint(syllabify('AA1 K T R W AA0 R'.split())) # octroi - '-AA1-K.T R W-AA0-R' - - # destressing - >>> pprint(destress(syllabify('M IH1 L AH0 T EH2 R IY0'.split()))) - 'M-IH-.L-AH-.T-EH-.R-IY-' - - # normal treatment of 'j': - >>> pprint(syllabify('M EH1 N Y UW0'.split())) # menu - 'M-EH1-N.Y-UW0-' - >>> pprint(syllabify('S P AE1 N Y AH0 L'.split())) # spaniel - 'S P-AE1-N.Y-AH0-L' - >>> pprint(syllabify('K AE1 N Y AH0 N'.split())) # canyon - 'K-AE1-N.Y-AH0-N' - >>> pprint(syllabify('M IH0 N Y UW2 EH1 T'.split())) # minuet - 'M-IH0-N.Y-UW2-.-EH1-T' - >>> pprint(syllabify('JH UW1 N Y ER0'.split())) # junior - 'JH-UW1-N.Y-ER0-' - >>> pprint(syllabify('K L EH R IH HH Y UW'.split())) # clerihew - 'K L-EH-.R-IH-.HH Y-UW-' - - # nuclear treatment of 'j' - >>> pprint(syllabify('R EH1 S K Y UW0'.split())) # rescue - 'R-EH1-S.K-Y UW0-' - >>> pprint(syllabify('T R IH1 B Y UW0 T'.split())) # tribute - 'T R-IH1-B.Y-UW0-T' - >>> pprint(syllabify('N EH1 B Y AH0 L AH0'.split())) # nebula - 'N-EH1-B.Y-AH0-.L-AH0-' - >>> pprint(syllabify('S P AE1 CH UH0 L AH0'.split())) # spatula - 'S P-AE1-.CH-UH0-.L-AH0-' - >>> pprint(syllabify('AH0 K Y UW1 M AH0 N'.split())) # acumen - '-AH0-K.Y-UW1-.M-AH0-N' - >>> pprint(syllabify('S AH1 K Y AH0 L IH0 N T'.split())) # succulent - 'S-AH1-K.Y-AH0-.L-IH0-N T' - >>> pprint(syllabify('F AO1 R M Y AH0 L AH0'.split())) # formula - 'F-AO1 R-M.Y-AH0-.L-AH0-' - >>> pprint(syllabify('V AE1 L Y UW0'.split())) # value - 'V-AE1-L.Y-UW0-' - - # everything else - >>> pprint(syllabify('N AO0 S T AE1 L JH IH0 K'.split())) # nostalgic - 'N-AO0-.S T-AE1-L.JH-IH0-K' - >>> pprint(syllabify('CH ER1 CH M AH0 N'.split())) # churchmen - 'CH-ER1-CH.M-AH0-N' - >>> pprint(syllabify('K AA1 M P AH0 N S EY2 T'.split())) # compensate - 'K-AA1-M.P-AH0-N.S-EY2-T' - >>> pprint(syllabify('IH0 N S EH1 N S'.split())) # inCENSE - '-IH0-N.S-EH1-N S' - >>> pprint(syllabify('IH1 N S EH2 N S'.split())) # INcense - '-IH1-N.S-EH2-N S' - >>> pprint(syllabify('AH0 S EH1 N D'.split())) # ascend - '-AH0-.S-EH1-N D' - >>> pprint(syllabify('R OW1 T EY2 T'.split())) # rotate - 'R-OW1-.T-EY2-T' - >>> pprint(syllabify('AA1 R T AH0 S T'.split())) # artist - '-AA1 R-.T-AH0-S T' - >>> pprint(syllabify('AE1 K T ER0'.split())) # actor - '-AE1-K.T-ER0-' - >>> pprint(syllabify('P L AE1 S T ER0'.split())) # plaster - 'P L-AE1-S.T-ER0-' - >>> pprint(syllabify('B AH1 T ER0'.split())) # butter - 'B-AH1-.T-ER0-' - >>> pprint(syllabify('K AE1 M AH0 L'.split())) # camel - 'K-AE1-.M-AH0-L' - >>> pprint(syllabify('AH1 P ER0'.split())) # upper - '-AH1-.P-ER0-' - >>> pprint(syllabify('B AH0 L UW1 N'.split())) # balloon - 'B-AH0-.L-UW1-N' - >>> pprint(syllabify('P R OW0 K L EY1 M'.split())) # proclaim - 'P R-OW0-.K L-EY1-M' - >>> pprint(syllabify('IH0 N S EY1 N'.split())) # insane - '-IH0-N.S-EY1-N' - >>> pprint(syllabify('IH0 K S K L UW1 D'.split())) # exclude - '-IH0-K.S K L-UW1-D' - """ - ## main pass - mypron = list(pron) - nuclei = [] - onsets = [] - i = -1 - for (j, seg) in enumerate(mypron): - if seg in VOWELS: - nuclei.append([seg]) - onsets.append(mypron[i + 1 : j]) # actually interludes, r.n. - i = j - codas = [mypron[i + 1 :]] - ## resolve disputes and compute coda - for i in range(1, len(onsets)): - coda = [] - # boundary cases - if len(onsets[i]) > 1 and onsets[i][0] == "R": - nuclei[i - 1].append(onsets[i].pop(0)) - if len(onsets[i]) > 2 and onsets[i][-1] == "Y": - nuclei[i].insert(0, onsets[i].pop()) - if ( - len(onsets[i]) > 1 - and alaska_rule - and nuclei[i - 1][-1] in SLAX - and onsets[i][0] == "S" - ): - coda.append(onsets[i].pop(0)) - # onset maximization - depth = 1 - if len(onsets[i]) > 1: - if tuple(onsets[i][-2:]) in O2: - depth = 3 if tuple(onsets[i][-3:]) in O3 else 2 - for j in range(len(onsets[i]) - depth): - coda.append(onsets[i].pop(0)) - # store coda - codas.insert(i - 1, coda) - - ## verify that all segments are included in the ouput - output = list(zip(onsets, nuclei, codas)) # in Python3 zip is a generator - flat_output = list(chain.from_iterable(chain.from_iterable(output))) - if flat_output != mypron: - raise ValueError(f"could not syllabify {mypron}, got {flat_output}") - return output - - -def pprint(syllab): - """ - Pretty-print a syllabification - """ - return ".".join("-".join(" ".join(p) for p in syl) for syl in syllab) - - -def destress(syllab): - """ - Generate a syllabification with nuclear stress information removed - """ - syls = [] - for (onset, nucleus, coda) in syllab: - nuke = [p[:-1] if p[-1] in {"0", "1", "2"} else p for p in nucleus] - syls.append((onset, nuke, coda)) - return syls - - -if __name__ == "__main__": - import doctest - - doctest.testmod()