cjklib.reading.operator

1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 # This file is part of cjklib. 4 # 5 # cjklib is free software: you can redistribute it and/or modify 6 # it under the terms of the GNU Lesser General Public License as published by 7 # the Free Software Foundation, either version 3 of the License, or 8 # (at your option) any later version. 9 # 10 # cjklib is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU Lesser General Public License for more details. 14 # 15 # You should have received a copy of the GNU Lesser General Public License 16 # along with cjklib. If not, see <http://www.gnu.org/licenses/>. 17 18 u""" 19 Provides L{ReadingOperator}s, classes to handle strings written in a character 20 reading. 21 22 Examples 23 ======== 24 Decompose a reading string in I{Gwoyeu Romatzyh} into single entities: 25 26 >>> from cjklib.reading import ReadingFactory 27 >>> f = ReadingFactory() 28 >>> f.decompose('"Hannshyue" .de mingcheng duey Jonggwo [...]', 'GR') 29 ['"', 'Hann', 'shyue', '" ', '.de', ' ', 'ming', 'cheng', ' ', 'duey', 30 ' ', 'Jong', 'gwo', ' [...]'] 31 32 The same can be done by directly using the operator's instance: 33 34 >>> from cjklib.reading import operator 35 >>> cy = operator.CantoneseYaleOperator() 36 >>> cy.decompose('gwóngjàuwá') 37 [u'gw\xf3ng', u'j\xe0u', u'w\xe1'] 38 39 Composing will reverse the process, using a I{Pinyin} string: 40 41 >>> f.compose([u'xī', u'ān'], 'Pinyin') 42 u"x\u012b'\u0101n" 43 44 For more complex operators, see L{PinyinOperator} or L{MandarinIPAOperator}. 45 """ 46 import re 47 import unicodedata 48 import copy 49 50 from sqlalchemy import Table, Column, Integer, String 51 from sqlalchemy import select, union 52 from sqlalchemy.sql import and_, or_, not_ 53 54 from cjklib.exception import (AmbiguousConversionError, DecompositionError, 55 AmbiguousDecompositonError, InvalidEntityError, UnsupportedError) 56 from cjklib.dbconnector import DatabaseConnector

57 58 -class ReadingOperator(object):

59 """ 60 Defines an abstract operator on text written in a I{character reading}. 61 62 The two basic methods are L{decompose()} and L{compose()}. L{decompose()} 63 breaks down a text into the basic entities of that reading (additional non 64 reading substrings are accepted though). L{compose()} joins these entities 65 together again and applies formating rules needed by the reading. 66 Additionally the method L{isReadingEntity()} is provided to check which of 67 the strings returned by L{decompose()} are supported entities for the given 68 reading. 69 70 The methods L{getDefaultOptions()} and L{getOption()} provide means to 71 handle the I{reading dialect}'s specific settings. 72 73 The class itself can't be used directly, it has to be subclassed and its 74 methods need to be extended. 75 """ 76 READING_NAME = None 77 """Unique name of reading""" 78

79 - def __init__(self, **options):

80 """ 81 Creates an instance of the ReadingOperator. 82 83 @param options: extra options 84 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 85 given, default settings will be assumed. 86 """ 87 if 'dbConnectInst' in options: 88 self.db = options['dbConnectInst'] 89 else: 90 self.db = DatabaseConnector.getDBConnector() 91 92 self.optionValue = {} 93 defaultOptions = self.getDefaultOptions() 94 for option in defaultOptions: 95 if type(defaultOptions[option]) \ 96 in [type(()), type([]), type({}), type(set())]: 97 self.optionValue[option] = copy.deepcopy(defaultOptions[option]) 98 else: 99 self.optionValue[option] = defaultOptions[option]

100 101 @classmethod

102 - def getDefaultOptions(cls):

103 """ 104 Returns the reading operator's default options. 105 106 The default implementation returns an empty dictionary. The keyword 107 'dbConnectInst' is not regarded a configuration option of the operator 108 and is thus not included in the dict returned. 109 110 @rtype: dict 111 @return: the reading operator's default options. 112 """ 113 return {}

114

115 - def getOption(self, option):

116 """ 117 Returns the value of the reading operator's option. 118 119 @return: the value of the given reading operator's option. 120 """ 121 return self.optionValue[option]

122

123 - def decompose(self, string):

124 """ 125 Decomposes the given string into basic entities that can be mapped to 126 one Chinese character each (exceptions possible). 127 128 The given input string can contain other non reading characters, e.g. 129 punctuation marks. 130 131 The returned list contains a mix of basic reading entities and other 132 characters e.g. spaces and punctuation marks. 133 134 The default implementation will raise a NotImplementedError. 135 136 @type string: str 137 @param string: reading string 138 @rtype: list of str 139 @return: a list of basic entities of the input string 140 @raise DecompositionError: if the string can not be decomposed. 141 """ 142 raise NotImplementedError

143

144 - def compose(self, readingEntities):

145 """ 146 Composes the given list of basic entities to a string. 147 148 The default implementation will raise a NotImplementedError. 149 150 @type readingEntities: list of str 151 @param readingEntities: list of basic entities or other content 152 @rtype: str 153 @return: composed entities 154 """ 155 raise NotImplementedError

156

157 - def isReadingEntity(self, entity):

158 """ 159 Returns true if the given entity is recognised by the reading 160 operator, i.e. it is a valid entity of the reading returned by 161 L{decompose()}. 162 163 The default implementation will raise a NotImplementedError. 164 165 @type entity: str 166 @param entity: entity to check 167 @rtype: bool 168 @return: true if string is an entity of the reading, false otherwise. 169 """ 170 raise NotImplementedError

171

172 173 -class RomanisationOperator(ReadingOperator):

174 """ 175 Defines an abstract L{ReadingOperator} on text written in a I{romanisation}, 176 i.e. text written in the Latin alphabet or written in the Cyrillic alphabet. 177 178 Additional to L{decompose()} provided by the class L{ReadingOperator} this 179 class offers a method L{getDecompositions()} that returns several possible 180 decompositions in an ambiguous case. 181 182 This class itself can't be used directly, it has to be subclassed and 183 extended. 184 185 X{Decomposition} 186 ================ 187 Transcriptions into the Latin alphabet generate the problem that syllable 188 boundaries or boundaries of entities belonging to single Chinese characters 189 aren't clear anymore once entities are grouped together. 190 191 Therefore it is important to have methods at hand to separate this strings 192 and to split them into single entities. This though cannot always be done 193 in a clear and unambiguous way as several different decompositions might be 194 possible thus leading to the general case of X{ambiguous decomposition}s. 195 196 Many romanisations do provide a way to tackle this problem. Pinyin for 197 example requires the use of an apostrophe (C{'}) when the reverse process 198 of splitting the string into syllables gets ambiguous. The Wade-Giles 199 romanisation in its strict implementation asks for a hyphen used between all 200 syllables. The LSHK's Jyutping when written with tone marks will always be 201 clearly decomposable. 202 203 The method L{isStrictDecomposition()} can be implemented to check if one 204 possible decomposition is the X{strict decomposition} offered by the 205 romanisation's protocol. This method should guarantee that under all 206 circumstances only one decomposed version will be regarded as strict. 207 208 If no strict version is yielded and different decompositions exist an 209 X{unambiguous decomposition} can not be made. These decompositions can be 210 accessed through method L{getDecompositions()}, even in a cases where a 211 strict decomposition exists. 212 @todo Impl: Optimise decompose() as to incorporate segment() and prune the 213 tree while it is created. Does this though yield significant 214 improvement? Would at least be O(n). 215 """ 216 readingEntityRegex = re.compile(u"([A-Za-z]+)") 217 """Regular Expression for finding romanisation entities in input.""" 218

219 - def __init__(self, **options):

220 """ 221 Creates an instance of the RomanisationOperator. 222 223 @param options: extra options 224 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 225 given, default settings will be assumed. 226 @keyword strictSegmentation: if C{True} segmentation (using 227 L{segment()}) and thus decomposition (using L{decompose()}) will 228 raise an exception if an alphabetic string is parsed which can not 229 be segmented into single reading entities. If C{False} the aforesaid 230 string will be returned unsegmented. 231 @keyword case: if set to C{'lower'}/C{'upper'}, only lower/upper 232 case will be supported, respectively, if set to C{'both'} both upper 233 and lower case will be supported. 234 """ 235 super(RomanisationOperator, self).__init__(**options) 236 237 if 'strictSegmentation' in options: 238 self.optionValue['strictSegmentation'] \ 239 = options['strictSegmentation'] 240 241 if 'case' in options: 242 self.optionValue['case'] = options['case'] 243 244 self.syllableTable = None 245 self.substringSet = None

246 247 @classmethod

248 - def getDefaultOptions(cls):

249 options = super(RomanisationOperator, cls).getDefaultOptions() 250 options.update({'strictSegmentation': False, 'case': 'both'}) 251 252 return options

253

254 - def decompose(self, string):

255 """ 256 Decomposes the given string into basic entities on a one-to-one mapping 257 level to Chinese characters. Decomposing can be ambiguous and there are 258 two assumptions made to solve this problem: If two subsequent entities 259 together make up a longer valid entity, then the decomposition with the 260 shorter entities can be disregarded. Furthermore it is assumed that the 261 reading provides rules to mark entity borders and that these rules can 262 be checked, so that the decomposition that abides by this rules will be 263 prefered. This check is done by calling L{isStrictDecomposition()}. 264 265 The given input string can contain other characters not supported by the 266 reading, e.g. punctuation marks. The returned list then contains a mix 267 of basic reading entities and other characters e.g. spaces and 268 punctuation marks. 269 270 @type string: str 271 @param string: reading string 272 @rtype: list of str 273 @return: a list of basic entities of the input string 274 @raise AmbiguousDecompositonError: if decomposition is ambiguous. 275 @raise DecompositionError: if the given string has a wrong format. 276 """ 277 decompositionParts = self.getDecompositionTree(string) 278 279 strictDecomposition = [] 280 for segment in decompositionParts: 281 if len(segment) == 1: 282 # only one possible decomposition, don't care if strict or not 283 strictDecomposition.extend(segment[0]) 284 else: 285 # check for decompositions with syllables that together make up 286 # a syllable again, don't take these into account for the 287 # unique decomposition 288 nonMergeableParts = [] 289 for decomposition in segment: 290 if not self._hasMergeableSyllables(decomposition): 291 nonMergeableParts.append(decomposition) 292 if len(nonMergeableParts) == 1: 293 strictDecomposition.extend(nonMergeableParts[0]) 294 else: 295 # get strict decomposition 296 for decomposition in nonMergeableParts: 297 if self.isStrictDecomposition(decomposition): 298 # there should be only one unambiguous 299 # decomposition, so take this match 300 strictDecomposition.extend(decomposition) 301 break 302 else: 303 raise AmbiguousDecompositonError("decomposition of '" \ 304 + string + "' ambiguous: '" \ 305 + ''.join(decomposition) + "'") 306 307 return strictDecomposition

308

309 - def getDecompositionTree(self, string):

310 """ 311 Decomposes the given string into basic entities that can be mapped to 312 one Chinese character each for all possible decompositions and returns 313 the possible decompositions as a lattice. 314 315 @type string: str 316 @param string: reading string 317 @rtype: list 318 @return: a list of all possible decompositions consisting of basic 319 entities as a lattice construct. 320 @raise DecompositionError: if the given string has a wrong format. 321 """ 322 # break string into pieces with alphabet and non alphabet parts 323 decompositionParts = [] 324 # get partial segmentations 325 for part in self.readingEntityRegex.split(string): 326 if part == '': 327 continue 328 if not self.readingEntityRegex.match(part): 329 # non-reading entity 330 decompositionParts.append([[part]]) 331 else: 332 segmentations = self.segment(part) 333 decompositionParts.append(segmentations) 334 335 return decompositionParts

336

337 - def getDecompositions(self, string):

338 """ 339 Decomposes the given string into basic entities that can be mapped to 340 one Chinese character each for all possible decompositions. This method 341 is a more general version of L{decompose()}. 342 343 The returned list construction consists of two entity types: entities of 344 the romanisation and other strings. 345 346 @type string: str 347 @param string: reading string 348 @rtype: list of list of str 349 @return: a list of all possible decompositions consisting of basic 350 entities. 351 @raise DecompositionError: if the given string has a wrong format. 352 """ 353 decompositionParts = self.getDecompositionTree(string) 354 # merge segmentations to decomposition 355 decompCrossProd = self._crossProduct(decompositionParts) 356 357 decompositionList = [] 358 for line in decompCrossProd: 359 resultList = [] 360 for entry in line: 361 resultList.extend(entry) 362 decompositionList.append(resultList) 363 364 return decompositionList

365

366 - def segment(self, string):

367 """ 368 Takes a string written in the romanisation and returns the possible 369 segmentations as a list of syllables. 370 371 In contrast to L{decompose()} this method merely segments continuous 372 entities of the romanisation. Characters not part of the romanisation 373 will not be dealt with, this is the task of the more general decompose 374 method. 375 376 @type string: str 377 @param string: reading string 378 @rtype: list of list of str 379 @return: a list of possible segmentations (several if ambiguous) into 380 single syllables 381 @raise DecompositionError: if the given string has an invalid format. 382 """ 383 segmentationTree = self._recursiveSegmentation(string) 384 if string != '' and len(segmentationTree) == 0: 385 if self.getOption('strictSegmentation'): 386 raise DecompositionError(u"Segmentation of '" + string \ 387 + "' not possible or invalid syllable") 388 else: 389 return [[string]] 390 resultList = [] 391 for entry in segmentationTree: 392 resultList.extend(self._treeToList(entry)) 393 return resultList

394

395 - def _recursiveSegmentation(self, string):

396 """ 397 Takes a string written in the romanisation and returns the possible 398 segmentations as a tree of syllables. 399 400 The tree is represented by tuples C{(syllable, subtree)}. 401 402 @type string: str 403 @param string: reading string 404 @rtype: list of tuple 405 @return: a tree of possible segmentations (if ambiguous) into single 406 syllables 407 """ 408 segmentationParts = [] 409 substringIndex = 1 410 while substringIndex <= len(string) and \ 411 self._hasSyllableSubstring(string[0:substringIndex].lower()): 412 syllable = string[0:substringIndex] 413 if self.isReadingEntity(syllable): 414 remaining = string[substringIndex:] 415 if remaining != '': 416 remainingParts = self._recursiveSegmentation(remaining) 417 if remainingParts != []: 418 segmentationParts.append((syllable, remainingParts)) 419 else: 420 segmentationParts.append((syllable, None)) 421 substringIndex = substringIndex + 1 422 return segmentationParts

423

424 - def _hasMergeableSyllables(self, decomposition):

425 """ 426 Checks if the given decomposition has two or more following syllables 427 which together make up a new syllable. 428 429 Segmentation can give several results with some possible syllables being 430 even further subdivided (e.g. I{tian} to I{ti'an} in Pinyin). These 431 segmentations are only secondary and the segmentation with the longer 432 syllables will be the one to take. 433 434 @type decomposition: list of str 435 @param decomposition: decomposed reading string 436 @rtype: bool 437 @return: True if following syllables make up a syllable 438 """ 439 for startIndex in range(0, len(decomposition)-1): 440 endIndex = startIndex + 2 441 subDecomp = "".join(decomposition[startIndex:endIndex]).lower() 442 while endIndex <= len(decomposition) and \ 443 self._hasSyllableSubstring(subDecomp): 444 if self.isReadingEntity(subDecomp): 445 return True 446 endIndex = endIndex + 1 447 subDecomp = "".join(decomposition[startIndex:endIndex]).lower() 448 return False

449

450 - def isStrictDecomposition(self, decomposition):

451 """ 452 Checks if the given decomposition follows the romanisation format 453 strictly to allow unambiguous decomposition. 454 455 The romanisation should offer a way/protocol to make an unambiguous 456 decomposition into it's basic syllables possible as to make the process 457 of appending syllables to a string reversible. The testing on compliance 458 with this protocol has to be implemented here. Thus this method can only 459 return true for one and only one possible decomposition for all strings. 460 461 @type decomposition: list of str 462 @param decomposition: decomposed reading string 463 @rtype: bool 464 @return: False, as this methods needs to be implemented by the sub class 465 """ 466 return False

467

468 - def _hasSyllableSubstring(self, string):

469 """ 470 Checks if the given string is a syllable supported by this romanisation 471 or a substring of one. 472 473 @type string: str 474 @param string: romanisation syllable or substring 475 @rtype: bool 476 @return: true if this string is a substring of a syllable, false 477 otherwise 478 """ 479 if self.substringSet == None: 480 # build index as called for the first time 481 self.substringSet = set() 482 for syllable in self.getReadingEntities(): 483 for i in range(len(syllable)): 484 self.substringSet.add(syllable[0:i+1]) 485 return string in self.substringSet

486

487 - def isReadingEntity(self, entity):

488 """ 489 Returns true if the given entity is recognised by the romanisation 490 operator, i.e. it is a valid entity of the reading returned by the 491 segmentation method. 492 493 Reading entities will be handled as being case insensitive. 494 495 @type entity: str 496 @param entity: entity to check 497 @rtype: bool 498 @return: C{True} if string is an entity of the reading, C{False} 499 otherwise. 500 """ 501 # check capitalisation 502 if self.getOption('case') == 'lower' and entity.lower() != entity: 503 return False 504 elif self.getOption('case') == 'upper' and entity.upper() != entity: 505 return False 506 507 if self.syllableTable == None: 508 # set used syllables 509 self.syllableTable = self.getReadingEntities() 510 return entity.lower() in self.syllableTable

511

512 - def getReadingEntities(self):

513 """ 514 Gets a set of all entities supported by the reading. 515 516 The list is used in the segmentation process to find entity boundaries. 517 The default implementation will raise a NotImplementedError. 518 519 @rtype: set of str 520 @return: set of supported syllables 521 """ 522 raise NotImplementedError

523 524 @staticmethod

525 - def _crossProduct(singleLists):

526 """ 527 Calculates the cross product (aka Cartesian product) of sets given as 528 lists. 529 530 Example: 531 >>> RomanisationOperator._crossProduct([['A', 'B'], [1, 2, 3]]) 532 [['A', 1], ['A', 2], ['A', 3], ['B', 1], ['B', 2], ['B', 3]] 533 534 @type singleLists: list of list 535 @param singleLists: a list of list entries containing various elements 536 @rtype: list of list 537 @return: the cross product of the given sets 538 """ 539 # get repeat index for whole set 540 lastRepeat = 1 541 repeatSet = [] 542 for elem in singleLists: 543 repeatSet.append(lastRepeat) 544 lastRepeat = lastRepeat * len(elem) 545 repeatEntry = [] 546 # get dimension of Cartesian product and dimensions of parts 547 newListLength = 1 548 for i in range(0, len(singleLists)): 549 elem = singleLists[len(singleLists) - i - 1] 550 repeatEntry.append(newListLength) 551 newListLength = newListLength * len(elem) 552 repeatEntry.reverse() 553 # create product 554 newList = [[] for i in range(0, newListLength)] 555 lastSetLen = 1 556 for i, listElem in enumerate(singleLists): 557 for j in range(0, repeatSet[i]): 558 for k, elem in enumerate(listElem): 559 for l in range(0, repeatEntry[i]): 560 newList[j * lastSetLen + k*repeatEntry[i] \ 561 + l].append(elem) 562 lastSetLen = repeatEntry[i] 563 return newList

564 565 @staticmethod

566 - def _treeToList(tupleTree):

567 """ 568 Converts a tree to a list containing all full paths from root to leaf 569 node. 570 571 The tree is given by tuples C{(leaf node element, subtree)}. 572 573 Example: 574 >>> RomanisationOperator._treeToList( 575 ... ('A', [('B', None), ('C', [('D', None), ('E', None)])])) 576 [['A', 'B'], ['A', 'C', 'D'], ['A', 'C', 'E']] 577 578 @type tupleTree: tuple 579 @param tupleTree: a tree realised through a tuple of a node and a 580 subtree 581 @rtype: list of list 582 @return: a list of all paths contained by the given tree 583 """ 584 resultList = [] 585 root, pathList = tupleTree 586 if not pathList: 587 return [[root]] 588 for path in pathList: 589 subList = RomanisationOperator._treeToList(path) 590 for entry in subList: 591 newEntry = [root] 592 newEntry.extend(entry) 593 resultList.append(newEntry) 594 return resultList

595

596 597 -class TonalFixedEntityOperator(ReadingOperator):

598 """ 599 Provides an abstract L{ReadingOperator} for tonal languages for a reading 600 based on a fixed set of reading entities. 601 602 It provides two methods L{getTonalEntity()} and L{splitEntityTone()} to 603 cope with tonal information in text. 604 605 The class itself can't be used directly, it has to be subclassed and its 606 methods need to be extended. 607 """

608 - def __init__(self, **options):

609 """ 610 Creates an instance of the TonalFixedEntityOperator. 611 612 @param options: extra options 613 """ 614 super(TonalFixedEntityOperator, self).__init__(**options) 615 616 self.plainEntityTable = None

617

618 - def getTones(self):

619 """ 620 Returns a set of tones supported by the reading. These tones don't 621 necessarily reflect the tones of the underlying language but may defer 622 to reflect notational or other features. 623 624 The default implementation will raise a NotImplementedError. 625 626 @rtype: list 627 @return: list of supported tone marks. 628 """ 629 raise NotImplementedError

630

631 - def getTonalEntity(self, plainEntity, tone):

632 """ 633 Gets the entity with tone mark for the given plain entity and tone. 634 635 The default implementation will raise a NotImplementedError. 636 637 @type plainEntity: str 638 @param plainEntity: entity without tonal information 639 @param tone: tone 640 @rtype: str 641 @return: entity with appropriate tone 642 @raise InvalidEntityError: if the entity is invalid. 643 @raise UnsupportedError: if the operation is not supported for the given 644 form. 645 """ 646 raise NotImplementedError

647

648 - def splitEntityTone(self, entity):

649 """ 650 Splits the entity into an entity without tone mark (plain entity) and 651 the entity's tone. 652 653 The default implementation will raise a NotImplementedError. 654 655 @type entity: str 656 @param entity: entity with tonal information 657 @rtype: tuple 658 @return: plain entity without tone mark and entity's tone 659 @raise InvalidEntityError: if the entity is invalid. 660 @raise UnsupportedError: if the operation is not supported for the given 661 form. 662 """ 663 raise NotImplementedError

664

665 - def getReadingEntities(self):

666 """ 667 Gets a set of all entities supported by the reading. 668 669 The list is used in the segmentation process to find entity boundaries. 670 671 @rtype: list of str 672 @return: list of supported syllables 673 """ 674 syllableSet = set() 675 for syllable in self.getPlainReadingEntities(): 676 for tone in self.getTones(): 677 syllableSet.add(self.getTonalEntity(syllable, tone)) 678 return syllableSet

679

680 - def getPlainReadingEntities(self):

681 """ 682 Gets the list of plain entities supported by this reading. Different to 683 L{getReadingEntities()} the entities will carry no tone mark. 684 685 The default implementation will raise a NotImplementedError. 686 687 @rtype: set of str 688 @return: set of supported syllables 689 """ 690 raise NotImplementedError

691

692 - def isPlainReadingEntity(self, entity):

693 """ 694 Returns true if the given plain entity (without any tone mark) is 695 recognised by the romanisation operator, i.e. it is a valid entity of 696 the reading returned by the segmentation method. 697 698 @type entity: str 699 @param entity: entity to check 700 @rtype: bool 701 @return: C{True} if string is an entity of the reading, C{False} 702 otherwise. 703 """ 704 if self.plainEntityTable == None: 705 # set used syllables 706 self.plainEntityTable = self.getPlainReadingEntities() 707 return entity in self.plainEntityTable

708

709 - def isReadingEntity(self, entity):

710 # reimplement to keep memory footprint small 711 # remove tone mark form and check plain entity 712 try: 713 plainEntity, _ = self.splitEntityTone(entity) 714 return self.isPlainReadingEntity(plainEntity) 715 except InvalidEntityError: 716 return False

717

718 719 720 -class TonalRomanisationOperator(RomanisationOperator, TonalFixedEntityOperator):

721 """ 722 Provides an abstract L{RomanisationOperator} for tonal languages 723 incorporating methods from L{TonalFixedEntityOperator}. 724 725 It provides two methods L{getTonalEntity()} and L{splitEntityTone()} to 726 cope with tonal information in text. 727 728 The class itself can't be used directly, it has to be subclassed and its 729 methods need to be extended. 730 """

731 - def __init__(self, **options):

732 """ 733 Creates an instance of the TonalRomanisationOperator. 734 735 @param options: extra options 736 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 737 given, default settings will be assumed. 738 @keyword strictSegmentation: if C{True} segmentation (using 739 L{segment()}) and thus decomposition (using L{decompose()}) will 740 raise an exception if an alphabetic string is parsed which can not 741 be segmented into single reading entities. If C{False} the aforesaid 742 string will be returned unsegmented. 743 """ 744 super(TonalRomanisationOperator, self).__init__(**options)

745

746 - def getReadingEntities(self):

747 """ 748 Gets a set of all entities supported by the reading. 749 750 The list is used in the segmentation process to find entity boundaries. 751 752 @rtype: list of str 753 @return: list of supported syllables 754 """ 755 return TonalFixedEntityOperator.getReadingEntities(self)

756

757 - def isPlainReadingEntity(self, entity):

758 """ 759 Returns true if the given plain entity (without any tone mark) is 760 recognised by the romanisation operator, i.e. it is a valid entity of 761 the reading returned by the segmentation method. 762 763 Reading entities will be handled as being case insensitive. 764 765 @type entity: str 766 @param entity: entity to check 767 @rtype: bool 768 @return: C{True} if string is an entity of the reading, C{False} 769 otherwise. 770 """ 771 # check for special capitalisation 772 if self.getOption('case') == 'lower' and entity.lower() != entity: 773 return False 774 elif self.getOption('case') == 'upper' and entity.upper() != entity: 775 return False 776 777 return TonalFixedEntityOperator.isPlainReadingEntity(self, 778 entity.lower())

779

780 - def isReadingEntity(self, entity):

781 return TonalFixedEntityOperator.isReadingEntity(self, entity)

782

783 784 -class TonalIPAOperator(TonalFixedEntityOperator):

785 u""" 786 Defines an operator on strings of a tonal language written in the 787 X{International Phonetic Alphabet} (X{IPA}). 788 789 TonalIPAOperator does not supply the same closed set of syllables as 790 other L{ReadingOperator}s as IPA provides different ways to represent 791 pronunciation. Because of that a user defined IPA syllable will not easily 792 map to another transcription system and thus only basic support is provided 793 for this direction. 794 795 Tones 796 ===== 797 Tones in IPA can be expressed using different schemes. The following schemes 798 are implemented here: 799 - Numbers, tone numbers , 800 - ChaoDigits, numbers displaying the levels of Chao tone contours, 801 - IPAToneBar, IPA modifying tone bar characters, e.g. ɛw˥˧, 802 - Diacritics, diacritical marks and finally 803 - None, no support for tone marks 804 805 @todo Lang: Shed more light on representations of tones in IPA. 806 @todo Fix: Get all diacritics used in IPA as tones for L{TONE_MARK_REGEX}. 807 """ 808 TONE_MARK_REGEX = {'Numbers': re.compile(r'(\d)$'), 809 'ChaoDigits': re.compile(r'(12345+)$'), 810 'IPAToneBar': re.compile(ur'([˥˦˧˨˩꜈꜉꜊꜋꜌]+)$'), 811 'Diacritics': re.compile(ur'([\u0300\u0301\u0302\u0303\u030c]+)') 812 } 813 814 DEFAULT_TONE_MARK_TYPE = 'IPAToneBar' 815 """Tone mark type to select by default.""" 816 817 TONES = [] 818 """List of tone names. Needs to be implemented in child class.""" 819 820 TONE_MARK_PREFER = {'Numbers': {}, 'ChaoDigits': {}, 'IPAToneBar': {}, 821 'Diacritics': {}} 822 """ 823 Mapping of tone marks to tone name which will be preferred on ambiguous 824 mappings. Needs to be implemented in child classes. 825 """ 826 827 TONE_MARK_MAPPING = {'Numbers': {}, 'ChaoDigits': {}, 'IPAToneBar': {}, 828 'Diacritics': {}} 829 """ 830 Mapping of tone names to tone mark for each tone mark type. Needs to be 831 implemented in child classes. 832 """ 833

834 - def __init__(self, **options):

835 """ 836 Creates an instance of the TonalIPAOperator. 837 838 By default no tone marks will be shown. 839 840 @param options: extra options 841 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 842 given, default settings will be assumed. 843 @keyword toneMarkType: type of tone marks, one out of C{'Numbers'}, 844 C{'ChaoDigits'}, C{'IPAToneBar'}, C{'Diacritics'}, C{'None'} 845 @keyword missingToneMark: if set to C{'noinfo'} no tone information 846 will be deduced when no tone mark is found (takes on value C{None}), 847 if set to C{'ignore'} this entity will not be valid. 848 """ 849 super(TonalIPAOperator, self).__init__(**options) 850 851 if 'toneMarkType' in options: 852 if options['toneMarkType'] not in ['Numbers', 'ChaoDigits', 853 'IPAToneBar', 'Diacritics', 'None']: 854 raise ValueError("Invalid option '" \ 855 + str(options['toneMarkType']) \ 856 + "' for keyword 'toneMarkType'") 857 self.optionValue['toneMarkType'] = options['toneMarkType'] 858 859 # check if we have to be strict on tones, i.e. report missing tone info 860 if 'missingToneMark' in options: 861 if options['missingToneMark'] not in ['noinfo', 'ignore']: 862 raise ValueError("Invalid option '" \ 863 + str(options['missingToneMark']) \ 864 + "' for keyword 'missingToneMark'") 865 self.optionValue['missingToneMark'] = options['missingToneMark'] 866 867 self.toneMarkLookup = None 868 869 # split regex 870 self.splitRegex = re.compile('([\.\s]+)')

871 872 @classmethod

873 - def getDefaultOptions(cls):

874 options = super(TonalIPAOperator, cls).getDefaultOptions() 875 options.update({'toneMarkType': cls.DEFAULT_TONE_MARK_TYPE, 876 'missingToneMark': 'noinfo', 'preferTone': cls.TONE_MARK_PREFER}) 877 878 return options

879

880 - def getTones(self):

881 tones = self.TONES[:] 882 if self.getOption('missingToneMark') == 'noinfo' \ 883 or self.getOption('toneMarkType') == 'None': 884 tones.append(None) 885 886 return tones

887

888 - def decompose(self, string):

889 """ 890 Decomposes the given string into basic entities that can be mapped to 891 one Chinese character each (exceptions possible). 892 893 The returned list contains a mix of basic reading entities and other 894 characters e.g. spaces and punctuation marks. 895 896 Single syllables can only be found if distinguished by a period or 897 whitespace, such as L{compose()} would return. 898 899 @type string: str 900 @param string: reading string 901 @rtype: list of str 902 @return: a list of basic entities of the input string 903 """ 904 return self.splitRegex.split(string)

905

906 - def compose(self, readingEntities):

907 """ 908 Composes the given list of basic entities to a string. IPA syllables are 909 separated by a period. 910 911 @type readingEntities: list of str 912 @param readingEntities: list of basic entities or other content 913 @rtype: str 914 @return: composed entities 915 """ 916 newReadingEntities = [] 917 if len(readingEntities) > 0: 918 newReadingEntities.append(readingEntities[0]) 919 # separate two following entities in IPA with a dot to mark syllable 920 # boundary 921 lastIsReadingEntity = self.isReadingEntity(readingEntities[0]) 922 for entity in readingEntities[1:]: 923 isReadingEntity = self.isReadingEntity(entity) 924 925 if lastIsReadingEntity and isReadingEntity: 926 newReadingEntities.append(u'.') 927 newReadingEntities.append(entity) 928 929 lastIsReadingEntity = isReadingEntity 930 931 return "".join(newReadingEntities)

932

933 - def getTonalEntity(self, plainEntity, tone):

934 """ 935 Gets the entity with tone mark for the given plain entity and tone. 936 937 The plain entity returned will always be in Unicode's 938 I{Normalization Form C} (NFC, see 939 U{http://www.unicode.org/reports/tr15/}). 940 941 @type plainEntity: str 942 @param plainEntity: entity without tonal information 943 @type tone: str 944 @param tone: tone 945 @rtype: str 946 @return: entity with appropriate tone 947 @raise InvalidEntityError: if the entity is invalid. 948 @todo Impl: Place diacritics on main vowel, derive from IPA 949 representation. 950 """ 951 if tone not in self.getTones(): 952 raise InvalidEntityError("Invalid tone information given for '" \ 953 + plainEntity + "': '" + str(tone) + "'") 954 if self.getOption('toneMarkType') == "None" or tone == None: 955 entity = plainEntity 956 else: 957 entity = plainEntity \ 958 + self.TONE_MARK_MAPPING[self.getOption('toneMarkType')][tone] 959 return unicodedata.normalize("NFC", entity)

960

961 - def splitEntityTone(self, entity):

962 """ 963 Splits the entity into an entity without tone mark and the name of the 964 entity's tone. 965 966 The plain entity returned will always be in Unicode's 967 I{Normalization Form C} (NFC, see 968 U{http://www.unicode.org/reports/tr15/}). 969 970 @type entity: str 971 @param entity: entity with tonal information 972 @rtype: tuple 973 @return: plain entity without tone mark and additionally the tone 974 @raise InvalidEntityError: if the entity is invalid. 975 """ 976 # get decomposed Unicode string, e.g. C{'â'} to C{'u\u0302'} 977 entity = unicodedata.normalize("NFD", unicode(entity)) 978 979 toneMarkType = self.getOption('toneMarkType') 980 if toneMarkType == 'None': 981 return unicodedata.normalize("NFC", entity), None 982 else: 983 matchObj = self.TONE_MARK_REGEX[toneMarkType].search(entity) 984 if matchObj: 985 toneMark = matchObj.group(1) 986 tone = self.getToneForToneMark(toneMark) 987 988 # strip off tone mark 989 plainEntity = entity.replace(toneMark, '') 990 return unicodedata.normalize("NFC", plainEntity), tone 991 elif self.getOption('missingToneMark') == 'noinfo': 992 return unicodedata.normalize("NFC", entity), None 993 994 raise InvalidEntityError("Invalid entity given for '" + entity + "'")

995

996 - def getToneForToneMark(self, toneMark):

997 """ 998 Gets the tone for the given tone mark. 999 1000 @type toneMark: str 1001 @param toneMark: tone mark representation of the tone 1002 @rtype: str 1003 @return: tone 1004 @raise InvalidEntityError: if the toneMark does not exist. 1005 """ 1006 if self.toneMarkLookup == None: 1007 toneMarkType = self.getOption('toneMarkType') 1008 # create lookup dict 1009 self.toneMarkLookup = {} 1010 for tone in self.getTones(): 1011 if tone == None: 1012 continue 1013 toneMark = self.TONE_MARK_MAPPING[toneMarkType][tone] 1014 if toneMark not in self.toneMarkLookup \ 1015 or (toneMark in self.TONE_MARK_PREFER[toneMarkType] \ 1016 and self.TONE_MARK_PREFER[toneMarkType][toneMark] \ 1017 == tone): 1018 self.toneMarkLookup[toneMark] = tone 1019 1020 if toneMark in self.toneMarkLookup: 1021 return self.toneMarkLookup[toneMark] 1022 else: 1023 raise InvalidEntityError("Invalid tone mark given with '" \ 1024 + toneMark + "'")

1025

1026 1027 -class SimpleEntityOperator(ReadingOperator):

1028 """Provides an operator on readings with a single character per entity."""

1029 - def decompose(self, string):

1030 readingEntities = [] 1031 i = 0 1032 while i < len(string): 1033 # look for non-entity characters first 1034 oldIndex = i 1035 while i < len(string) and not self.isReadingEntity(string[i]): 1036 i = i + 1 1037 if oldIndex != i: 1038 readingEntities.append(string[oldIndex:i]) 1039 # if we didn't reach the end of the input we have a entity char 1040 if i < len(string): 1041 readingEntities.append(string[i]) 1042 i = i + 1 1043 return readingEntities

1044

1045 - def compose(self, readingEntities):

1046 return ''.join(readingEntities)

1047

1048 - def isReadingEntity(self, entity):

1049 raise NotImplemented()

1050

1051 1052 -class HangulOperator(SimpleEntityOperator):

1053 """Provides an operator on Korean text written in X{Hangul}.""" 1054 READING_NAME = "Hangul" 1055

1056 - def isReadingEntity(self, entity):

1057 return (entity >= u'가') and (entity <= u'힣')

1058

1059 1060 -class HiraganaOperator(SimpleEntityOperator):

1061 """Provides an operator on Japanese text written in X{Hiragana}.""" 1062 READING_NAME = "Hiragana" 1063

1064 - def isReadingEntity(self, entity):

1065 return (entity >= u'ぁ') and (entity <= u'ゟ')

1066

1067 1068 -class KatakanaOperator(SimpleEntityOperator):

1069 """Provides an operator on Japanese text written in X{Katakana}.""" 1070 READING_NAME = "Katakana" 1071

1072 - def isReadingEntity(self, entity):

1073 return (entity >= u'゠') and (entity <= u'ヿ')

1074

1075 1076 -class KanaOperator(SimpleEntityOperator):

1077 """ 1078 Provides an operator on Japanese text written in a mix of X{Hiragana} and 1079 X{Katakana}. 1080 """ 1081 READING_NAME = "Kana" 1082

1083 - def isReadingEntity(self, entity):

1084 return ((entity >= u'ぁ') and (entity <= u'ヿ'))

1085

1086 1087 -class PinyinOperator(TonalRomanisationOperator):

1088 ur""" 1089 Provides an operator for the Mandarin romanisation X{Hanyu Pinyin}. 1090 It can be configured to cope with different representations (I{"dialects"}) 1091 of X{Pinyin}. For conversion between different representations the 1092 L{PinyinDialectConverter} can be used. 1093 1094 Features: 1095 - tones marked by either diacritics or numbers, 1096 - alternative representation of I{ü}-character, 1097 - correct placement of apostrophes, 1098 - guessing of input form (I{reading dialect}), 1099 - support for Erhua and 1100 - splitting of syllables into onset and rhyme. 1101 1102 Apostrophes 1103 =========== 1104 Pinyin syllables need to be separated by an X{apostrophe} in case their 1105 decomposition will get ambiguous. A famous example might be the city 1106 I{Xi'an}, which if written I{xian} would be read as one syllable, meaning 1107 e.g. 'fresh'. Another example would be I{Chang'an} which could be read 1108 I{chan'gan} if no delimiter is used in at least one of both cases. 1109 1110 Different rules exist where to place apostrophes. A simple yet sufficient 1111 rule is implemented in L{aeoApostropheRule()} which is used as default in 1112 this class. Syllables starting with one of the three vowels I{a}, I{e}, I{o} 1113 will be separated. Remember that vowels [i], [u], [y] are represented as 1114 I{yi}, I{wu}, I{yu} respectively, thus making syllable boundaries clear. 1115 L{compose()} will place apostrophes where required when composing the 1116 reading string. 1117 1118 An alternative rule can be specified to the constructor passing a function 1119 as an option C{PinyinApostropheFunction}. A possible function could be a 1120 rule separating all syllables by an apostrophe thus simplifying the reading 1121 process for beginners. 1122 1123 On decomposition of strings it is important to check which of the possibly 1124 several choices will be the one actually meant. E.g. syllable I{xian} given 1125 above should always be segmented into one syllable, solution I{xi'an} is not 1126 an option in this case. Therefore an alternative to L{aeoApostropheRule()} 1127 should make sure it guarantees proper decomposition, which is tested through 1128 L{isStrictDecomposition()}. 1129 1130 Last but not least C{compose(decompose(string))} will only be the identity 1131 if apostrophes are applied properly according to the rule as wrongly 1132 placed apostrophes will be kept when composing. Use L{removeApostrophes()} 1133 to remove separating apostrophes. 1134 1135 Example 1136 ------- 1137 1138 >>> def noToneApostropheRule(precedingEntity, followingEntity): 1139 ... return precedingEntity and precedingEntity[0].isalpha() \ 1140 ... and not precedingEntity[-1].isdigit() \ 1141 ... and followingEntity[0].isalpha() 1142 ... 1143 >>> from cjklib.reading import ReadingFactory 1144 >>> f = ReadingFactory() 1145 >>> f.convert('an3ma5mi5ba5ni2mou1', 'Pinyin', 'Pinyin', 1146 ... sourceOptions={'toneMarkType': 'Numbers'}, 1147 ... targetOptions={'toneMarkType': 'Numbers', 1148 ... 'missingToneMark': 'fifth', 1149 ... 'PinyinApostropheFunction': noToneApostropheRule}) 1150 u"an3ma'mi'ba'ni2mou1" 1151 1152 R-colouring 1153 =========== 1154 The phenomenon X{Erhua} (兒化音/儿化音, Erhua yin), i.e. the X{r-colouring} of 1155 syllables, is found in the northern Chinese dialects and results from 1156 merging the formerly independent sound I{er} with the preceding syllable. In 1157 written form a word is followed by the character 兒/儿, e.g. 頭兒/头儿. 1158 1159 In Pinyin the Erhua sound is quite often expressed by appending a single 1160 I{r} to the syllable of the character preceding 兒/儿, e.g. I{tóur} for 1161 頭兒/头儿, to stress the monosyllabic nature and in contrast to words like 1162 兒子/儿子 I{ér'zi} where 兒/儿 I{ér} constitutes a single syllable. 1163 1164 For decomposing syllables in Pinyin it is thus important to decide if the 1165 I{r} marking r-colouring should be an entity on its own account stressing 1166 the representation in the character string with an own character or rather 1167 stressing the monosyllabic nature and being part of a syllable of the 1168 foregoing character. This can be configured once instantiation. 1169 1170 Source 1171 ====== 1172 - Yǐn Bīnyōng (尹斌庸), Mary Felley (傅曼丽): Chinese romanization: 1173 Pronunciation and Orthography (汉语拼音和正词法). Sinolingua, Beijing, 1174 1990, ISBN 7-80052-148-6, ISBN 0-8351-1930-0. 1175 1176 @see: 1177 - Pinyin: U{http://www.pinyin.info/rules/where.html}, 1178 U{http://www.pinyin.info/romanization/hanyu/apostrophes.html}, 1179 U{http://www.pinyin.info/rules/initials_finals.html} 1180 - Erhua sound: U{http://en.wikipedia.org/wiki/Erhua} 1181 1182 @todo Impl: ISO 7098 asks for conversion of C{。、·「」} to C{.,-«»}. What 1183 about C{，？《》：－}? Implement a method for conversion to be optionally 1184 used. 1185 @todo Impl: Strict testing of tone mark placement. Currently it doesn't 1186 matter where tones are placed. All combinations are recognised. 1187 @todo Impl: Special marker for neutral tone: 'mȧ' (u'm\u0227', reported by 1188 Ching-song Gene Hsiao: A Manual of Transcription Systems For Chinese, 1189 中文拼音手册. Far Eastern Publications, Yale University, New Haven, 1190 Connecticut, 1985, ISBN 0-88710-141-0.), and '·ma' (u'\xb7ma', check!: 1191 现代汉语词典（第5版）[Xiàndài Hànyǔ Cídiǎn 5. Edition]. 商务印书馆 1192 [Shāngwù Yìnshūguǎn], Beijing, 2005, ISBN 7-100-04385-9.) 1193 """ 1194 READING_NAME = 'Pinyin' 1195 1196 TONEMARK_VOWELS = [u'a', u'e', u'i', u'o', u'u', u'ü', u'n', u'm', u'r', 1197 u'ê'] 1198 """ 1199 List of characters of the nucleus possibly carrying the tone mark. I{n} is 1200 included in standalone syllables I{n} and I{ng}. I{r} is used for supporting 1201 I{Erhua} in a two syllable form. 1202 """ 1203 TONEMARK_MAP = {u'\u0304': 1, u'\u0301': 2, u'\u030c': 3, u'\u0300': 4} 1204 """ 1205 Mapping of I{Combining Diacritical Marks} to their Pinyin tone index. 1206 1207 @see: 1208 - The Unicode Consortium: The Unicode Standard, Version 5.0.0, 1209 Chapter 7, European Alphabetic Scripts, 7.9 Combining Marks, 1210 defined by: The Unicode Standard, Version 5.0 (Boston, MA, 1211 Addison-Wesley, 2007. ISBN 0-321-48091-0), 1212 U{http://www.unicode.org/versions/Unicode5.0.0/} 1213 - Unicode: X{Combining Diacritical Marks}, Range: 0300-036F: 1214 U{http://www.unicode.org/charts/PDF/U0300.pdf} 1215 - Unicode: FAQ - Characters and Combining Marks: 1216 U{http://unicode.org/faq/char_combmark.html} 1217 """ 1218 1219 PINYIN_SOUND_REGEX \ 1220 = re.compile(u'(?i)^([^aeiuoü]*)([aeiuoü]*)([^aeiuoü]*)$') 1221 """ 1222 Regular Expression matching onset, nucleus and coda. Syllables 'n', 'ng', 1223 'r' (for Erhua) and 'ê' have to be handled separately. 1224 """ 1225 toneMarkRegex = re.compile(u'[' + re.escape(''.join(TONEMARK_MAP.keys())) \ 1226 + ']') 1227 """Regular Expression matching the Pinyin tone marks.""" 1228 tonemarkMapReverse = dict([(TONEMARK_MAP[mark], mark) \ 1229 for mark in TONEMARK_MAP.keys()]) 1230 del mark 1231 """Reverse lookup of tone marks for tones provided by TONEMARK_MAP.""" 1232

1233 - def __init__(self, **options):

1234 u""" 1235 Creates an instance of the PinyinOperator. 1236 1237 The class instance can be configured by different optional options given 1238 as keywords. 1239 1240 @param options: extra options 1241 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 1242 given, default settings will be assumed. 1243 @keyword strictSegmentation: if C{True} segmentation (using 1244 L{segment()}) and thus decomposition (using L{decompose()}) will 1245 raise an exception if an alphabetic string is parsed which can not 1246 be segmented into single reading entities. If C{False} the aforesaid 1247 string will be returned unsegmented. 1248 @keyword toneMarkType: if set to C{'Diacritics'} tones will be marked 1249 using diacritic marks, if set to C{'Numbers'} appended numbers from 1250 1 to 5 will be used to mark tones, if set to C{'None'} no tone marks 1251 will be used and no tonal information will be supplied at all. 1252 @keyword missingToneMark: if set to C{'fifth'} no tone mark is set to 1253 indicate the fifth tone (I{qingsheng}, e.g. C{'wo3men'} stands for 1254 C{'wo3men5'}), if set to C{'noinfo'}, no tone information will be 1255 deduced when no tone mark is found (takes on value C{None}), if set 1256 to C{'ignore'} this entity will not be valid and for segmentation 1257 the behaviour defined by C{'strictSegmentation'} will take affect. 1258 This option is only valid for the tone mark type C{'Numbers'}. 1259 @keyword yVowel: a character (or string) that is taken as alternative 1260 for I{ü} which depicts (among others) the close front rounded vowel 1261 [y] (IPA) in Pinyin and includes an umlaut. Changes forms of 1262 syllables I{nü, nüe, lü, lüe}. This option is not valid for the 1263 tone mark type C{'Diacritics'}. 1264 @keyword PinyinApostrophe: an alternate apostrophe that is taken instead 1265 of the default one. 1266 @keyword PinyinApostropheFunction: a function that indicates when a 1267 syllable combination needs to be split by an I{apostrophe}, see 1268 L{aeoApostropheRule()} for the default implementation. 1269 @keyword Erhua: if set to C{'ignore'} no special support will be 1270 provided for retroflex -r at syllable end (I{Erhua}), i.e. I{zher} 1271 will raise an exception. If set to C{'twoSyllables'} syllables with 1272 an append r are given/will be segmented into two syllables, the -r 1273 suffix making up one syllable itself as C{'r'}. If set to 1274 C{'oneSyllable'} syllables with an appended r are given/will be 1275 segmented into one syllable only. 1276 """ 1277 super(PinyinOperator, self).__init__(**options) 1278 1279 # check which tone marks to use 1280 if 'toneMarkType' in options: 1281 if options['toneMarkType'] not in ['Diacritics', 'Numbers', 'None']: 1282 raise ValueError("Invalid option '" \ 1283 + str(options['toneMarkType']) \ 1284 + "' for keyword 'toneMarkType'") 1285 self.optionValue['toneMarkType'] = options['toneMarkType'] 1286 1287 # check if we have to be strict on tones, i.e. report missing tone info 1288 if 'missingToneMark' in options: 1289 if self.getOption('toneMarkType') != 'Numbers': 1290 raise ValueError("keyword 'missingToneMark' is only valid if" \ 1291 + " tone mark type is set to 'Numbers'") 1292 1293 if options['missingToneMark'] not in ['fifth', 'noinfo', 'ignore']: 1294 raise ValueError("Invalid option '" \ 1295 + str(options['missingToneMark']) \ 1296 + "' for keyword 'missingToneMark'") 1297 self.optionValue['missingToneMark'] = options['missingToneMark'] 1298 1299 # set alternative ü vowel if given 1300 if 'yVowel' in options: 1301 if self.getOption('toneMarkType') == 'Diacritics' \ 1302 and options['yVowel'] != u'ü': 1303 raise ValueError("keyword 'yVowel' is not valid for tone mark" \ 1304 + " type 'Diacritics'") 1305 1306 self.optionValue['yVowel'] = options['yVowel'] 1307 1308 # set alternative apostrophe if given 1309 if 'PinyinApostrophe' in options: 1310 self.optionValue['PinyinApostrophe'] = options['PinyinApostrophe'] 1311 1312 # set apostrophe function if given 1313 if 'PinyinApostropheFunction' in options: 1314 self.optionValue['PinyinApostropheFunction'] \ 1315 = options['PinyinApostropheFunction'] 1316 1317 # check if we support Erhua 1318 if 'Erhua' in options: 1319 if options['Erhua'] not in ['ignore', 'twoSyllables', 1320 'oneSyllable']: 1321 raise ValueError("Invalid option '" + str(options['Erhua']) \ 1322 + "' for keyword 'Erhua'") 1323 self.optionValue['Erhua'] = options['Erhua'] 1324 1325 # set split regular expression, works for all 3 main dialects, get at 1326 # least the whole alphabet to have a conservative recognition 1327 self.readingEntityRegex = re.compile(u'(?i)((?:' \ 1328 + '|'.join([re.escape(v) for v in self._getDiacriticVowels()]) \ 1329 + '|' + re.escape(self.getOption('yVowel')) \ 1330 + u'|[a-zêü])+[12345]?)')

1331 1332 @classmethod

1333 - def getDefaultOptions(cls):

1334 options = super(PinyinOperator, cls).getDefaultOptions() 1335 options.update({'toneMarkType': 'Diacritics', 1336 'missingToneMark': 'noinfo', 'yVowel': u'ü', 1337 'PinyinApostrophe': "'", 'Erhua': 'twoSyllables', 1338 'PinyinApostropheFunction': cls.aeoApostropheRule}) 1339 1340 return options

1341 1342 @staticmethod

1343 - def _getDiacriticVowels():

1344 u""" 1345 Gets a list of Pinyin vowels with diacritical marks for tones. 1346 1347 The alternative for vowel ü does not need diacritical forms as the 1348 standard form doesn't allow changing the vowel. 1349 1350 @rtype: list of str 1351 @return: list of Pinyin vowels with diacritical marks 1352 """ 1353 vowelList = [] 1354 for vowel in PinyinOperator.TONEMARK_VOWELS: 1355 for mark in PinyinOperator.TONEMARK_MAP.keys(): 1356 vowelList.append(unicodedata.normalize("NFC", vowel + mark)) 1357 return vowelList

1358 1359 @classmethod

1360 - def guessReadingDialect(cls, string, includeToneless=False):

1361 u""" 1362 Takes a string written in Pinyin and guesses the reading dialect. 1363 1364 The basic options C{'toneMarkType'}, C{'yVowel'} and C{'Erhua'} are 1365 guessed. Unless C{'includeToneless'} is set to C{True} only the 1366 tone mark types C{'Diacritics'} and C{'Numbers'} are considered as the 1367 latter one can also represent the state of missing tones. Strings tested 1368 for C{'yVowel'} are C{ü}, C{v} and C{u:}. C{'Erhua'} is set to 1369 C{'twoSyllables'} by default and only tested when C{'toneMarkType'} is 1370 assumed to be set to C{'Numbers'}. 1371 1372 @type string: str 1373 @param string: Pinyin string 1374 @rtype: dict 1375 @return: dictionary of basic keyword settings 1376 """ 1377 Y_VOWEL_LIST = [u'ü', 'v', 'u:'] 1378 APOSTROPHE_LIST = ["'", u'’', u'´', u'‘', u'`', u'ʼ', u'ˈ', u'′', u'ʻ'] 1379 readingStr = unicodedata.normalize("NFC", unicode(string)) 1380 1381 diacriticVowels = PinyinOperator._getDiacriticVowels() 1382 # split regex for all dialect forms 1383 entities = re.findall(u'(?i)((?:' + '|'.join(diacriticVowels) \ 1384 + '|'.join(Y_VOWEL_LIST) + u'|[a-uw-zê])+[12345]?)', readingStr) 1385 1386 # guess one of main dialects: tone mark type 1387 diacriticEntityCount = 0 1388 numberEntityCount = 0 1389 for entity in entities: 1390 # take entity (which can be several connected syllables) and check 1391 if entity[-1] in '12345': 1392 numberEntityCount = numberEntityCount + 1 1393 else: 1394 for vowel in diacriticVowels: 1395 if vowel in entity: 1396 diacriticEntityCount = diacriticEntityCount + 1 1397 break 1398 # compare statistics 1399 if includeToneless \ 1400 and (1.0 * max(diacriticEntityCount, numberEntityCount) \ 1401 / len(entities)) < 0.1: 1402 # less than 1/10 units carry some possible tone mark, so decide 1403 # for toneless 1404 toneMarkType = 'None' 1405 else: 1406 if diacriticEntityCount > numberEntityCount: 1407 toneMarkType = 'Diacritics' 1408 else: 1409 toneMarkType = 'Numbers' 1410 1411 # guess ü vowel 1412 if toneMarkType == 'Diacritics': 1413 yVowel = u'ü' 1414 else: 1415 for vowel in Y_VOWEL_LIST: 1416 if vowel in readingStr: 1417 yVowel = vowel 1418 break 1419 else: 1420 yVowel = u'ü' 1421 1422 # guess apostrophe vowel 1423 for apostrophe in APOSTROPHE_LIST: 1424 if apostrophe in readingStr: 1425 PinyinApostrophe = apostrophe 1426 break 1427 else: 1428 PinyinApostrophe = "'" 1429 1430 # guess Erhua 1431 Erhua = 'twoSyllables' 1432 if toneMarkType == 'Numbers': 1433 lastIndex = 0 1434 while lastIndex != -1: 1435 lastIndex = readingStr.find('r', lastIndex+1) 1436 if lastIndex > 1: 1437 if len(readingStr) > lastIndex + 1 \ 1438 and readingStr[lastIndex + 1] in '12345': 1439 if readingStr[lastIndex - 1] in '12345': 1440 # found a preceding number that should be a tone 1441 # mark for another syllable, thus r5 is isolated 1442 break 1443 else: 1444 # found trailing r 1445 Erhua = 'oneSyllable' 1446 1447 return {'toneMarkType': toneMarkType, 'yVowel': yVowel, 1448 'PinyinApostrophe': PinyinApostrophe, 'Erhua': Erhua}

1449

1450 - def getTones(self):

1451 tones = range(1, 6) 1452 if self.getOption('toneMarkType') == 'None' \ 1453 or (self.getOption('missingToneMark') == 'noinfo' \ 1454 and self.getOption('toneMarkType') == 'Numbers'): 1455 tones.append(None) 1456 return tones

1457

1458 - def compose(self, readingEntities):

1459 """ 1460 Composes the given list of basic entities to a string. Applies an 1461 apostrophe between syllables if needed using default implementation 1462 L{aeoApostropheRule()}. 1463 1464 @type readingEntities: list of str 1465 @param readingEntities: list of basic syllables or other content 1466 @rtype: str 1467 @return: composed entities 1468 """ 1469 newReadingEntities = [] 1470 precedingEntity = None 1471 for entity in readingEntities: 1472 if self.getOption('PinyinApostropheFunction')(self, precedingEntity, 1473 entity): 1474 newReadingEntities.append(self.getOption('PinyinApostrophe')) 1475 1476 newReadingEntities.append(entity) 1477 precedingEntity = entity 1478 return ''.join(newReadingEntities)

1479

1480 - def removeApostrophes(self, readingEntities):

1481 """ 1482 Removes apostrophes between two syllables for a given decomposition. 1483 1484 @type readingEntities: list of str 1485 @param readingEntities: list of basic syllables or other content 1486 @rtype: list of str 1487 @return: the given entity list without separating apostrophes 1488 """ 1489 if len(readingEntities) == 0: 1490 return [] 1491 elif len(readingEntities) > 2 \ 1492 and readingEntities[1] == self.getOption('PinyinApostrophe') \ 1493 and self.isReadingEntity(readingEntities[0]) \ 1494 and self.isReadingEntity(readingEntities[2]): 1495 # apostrophe on pos #1 preceded and followed by a syllable 1496 newReadingEntities = [readingEntities[0]] 1497 newReadingEntities.extend(self.removeApostrophes( 1498 readingEntities[2:])) 1499 return newReadingEntities 1500 else: 1501 newReadingEntities = [readingEntities[0]] 1502 newReadingEntities.extend(self.removeApostrophes( 1503 readingEntities[1:])) 1504 return newReadingEntities

1505

1506 - def aeoApostropheRule(self, precedingEntity, followingEntity):

1507 """ 1508 Checks if the given entities need to be separated by an apostrophe. 1509 1510 Returns true for syllables starting with one of the three vowels I{a}, 1511 I{e}, I{o} having a preceding syllable. Additionally forms I{n} and 1512 I{ng} are separated from preceding syllables. Furthermore corner case 1513 I{e'r} will handled to distinguish from I{er}. 1514 1515 This function serves as the default apostrophe rule. 1516 1517 @type precedingEntity: str 1518 @param precedingEntity: the preceding syllable or any other content 1519 @type followingEntity: str 1520 @param followingEntity: the following syllable or any other content 1521 @rtype: bool 1522 @return: true if the syllables need to be separated, false otherwise 1523 """ 1524 # if both following entities are syllables they have to be separated if 1525 # the following syllable's first character is one of the vowels a, e, o, 1526 # or the syllable is n or ng 1527 if precedingEntity and self.isReadingEntity(precedingEntity) \ 1528 and self.isReadingEntity(followingEntity): 1529 plainSyllable, tone = self.splitEntityTone(followingEntity) 1530 1531 # take care of corner case Erhua form e'r, that needs to be 1532 # distinguished from er 1533 if plainSyllable == 'r': 1534 precedingPlainSyllable, _ \ 1535 = self.splitEntityTone(precedingEntity) 1536 return precedingPlainSyllable == 'e' 1537 1538 return plainSyllable[0] in ['a', 'e', 'o'] \ 1539 or plainSyllable in ['n', 'ng', 'nr', 'ngr'] 1540 return False

1541

1542 - def isStrictDecomposition(self, readingEntities):

1543 """ 1544 Checks if the given decomposition follows the Pinyin format 1545 strictly for unambiguous decomposition: syllables have to be preceded by 1546 an apostrophe if the decomposition would be ambiguous otherwise. 1547 1548 The function stored given as option C{'PinyinApostropheFunction'} is 1549 used to check if a apostrophe should have been placed. 1550 1551 @type readingEntities: list of str 1552 @param readingEntities: decomposed reading string 1553 @rtype: bool 1554 @return: true if decomposition is strict, false otherwise 1555 """ 1556 precedingEntity = None 1557 for entity in readingEntities: 1558 if self.isReadingEntity(entity): 1559 # Pinyin syllable 1560 if self.getOption('PinyinApostropheFunction')(self, 1561 precedingEntity, entity): 1562 return False 1563 1564 precedingEntity = entity 1565 else: 1566 # other content, treat next entity as first (start) 1567 precedingEntity = None 1568 1569 return True

1570

1571 - def getTonalEntity(self, plainEntity, tone):

1572 # get normalised Unicode string, e.g. C{'e\u0302'} to C{'ê'} 1573 plainEntity = unicodedata.normalize("NFC", unicode(plainEntity)) 1574 1575 if tone != None: 1576 tone = int(tone) 1577 if tone not in self.getTones(): 1578 raise InvalidEntityError("Invalid tone information given for '" \ 1579 + plainEntity + "': '" + str(tone) + "'") 1580 1581 if self.getOption('toneMarkType') == 'None': 1582 return plainEntity 1583 1584 elif self.getOption('toneMarkType') == 'Numbers': 1585 if tone == None or (tone == 5 \ 1586 and self.getOption('missingToneMark') == 'fifth'): 1587 return plainEntity 1588 else: 1589 return plainEntity + str(tone) 1590 1591 elif self.getOption('toneMarkType') == 'Diacritics': 1592 # split syllable into onset, nucleus and coda, handle nasal and ê 1593 # syllables independently 1594 if plainEntity.lower() in ['n', 'ng', 'm', 'r', u'ê', 'nr', 'ngr', 1595 'mr', u'êr']: 1596 onset, nucleus, coda = ('', plainEntity[0], plainEntity[1:]) 1597 elif plainEntity.lower() in ['hm', 'hng', 'hmr', 'hngr']: 1598 onset, nucleus, coda = (plainEntity[0], plainEntity[1], 1599 plainEntity[2:]) 1600 else: 1601 matchObj = self.PINYIN_SOUND_REGEX.match(plainEntity) 1602 onset, nucleus, coda = matchObj.group(1, 2, 3) 1603 if not nucleus: 1604 raise InvalidEntityError("no nucleus found for '" \ 1605 + plainEntity + "'") 1606 # place tone mark 1607 tonalNucleus = self._placeNucleusToneMark(nucleus, tone) 1608 return onset + tonalNucleus + coda

1609

1610 - def _placeNucleusToneMark(self, nucleus, tone):

1611 """ 1612 Places a tone mark on the given syllable nucleus according to the rules 1613 of the Pinyin standard. 1614 1615 @see: Pinyin.info - Where do the tone marks go?, 1616 U{http://www.pinyin.info/rules/where.html}. 1617 1618 @type nucleus: str 1619 @param nucleus: syllable nucleus 1620 @type tone: int 1621 @param tone: tone index (starting with 1) 1622 @rtype: str 1623 @return: nucleus with appropriate tone 1624 """ 1625 # only tone mark to place for tones 0 - 3 1626 if tone != 5: 1627 if len(nucleus) == 1: 1628 # only one character in nucleus, place tone mark there 1629 tonalNucleus = nucleus + self.tonemarkMapReverse[tone] 1630 elif nucleus[0].lower() in ('a', 'e', 'o'): 1631 # if several vowels place on a, e, o... 1632 tonalNucleus = nucleus[0] + self.tonemarkMapReverse[tone] \ 1633 + nucleus[1:] 1634 else: 1635 # ...otherwise on second vowel (see Pinyin rules) 1636 tonalNucleus = nucleus[0] + nucleus[1] \ 1637 + self.tonemarkMapReverse[tone] + nucleus[2:] 1638 else: 1639 tonalNucleus = nucleus 1640 # get normalised Unicode string, 1641 return unicodedata.normalize("NFC", tonalNucleus)

1642

1643 - def splitEntityTone(self, entity):

1644 """ 1645 Splits the entity into an entity without tone mark and the 1646 entity's tone index. 1647 1648 The plain entity returned will always be in Unicode's 1649 I{Normalization Form C} (NFC, see 1650 U{http://www.unicode.org/reports/tr15/}). 1651 1652 @type entity: str 1653 @param entity: entity with tonal information 1654 @rtype: tuple 1655 @return: plain entity without tone mark and entity's tone index 1656 (starting with 1) 1657 """ 1658 # get decomposed Unicode string, e.g. C{'ū'} to C{'u\u0304'} 1659 entity = unicodedata.normalize("NFD", unicode(entity)) 1660 if self.getOption('toneMarkType') == 'None': 1661 plainEntity = entity 1662 tone = None 1663 1664 elif self.getOption('toneMarkType') == 'Numbers': 1665 matchObj = re.search(u"[12345]$", entity) 1666 if matchObj: 1667 plainEntity = entity[0:len(entity)-1] 1668 tone = int(matchObj.group(0)) 1669 else: 1670 if self.getOption('missingToneMark') == 'fifth': 1671 plainEntity = entity 1672 tone = 5 1673 elif self.getOption('missingToneMark') == 'ignore': 1674 raise InvalidEntityError("No tone information given for '" \ 1675 + entity + "'") 1676 else: 1677 plainEntity = entity 1678 tone = None 1679 1680 elif self.getOption('toneMarkType') == 'Diacritics': 1681 # find character with tone marker 1682 matchObj = self.toneMarkRegex.search(entity) 1683 if matchObj: 1684 diacriticalMark = matchObj.group(0) 1685 tone = self.TONEMARK_MAP[diacriticalMark] 1686 # strip off diacritical mark 1687 plainEntity = entity.replace(diacriticalMark, '') 1688 else: 1689 # fifth tone doesn't have any marker 1690 plainEntity = entity 1691 tone = 5 1692 # compose Unicode string (used for ê) and return with tone 1693 return unicodedata.normalize("NFC", plainEntity), tone

1694

1695 - def getPlainReadingEntities(self):

1696 u""" 1697 Gets the list of plain entities supported by this reading. Different to 1698 L{getReadingEntities()} the entities will carry no tone mark. 1699 1700 Depending on the type of Erhua support either additional syllables with 1701 an ending -r are added, or a single I{r} is included. The user specified 1702 character for vowel I{ü} will be used. 1703 1704 @rtype: set of str 1705 @return: set of supported syllables 1706 """ 1707 # set used syllables 1708 plainSyllables = set(self.db.selectScalars( 1709 select([self.db.tables['PinyinSyllables'].c.Pinyin]))) 1710 # support for Erhua if needed 1711 if self.getOption('Erhua') == 'twoSyllables': 1712 # single 'r' for patterns like 'tóur' 1713 plainSyllables.add('r') 1714 elif self.getOption('Erhua') == 'oneSyllable': 1715 # add a -r form for all syllables except e and er 1716 for syllable in plainSyllables.copy(): 1717 if syllable not in ['e', 'er']: 1718 plainSyllables.add(syllable + 'r') 1719 1720 # add alternative forms for replacement of ü 1721 if self.getOption('yVowel') != u'ü': 1722 for syllable in plainSyllables.copy(): 1723 if syllable.find(u'ü') != -1: 1724 syllable = syllable.replace(u'ü', self.getOption('yVowel')) 1725 if syllable in plainSyllables: 1726 # check if through conversion we collide with an already 1727 # existing syllable 1728 raise ValueError("syllable '" + syllable \ 1729 + "' included more than once, " \ 1730 + u"probably bad substitute for 'ü'") 1731 plainSyllables.add(syllable) 1732 return plainSyllables

1733

1734 - def getReadingEntities(self):

1735 # overwrite default implementation to specify a special tone mark for 1736 # syllable 'r' used to support two syllable Erhua. 1737 syllables = self.getPlainReadingEntities() 1738 syllableSet = set() 1739 for syllable in syllables: 1740 if syllable == 'r': 1741 # r is included to support Erhua and is marked with the 1742 # fifth tone as it is not pronounced separetly. 1743 tones = [5] 1744 if None in self.getTones(): 1745 tones.append(None) 1746 else: 1747 tones = self.getTones() 1748 # check if we accept syllables without tone mark 1749 for tone in tones: 1750 syllableSet.add(self.getTonalEntity(syllable, tone)) 1751 return syllableSet

1752

1753 - def getOnsetRhyme(self, plainSyllable):

1754 """ 1755 Splits the given plain syllable into onset (initial) and rhyme (final). 1756 1757 Pinyin can't be separated into onset and rhyme clearly within its own 1758 system. There are syllables with same finals written differently (e.g. 1759 I{wei} and I{dui} both ending in a final that can be described by 1760 I{uei}) and reduction of vowels (same example: I{dui} which is 1761 pronounced with vowels I{uei}). This method will use three forms not 1762 found as substrings in Pinyin (I{uei}, {uen} and I{iou}) and substitutes 1763 (pseudo) initials I{w} and I{y} with its vowel equivalents. 1764 1765 Furthermore final I{i} will be distinguished in three forms given by 1766 the following three examples: I{yi}, I{zhi} and I{zi} to express 1767 phonological difference. 1768 1769 @type plainSyllable: str 1770 @param plainSyllable: syllable without tone marks 1771 @rtype: tuple of str 1772 @return: tuple of entity onset and rhyme 1773 @raise InvalidEntityError: if the entity is invalid. 1774 @raise UnsupportedError: for entity I{r} when Erhua is handled as 1775 separate entity. 1776 """ 1777 erhuaForm = False 1778 if self.getOption('Erhua') == 'oneSyllable' \ 1779 and plainSyllable.endswith('r') and plainSyllable != 'er': 1780 plainSyllable = plainSyllable[:-1] 1781 erhuaForm = True 1782 1783 elif plainSyllable == 'r' and self.getOption('Erhua') == 'twoSyllables': 1784 raise UnsupportedError("Not supported for '" + plainSyllable + "'") 1785 1786 table = self.db.tables['PinyinInitialFinal'] 1787 entry = self.db.selectRow( 1788 select([table.c.PinyinInitial, table.c.PinyinFinal], 1789 table.c.Pinyin == plainSyllable.lower())) 1790 if not entry: 1791 raise InvalidEntityError("'" + plainSyllable \ 1792 + "' not a valid plain Pinyin syllable'") 1793 1794 if erhuaForm: 1795 return (entry[0], entry[1] + 'r') 1796 else: 1797 return (entry[0], entry[1])

1798

1799 1800 -class WadeGilesOperator(TonalRomanisationOperator):

1801 u""" 1802 Provides an operator for the Mandarin X{Wade-Giles} romanisation. 1803 1804 Features: 1805 - tones marked by either standard numbers or subscripts, 1806 - configurable apostrophe for marking aspiration and 1807 - placement of hyphens between syllables. 1808 1809 @todo Lang: Get a good source for the syllables used. See also 1810 L{PinyinWadeGilesConverter}. 1811 @todo Lang: Respect mangled Wade-Giles writings. Possible steps: a) 1812 Warn/Error on syllables which are ambiguous when asume apostrophe are 1813 omitted. b) 'hsu' is no valid syllable but can be viewed as 'hsü'. 1814 Compare to different 'implementations' of the Wade-Giles romanisation. 1815 """ 1816 READING_NAME = 'WadeGiles' 1817 1818 DB_ASPIRATION_APOSTROPHE = u"‘" 1819 """Default apostrophe used by Wade-Giles syllable data in database.""" 1820 1821 TO_SUPERSCRIPT = {1: u'¹', 2: u'²', 3: u'³', 4: u'⁴', 5: u'⁵'} 1822 """Mapping of tone numbers to superscript numbers.""" 1823 FROM_SUPERSCRIPT = dict([(value, key) \ 1824 for key, value in TO_SUPERSCRIPT.iteritems()]) 1825 """Mapping of superscript numbers to tone numbers.""" 1826 del value 1827 del key 1828

1829 - def __init__(self, **options):

1830 """ 1831 Creates an instance of the WadeGilesOperator. 1832 1833 @param options: extra options 1834 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 1835 given, default settings will be assumed. 1836 @keyword strictSegmentation: if C{True} segmentation (using 1837 L{segment()}) and thus decomposition (using L{decompose()}) will 1838 raise an exception if an alphabetic string is parsed which can not 1839 be segmented into single reading entities. If C{False} the aforesaid 1840 string will be returned unsegmented. 1841 @keyword WadeGilesApostrophe: an alternate apostrophe that is taken 1842 instead of the default one. 1843 @keyword toneMarkType: if set to C{'Numbers'} appended numbers from 1 to 1844 5 will be used to mark tones, if set to C{'SuperscriptNumbers'} 1845 appended superscript numbers from 1 to 5 will be used to mark tones, 1846 if set to C{'None'} no tone marks will be used and no tonal 1847 information will be supplied at all. 1848 @keyword missingToneMark: if set to C{'fifth'} no tone mark is set to 1849 indicate the fifth tone (I{qingsheng}, e.g. C{'tsan2-men'} stands 1850 for C{'tsan2-men5'}), if set to C{'noinfo'}, no tone information 1851 will be deduced when no tone mark is found (takes on value C{None}), 1852 if set to C{'ignore'} this entity will not be valid and for 1853 segmentation the behaviour defined by C{'strictSegmentation'} will 1854 take affect. 1855 """ 1856 super(WadeGilesOperator, self).__init__(**options) 1857 # set alternate apostrophe if given 1858 if 'WadeGilesApostrophe' in options: 1859 self.optionValue['WadeGilesApostrophe'] \ 1860 = options['WadeGilesApostrophe'] 1861 self.readingEntityRegex = re.compile(u"((?:" \ 1862 + re.escape(self.getOption('WadeGilesApostrophe')) \ 1863 + u"|[A-ZÜa-zü])+[12345¹²³⁴⁵]?)") 1864 1865 # check which tone marks to use 1866 if 'toneMarkType' in options: 1867 if options['toneMarkType'] not in ['Numbers', 'SuperscriptNumbers', 1868 'None']: 1869 raise ValueError("Invalid option '" \ 1870 + str(options['toneMarkType']) \ 1871 + "' for keyword 'toneMarkType'") 1872 self.optionValue['toneMarkType'] = options['toneMarkType'] 1873 1874 # check behaviour on missing tone info 1875 if 'missingToneMark' in options: 1876 if self.getOption('toneMarkType') not in ['Numbers', 1877 'SuperscriptNumbers']: 1878 raise ValueError("keyword 'missingToneMark' is only valid if" \ 1879 + " tone mark type is set to 'Numbers' or " \ 1880 + "'SuperscriptNumbers'") 1881 1882 if options['missingToneMark'] not in ['fifth', 'noinfo', 'ignore']: 1883 raise ValueError("Invalid option '" \ 1884 + str(options['missingToneMark']) \ 1885 + "' for keyword 'missingToneMark'") 1886 self.optionValue['missingToneMark'] = options['missingToneMark']

1887 1888 @classmethod

1889 - def getDefaultOptions(cls):

1890 options = super(WadeGilesOperator, cls).getDefaultOptions() 1891 options.update({ 1892 'WadeGilesApostrophe': WadeGilesOperator.DB_ASPIRATION_APOSTROPHE, 1893 'toneMarkType': 'Numbers', 'missingToneMark': u'noinfo'}) 1894 1895 return options

1896

1897 - def getTones(self):

1898 if self.getOption('missingToneMark') == 'fifth': 1899 tones = [1, 2, 3, 4, None] 1900 else: 1901 tones = range(1, 6) 1902 if self.getOption('toneMarkType') == 'None' \ 1903 or self.getOption('missingToneMark') == 'noinfo': 1904 tones.append(None) 1905 return tones

1906

1907 - def compose(self, readingEntities):

1908 """ 1909 Composes the given list of basic entities to a string by applying a 1910 hyphen between syllables. 1911 1912 @type readingEntities: list of str 1913 @param readingEntities: list of basic syllables or other content 1914 @rtype: str 1915 @return: composed entities 1916 """ 1917 newReadingEntities = [] 1918 precedingEntity = None 1919 for entity in readingEntities: 1920 # check if we have to syllables 1921 if precedingEntity and self.isReadingEntity(precedingEntity) and \ 1922 self.isReadingEntity(entity): 1923 # syllables are separated by a hyphen in the strict 1924 # interpretation of Wade-Giles 1925 newReadingEntities.append("-") 1926 newReadingEntities.append(entity) 1927 precedingEntity = entity 1928 return ''.join(newReadingEntities)

1929

1930 - def removeHyphens(self, readingEntities):

1931 """ 1932 Removes hyphens between two syllables for a given decomposition. 1933 1934 @type readingEntities: list of str 1935 @param readingEntities: list of basic syllables or other content 1936 @rtype: list of str 1937 @return: the given entity list without separating hyphens 1938 """ 1939 if len(readingEntities) == 0: 1940 return [] 1941 elif len(readingEntities) > 2 and readingEntities[1] == "-" \ 1942 and self.isReadingEntity(readingEntities[0]) \ 1943 and self.isReadingEntity(readingEntities[2]): 1944 # hyphen on pos #1 preceded and followed by a syllable 1945 newReadingEntities = [readingEntities[0]] 1946 newReadingEntities.extend(self.removeHyphens(readingEntities[2:])) 1947 return newReadingEntities 1948 else: 1949 newReadingEntities = [readingEntities[0]] 1950 newReadingEntities.extend(self.removeHyphens(readingEntities[1:])) 1951 return newReadingEntities

1952

1953 - def getTonalEntity(self, plainEntity, tone):

1954 if tone != None: 1955 tone = int(tone) 1956 if tone not in self.getTones(): 1957 raise InvalidEntityError("Invalid tone information given for '" \ 1958 + plainEntity + "': '" + str(tone) + "'") 1959 1960 if self.getOption('toneMarkType') == 'None': 1961 return plainEntity 1962 1963 if tone == None or (tone == 5 \ 1964 and self.getOption('missingToneMark') == 'fifth'): 1965 return plainEntity 1966 else: 1967 if self.getOption('toneMarkType') == 'Numbers': 1968 return plainEntity + str(tone) 1969 elif self.getOption('toneMarkType') == 'SuperscriptNumbers': 1970 return plainEntity + self.TO_SUPERSCRIPT[tone] 1971 assert False

1972

1973 - def splitEntityTone(self, entity):

1974 if self.getOption('toneMarkType') == 'None': 1975 plainEntity = entity 1976 tone = None 1977 1978 else: 1979 tone = None 1980 if self.getOption('toneMarkType') == 'Numbers': 1981 matchObj = re.search(u"[12345]$", entity) 1982 if matchObj: 1983 tone = int(matchObj.group(0)) 1984 elif self.getOption('toneMarkType') == 'SuperscriptNumbers': 1985 matchObj = re.search(u"[¹²³⁴⁵]$", entity) 1986 if matchObj: 1987 tone = self.FROM_SUPERSCRIPT[matchObj.group(0)] 1988 1989 if tone: 1990 plainEntity = entity[0:len(entity)-1] 1991 else: 1992 if self.getOption('missingToneMark') == 'fifth': 1993 plainEntity = entity 1994 tone = 5 1995 elif self.getOption('missingToneMark') == 'ignore': 1996 raise InvalidEntityError("No tone information given for '" \ 1997 + entity + "'") 1998 else: 1999 plainEntity = entity 2000 2001 return plainEntity, tone

2002

2003 - def getPlainReadingEntities(self):

2004 """ 2005 Gets the list of plain entities supported by this reading. Different to 2006 L{getReadingEntities()} the entities will carry no tone mark. 2007 2008 Syllables will use the user specified apostrophe to mark aspiration. 2009 2010 @rtype: set of str 2011 @return: set of supported syllables 2012 """ 2013 plainSyllables = set(self.db.selectScalars( 2014 select([self.db.tables['WadeGilesSyllables'].c.WadeGiles]))) 2015 # use selected apostrophe 2016 if self.getOption('WadeGilesApostrophe') \ 2017 == self.DB_ASPIRATION_APOSTROPHE: 2018 return plainSyllables 2019 else: 2020 translatedSyllables = set() 2021 for syllable in plainSyllables: 2022 syllable = syllable.replace(self.DB_ASPIRATION_APOSTROPHE, 2023 self.getOption('WadeGilesApostrophe')) 2024 translatedSyllables.add(syllable) 2025 return translatedSyllables

2026

2027 2028 -class GROperator(TonalRomanisationOperator):

2029 u""" 2030 Provides an operator for the Mandarin X{Gwoyeu Romatzyh} romanisation. 2031 2032 Features: 2033 - support of abbreviated forms (zh, j, g), 2034 - conversion of abbreviated forms to full forms, 2035 - placement of apostrophes before 0-initial syllables, 2036 - support for different apostrophe characters, 2037 - support for I{r-coloured} syllables (I{Erlhuah}) and 2038 - guessing of input form (I{reading dialect}). 2039 2040 Limitations: 2041 - abbreviated forms for multiple syllables are not supported, 2042 - syllable repetition markers as reported by some will currently not be 2043 parsed. 2044 2045 R-colouring 2046 =========== 2047 Gwoyeu Romatzyh renders X{rhotacised} syllables (X{Erlhuah}) by trying to 2048 give the actual pronunciation. As the effect of r-colouring looses the 2049 information of the underlying etymological syllable conversion between the 2050 r-coloured form back to the underlying form can not be done in an 2051 unambiguous way. As furthermore finals I{i}, I{iu}, I{in}, I{iun} contrast 2052 in the first and the second tone but not in the third and the forth tone 2053 conversion between different tones (including the base form) cannot be made 2054 in a general manner: 小鸡儿 I{sheau-jiel} is different to 小街儿 2055 I{sheau-jie’l} but 几儿 I{jieel} equals 姐儿 I{jieel} (see Chao). 2056 2057 Thus this ReadingOperator lacks the general handling of syllable renderings 2058 and many methods narrow the range of syllables allowed. Unlike the original 2059 forms without r-colouring for Erlhuah forms the combination of a plain 2060 syllable with a specific tone is limited to the data given in the source, so 2061 operations involving tones may return with an L{UnsupportedError} if the 2062 given syllable isn't found with that tone. 2063 2064 Sources 2065 ======= 2066 - Yuen Ren Chao: A Grammar of Spoken Chinese. University of California 2067 Press, Berkeley, 1968, ISBN 0-520-00219-9. 2068 2069 @see: 2070 - GR Junction by Richard Warmington: 2071 U{http://home.iprimus.com.au/richwarm/gr/gr.htm} 2072 - Article about Gwoyeu Romatzyh on the English Wikipedia: 2073 U{http://en.wikipedia.org/wiki/Gwoyeu_Romatzyh} 2074 2075 @todo Impl: Initial, medial, head, ending (ending1, ending2=l?) 2076 @todo Lang: Which character to use for optional neutral tone: C{'ₒ'} ? 2077 @todo Impl: Implement Erhua forms as stated in W. Simon: A Beginner's 2078 Chinese-English Dictionary. 2079 @todo Impl: Implement repetition markers as stated in W. Simon: A Beginner's 2080 Chinese-English Dictionary. 2081 @todo Impl: Implement a GRIPAConverter once IPA values are obtained for 2082 the PinyinIPAConverter. GRIPAConverter can work around missing Erhua 2083 conversion to Pinyin. 2084 @todo Lang: Special rule for non-Chinese names with initial r- to be 2085 transcribed with an r- cited by Ching-song Gene Hsiao: A Manual of 2086 Transcription Systems For Chinese, 中文拼音手册. Far Eastern Publications, 2087 Yale University, New Haven, Connecticut, 1985, ISBN 0-88710-141-0. 2088 """ 2089 READING_NAME = 'GR' 2090 2091 TONES = ['1stTone', '2ndTone', '3rdTone', '4thTone', 2092 '5thToneEtymological1st', '5thToneEtymological2nd', 2093 '5thToneEtymological3rd', '5thToneEtymological4th', 2094 '1stToneOptional5th', '2ndToneOptional5th', '3rdToneOptional5th', 2095 '4thToneOptional5th'] 2096 2097 SYLLABLE_STRUCTURE = re.compile(r"^((?:tz|ts|ch|sh|[bpmfdtnlsjrgkh])?)" \ 2098 + "([aeiouy]+)((?:ngl|ng|n|l)?)$") 2099 """Regular expression describing the syllable structure in GR (C,V,C).""" 2100 2101 _syllableToneLookup = None 2102 """Holds the tonal syllable to plain syllable & tone lookup table.""" 2103 2104 _abbrConversionLookup = None 2105 """Holds the abbreviated entity lookup table.""" 2106 2107 DB_RHOTACISED_FINAL_MAPPING = {1: 'GRFinal_T1', 2: 'GRFinal_T2', 2108 3: 'GRFinal_T3', 4: 'GRFinal_T4'} 2109 """Database fields for tonal Erlhuah syllables.""" 2110 DB_RHOTACISED_FINAL_MAPPING_ZEROINITIAL = {1: 'GRFinal_T1', 2: 'GRFinal_T2', 2111 3: 'GRFinal_T3_ZEROINITIAL', 4: 'GRFinal_T4_ZEROINITIAL'} 2112 """Database fields for tonal Erlhuah syllables with i, u and iu medials.""" 2113 2114 DB_RHOTACISED_FINAL_APOSTROPHE = "'" 2115 """ 2116 Default apostrophe used by GR syllable data in database for marking the 2117 longer and back vowel in rhotacised finals. 2118 """ 2119

2120 - def __init__(self, **options):

2121 u""" 2122 Creates an instance of the GROperator. 2123 2124 @param options: extra options 2125 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 2126 given, default settings will be assumed. 2127 @keyword strictSegmentation: if C{True} segmentation (using 2128 L{segment()}) and thus decomposition (using L{decompose()}) will 2129 raise an exception if an alphabetic string is parsed which can not 2130 be segmented into single reading entities. If C{False} the aforesaid 2131 string will be returned unsegmented. 2132 @keyword abbreviations: if set to C{True} abbreviated spellings will be 2133 supported. 2134 @keyword GRRhotacisedFinalApostrophe: an alternate apostrophe that is 2135 taken instead of the default one for marking a longer and back vowel 2136 in rhotacised finals. 2137 @keyword GRSyllableSeparatorApostrophe: an alternate apostrophe that is 2138 taken instead of the default one for separating 0-initial syllables 2139 from preceding ones. 2140 """ 2141 super(GROperator, self).__init__(**options) 2142 2143 if 'abbreviations' in options: 2144 self.optionValue['abbreviations'] = options['abbreviations'] 2145 2146 if 'GRRhotacisedFinalApostrophe' in options: 2147 self.optionValue['GRRhotacisedFinalApostrophe'] \ 2148 = options['GRRhotacisedFinalApostrophe'] 2149 2150 if 'GRSyllableSeparatorApostrophe' in options: 2151 self.optionValue['GRSyllableSeparatorApostrophe'] \ 2152 = options['GRSyllableSeparatorApostrophe'] 2153 2154 self.readingEntityRegex = re.compile(u"([\.ₒ]?(?:" \ 2155 + re.escape(self.getOption('GRRhotacisedFinalApostrophe')) \ 2156 + "|[A-Za-z])+)")

2157 2158 @classmethod

2159 - def getDefaultOptions(cls):

2160 options = super(GROperator, cls).getDefaultOptions() 2161 options.update({'abbreviations': True, 2162 'GRRhotacisedFinalApostrophe': u"’", 2163 'GRSyllableSeparatorApostrophe': u"’"}) 2164 2165 return options

2166 2167 @classmethod

2168 - def guessReadingDialect(cls, string, includeToneless=False):

2169 u""" 2170 Takes a string written in GR and guesses the reading dialect. 2171 2172 The options C{'GRRhotacisedFinalApostrophe'} and 2173 C{'GRSyllableSeparatorApostrophe'} are guessed. Both will be set to the 2174 same value which derives from a list of different apostrophes and 2175 similar characters. 2176 2177 @type string: str 2178 @param string: GR string 2179 @rtype: dict 2180 @return: dictionary of basic keyword settings 2181 """ 2182 APOSTROPHE_LIST = ["'", u'’', u'´', u'‘', u'`', u'ʼ', u'ˈ', u'′', u'ʻ'] 2183 readingStr = unicodedata.normalize("NFC", unicode(string)) 2184 2185 # guess apostrophe vowel 2186 for apostrophe in APOSTROPHE_LIST: 2187 if apostrophe in readingStr: 2188 break 2189 else: 2190 apostrophe = "'" 2191 2192 return {'GRRhotacisedFinalApostrophe': apostrophe, 2193 'GRSyllableSeparatorApostrophe': apostrophe}

2194

2195 - def getTones(self):

2196 return self.TONES[:]

2197

2198 - def compose(self, readingEntities):

2199 """ 2200 Composes the given list of basic entities to a string. Applies an 2201 apostrophe between syllables if the second syllable has a zero-initial. 2202 2203 @type readingEntities: list of str 2204 @param readingEntities: list of basic syllables or other content 2205 @rtype: str 2206 @return: composed entities 2207 """ 2208 newReadingEntities = [] 2209 precedingEntity = None 2210 2211 for entity in readingEntities: 2212 if precedingEntity and self.isReadingEntity(precedingEntity) \ 2213 and self.isReadingEntity(entity): 2214 2215 if entity[0] in ['a', 'e', 'i', 'o', 'u']: 2216 newReadingEntities.append( 2217 self.getOption('GRSyllableSeparatorApostrophe')) 2218 2219 newReadingEntities.append(entity) 2220 precedingEntity = entity 2221 2222 return ''.join(newReadingEntities)

2223

2224 - def isStrictDecomposition(self, readingEntities):

2225 precedingEntity = None 2226 for entity in readingEntities: 2227 if precedingEntity and self.isReadingEntity(precedingEntity) \ 2228 and self.isReadingEntity(entity): 2229 2230 if entity[0] in ['a', 'e', 'i', 'o', 'u']: 2231 return False 2232 2233 precedingEntity = entity 2234 2235 return True

2236

2237 - def _recursiveSegmentation(self, string):

2238 # overwrite method to deal with the apostrophe that can be both a part 2239 # of a syllable and a separator between syllables 2240 segmentationParts = [] 2241 substringIndex = 1 2242 while substringIndex <= len(string) and \ 2243 (self._hasSyllableSubstring(string[0:substringIndex].lower()) \ 2244 or string[0:substringIndex] == "'"): 2245 syllable = string[0:substringIndex] 2246 if self.isReadingEntity(syllable) or syllable == "'": 2247 remaining = string[substringIndex:] 2248 if remaining != '': 2249 remainingParts = self._recursiveSegmentation(remaining) 2250 if remainingParts != []: 2251 segmentationParts.append((syllable, remainingParts)) 2252 else: 2253 segmentationParts.append((syllable, None)) 2254 substringIndex = substringIndex + 1 2255 return segmentationParts

2256

2257 - def removeApostrophes(self, readingEntities):

2258 """ 2259 Removes apostrophes between two syllables for a given decomposition. 2260 2261 @type readingEntities: list of str 2262 @param readingEntities: list of basic syllables or other content 2263 @rtype: list of str 2264 @return: the given entity list without separating apostrophes 2265 """ 2266 if len(readingEntities) == 0: 2267 return [] 2268 elif len(readingEntities) > 2 and readingEntities[1] == "'" \ 2269 and self.isReadingEntity(readingEntities[0]) \ 2270 and self.isReadingEntity(readingEntities[2]): 2271 # apostrophe on pos #1 preceded and followed by a syllable 2272 newReadingEntities = [readingEntities[0]] 2273 newReadingEntities.extend(self.removeApostrophes( 2274 readingEntities[2:])) 2275 return newReadingEntities 2276 else: 2277 newReadingEntities = [readingEntities[0]] 2278 newReadingEntities.extend(self.removeApostrophes( 2279 readingEntities[1:])) 2280 return newReadingEntities

2281

2282 - def getBaseTone(self, tone):

2283 """ 2284 Gets the tone number of the tone or the etymological tone if it is a 2285 neutral or optional neutral tone. 2286 2287 @type tone: str 2288 @param tone: tone 2289 @rtype: int 2290 @return: base tone number 2291 @raise InvalidEntityError: if an invalid tone is passed. 2292 """ 2293 if tone not in self.getTones(): 2294 raise InvalidEntityError("Invalid tone information given for '" \ 2295 + unicode(tone) + "'") 2296 2297 if tone.startswith("5thToneEtymological"): 2298 return int(tone[-3]) 2299 else: 2300 return int(tone[0])

2301

2302 - def splitPlainSyllableCVC(self, plainSyllable):

2303 """ 2304 Splits the given plain syllable into consonants-vowels-consonants. 2305 2306 @type plainSyllable: str 2307 @param plainSyllable: entity without tonal information 2308 @rtype: tuple of str 2309 @return: syllable CVC triple 2310 @raise InvalidEntityError: if the entity is invalid. 2311 """ 2312 # split syllable into CVC parts 2313 matchObj = self.SYLLABLE_STRUCTURE.match(plainSyllable) 2314 if not matchObj: 2315 print plainSyllable 2316 raise InvalidEntityError("Invalid entity given for '" \ 2317 + plainSyllable + "'") 2318 2319 c1, v, c2 = matchObj.groups() 2320 return c1, v, c2

2321

2322 - def getTonalEntity(self, plainEntity, tone):

2323 """ 2324 Gets the entity with tone mark for the given plain entity and tone. This 2325 method only works for plain syllables that are not r-coloured (Erlhuah 2326 forms) as due to the depiction of Erlhuah in GR the information about 2327 the base syllable is lost and pronunciation partly varies between 2328 different syllables. Use L{getRhotacisedTonalEntity()} to get the tonal 2329 entity for a given etymological (base) syllable. 2330 2331 @type plainEntity: str 2332 @param plainEntity: entity without tonal information 2333 @type tone: str 2334 @param tone: tone 2335 @rtype: str 2336 @return: entity with appropriate tone 2337 @raise InvalidEntityError: if the entity is invalid. 2338 @raise UnsupportedError: if the given entity is an Erlhuah form. 2339 """ 2340 if tone not in self.getTones(): 2341 raise InvalidEntityError("Invalid tone information given for '" \ 2342 + plainEntity + "': '" + unicode(tone) + "'") 2343 2344 if plainEntity.endswith('l') and plainEntity != 'el' \ 2345 and self.isPlainReadingEntity(plainEntity[:-1]): 2346 raise UnsupportedError("Not supported for '" + plainEntity + "'") 2347 2348 # split syllable into CVC parts 2349 c1, v, c2 = self.splitPlainSyllableCVC(plainEntity) 2350 # get tonal of etymological syllable 2351 baseTone = self.getBaseTone(tone) 2352 2353 # Follow rules of "A Grammar of Spoken Chinese", pp. 29 2354 if baseTone == 1: 2355 if c1 not in ['m', 'n', 'l', 'r']: 2356 # Rule 1 2357 tonalEntity = plainEntity 2358 else: 2359 # Rule 7 2360 tonalEntity = c1 + 'h' + v + c2 2361 2362 elif baseTone == 2: 2363 if c1 not in ['m', 'n', 'l', 'r']: 2364 # Rule 3 2365 if v == 'i' and not c2: 2366 tonalEntity = c1 + 'y' + v 2367 elif v[0] == 'i': 2368 # for rows 'i' and 'iu' 2369 tonalEntity = c1 + 'y' + v[1:] + c2 2370 elif v == 'u' and not c2: 2371 tonalEntity = c1 + 'w' + v 2372 elif v[0] == 'u': 2373 tonalEntity = c1 + 'w' + v[1:] + c2 2374 else: 2375 tonalEntity = c1 + v + 'r' + c2 2376 else: 2377 # Rule 7 2378 tonalEntity = plainEntity 2379 2380 elif baseTone == 3: 2381 # Rule 4 2382 if len(v) == 1: 2383 tonalEntity = c1 + v + v + c2 2384 elif v in ['ie', 'ei']: 2385 tonalEntity = c1 + v[0] + 'e' + v[1] + c2 2386 elif v in ['ou', 'uo']: 2387 tonalEntity = c1 + v[0] + 'o' + v[1] + c2 2388 # Rule 5 2389 elif v[0] == 'i': 2390 # for rows 'i' and 'iu', and final i 2391 tonalEntity = c1 + 'e' + v[1:] + c2 2392 elif v[0] == 'u': 2393 tonalEntity = c1 + 'o' + v[1:] + c2 2394 elif ('i' in v) or ('u' in v): 2395 tonalEntity = c1 + v.replace('i', 'e', 1).replace('u', 'o', 1) \ 2396 + c2 2397 2398 # Rule 8 2399 if not c1: 2400 if tonalEntity == 'iee': 2401 tonalEntity = 'yee' 2402 elif tonalEntity == 'uoo': 2403 tonalEntity = 'woo' 2404 elif v[0] == 'i': 2405 # for rows 'i' and 'iu' 2406 tonalEntity = 'y' + tonalEntity 2407 elif v[0] == 'u': 2408 tonalEntity = 'w' + tonalEntity 2409 2410 elif baseTone == 4: 2411 # Rule 6 2412 if not c2: 2413 if v in ['i', 'iu', 'u']: 2414 tonalEntity = c1 + v + c2 + 'h' 2415 elif v.endswith('i'): 2416 tonalEntity = c1 + v[:-1] + 'y' + c2 2417 elif v.endswith('u'): 2418 tonalEntity = c1 + v[:-1] + 'w' + c2 2419 else: 2420 tonalEntity = c1 + v + c2 + 'h' 2421 elif c2 == 'n': 2422 tonalEntity = c1 + v + 'nn' 2423 elif c2 == 'ng': 2424 tonalEntity = c1 + v + 'nq' 2425 elif c2 == 'l': 2426 tonalEntity = c1 + v + 'll' 2427 2428 # Rule 9 2429 if not c1: 2430 if tonalEntity == 'ih': 2431 tonalEntity = 'yih' 2432 elif tonalEntity == 'uh': 2433 tonalEntity = 'wuh' 2434 elif tonalEntity == 'inn': 2435 tonalEntity = 'yinn' 2436 elif tonalEntity == 'inq': 2437 tonalEntity = 'yinq' 2438 elif v[0] == 'i': 2439 # for rows 'i' and 'iu' 2440 tonalEntity = 'y' + tonalEntity[1:] 2441 elif v[0] == 'u': 2442 tonalEntity = 'w' + tonalEntity[1:] 2443 2444 if tone.startswith('5'): 2445 tonalEntity = '.' + tonalEntity 2446 elif tone.endswith('Optional5th'): 2447 tonalEntity = u'ₒ' + tonalEntity 2448 2449 return tonalEntity

2450

2451 - def splitEntityTone(self, entity):

2452 if self._syllableToneLookup == None: 2453 self._syllableToneLookup = {} 2454 for plainEntity in self.getPlainReadingEntities(): 2455 for tone in self.getTones(): 2456 tonalEntity = self.getTonalEntity(plainEntity, tone) 2457 self._syllableToneLookup[tonalEntity] = (plainEntity, tone) 2458 2459 if entity not in self._syllableToneLookup: 2460 # don't work for Erlhuah forms 2461 if self.isReadingEntity(entity): 2462 raise UnsupportedError("Not supported for '" + entity + "'") 2463 else: 2464 raise InvalidEntityError("Invalid entity given for '" \ 2465 + entity + "'") 2466 2467 return self._syllableToneLookup[entity]

2468

2469 - def getRhotacisedTonalEntity(self, plainEntity, tone):

2470 """ 2471 Gets the r-coloured entity (Erlhuah form) with tone mark for the given 2472 plain entity and tone. Not all entity-tone combinations are supported. 2473 2474 @type plainEntity: str 2475 @param plainEntity: entity without tonal information 2476 @type tone: str 2477 @param tone: tone 2478 @rtype: str 2479 @return: entity with appropriate tone 2480 @raise InvalidEntityError: if the entity is invalid. 2481 @raise UnsupportedError: if the given entity is an Erlhuah form or the 2482 syllable is not supported in this given tone. 2483 @todo Fix: Build lookup for performance reasons. 2484 """ 2485 if tone not in self.getTones(): 2486 raise InvalidEntityError("Invalid tone information given for '" \ 2487 + plainEntity + "': '" + unicode(tone) + "'") 2488 2489 if plainEntity.endswith('l') \ 2490 and self.isPlainReadingEntity(plainEntity[:-1]): 2491 raise UnsupportedError("Not supported for '" + plainEntity + "'") 2492 2493 # split syllable into CVC parts 2494 c1, v, c2 = self.splitPlainSyllableCVC(plainEntity) 2495 baseTone = self.getBaseTone(tone) 2496 2497 # apply Rule 7 which is not included in the table 2498 if c1 in ['m', 'n', 'l', 'r']: 2499 if baseTone == 1: 2500 c1 = c1 + 'h' 2501 elif baseTone == 2: 2502 # use base form 2503 baseTone = 1 2504 2505 # for i-, u-, iu- rows use the zero initial mapping 2506 if not c1 and v[0] in ['i', 'u']: 2507 column = self.DB_RHOTACISED_FINAL_MAPPING_ZEROINITIAL[baseTone] 2508 else: 2509 column = self.DB_RHOTACISED_FINAL_MAPPING[baseTone] 2510 2511 table = self.db.tables['GRRhotacisedFinals'] 2512 tonalFinal = self.db.selectScalar(select([table.c[column]], 2513 table.c.GRFinal == v + c2)) 2514 if not tonalFinal: 2515 raise UnsupportedError("No Erlhuah form for '" \ 2516 + plainEntity + "' and tone '" + tone + "'") 2517 2518 2519 # use selected apostrophe 2520 if self.getOption('GRRhotacisedFinalApostrophe') \ 2521 != self.DB_RHOTACISED_FINAL_APOSTROPHE: 2522 tonalFinal = tonalFinal.replace(self.DB_RHOTACISED_FINAL_APOSTROPHE, 2523 self.getOption('GRRhotacisedFinalApostrophe')) 2524 2525 tonalEntity = c1 + tonalFinal 2526 2527 if tone.startswith('5'): 2528 tonalEntity = '.' + tonalEntity 2529 elif tone.endswith('Optional5th'): 2530 tonalEntity = u'ₒ' + tonalEntity 2531 2532 return tonalEntity

2533

2534 - def _getAbbreviatedLookup(self):

2535 """ 2536 Gets the abbreviated form lookup table. 2537 2538 @rtype: dict 2539 @return: lookup table of abbreviated forms 2540 """ 2541 if self._abbrConversionLookup == None: 2542 self._abbrConversionLookup = {} 2543 2544 fullEntities = self.getFullReadingEntities() 2545 2546 table = self.db.tables['GRAbbreviation'] 2547 result = self.db.selectRows( 2548 select([table.c.GR, table.c.GRAbbreviation], distinct=True)) 2549 for originalEntity, abbreviatedEntity in result: 2550 # don't convert proper entities 2551 if abbreviatedEntity in fullEntities: 2552 continue 2553 2554 if abbreviatedEntity in self._abbrConversionLookup: 2555 # ambiguous mapping 2556 self._abbrConversionLookup[abbreviatedEntity] = None 2557 2558 self._abbrConversionLookup[abbreviatedEntity] = originalEntity 2559 2560 return self._abbrConversionLookup

2561

2562 - def getAbbreviatedEntities(self):

2563 """ 2564 Gets a list of abbreviated GR spellings. 2565 2566 @rtype: list 2567 @return: list of abbreviated GR forms 2568 """ 2569 return self._getAbbreviatedLookup().keys()

2570

2571 - def isAbbreviatedEntity(self, entity):

2572 """ 2573 Returns true if the given entity is an abbreviated spelling. 2574 2575 Reading entities will be handled as being case insensitive. 2576 2577 @type entity: str 2578 @param entity: entity to check 2579 @rtype: bool 2580 @return: C{True} if entity is an abbreviated form. 2581 """ 2582 return entity in self._getAbbreviatedLookup()

2583

2584 - def convertAbbreviatedEntity(self, entity):

2585 """ 2586 Converts the given abbreviated GR spelling to the original form. 2587 Non-abbreviated forms will returned unchanged. Takes care of 2588 capitalisation. 2589 2590 @type entity: str 2591 @param entity: reading entity. 2592 @rtype: str 2593 @return: original entity 2594 @raise AmbiguousConversionError: if conversion is ambiguous. 2595 @todo Fix: Move this method to the Converter, AmbiguousConversionError 2596 not needed for import here then 2597 """ 2598 if self.isAbbreviatedEntity(entity): 2599 if self._getAbbreviatedLookup()[entity] == None: 2600 raise AmbiguousConversionError("conversion for entity '" \ 2601 + entity + "' is ambiguous") 2602 2603 originalEntity = self._getAbbreviatedLookup()[entity] 2604 if entity.isupper(): 2605 originalEntity = originalEntity.upper() 2606 elif entity.istitle(): 2607 originalEntity = originalEntity.capitalize() 2608 2609 return originalEntity 2610 else: 2611 return entity

2612

2613 - def getPlainReadingEntities(self):

2614 """ 2615 Gets the list of plain entities supported by this reading without 2616 r-coloured forms (Erlhuah forms). Different to L{getReadingEntities()} 2617 the entities will carry no tone mark. 2618 2619 @rtype: set of str 2620 @return: set of supported syllables 2621 """ 2622 table = self.db.tables['GRSyllables'] 2623 return set(self.db.selectScalars(select([table.c.GR])))

2624

2625 - def getFullReadingEntities(self):

2626 """ 2627 Gets a set of full entities supported by the reading excluding 2628 abbreviated forms. 2629 2630 @rtype: set of str 2631 @return: set of supported syllables 2632 """ 2633 plainSyllables = self.getPlainReadingEntities() 2634 2635 syllableSet = set() 2636 for syllable in plainSyllables: 2637 for tone in self.getTones(): 2638 syllableSet.add(self.getTonalEntity(syllable, tone)) 2639 2640 # Erlhuah 2641 for syllable in plainSyllables: 2642 for tone in self.getTones(): 2643 try: 2644 erlhuahSyllable = self.getRhotacisedTonalEntity(syllable, 2645 tone) 2646 syllableSet.add(erlhuahSyllable) 2647 except UnsupportedError: 2648 # ignore errors about tone combinations that don't exist 2649 pass 2650 2651 return syllableSet

2652

2653 - def getReadingEntities(self):

2654 syllableSet = self.getFullReadingEntities() 2655 syllableSet.update(self.getAbbreviatedEntities()) 2656 2657 return syllableSet

2658

2659 - def isReadingEntity(self, entity):

2660 # overwrite default method, use lookup dictionary 2661 return RomanisationOperator.isReadingEntity(self, entity)

2662

2663 2664 -class MandarinIPAOperator(TonalIPAOperator):

2665 u""" 2666 Provides an operator on strings in Mandarin Chinese written in the 2667 I{International Phonetic Alphabet} (I{IPA}). 2668 2669 Features: 2670 - Tones can be marked either with tone numbers (1-4), tone contour 2671 numbers (e.g. 214), IPA tone bar characters or IPA diacritics, 2672 - support for low third tone (1/2 third tone) with tone contour 21, 2673 - four levels of the neutral tone for varying stress depending on the 2674 preceding syllable and 2675 - splitting of syllables into onset and rhyme using method 2676 L{getOnsetRhyme()}. 2677 2678 Tones 2679 ===== 2680 Tones in IPA can be expressed using different schemes. The following schemes 2681 are implemented here: 2682 - Numbers, regular tone numbers from 1 to 5 for first tone to fifth 2683 (qingsheng), 2684 - ChaoDigits, numbers displaying the levels of tone contours, e.g. 2685 214 for the regular third tone, 2686 - IPAToneBar, IPA modifying tone bar characters, e.g. ɕi˨˩˦, 2687 - Diacritics, diacritical marks and finally 2688 - None, no support for tone marks 2689 2690 Unlike other operators for Mandarin, distinction is made for six different 2691 tonal occurrences. The third tone is affected by tone sandhi and basically 2692 two different tone contours exist. Therefore L{getTonalEntity()} and 2693 L{splitEntityTone()} work with string representations as tones defined in 2694 L{TONES}. Same behaviour as found in other operators for Mandarin can be 2695 achieved by simply using the first character of the given string: 2696 2697 >>> from cjklib.reading import operator 2698 >>> ipaOp = operator.MandarinIPAOperator(toneMarkType='IPAToneBar') 2699 >>> syllable, toneName = ipaOp.splitEntityTone(u'mən˧˥') 2700 >>> tone = int(toneName[0]) 2701 2702 The implemented schemes render tone information differently. Mapping might 2703 lose information so a full back-transformation can not be guaranteed. 2704 2705 Source 2706 ====== 2707 - Yuen Ren Chao: A Grammar of Spoken Chinese. University of California 2708 Press, Berkeley, 1968, ISBN 0-520-00219-9. 2709 """ 2710 READING_NAME = "MandarinIPA" 2711 2712 TONE_MARK_PREFER = {'Numbers': {'3': '3rdToneRegular', '5': '5thTone'}, 2713 'ChaoDigits': {}, 'IPAToneBar': {}, 'Diacritics': {}} 2714 2715 TONES = ['1stTone', '2ndTone', '3rdToneRegular', '3rdToneLow', 2716 '4thTone', '5thTone', '5thToneHalfHigh', '5thToneMiddle', 2717 '5thToneHalfLow', '5thToneLow'] 2718 2719 TONE_MARK_MAPPING = {'Numbers': {'1stTone': '1', '2ndTone': '2', 2720 '3rdToneRegular': '3', '3rdToneLow': '3', '4thTone': '4', 2721 '5thTone':'5', '5thToneHalfHigh': '5', '5thToneMiddle': '5', 2722 '5thToneHalfLow': '5', '5thToneLow': '5'}, 2723 'ChaoDigits': {'1stTone': '55', '2ndTone': '35', 2724 '3rdToneRegular': '214', '3rdToneLow': '21', '4thTone': '51', 2725 '5thTone':'', '5thToneHalfHigh': '', '5thToneMiddle': '', 2726 '5thToneHalfLow': '', '5thToneLow': ''}, 2727 'IPAToneBar': {'1stTone': u'˥˥', '2ndTone': u'˧˥', 2728 '3rdToneRegular': u'˨˩˦', '3rdToneLow': u'˨˩', '4thTone': u'˥˩', 2729 '5thTone':'', '5thToneHalfHigh': u'꜉', '5thToneMiddle': u'꜊', 2730 '5thToneHalfLow': u'꜋', '5thToneLow': u'꜌'}, 2731 # TODO 2732 #'Diacritics': {'1stTone': u'\u0301', '2ndTone': u'\u030c', 2733 #'3rdToneRegular': u'\u0301\u0300\u0301', '3rdToneLow': u'\u0300', 2734 #'4thTone': u'\u0302', '5thTone': u'', '5thToneHalfHigh': '', 2735 #'5thToneMiddle': '', '5thToneHalfLow': '', '5thToneLow': ''} 2736 } 2737

2738 - def getPlainReadingEntities(self):

2739 """ 2740 Gets the list of plain entities supported by this reading. These 2741 entities will carry no tone mark. 2742 2743 @rtype: set of str 2744 @return: set of supported syllables 2745 """ 2746 table = self.db.tables['MandarinIPAInitialFinal'] 2747 return set(self.db.selectScalars(select([table.c.IPA])))

2748

2749 - def getOnsetRhyme(self, plainSyllable):

2750 """ 2751 Splits the given plain syllable into onset (initial) and rhyme (final). 2752 2753 @type plainSyllable: str 2754 @param plainSyllable: syllable in IPA without tone marks 2755 @rtype: tuple of str 2756 @return: tuple of syllable onset and rhyme 2757 @raise InvalidEntityError: if the entity is invalid (e.g. syllable 2758 nucleus or tone invalid). 2759 """ 2760 table = self.db.tables['MandarinIPAInitialFinal'] 2761 entry = set(self.db.selectRow( 2762 select([table.c.IPAInitial, table.c.IPAFinal], 2763 table.c.IPA == plainSyllable))) 2764 if not entry: 2765 raise InvalidEntityError("'" + plainSyllable \ 2766 + "' not a valid IPA form in this system'") 2767 return (entry[0], entry[1])

2768

2769 2770 -class MandarinBrailleOperator(ReadingOperator):

2771 u""" 2772 Provides an operator on strings written in the X{Braille} system. 2773 """ 2774 READING_NAME = "MandarinBraille" 2775 2776 TONEMARKS = [u'⠁', u'⠂', u'⠄', u'⠆', ''] 2777

2778 - def __init__(self, **options):

2779 """ 2780 Creates an instance of the MandarinBrailleOperator. 2781 2782 @param options: extra options 2783 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 2784 given, default settings will be assumed. 2785 """ 2786 super(MandarinBrailleOperator, self).__init__(**options) 2787 2788 # split regex 2789 initials = ''.join(self.db.selectScalars( 2790 select([self.db.tables['PinyinBrailleInitialMapping'].c.Braille], 2791 distinct=True))) 2792 finals = ''.join(self.db.selectScalars( 2793 select([self.db.tables['PinyinBrailleFinalMapping'].c.Braille], 2794 distinct=True))) 2795 # initial and final optional (but at least one), tone optional 2796 self.splitRegex = re.compile(ur'((?:(?:[' + re.escape(initials) \ 2797 + '][' + re.escape(finals) + ']?)|['+ re.escape(finals) \ 2798 + u'])[' + re.escape(''.join(self.TONEMARKS)) + ']?)') 2799 self.brailleRegex = re.compile(ur'([⠀-⣿]+|[^⠀-⣿]+)')

2800

2801 - def getTones(self):

2802 """ 2803 Returns a set of tones supported by the reading. 2804 2805 @rtype: set 2806 @return: set of supported tone marks. 2807 """ 2808 return range(1, 6)

2809

2810 - def decompose(self, string):

2811 """ 2812 Decomposes the given string into basic entities that can be mapped to 2813 one Chinese character each (exceptions possible). 2814 2815 The given input string can contain other non reading characters, e.g. 2816 punctuation marks. 2817 2818 The returned list contains a mix of basic reading entities and other 2819 characters e.g. spaces and punctuation marks. 2820 2821 @type string: str 2822 @param string: reading string 2823 @rtype: list of str 2824 @return: a list of basic entities of the input string 2825 """ 2826 def buildList(entityList): 2827 # further splitting of Braille and non-Braille parts/removing empty 2828 # strings 2829 newList = self.brailleRegex.findall(entityList[0]) 2830 2831 if len(entityList) > 1: 2832 newList.extend(buildList(entityList[1:])) 2833 2834 return newList

2835 2836 return buildList(self.splitRegex.split(string))

2837

2838 - def compose(self, readingEntities):

2839 """ 2840 Composes the given list of basic entities to a string. 2841 2842 No special treatment is given for subsequent Braille entities. Use 2843 L{getSpaceSeparatedEntities()} to insert spaces between two Braille 2844 syllables. 2845 2846 @type readingEntities: list of str 2847 @param readingEntities: list of basic entities or other content 2848 @rtype: str 2849 @return: composed entities 2850 """ 2851 return "".join(readingEntities)

2852

2853 - def getSpaceSeparatedEntities(self, readingEntities):

2854 """ 2855 Inserts spaces between to Braille entities for a given list of reading 2856 entities. 2857 2858 Spaces in the Braille system are applied between words. This is not 2859 reflected here and instead a space will be added between single 2860 syllables. 2861 2862 @type readingEntities: list of str 2863 @param readingEntities: list of basic entities or other content 2864 @rtype: list of str 2865 @return: entities with spaces inserted between Braille sequences 2866 """ 2867 def isBrailleChar(char): 2868 return char >= u'⠀' and char <= u'⣿'

2869 2870 newReadingEntities = [] 2871 if len(readingEntities) > 0: 2872 lastIsBraille = False 2873 for entity in readingEntities: 2874 isBraille = len(entity) > 0 and isBrailleChar(entity[0]) 2875 # separate two following entities with a space 2876 if lastIsBraille and isBraille: 2877 newReadingEntities.append(u' ') 2878 newReadingEntities.append(entity) 2879 lastIsBraille = isBraille 2880 return newReadingEntities 2881

2882 - def getTonalEntity(self, plainEntity, tone):

2883 """ 2884 Gets the entity with tone mark for the given plain entity and tone. 2885 2886 @type plainEntity: str 2887 @param plainEntity: entity without tonal information 2888 @type tone: str 2889 @param tone: tone 2890 @rtype: str 2891 @return: entity with appropriate tone 2892 @raise InvalidEntityError: if the entity is invalid. 2893 """ 2894 if tone not in self.getTones(): 2895 raise InvalidEntityError("Invalid tone information given for '" \ 2896 + plainEntity + "': '" + str(tone) + "'") 2897 return plainEntity + self.TONEMARKS[tone-1]

2898

2899 - def splitEntityTone(self, entity):

2900 """ 2901 Splits the entity into an entity without tone mark and the name of the 2902 entity's tone. 2903 2904 @type entity: str 2905 @param entity: entity with tonal information 2906 @rtype: tuple 2907 @return: plain entity without tone mark and additionally the tone 2908 @raise InvalidEntityError: if the entity is invalid. 2909 """ 2910 if entity[-1] in self.TONEMARKS: 2911 return entity[:-1], self.TONEMARKS.index(entity[-1]) + 1 2912 else: 2913 return entity, 5

2914

2915 - def isReadingEntity(self, entity):

2916 if not entity: 2917 return False 2918 2919 try: 2920 plainEntity, _ = self.splitEntityTone(entity) 2921 if not plainEntity: 2922 return False 2923 2924 initial, final = self.getOnsetRhyme(plainEntity) 2925 2926 finalTable = self.db.tables['PinyinBrailleFinalMapping'] 2927 if final and self.db.selectScalar(select([finalTable.c['Braille']], 2928 finalTable.c['Braille'] == final, distinct=True)) == None: 2929 return False 2930 2931 initialTable = self.db.tables['PinyinBrailleInitialMapping'] 2932 if initial and self.db.selectScalar(select( 2933 [initialTable.c['Braille']], 2934 initialTable.c['Braille'] == initial, distinct=True)) == None: 2935 return False 2936 2937 return True 2938 except InvalidEntityError: 2939 return False

2940

2941 - def getOnsetRhyme(self, plainSyllable):

2942 """ 2943 Splits the given plain syllable into onset (initial) and rhyme (final). 2944 2945 @type plainSyllable: str 2946 @param plainSyllable: syllable without tone marks 2947 @rtype: tuple of str 2948 @return: tuple of syllable onset and rhyme 2949 @raise InvalidEntityError: if the entity is invalid. 2950 """ 2951 if len(plainSyllable) == 1: 2952 finalTable = self.db.tables['PinyinBrailleFinalMapping'] 2953 if plainSyllable and self.db.selectScalar( 2954 select([finalTable.c.Braille], 2955 finalTable.c.Braille == plainSyllable, 2956 distinct=True)) != None: 2957 return '', plainSyllable 2958 else: 2959 return plainSyllable, '' 2960 elif len(plainSyllable) == 2: 2961 return plainSyllable[0], plainSyllable[1] 2962 else: 2963 raise InvalidEntityError("Invalid plain entity given with '" \ 2964 + plainSyllable + "'")

2965

2966 2967 -class JyutpingOperator(TonalRomanisationOperator):

2968 """ 2969 Provides an operator for the Cantonese romanisation X{Jyutping} made by the 2970 X{Linguistic Society of Hong Kong} (X{LSHK}). 2971 2972 @see: 2973 - The Linguistic Society of Hong Kong Cantonese Romanization Scheme: 2974 U{http://lshk.ctl.cityu.edu.hk/cantonese.php} 2975 """ 2976 READING_NAME = 'Jyutping' 2977 readingEntityRegex = re.compile(u"([A-Za-z]+[123456]?)") 2978

2979 - def __init__(self, **options):

2980 """ 2981 Creates an instance of the JyutpingOperator. 2982 2983 @param options: extra options 2984 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 2985 given, default settings will be assumed. 2986 @keyword strictSegmentation: if C{True} segmentation (using 2987 L{segment()}) and thus decomposition (using L{decompose()}) will 2988 raise an exception if an alphabetic string is parsed which can not 2989 be segmented into single reading entities. If C{False} the aforesaid 2990 string will be returned unsegmented. 2991 @keyword toneMarkType: if set to C{'Numbers'} the default form of 2992 appended numbers from 1 to 6 will be used to mark tones, if set to 2993 C{'None'} no tone marks will be used and no tonal information will 2994 be supplied at all. 2995 @keyword missingToneMark: if set to C{'noinfo'} no tone information 2996 will be deduced when no tone mark is found (takes on value C{None}), 2997 if set to C{'ignore'} this entity will not be valid and for 2998 segmentation the behaviour defined by C{'strictSegmentation'} will 2999 take affect. 3000 """ 3001 super(JyutpingOperator, self).__init__(**options) 3002 3003 # check which tone marks to use 3004 if 'toneMarkType' in options: 3005 if options['toneMarkType'] not in ['Numbers', 'None']: 3006 raise ValueError("Invalid option '" \ 3007 + str(options['toneMarkType']) \ 3008 + "' for keyword 'toneMarkType'") 3009 self.optionValue['toneMarkType'] = options['toneMarkType'] 3010 3011 # check if we have to be strict on tones, i.e. report missing tone info 3012 if 'missingToneMark' in options: 3013 if options['missingToneMark'] not in ['noinfo', 'ignore']: 3014 raise ValueError("Invalid option '" \ 3015 + str(options['missingToneMark']) \ 3016 + "' for keyword 'missingToneMark'") 3017 self.optionValue['missingToneMark'] = options['missingToneMark']

3018 3019 @classmethod

3020 - def getDefaultOptions(cls):

3021 options = super(JyutpingOperator, cls).getDefaultOptions() 3022 options.update({'toneMarkType': 'Numbers', 'missingToneMark': 'noinfo'}) 3023 3024 return options

3025

3026 - def getTones(self):

3027 tones = range(1, 7) 3028 if self.getOption('missingToneMark') != 'ignore' \ 3029 or self.getOption('toneMarkType') == 'None': 3030 tones.append(None) 3031 return tones

3032

3033 - def compose(self, readingEntities):

3034 return "".join(readingEntities)

3035

3036 - def getTonalEntity(self, plainEntity, tone):

3037 if self.getOption('toneMarkType') == 'None': 3038 return plainEntity 3039 3040 if tone != None: 3041 tone = int(tone) 3042 if tone not in self.getTones(): 3043 raise InvalidEntityError("Invalid tone information given for '" \ 3044 + plainEntity + "': '" + str(tone) + "'") 3045 if tone == None: 3046 return plainEntity 3047 return plainEntity + str(tone)

3048

3049 - def splitEntityTone(self, entity):

3050 if self.getOption('toneMarkType') == 'None': 3051 return entity, None 3052 3053 matchObj = re.search(u"[123456]$", entity) 3054 if matchObj: 3055 tone = int(matchObj.group(0)) 3056 return entity[0:len(entity)-1], tone 3057 else: 3058 if self.getOption('missingToneMark') == 'ignore': 3059 raise InvalidEntityError("No tone information given for '" \ 3060 + entity + "'") 3061 else: 3062 return entity, None

3063

3064 - def getPlainReadingEntities(self):

3065 return set(self.db.selectScalars( 3066 select([self.db.tables['JyutpingSyllables'].c.Jyutping])))

3067

3068 - def getOnsetRhyme(self, plainSyllable):

3069 """ 3070 Splits the given plain syllable into onset (initial) and rhyme (final). 3071 3072 The syllabic nasals I{m}, I{ng} will be regarded as being finals. 3073 3074 @type plainSyllable: str 3075 @param plainSyllable: syllable without tone marks 3076 @rtype: tuple of str 3077 @return: tuple of entity onset and rhyme 3078 @raise InvalidEntityError: if the entity is invalid. 3079 @todo Impl: Finals I{ing, ik, ung, uk} differ from other finals with 3080 same vowels. What semantics/view do we want to provide on the 3081 syllable parts? 3082 """ 3083 table = self.db.tables['JyutpingInitialFinal'] 3084 entry = self.db.selectRow( 3085 select([table.c.JyutpingInitial, table.c.JyutpingFinal], 3086 table.c.Jyutping == plainSyllable.lower())) 3087 if not entry: 3088 raise InvalidEntityError("'" + plainSyllable \ 3089 + "' not a valid plain Jyutping syllable'") 3090 return (entry[0], entry[1])

3091

3092 3093 -class CantoneseYaleOperator(TonalRomanisationOperator):

3094 u""" 3095 Provides an operator for the X{Cantonese Yale} romanisation. 3096 3097 Features: 3098 - tones marked by either diacritics or numbers, 3099 - choice between high level and high falling tone for number marks, 3100 - guessing of input form (reading dialect) and 3101 - splitting of syllables into onset, nucleus and coda. 3102 3103 High Level vs. High Falling Tone 3104 ================================ 3105 Yale distinguishes two tones often subsumed under one: the high level tone 3106 with tone contour 55 as given in the commonly used pitch model by Yuen Ren 3107 Chao and the high falling tone given as pitch 53 (as by Chao), 52 or 51 3108 (Bauer and Benedikt, chapter 2.1.1 pp. 115). 3109 Many sources state that these two tones aren't distinguishable anymore in 3110 modern Hong Kong Cantonese and thus are subsumed under one tone in some 3111 romanisation systems for Cantonese. 3112 3113 In the abbreviated form of the Yale romanisation that uses numbers to 3114 represent tones this distinction is not made. The mapping of the tone number 3115 C{1} to either the high level or the high falling tone can be given by the 3116 user and is important when conversion is done involving this abbreviated 3117 form of the Yale romanisation. By default the the high level tone will be 3118 used as this primary use is indicated in the given sources. 3119 3120 Sources 3121 ======= 3122 - Stephen Matthews, Virginia Yip: Cantonese: A Comprehensive Grammar. 3123 Routledge, 1994, ISBN 0-415-08945-X. 3124 - Robert S. Bauer, Paul K. Benedikt: Modern Cantonese Phonology 3125 (摩登廣州話語音學). Walter de Gruyter, 1997, ISBN 3-11-014893-5. 3126 3127 @see: 3128 - Cantonese: A Comprehensive Grammar (Preview): 3129 U{http://books.google.de/books?id=czbGJLu59S0C} 3130 - Modern Cantonese Phonology (Preview): 3131 U{http://books.google.de/books?id=QWNj5Yj6_CgC} 3132 """ 3133 READING_NAME = 'CantoneseYale' 3134 3135 TONES = ['1stToneLevel', '1stToneFalling', '2ndTone', '3rdTone', '4thTone', 3136 '5thTone', '6thTone'] 3137 """Names of tones used in the romanisation.""" 3138 TONE_MARK_MAPPING = {'Numbers': {'1stToneLevel': ('1', ''), 3139 '1stToneFalling': ('1', ''), '2ndTone': ('2', ''), 3140 '3rdTone': ('3', ''), '4thTone': ('4', ''), '5thTone': ('5', ''), 3141 '6thTone': ('6', ''), None: ('', '')}, 3142 'Diacritics': {'1stToneLevel': (u'\u0304', ''), 3143 '1stToneFalling': (u'\u0300', ''), 3144 '2ndTone': (u'\u0301', ''), '3rdTone': (u'', ''), 3145 '4thTone': (u'\u0300', 'h'), '5thTone': (u'\u0301', 'h'), 3146 '6thTone': (u'', 'h')}, 3147 'Internal': {'1stToneLevel': ('0', ''), 3148 '1stToneFalling': ('1', ''), '2ndTone': ('2', ''), 3149 '3rdTone': ('3', ''), '4thTone': ('4', ''), '5thTone': ('5', ''), 3150 '6thTone': ('6', ''), None: ('', '')}} 3151 """ 3152 Mapping of tone name to representation per tone mark type. Representations 3153 includes a diacritic mark and optional the letter 'h' marking a low tone. 3154 3155 The C{'Internal'} dialect is used for conversion between different forms of 3156 Cantonese Yale. As conversion to the other dialects can lose information 3157 (Diacritics: missing tone, Numbers: distinction between high level and high 3158 rising, None: no tones at all) conversion to this dialect can retain all 3159 information and thus can be used as a standard target reading. 3160 """ 3161 3162 syllableRegex = re.compile(ur'((?:m|ng|h|' \ 3163 + u'(?:[bcdfghjklmnpqrstvwxyz]*' \ 3164 + u'(?:(?:[aeiou]|[\u0304\u0301\u0300])+|yu[\u0304\u0301\u0300]?)))' \ 3165 + u'(?:h(?!(?:[aeiou]|yu)))?' \ 3166 + '(?:[mnptk]|ng)?[0123456]?)') 3167 """ 3168 Regex to split a string in NFD into several syllables in a crude way. 3169 The regular expressions works for both, diacritical and number tone marks. 3170 It consists of: 3171 - Nasal syllables, 3172 - Initial consonants, 3173 - vowels including diacritics, 3174 - tone mark h, 3175 - final consonants, 3176 - tone numbers. 3177 """ 3178

3179 - def __init__(self, **options):

3180 """ 3181 Creates an instance of the CantoneseYaleOperator. 3182 3183 @param options: extra options 3184 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 3185 given, default settings will be assumed. 3186 @keyword strictSegmentation: if C{True} segmentation (using 3187 L{segment()}) and thus decomposition (using L{decompose()}) will 3188 raise an exception if an alphabetic string is parsed which can not 3189 be segmented into single reading entities. If C{False} the aforesaid 3190 string will be returned unsegmented. 3191 @keyword toneMarkType: if set to C{'Diacritics'} tones will be marked 3192 using diacritic marks and the character I{h} for low tones, if set 3193 to C{'Numbers'} appended numbers from 1 to 6 will be used to mark 3194 tones, if set to C{'None'} no tone marks will be used and no tonal 3195 information will be supplied at all. 3196 @keyword missingToneMark: if set to C{'noinfo'} no tone information 3197 will be deduced when no tone mark is found (takes on value C{None}), 3198 if set to C{'ignore'} this entity will not be valid and for 3199 segmentation the behaviour defined by C{'strictSegmentation'} will 3200 take affect. This option is only valid if the value C{'Numbers'} is 3201 given for the option I{toneMarkType}. 3202 @keyword YaleFirstTone: tone in Yale which the first tone for tone marks 3203 with numbers should be mapped to. Value can be C{'1stToneLevel'} to 3204 map to the level tone with contour 55 or C{'1stToneFalling'} to map 3205 to the falling tone with contour 53. 3206 """ 3207 super(CantoneseYaleOperator, self).__init__(**options) 3208 3209 # check which tone marks to use 3210 if 'toneMarkType' in options: 3211 if options['toneMarkType'] not in ['Diacritics', 'Numbers', 'None', 3212 'Internal']: 3213 raise ValueError("Invalid option '" \ 3214 + str(options['toneMarkType']) \ 3215 + "' for keyword 'toneMarkType'") 3216 self.optionValue['toneMarkType'] = options['toneMarkType'] 3217 3218 # check if we have to be strict on tones, i.e. report missing tone info 3219 if 'missingToneMark' in options: 3220 if option['toneMarkType'] not in ['Numbers', 'Internal', 'None']: 3221 raise ValueError("keyword 'missingToneMark' is only valid if" \ 3222 + " tone mark type is set to 'Numbers', 'Internal' and "\ 3223 + "'None'") 3224 3225 if options['missingToneMark'] not in ['noinfo', 'ignore']: 3226 raise ValueError("Invalid option '" \ 3227 + str(options['missingToneMark']) \ 3228 + "' for keyword 'missingToneMark'") 3229 self.optionValue['missingToneMark'] = options['missingToneMark'] 3230 3231 # set the YaleFirstTone for handling ambiguous conversion of first 3232 # tone in Cantonese that has two different representations in Yale 3233 if 'YaleFirstTone' in options: 3234 if options['YaleFirstTone'] not in ['1stToneLevel', 3235 '1stToneFalling', 'None']: 3236 raise ValueError("Invalid option '" \ 3237 + unicode(options['YaleFirstTone']) \ 3238 + "' for keyword 'YaleFirstTone'") 3239 self.optionValue['YaleFirstTone'] = options['YaleFirstTone'] 3240 3241 # create lookup dict 3242 if self.getOption('toneMarkType') != 'None': 3243 # create lookup dicts 3244 self.toneMarkLookup = {} 3245 for tone in self.getTones(): 3246 toneMarks = self.TONE_MARK_MAPPING[ 3247 self.getOption('toneMarkType')][tone] 3248 self.toneMarkLookup[toneMarks] = tone 3249 if self.getOption('toneMarkType') == 'Numbers': 3250 # first tone ambiguous for tone mark as numbers, set user 3251 # selected tone 3252 self.toneMarkLookup[('1', '')] = self.getOption('YaleFirstTone') 3253 3254 # create tone regex 3255 if self.getOption('toneMarkType') != 'None': 3256 self.primaryToneRegex = re.compile(r"(?i)^[a-z]+([" \ 3257 + r"".join(set([re.escape(toneMark) for toneMark, hChar \ 3258 in self.TONE_MARK_MAPPING[self.getOption('toneMarkType')]\ 3259 .values()])) \ 3260 + r"]?)") 3261 self.hCharRegex = re.compile(r"^.*(?:[aeiou]|m|ng)(h)") 3262 3263 # set split regular expression, works for all tone marks 3264 self.readingEntityRegex = re.compile(u'(?i)((?:' \ 3265 + '|'.join([re.escape(v) for v in self._getDiacriticVowels()]) \ 3266 + u'|[a-z])+[0123456]?)')

3267 3268 @classmethod

3269 - def getDefaultOptions(cls):

3270 options = super(CantoneseYaleOperator, cls).getDefaultOptions() 3271 options.update({'toneMarkType': 'Diacritics', 3272 'missingToneMark': 'noinfo', 'YaleFirstTone': '1stToneLevel'}) 3273 3274 return options

3275 3276 @staticmethod

3277 - def _getDiacriticVowels():

3278 """ 3279 Gets a list of Cantonese Yale vowels with diacritical marks for tones. 3280 3281 The list includes characters I{m}, I{n} and I{h} for nasal forms. 3282 3283 @rtype: list of str 3284 @return: list of Cantonese Yale vowels with diacritical marks 3285 """ 3286 vowelList = set([]) 3287 for nucleusFirstChar in 'aeioumnh': 3288 for toneMark, hChar in \ 3289 CantoneseYaleOperator.TONE_MARK_MAPPING['Diacritics'].values(): 3290 if toneMark: 3291 vowelList.add(unicodedata.normalize("NFC", 3292 nucleusFirstChar + toneMark)) 3293 return vowelList

3294 3295 @classmethod

3296 - def guessReadingDialect(cls, string, includeToneless=False):

3297 """ 3298 Takes a string written in Cantonese Yale and guesses the reading 3299 dialect. 3300 3301 Currently only the option C{'toneMarkType'} is guessed. Unless 3302 C{'includeToneless'} is set to C{True} only the tone mark types 3303 C{'Diacritics'} and C{'Numbers'} are considered as the latter one can 3304 also represent the state of missing tones. 3305 3306 @type string: str 3307 @param string: Cantonese Yale string 3308 @rtype: dict 3309 @return: dictionary of basic keyword settings 3310 """ 3311 # split into entities using a simple regex for all dialect forms 3312 entities = cls.syllableRegex.findall( 3313 unicodedata.normalize("NFD", unicode(string))) 3314 3315 # guess tone mark type 3316 diacriticEntityCount = 0 3317 numberEntityCount = 0 3318 3319 for entity in entities: 3320 # take entity (which can be several connected syllables) and check 3321 if entity[-1] in '123456': 3322 numberEntityCount = numberEntityCount + 1 3323 elif 'h' in entity[1:]: 3324 # tone mark character 'h' for low tone only used with diacritics 3325 diacriticEntityCount = diacriticEntityCount + 1 3326 else: 3327 for diacriticMarc in [u'\u0304', u'\u0301', u'\u0300']: 3328 if diacriticMarc in entity: 3329 diacriticEntityCount = diacriticEntityCount + 1 3330 break 3331 # compare statistics 3332 if includeToneless \ 3333 and (1.0 * max(diacriticEntityCount, numberEntityCount) \ 3334 / len(entities)) < 0.1: 3335 # less than 1/10 units carry some possible tone mark, so decide 3336 # for toneless 3337 toneMarkType = 'None' 3338 else: 3339 if diacriticEntityCount > numberEntityCount: 3340 toneMarkType = 'Diacritics' 3341 else: 3342 toneMarkType = 'Numbers' 3343 3344 return {'toneMarkType': toneMarkType}

3345

3346 - def getTones(self):

3347 tones = self.TONES[:] 3348 if (self.getOption('missingToneMark') == 'noinfo' \ 3349 and self.getOption('toneMarkType') in ['Numbers', 'Internal']) \ 3350 or self.getOption('toneMarkType') == 'None': 3351 tones.append(None) 3352 return tones

3353

3354 - def compose(self, readingEntities):

3355 return "".join(readingEntities)

3356

3357 - def getTonalEntity(self, plainEntity, tone):

3358 """ 3359 @todo Lang: Place the tone mark on the first character of the nucleus? 3360 """ 3361 if tone not in self.getTones(): 3362 raise InvalidEntityError("Invalid tone information given for '" \ 3363 + plainEntity + "': '" + unicode(tone) + "'") 3364 3365 if self.getOption('toneMarkType') == 'None': 3366 return plainEntity 3367 3368 toneMark, hChar = self.TONE_MARK_MAPPING[ 3369 self.getOption('toneMarkType')][tone] 3370 3371 if self.getOption('toneMarkType') == 'Diacritics': 3372 # split entity into vowel (aeiou) and non-vowel part for placing 3373 # marks 3374 matchObj = re.match('(?i)^([^aeiou]*?)([aeiou]*)([^aeiou]*)$', 3375 plainEntity) 3376 if not matchObj: 3377 raise InvalidEntityError("Invalid entity given for '" \ 3378 + plainEntity + "'") 3379 3380 nonVowelH, vowels, nonVowelT = matchObj.groups() 3381 # place 'h' after vowel (or after syllable for syllabic nasal) and 3382 # diacritic on first vowel/first character for syllabic nasal 3383 if vowels: 3384 vowels = unicodedata.normalize("NFC", vowels[0] + toneMark \ 3385 + vowels[1:] + hChar) 3386 else: 3387 nonVowelT = unicodedata.normalize("NFC", nonVowelT[0] \ 3388 + toneMark + nonVowelT[1:] + hChar) 3389 3390 return nonVowelH + vowels + nonVowelT 3391 elif self.getOption('toneMarkType') in ['Numbers', 'Internal']: 3392 return plainEntity + toneMark

3393

3394 - def splitEntityTone(self, entity):

3395 """ 3396 Splits the entity into an entity without tone mark and the 3397 entity's tone index. 3398 3399 The plain entity returned will always be in Unicode's 3400 I{Normalization Form C} (NFC, see 3401 U{http://www.unicode.org/reports/tr15/}). 3402 3403 @type entity: str 3404 @param entity: entity with tonal information 3405 @rtype: tuple 3406 @return: plain entity without tone mark and entity's tone index 3407 (starting with 1) 3408 """ 3409 # get decomposed Unicode string, e.g. C{'ū'} to C{'u\u0304'} 3410 entity = unicodedata.normalize("NFD", unicode(entity)) 3411 if self.getOption('toneMarkType') == 'None': 3412 return unicodedata.normalize("NFC", entity), None 3413 3414 # find primary tone mark 3415 matchObj = self.primaryToneRegex.search(entity) 3416 if not matchObj: 3417 raise InvalidEntityError("Invalid entity or no tone information " \ 3418 "given for '" + entity + "'") 3419 toneMark = matchObj.group(1) 3420 plainEntity = entity[0:matchObj.start(1)] + entity[matchObj.end(1):] 3421 3422 # find lower tone mark 'h' character 3423 matchObj = self.hCharRegex.search(plainEntity) 3424 if matchObj: 3425 hChar = matchObj.group(1) 3426 plainEntity = plainEntity[0:matchObj.start(1)] \ 3427 + plainEntity[matchObj.end(1):] 3428 else: 3429 hChar = '' 3430 3431 try: 3432 tone = self.toneMarkLookup[(toneMark, hChar)] 3433 except KeyError: 3434 raise InvalidEntityError("Invalid entity or no tone information " \ 3435 "given for '" + entity + "'") 3436 3437 return unicodedata.normalize("NFC", plainEntity), tone

3438

3439 - def getPlainReadingEntities(self):

3440 return set(self.db.selectScalars(select( 3441 [self.db.tables['CantoneseYaleSyllables'].c.CantoneseYale])))

3442

3443 - def getOnsetRhyme(self, plainSyllable):

3444 """ 3445 Splits the given plain syllable into onset (initial) and rhyme (final). 3446 3447 The syllabic nasals I{m}, I{ng} will be returned as final. Syllables yu, 3448 yun, yut will fall into (y, yu, ), (y, yu, n) and (y, yu, t). 3449 3450 @type plainSyllable: str 3451 @param plainSyllable: syllable without tone marks 3452 @rtype: tuple of str 3453 @return: tuple of entity onset and rhyme 3454 @raise InvalidEntityError: if the entity is invalid. 3455 """ 3456 onset, nucleus, coda = self.getOnsetNucleusCoda(plainSyllable) 3457 return onset, nucleus + coda

3458

3459 - def getOnsetNucleusCoda(self, plainSyllable):

3460 """ 3461 Splits the given plain syllable into onset (initial), nucleus and coda, 3462 the latter building the rhyme (final). 3463 3464 The syllabic nasals I{m}, I{ng} will be returned as coda. Syllables yu, 3465 yun, yut will fall into (y, yu, ), (y, yu, n) and (y, yu, t). 3466 3467 @type plainSyllable: str 3468 @param plainSyllable: syllable in the Yale romanisation system without 3469 tone marks 3470 @rtype: tuple of str 3471 @return: tuple of syllable onset, nucleus and coda 3472 @raise InvalidEntityError: if the entity is invalid (e.g. syllable 3473 nucleus or tone invalid). 3474 @todo Impl: Finals I{ing, ik, ung, uk, eun, eut, a} differ from other 3475 finals with same vowels. What semantics/view do we want to provide 3476 on the syllable parts? 3477 """ 3478 # if tone mark exist, split off 3479 table = self.db.tables['CantoneseYaleInitialNucleusCoda'] 3480 entry = self.db.selectRow( 3481 select([table.c.CantoneseYaleInitial, table.c.CantoneseYaleNucleus, 3482 table.c.CantoneseYaleCoda], 3483 table.c.CantoneseYale == plainSyllable.lower())) 3484 if not entry: 3485 raise InvalidEntityError("'" + plainSyllable \ 3486 + "' not a valid plain Cantonese Yale syllable'") 3487 3488 return (entry[0], entry[1], entry[2])

3489

3490 3491 -class CantoneseIPAOperator(TonalIPAOperator):

3492 u""" 3493 Provides an operator on strings of the Cantonese language written in the 3494 I{International Phonetic Alphabet} (I{IPA}). 3495 3496 CantonteseIPAOperator does not supply the same closed set of syllables as 3497 other L{ReadingOperator}s as IPA provides different ways to represent 3498 pronunciation. Because of that a user defined IPA syllable will not easily 3499 map to another transcription system and thus only basic support is provided 3500 for this direction. 3501 3502 This operator supplies an additional method L{getOnsetRhyme()} which allows 3503 breaking down syllables into their onset and rhyme. 3504 3505 Features: 3506 - Tones can be marked either with tone numbers (1-6), tone contour 3507 numbers (e.g. 55), IPA tone bar characters or IPA diacritics, 3508 - choice between high level and high falling tone for number marks, 3509 - flexible set of tones, 3510 - support for stop tones, 3511 - handling of variable vowel length for tone contours of stop tone 3512 syllables and 3513 - splitting of syllables into onset and rhyme. 3514 3515 Tones 3516 ===== 3517 Tones in IPA can be expressed using different schemes. The following schemes 3518 are implemented here: 3519 - Numbers, tone numbers for the six-tone scheme, 3520 - ChaoDigits, numbers displaying the levels of tone contours, e.g. 3521 55 for the high level tone, 3522 - IPAToneBar, IPA modifying tone bar characters, e.g. ɛw˥˥, 3523 - None, no support for tone marks 3524 3525 Sources 3526 ======= 3527 - Robert S. Bauer, Paul K. Benedikt: Modern Cantonese Phonology 3528 (摩登廣州話語音學). Walter de Gruyter, 1997, ISBN 3-11-014893-5. 3529 - Robert S. Bauer: Hong Kong Cantonese Tone Contours. In: Studies in 3530 Cantonese Linguistics. Linguistic Society of Hong Kong, 1998, 3531 ISBN 962-7578-04-5. 3532 3533 @see: 3534 - Modern Cantonese Phonology (Preview): 3535 U{http://books.google.de/books?id=QWNj5Yj6_CgC} 3536 3537 @todo Lang: Shed more light on tone sandhi in Cantonese language. 3538 @todo Impl: Implement diacritics for Cantonese Tones. On which part of the 3539 syllable should they be placed. Document. 3540 @todo Lang: Binyām 變音 3541 @todo Impl: What are the semantics of non-level tones given for unreleased 3542 stop finals? Take high rising Binyam into account. 3543 """ 3544 READING_NAME = "CantoneseIPA" 3545 3546 TONES = ['HighLevel', 'MidLevel', 'MidLowLevel', 'HighRising', 3547 'MidLowRising', 'MidLowFalling', 'HighFalling'] 3548 3549 STOP_TONES = {'HighStopped': 'HighLevel', 'MidStopped': 'MidLevel', 3550 'MidLowStopped': 'MidLowLevel'} 3551 """Cantonese stop tone mapping to general level tones.""" 3552 3553 STOP_TONES_EXPLICIT = {'HighStopped_Short': ('HighLevel', 'S'), 3554 'MidStopped_Short': ('MidLevel', 'S'), 3555 'MidLowStopped_Short': ('MidLowLevel', 'S'), 3556 'HighStopped_Long': ('HighLevel', 'L'), 3557 'MidStopped_Long': ('MidLevel', 'L'), 3558 'MidLowStopped_Long': ('MidLowLevel', 'L')} 3559 """ 3560 Cantonese stop tone mapping to general level tones with stop tones realised 3561 for explicit marking short/long pronunciation. 3562 """ 3563 3564 TONE_MARK_PREFER = {'Numbers': {'1': 'HighLevel'}, 3565 'ChaoDigits': {}, 'IPAToneBar': {}, 'Diacritics': {}} 3566 3567 TONE_MARK_MAPPING = {'Numbers': {'HighLevel': '1', 'MidLevel': '3', 3568 'MidLowLevel': '6', 'HighRising': '2', 'MidLowRising': '5', 3569 'MidLowFalling': '4', 'HighFalling': '1', 'HighStopped_Short': '1', 3570 'MidStopped_Short': '3', 'MidLowStopped_Short': '6', 3571 'HighStopped_Long': '1', 'MidStopped_Long': '3', 3572 'MidLowStopped_Long': '6'}, 3573 'ChaoDigits': {'HighLevel': '55', 'MidLevel': '33', 3574 'MidLowLevel': '22', 'HighRising': '25', 'MidLowRising': '23', 3575 'MidLowFalling': '21', 'HighFalling': '52', 3576 'HighStopped_Short': '5', 'MidStopped_Short': '3', 3577 'MidLowStopped_Short': '2', 'HighStopped_Long': '55', 3578 'MidStopped_Long': '33', 'MidLowStopped_Long': '22'}, 3579 'IPAToneBar': {'HighLevel': u'˥˥', 'MidLevel': u'˧˧', 3580 'MidLowLevel': u'˨˨', 'HighRising': u'˨˥', 'MidLowRising': u'˨˧', 3581 'MidLowFalling': u'˨˩', 'HighFalling': u'˥˨', 3582 'HighStopped_Short': u'˥', 'MidStopped_Short': u'˧', 3583 'MidLowStopped_Short': u'˨', 'HighStopped_Long': u'˥˥', 3584 'MidStopped_Long': u'˧˧', 'MidLowStopped_Long': u'˨˨'}, 3585 #'Diacritics': {} 3586 } 3587

3588 - def __init__(self, **options):

3589 """ 3590 Creates an instance of the CantoneseIPAOperator. 3591 3592 By default no tone marks will be shown. 3593 3594 @param options: extra options 3595 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 3596 given, default settings will be assumed. 3597 @keyword toneMarkType: type of tone marks, one out of C{'Numbers'}, 3598 C{'ChaoDigits'}, C{'IPAToneBar'}, C{'Diacritics'}, C{'None'} 3599 @keyword 1stToneName: tone for mark 1 under tone mark type C{'Numbers'}, 3600 either I{'HighLevel'} or I{'HighFalling'}. 3601 @keyword stopTones: if set to C{'none'} the basic 6 (7) tones will be 3602 used and stop tones will be reported as one of them, if set to 3603 C{'general'} the three stop tones will be included, if set to 3604 C{'explicit'} the short and long forms will be explicitly supported. 3605 """ 3606 super(CantoneseIPAOperator, self).__init__(**options) 3607 3608 if self.getOption('toneMarkType') == 'Diacritics': 3609 raise NotImplementedError() # TODO 3610 3611 if '1stToneName' in options: 3612 if self.getOption('toneMarkType') != 'Numbers': 3613 raise ValueError("keyword '1stToneName' is only valid if" \ 3614 + " tone mark type is set to 'Numbers'") 3615 if options['1stToneName'] not in self.TONES: 3616 raise ValueError("Invalid option '" \ 3617 + str(options['1stToneName']) \ 3618 + "' for keyword '1stToneName'") 3619 3620 self.optionValue['toneMarkPrefer']['1'] = options['1stToneName'] 3621 3622 if 'stopTones' in options: 3623 if options['stopTones'] not in ['none', 'general', 'explicit']: 3624 raise ValueError("Invalid option '" \ 3625 + str(options['stopTones']) + "' for keyword 'stopTones'") 3626 3627 self.optionValue['stopTones'] = options['stopTones'] 3628 3629 # lookup base tone to explicit stop tone 3630 self.stopToneLookup = {} 3631 for stopTone in self.STOP_TONES_EXPLICIT: 3632 baseTone, vowelLength = self.STOP_TONES_EXPLICIT[stopTone] 3633 if not baseTone in self.stopToneLookup: 3634 self.stopToneLookup[baseTone] = {} 3635 self.stopToneLookup[baseTone][vowelLength] = stopTone 3636 # add general stop tones 3637 for stopTone in self.STOP_TONES: 3638 self.stopToneLookup[stopTone] \ 3639 = self.stopToneLookup[self.STOP_TONES[stopTone]]

3640 3641 @classmethod

3642 - def getDefaultOptions(cls):

3643 options = super(CantoneseIPAOperator, cls).getDefaultOptions() 3644 options.update({'stopTones': 'none'}) 3645 3646 return options

3647

3648 - def getTones(self):

3649 tones = self.TONES[:] 3650 if self.getOption('stopTones') == 'general': 3651 tones.extend(self.STOP_TONES.keys()) 3652 elif self.getOption('stopTones') == 'explicit': 3653 tones.extend(self.STOP_EXPLICIT.keys()) 3654 if self.getOption('missingToneMark') == 'noinfo' \ 3655 or self.getOption('toneMarkType') == 'None': 3656 tones.append(None) 3657 3658 return tones

3659

3660 - def getPlainReadingEntities(self):

3661 return set(self.db.selectScalars(select( 3662 [self.db.tables['CantoneseIPAInitialFinal'].c.IPA])))

3663

3664 - def getOnsetRhyme(self, plainSyllable):

3665 """ 3666 Splits the given plain syllable into onset (initial) and rhyme (final). 3667 3668 @type plainSyllable: str 3669 @param plainSyllable: syllable in IPA without tone marks 3670 @rtype: tuple of str 3671 @return: tuple of syllable onset and rhyme 3672 @raise InvalidEntityError: if the entity is invalid (e.g. syllable 3673 nucleus or tone invalid). 3674 """ 3675 table = self.db.tables['CantoneseIPAInitialFinal'] 3676 entry = self.db.selectRow( 3677 select([table.c.IPAInitial, table.c.IPAFinal], 3678 table.c.IPA == plainSyllable)) 3679 if not entry: 3680 raise InvalidEntityError("'" + plainSyllable \ 3681 + "' not a valid IPA form in this system'") 3682 return (entry[0], entry[1])

3683

3684 - def getTonalEntity(self, plainEntity, tone):

3685 if tone not in self.getTones(): 3686 raise InvalidEntityError("Invalid tone information given for '" \ 3687 + plainEntity + "': '" + str(tone) + "'") 3688 if self.getOption('toneMarkType') == "None" or tone == None: 3689 entity = plainEntity 3690 else: 3691 # find explicit form 3692 tone = self.getExplicitTone(plainEntity, tone) 3693 3694 entity = plainEntity \ 3695 + self.TONE_MARK_MAPPING[self.getOption('toneMarkType')][tone] 3696 return unicodedata.normalize("NFC", entity)

3697

3698 - def splitEntityTone(self, entity):

3699 # get decomposed Unicode string, e.g. C{'â'} to C{'u\u0302'} 3700 entity = unicodedata.normalize("NFD", unicode(entity)) 3701 3702 toneMarkType = self.getOption('toneMarkType') 3703 if toneMarkType == 'None': 3704 return unicodedata.normalize("NFC", entity), None 3705 else: 3706 matchObj = self.TONE_MARK_REGEX[toneMarkType].search(entity) 3707 if matchObj: 3708 toneMark = matchObj.group(1) 3709 # strip off tone mark 3710 plainEntity = entity.replace(toneMark, '') 3711 3712 baseTone = self.getBaseToneForToneMark(toneMark) 3713 3714 return unicodedata.normalize("NFC", plainEntity), baseTone 3715 elif self.getOption('missingToneMark') == 'noinfo': 3716 return unicodedata.normalize("NFC", entity), None 3717 3718 raise InvalidEntityError("Invalid entity given for '" + entity + "'")

3719

3720 - def getExplicitTone(self, plainSyllable, baseTone):

3721 """ 3722 Gets the explicit tone for the given plain syllable and base tone. 3723 3724 In case the 6 (7) base tones are used, the stop tone value can be 3725 deduced from the given syllable. The stop tone returned will be even 3726 more precise in denoting the vowel length that influences the tone 3727 contour. 3728 3729 @type plainSyllable: str 3730 @param plainSyllable: syllable without tonal information 3731 @type baseTone: str 3732 @param baseTone: tone 3733 @rtype: str 3734 @return: explicit tone 3735 @raise InvalidEntityError: if the entity is invalid. 3736 """ 3737 # only need explicit tones 3738 if baseTone in self.stopToneLookup: 3739 # check if we have an unreleased final consonant 3740 table = self.db.tables['CantoneseIPAInitialFinal'] 3741 unreleasedFinal, vowelLength = self.db.selectRow( 3742 select([table.c.UnreleasedFinal, table.c.VowelLength], 3743 table.c.IPA == plainSyllable)) 3744 if unreleasedFinal: 3745 return self.stopToneLookup[baseTone][vowelLength] 3746 3747 if baseTone in self.STOP_TONES: 3748 # general stop tone that couldn't be dealt with 3749 raise InvalidEntityError("Invalid tone information given for '" \ 3750 + plainEntity + "': '" + str(tone) + "'") 3751 3752 return baseTone

3753

3754 - def getBaseToneForToneMark(self, toneMark):

3755 """ 3756 Gets the base tone (one of the 6/7 general tones) for the given tone 3757 mark. 3758 3759 @type toneMark: str 3760 @param toneMark: tone mark representation of the tone 3761 @rtype: str 3762 @return: base tone 3763 @raise InvalidEntityError: if the toneMark does not exist. 3764 """ 3765 if self.toneMarkLookup == None: 3766 # create lookup dict 3767 self.toneMarkLookup = {} 3768 toneMarkType = self.getOption('toneMarkType') 3769 for tone in self.TONE_MARK_MAPPING[toneMarkType]: 3770 mark = self.TONE_MARK_MAPPING[toneMarkType][tone] 3771 3772 # get base tone 3773 reportTone = tone 3774 if reportTone not in self.TONES: 3775 if self.getOption('stopTones') == 'general': 3776 reportTone = self.STOP_TONES[tone] 3777 elif self.getOption('stopTones') == 'none': 3778 reportTone, _ = self.STOP_TONES_EXPLICIT[tone] 3779 3780 if mark not in self.toneMarkLookup \ 3781 or (mark in self.TONE_MARK_PREFER[toneMarkType] \ 3782 and self.TONE_MARK_PREFER[toneMarkType][mark] == tone): 3783 self.toneMarkLookup[mark] = reportTone 3784 3785 if toneMark in self.toneMarkLookup: 3786 return self.toneMarkLookup[toneMark] 3787 else: 3788 raise InvalidEntityError("Invalid tone mark given with '" \ 3789 + toneMark + "'")

3790

Source Code for Module cjklib.reading.operator