cjklib.reading.converter

1 #!/usr/bin/python 2 # -*- coding: utf-8 -*- 3 # This file is part of cjklib. 4 # 5 # cjklib is free software: you can redistribute it and/or modify 6 # it under the terms of the GNU Lesser General Public License as published by 7 # the Free Software Foundation, either version 3 of the License, or 8 # (at your option) any later version. 9 # 10 # cjklib is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU Lesser General Public License for more details. 14 # 15 # You should have received a copy of the GNU Lesser General Public License 16 # along with cjklib. If not, see <http://www.gnu.org/licenses/>. 17 18 u""" 19 Provides L{ReadingConverter}s, classes to convert strings written in a character 20 reading to another reading. 21 22 Examples 23 ======== 24 Convert a string from I{Jyutping} to I{Cantonese Yale}: 25 26 >>> from cjklib.reading import ReadingFactory 27 >>> f = ReadingFactory() 28 >>> f.convert('gwong2jau1wa2', 'Jyutping', 'CantoneseYale') 29 u'gw\xf3ngy\u0101uw\xe1' 30 31 This is also possible creating a converter instance explicitly using the 32 factory: 33 34 >>> jyc = f.createReadingConverter('GR', 'Pinyin') 35 >>> jyc.convert('Woo.men tingshuo yeou "Yinnduhshyue", "Aijyishyue"') 36 u'W\u01d2men t\u012bngshu\u014d y\u01d2u "Y\xecnd\xf9xu\xe9", \ 37 "\u0100ij\xedxu\xe9"' 38 39 Convert between different dialects of the same reading I{Wade-Giles}: 40 41 >>> f.convert(u'kuo3-yü2', 'WadeGiles', 'WadeGiles', 42 ... sourceOptions={'toneMarkType': 'Numbers'}, 43 ... targetOptions={'toneMarkType': 'SuperscriptNumbers'}) 44 u'kuo\xb3-y\xfc\xb2' 45 46 See L{PinyinDialectConverter} for more examples. 47 """ 48 import re 49 import copy 50 51 from sqlalchemy import select 52 from sqlalchemy.sql import and_, or_, not_ 53 54 from cjklib.exception import (ConversionError, AmbiguousConversionError, 55 InvalidEntityError, UnsupportedError) 56 from cjklib.dbconnector import DatabaseConnector 57 import operator 58 import cjklib.reading

59 60 -class ReadingConverter(object):

61 """ 62 Defines an abstract converter between two or more I{character reading}s. 63 64 The basic method is L{convert()} which converts one input string from one 65 reading to another. 66 67 The methods L{getDefaultOptions()} and L{getOption()} provide means to 68 handle conversion specific settings. 69 70 The class itself can't be used directly, it has to be subclassed and its 71 methods need to be extended. 72 """ 73 CONVERSION_DIRECTIONS = [] 74 """ 75 List of tuples for specifying supported conversion directions from reading A 76 to reading B. If both directions are supported, two tuples (A, B) and (B, A) 77 are given. 78 """ 79

80 - def __init__(self, *args, **options):

81 """ 82 Creates an instance of the ReadingConverter. 83 84 @param args: optional list of L{RomanisationOperator}s to use for 85 handling source and target readings. 86 @param options: extra options 87 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 88 given, default settings will be assumed. 89 @keyword sourceOperators: list of L{ReadingOperator}s used for handling 90 source readings. 91 @keyword targetOperators: list of L{ReadingOperator}s used for handling 92 target readings. 93 """ 94 if 'dbConnectInst' in options: 95 self.db = options['dbConnectInst'] 96 else: 97 self.db = DatabaseConnector.getDBConnector() 98 99 self.readingFact = cjklib.reading.ReadingFactory(dbConnectInst=self.db) 100 101 self.optionValue = {} 102 defaultOptions = self.getDefaultOptions() 103 for option in defaultOptions: 104 if type(defaultOptions[option]) in [type(()), type([]), type({})]: 105 self.optionValue[option] = copy.deepcopy(defaultOptions[option]) 106 else: 107 self.optionValue[option] = defaultOptions[option] 108 109 # get reading operators 110 for arg in args: 111 if isinstance(arg, operator.ReadingOperator): 112 # store reading operator for the given reading 113 self.optionValue['sourceOperators'][arg.READING_NAME] = arg 114 self.optionValue['targetOperators'][arg.READING_NAME] = arg 115 else: 116 raise ValueError("unknown type '" + str(type(arg)) \ 117 + "' given as ReadingOperator") 118 119 # get specialised source/target readings 120 if 'sourceOperators' in options: 121 for arg in options['sourceOperators']: 122 if isinstance(arg, operator.ReadingOperator): 123 # store reading operator for the given reading 124 self.optionValue['sourceOperators'][arg.READING_NAME] = arg 125 else: 126 raise ValueError("unknown type '" + str(type(arg)) \ 127 + "' given as source reading operator") 128 129 if 'targetOperators' in options: 130 for arg in options['targetOperators']: 131 if isinstance(arg, operator.ReadingOperator): 132 # store reading operator for the given reading 133 self.optionValue['targetOperators'][arg.READING_NAME] = arg 134 else: 135 raise ValueError("unknown type '" + str(type(arg)) \ 136 + "' given as target reading operator")

137 138 @classmethod

139 - def getDefaultOptions(cls):

140 """ 141 Returns the reading converter's default options. 142 143 The keyword 'dbConnectInst' is not regarded a configuration option of 144 the converter and is thus not included in the dict returned. 145 146 @rtype: dict 147 @return: the reading converter's default options. 148 """ 149 return {'sourceOperators': {}, 'targetOperators': {}}

150

151 - def getOption(self, option):

152 """ 153 Returns the value of the reading converter's option. 154 155 @return: the value of the given reading converter's option. 156 """ 157 return self.optionValue[option]

158

159 - def convert(self, string, fromReading, toReading):

160 """ 161 Converts a string in the source reading to the given target reading. 162 163 @type string: str 164 @param string: string written in the source reading 165 @type fromReading: str 166 @param fromReading: name of the source reading 167 @type toReading: str 168 @param toReading: name of the target reading 169 @rtype: str 170 @returns: the input string converted to the C{toReading} 171 @raise DecompositionError: if the string can not be decomposed into 172 basic entities with regards to the source reading or the given 173 information is insufficient. 174 @raise ConversionError: on operations specific to the conversion between 175 the two readings (e.g. error on converting entities). 176 @raise UnsupportedError: if source or target reading is not supported 177 for conversion. 178 """ 179 # decompose string 180 fromReadingEntities = self._getFromOperator(fromReading).decompose( 181 string) 182 # convert entities 183 toReadingEntities = self.convertEntities(fromReadingEntities, 184 fromReading, toReading) 185 # compose 186 return self._getToOperator(toReading).compose(toReadingEntities)

187

188 - def convertEntities(self, readingEntities, fromReading, toReading):

189 """ 190 Converts a list of entities in the source reading to the given target 191 reading. 192 193 The default implementation will raise a NotImplementedError. 194 195 @type readingEntities: list of str 196 @param readingEntities: list of entities written in source reading 197 @type fromReading: str 198 @param fromReading: name of the source reading 199 @type toReading: str 200 @param toReading: name of the target reading 201 @rtype: list of str 202 @return: list of entities written in target reading 203 @raise ConversionError: on operations specific to the conversion between 204 the two readings (e.g. error on converting entities). 205 @raise UnsupportedError: if source or target reading is not supported 206 for conversion. 207 @raise InvalidEntityError: if an invalid entity is given. 208 """ 209 raise NotImplementedError

210

211 - def _getFromOperator(self, readingN):

212 """ 213 Gets a reading operator instance for conversion from the given reading. 214 215 @type readingN: str 216 @param readingN: name of reading 217 @rtype: instance 218 @return: a L{ReadingOperator} instance 219 @raise UnsupportedError: if the given reading is not supported. 220 """ 221 if readingN not in self.getOption('sourceOperators'): 222 self.optionValue['sourceOperators'][readingN] \ 223 = self.readingFact._getReadingOperatorInstance(readingN) 224 return self.getOption('sourceOperators')[readingN]

225

226 - def _getToOperator(self, readingN):

227 """ 228 Gets a reading operator instance for conversion to the given reading. 229 230 @type readingN: str 231 @param readingN: name of reading 232 @rtype: instance 233 @return: a L{ReadingOperator} instance 234 @raise UnsupportedError: if the given reading is not supported. 235 """ 236 if readingN not in self.getOption('targetOperators'): 237 self.optionValue['targetOperators'][readingN] \ 238 = self.readingFact._getReadingOperatorInstance(readingN) 239 return self.getOption('targetOperators')[readingN]

240

241 242 -class EntityWiseReadingConverter(ReadingConverter):

243 """ 244 Defines an abstract L{ReadingConverter} between two or more I{readings}s for 245 doing entity wise conversion. 246 247 Converters that simply convert one syllable at once can implement this class 248 and merely need to overwrite L{convertBasicEntity()} 249 """

250 - def convertEntities(self, readingEntities, fromReading, toReading):

251 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS: 252 raise UnsupportedError("conversion direction from '" \ 253 + fromReading + "' to '" + toReading + "' not supported") 254 255 # do a entity wise conversion to the target reading 256 toReadingEntities = [] 257 258 for entity in readingEntities: 259 # convert reading entities, don't convert the rest 260 if self._getFromOperator(fromReading).isReadingEntity(entity): 261 toReadingEntity = self.convertBasicEntity(entity, fromReading, 262 toReading) 263 toReadingEntities.append(toReadingEntity) 264 else: 265 toReadingEntities.append(entity) 266 267 return toReadingEntities

268

269 - def convertBasicEntity(self, entity, fromReading, toReading):

270 """ 271 Converts a basic entity (e.g. a syllable) in the source reading to the 272 given target reading. 273 274 This method is called by L{convertEntities()} and a single entity is 275 given for conversion. 276 277 The default implementation will raise a NotImplementedError. 278 279 @type entity: str 280 @param entity: string written in the source reading 281 @type fromReading: str 282 @param fromReading: name of the source reading 283 @type toReading: str 284 @param toReading: name of the target reading 285 @rtype: str 286 @returns: the entity converted to the C{toReading} 287 @raise AmbiguousConversionError: if conversion for this entity of the 288 source reading is ambiguous. 289 @raise ConversionError: on other operations specific to the conversion 290 of the entity. 291 @raise InvalidEntityError: if the entity is invalid. 292 """ 293 raise NotImplementedError

294

295 296 -class RomanisationConverter(EntityWiseReadingConverter):

297 """ 298 Defines an abstract L{ReadingConverter} between two or more 299 I{romanisation}s. 300 301 Reading dialects can produce different entities which have to be handled by 302 the conversion process. This is realised by converting the given reading 303 dialect to a default form, then converting to the default target reading and 304 finally converting to the specified target reading dialect. On conversion 305 step thus involves three single conversion steps using a default form. This 306 default form can be defined in L{DEFAULT_READING_OPTIONS}. 307 308 Upper or lower case will be transfered between syllables, no special 309 formatting according to anyhow defined standards will be guaranteed. 310 Upper/lower case will be identified according to three classes: either the 311 whole syllable is upper case, only the initial letter is upper case or 312 otherwise the whole syllable is assumed being lower case. 313 314 The class itself can't be used directly, it has to be subclassed and 315 L{convertBasicEntity()} has to be implemented, as to make the translation of 316 a syllable from one romanisation to another possible. 317 """ 318 DEFAULT_READING_OPTIONS = {} 319 """ 320 Defines default reading options for the reading used to convert from (to 321 resp.) before (after resp.) converting to (from resp.) the user specified 322 dialect. 323 324 The most general reading dialect should be specified as to allow for a broad 325 range of input. 326 """ 327

328 - def convertEntities(self, readingEntities, fromReading, toReading):

329 """ 330 Converts a list of entities in the source reading to the given target 331 reading. 332 333 Upper case of the first character or the whole characters of one entity 334 (e.g. syllable) is respected. Entities like C{"HaO"} will degenerate to 335 C{"Hao"} though. 336 337 @type readingEntities: list of str 338 @param readingEntities: list of entities written in source reading 339 @type fromReading: str 340 @param fromReading: name of the source reading 341 @type toReading: str 342 @param toReading: name of the target reading 343 @rtype: list of str 344 @return: list of entities written in target reading 345 @raise AmbiguousConversionError: if conversion for a specific entity of 346 the source reading is ambiguous. 347 @raise ConversionError: on other operations specific to the conversion 348 between the two readings (e.g. error on converting entities). 349 @raise UnsupportedError: if source or target reading is not supported 350 for conversion. 351 @raise InvalidEntityError: if an invalid entity is given. 352 """ 353 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS: 354 raise UnsupportedError("conversion direction from '" \ 355 + fromReading + "' to '" + toReading + "' not supported") 356 357 # get default options if available used for converting the reading 358 # dialect 359 if fromReading in self.DEFAULT_READING_OPTIONS: 360 fromDefaultOptions = self.DEFAULT_READING_OPTIONS[fromReading] 361 else: 362 fromDefaultOptions = {} 363 # convert to standard form if supported (step 1) 364 if self.readingFact.isReadingConversionSupported(fromReading, 365 fromReading): 366 # use user specified source operator, set target to default form 367 readingEntities = self.readingFact.convertEntities( 368 readingEntities, fromReading, fromReading, 369 sourceOperators=[self._getFromOperator(fromReading)], 370 targetOptions=fromDefaultOptions) 371 372 # do a entity wise conversion to the target reading (step 2) 373 toReadingEntities = [] 374 for entity in readingEntities: 375 # convert reading entities, don't convert the rest 376 if self.readingFact.isReadingEntity(entity, fromReading, 377 **fromDefaultOptions): 378 toReadingEntity = self.convertBasicEntity(entity.lower(), 379 fromReading, toReading) 380 381 # capitalisation 382 if self._getToOperator(toReading).getOption('case') == 'both': 383 # check for capitalised characters 384 if entity.isupper(): 385 toReadingEntity = toReadingEntity.upper() 386 elif entity.istitle(): 387 toReadingEntity = toReadingEntity.capitalize() 388 elif self._getToOperator(toReading) == 'upper': 389 toReadingEntity = toReadingEntity.upper() 390 391 toReadingEntities.append(toReadingEntity) 392 else: 393 toReadingEntities.append(entity) 394 395 # get default options if available used for converting the reading 396 # dialect 397 if toReading in self.DEFAULT_READING_OPTIONS: 398 toDefaultOptions = self.DEFAULT_READING_OPTIONS[toReading] 399 else: 400 toDefaultOptions = {} 401 # convert to requested form if supported (step 3) 402 if self.readingFact.isReadingConversionSupported(toReading, toReading): 403 # use user specified target operator, set source to default form 404 toReadingEntities = self.readingFact.convertEntities( 405 toReadingEntities, toReading, toReading, 406 sourceOptions=toDefaultOptions, 407 targetOperators=[self._getToOperator(toReading)]) 408 409 return toReadingEntities

410

411 - def convertBasicEntity(self, entity, fromReading, toReading):

412 """ 413 Converts a basic entity (e.g. a syllable) in the source reading to the 414 given target reading. 415 416 This method is called by L{convertEntities()} and a lower case entity 417 is given for conversion. The returned value should be in lower case 418 characters too, as L{convertEntities()} will take care of 419 capitalisation. 420 421 If a single entity needs to be converted it is recommended to use 422 L{convertEntities()} instead. In the general case it can not be ensured 423 that a mapping from one reading to another can be done by the simple 424 conversion of a basic entity. One-to-many mappings are possible and 425 there is no guarantee that any entity of a reading recognised by 426 L{operator.ReadingOperator.isReadingEntity()} will be mapped here. 427 428 The default implementation will raise a NotImplementedError. 429 430 @type entity: str 431 @param entity: string written in the source reading in lower case 432 letters 433 @type fromReading: str 434 @param fromReading: name of the source reading 435 @type toReading: str 436 @param toReading: name of the target reading 437 @rtype: str 438 @returns: the entity converted to the C{toReading} in lower case 439 @raise AmbiguousConversionError: if conversion for this entity of the 440 source reading is ambiguous. 441 @raise ConversionError: on other operations specific to the conversion 442 of the entity. 443 @raise InvalidEntityError: if the entity is invalid. 444 """ 445 raise NotImplementedError

446

447 448 -class PinyinDialectConverter(ReadingConverter):

449 u""" 450 Provides a converter for different representations of the Chinese 451 romanisation I{Hanyu Pinyin}. 452 453 Examples 454 ======== 455 The following examples show how to convert between different representations 456 of Pinyin. 457 - Create the Converter and convert from standard Pinyin to Pinyin with 458 tones represented by numbers: 459 460 >>> from cjklib.reading import * 461 >>> targetOp = operator.PinyinOperator(toneMarkType='Numbers') 462 >>> pinyinConv = converter.PinyinDialectConverter( 463 ... targetOperators=[targetOp]) 464 >>> pinyinConv.convert(u'hànzì', 'Pinyin', 'Pinyin') 465 u'han4zi4' 466 467 - Convert Pinyin written with numbers, the ü (u with umlaut) replaced 468 by character v and omitted fifth tone to standard Pinyin: 469 470 >>> sourceOp = operator.PinyinOperator(toneMarkType='Numbers', 471 ... yVowel='v', missingToneMark='fifth') 472 >>> pinyinConv = converter.PinyinDialectConverter( 473 ... sourceOperators=[sourceOp]) 474 >>> pinyinConv.convert('nv3hai2zi', 'Pinyin', 'Pinyin') 475 u'n\u01dah\xe1izi' 476 477 - Or more elegantly: 478 479 >>> f = ReadingFactory() 480 >>> f.convert('nv3hai2zi', 'Pinyin', 'Pinyin', 481 ... sourceOptions={'toneMarkType': 'Numbers', 'yVowel': 'v', 482 ... 'missingToneMark': 'fifth'}) 483 u'n\u01dah\xe1izi' 484 485 - Decompose the reading of a dictionary entry from CEDICT into syllables 486 and convert the ü-vowel and forms of I{Erhua sound}: 487 488 >>> pinyinFrom = operator.PinyinOperator(toneMarkType='Numbers', 489 ... yVowel='u:', Erhua='oneSyllable') 490 >>> syllables = pinyinFrom.decompose('sun1nu:r3') 491 >>> print syllables 492 ['sun1', 'nu:r3'] 493 >>> pinyinTo = operator.PinyinOperator(toneMarkType='Numbers', 494 ... Erhua='twoSyllables') 495 >>> pinyinConv = converter.PinyinDialectConverter( 496 ... sourceOperators=[pinyinFrom], targetOperators=[pinyinTo]) 497 >>> pinyinConv.convertEntities(syllables, 'Pinyin', 'Pinyin') 498 [u'sun1', u'n\xfc3', u'r5'] 499 500 - Or more elegantly with entities already decomposed: 501 502 >>> f.convertEntities(['sun1', 'nu:r3'], 'Pinyin', 'Pinyin', 503 ... sourceOptions={'toneMarkType': 'Numbers', 'yVowel': 'u:', 504 ... 'Erhua': 'oneSyllable'}, 505 ... targetOptions={'toneMarkType': 'Numbers', 506 ... 'Erhua': 'twoSyllables'}) 507 [u'sun1', u'n\xfc3', u'r5'] 508 """ 509 CONVERSION_DIRECTIONS = [('Pinyin', 'Pinyin')] 510

511 - def __init__(self, *args, **options):

512 u""" 513 Creates an instance of the PinyinDialectConverter. 514 515 @param args: optional list of L{RomanisationOperator}s to use for 516 handling source and target readings. 517 @param options: extra options 518 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 519 given, default settings will be assumed. 520 @keyword sourceOperators: list of L{ReadingOperator}s used for handling 521 source readings. 522 @keyword targetOperators: list of L{ReadingOperator}s used for handling 523 target readings. 524 @keyword keepPinyinApostrophes: if set to C{True} apostrophes separating 525 two syllables in Pinyin will be kept even if not necessary. 526 Apostrophes missing according to the given rule will be added 527 though. 528 @keyword breakUpErhua: if set to C{'on'} I{Erhua} forms will be 529 converted to single syllables with a full I{er} syllable regardless 530 of the Erhua form setting of the target reading, e.g. I{zher} will 531 be converted to I{zhe}, I{er}, if set to C{'auto'} Erhua forms are 532 converted if the given target reading operator doesn't support 533 Erhua forms, if set to C{'off'} Erhua forms will always be 534 conserved. 535 """ 536 super(PinyinDialectConverter, self).__init__(*args, **options) 537 # set options 538 if 'keepPinyinApostrophes' in options: 539 self.optionValue['keepPinyinApostrophes'] \ 540 = options['keepPinyinApostrophes'] 541 542 if 'breakUpErhua' in options: 543 if options['breakUpErhua'] not in ['on', 'auto', 'off']: 544 raise ValueError("Invalid option '" \ 545 + str(options['breakUpErhua']) \ 546 + "' for keyword 'breakUpErhua'") 547 self.optionValue['breakUpErhua'] = options['breakUpErhua'] 548 549 # get yVowel setting 550 if self._getFromOperator('Pinyin').getOption('yVowel') != u'ü': 551 self.fromYVowel \ 552 = self._getFromOperator('Pinyin').getOption('yVowel') 553 else: 554 self.fromYVowel = u'ü' 555 if self._getToOperator('Pinyin').getOption('yVowel') != u'ü': 556 self.toYVowel = self._getToOperator('Pinyin').getOption('yVowel') 557 else: 558 self.toYVowel = u'ü' 559 560 # get Erhua settings, 'twoSyllables' is default 561 if self.getOption('breakUpErhua') == 'on' \ 562 or (self.getOption('breakUpErhua') == 'auto' \ 563 and self._getToOperator('Pinyin').getOption('Erhua') \ 564 == 'ignore')\ 565 or (self._getToOperator('Pinyin').getOption('Erhua') \ 566 == 'twoSyllables'\ 567 and self._getFromOperator('Pinyin').getOption('Erhua') \ 568 == 'oneSyllable'): 569 # need to convert from one-syllable-form to two-syllables-form 570 self.convertErhuaFunc = self.convertToTwoSyllablesErhua 571 elif self._getToOperator('Pinyin').getOption('Erhua') == 'oneSyllable'\ 572 and self._getFromOperator('Pinyin').getOption('Erhua') \ 573 != 'oneSyllable': 574 # need to convert from two-syllables-form to one-syllable-form 575 self.convertErhuaFunc = self.convertToSingleSyllableErhua 576 elif self._getFromOperator('Pinyin').getOption('Erhua') != 'ignore'\ 577 and self._getToOperator('Pinyin').getOption('Erhua') == 'ignore': 578 # no real conversion but make sure to raise an error for Erhua forms 579 self.convertErhuaFunc = self._checkForErhua 580 else: 581 # do nothing 582 self.convertErhuaFunc = lambda x: x

583 584 @classmethod

585 - def getDefaultOptions(cls):

586 options = super(PinyinDialectConverter, cls).getDefaultOptions() 587 options.update({'keepPinyinApostrophes': False, 'breakUpErhua': 'auto'}) 588 589 return options

590

591 - def convertEntities(self, readingEntities, fromReading='Pinyin', 592 toReading='Pinyin'):

593 """ 594 Converts a list of entities in the source reading to the given target 595 reading. 596 597 @type readingEntities: list of str 598 @param readingEntities: list of entities written in source reading 599 @type fromReading: str 600 @param fromReading: name of the source reading 601 @type toReading: str 602 @param toReading: name of the target reading 603 @rtype: list of str 604 @return: list of entities written in target reading 605 @raise AmbiguousConversionError: if conversion for a specific entity of 606 the source reading is ambiguous. 607 @raise ConversionError: on other operations specific to the conversion 608 between the two readings (e.g. error on converting entities). 609 @raise UnsupportedError: if source or target reading is not supported 610 for conversion. 611 @raise InvalidEntityError: if an invalid entity is given. 612 """ 613 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS: 614 raise UnsupportedError("conversion direction from '" \ 615 + fromReading + "' to '" + toReading + "' not supported") 616 617 # remove apostrophes 618 if not self.getOption('keepPinyinApostrophes'): 619 readingEntities = self._getFromOperator(fromReading)\ 620 .removeApostrophes(readingEntities) 621 622 # split syllables into plain syllable and tone part 623 entityTuples = [] 624 for entity in readingEntities: 625 # convert reading entities, don't convert the rest 626 if self._getFromOperator(fromReading).isReadingEntity(entity): 627 # split syllable into plain part and tonal information 628 plainSyllable, tone = self._getFromOperator(fromReading)\ 629 .splitEntityTone(entity) 630 631 entityTuples.append((plainSyllable, tone)) 632 else: 633 entityTuples.append(entity) 634 635 # fix Erhua forms if needed 636 entityTuples = self.convertErhuaFunc(entityTuples) 637 638 targetTones = self._getToOperator(toReading).getTones() 639 640 # convert 641 toReadingEntities = [] 642 for entry in entityTuples: 643 if type(entry) == type(()): 644 plainSyllable, tone = entry 645 646 # check if target operator supports missing tones 647 if tone not in targetTones: 648 # missing tone not supported, raise a conversion error 649 raise AmbiguousConversionError("Target reading does not " \ 650 "support missing tone information") 651 652 # fix Erhua form if needed 653 if plainSyllable.lower() == 'r' \ 654 and ((self.getOption('breakUpErhua') == 'auto' \ 655 and self._getToOperator('Pinyin').getOption('Erhua') \ 656 == 'ignore') \ 657 or self.getOption('breakUpErhua') == 'on'): 658 if plainSyllable.isupper(): 659 plainSyllable = 'ER' 660 else: 661 plainSyllable = 'er' 662 663 # check for special vowel for ü on input 664 if self.fromYVowel != self.toYVowel: 665 plainSyllable = plainSyllable.replace(self.fromYVowel, 666 self.toYVowel) 667 668 # capitalisation 669 if self._getToOperator(toReading).getOption('case') == 'lower': 670 plainSyllable = plainSyllable.lower() 671 elif self._getToOperator(toReading).getOption('case') \ 672 == 'upper': 673 plainSyllable = plainSyllable.upper() 674 675 try: 676 toReadingEntities.append( 677 self._getToOperator(toReading).getTonalEntity( 678 plainSyllable, tone)) 679 except InvalidEntityError, e: 680 # handle this as a conversion error as the converted 681 # syllable is not accepted by the operator 682 raise ConversionError(e) 683 elif entry == self._getToOperator(fromReading)\ 684 .getOption('PinyinApostrophe'): 685 toReadingEntities.append(self._getToOperator(toReading)\ 686 .getOption('PinyinApostrophe')) 687 else: 688 toReadingEntities.append(entry) 689 690 return toReadingEntities

691

692 - def convertToSingleSyllableErhua(self, entityTuples):

693 """ 694 Converts the various I{Erhua} forms in a list of reading entities to 695 a representation with one syllable, e.g. C{['tou2', 'r5']} to 696 C{['tour2']}. 697 698 @type entityTuples: list of tuple/str 699 @param entityTuples: list of tuples with plain syllable and tone 700 @rtype: list of tuple/str 701 @return: list of tuples with plain syllable and tone 702 """ 703 convertedTuples = [] 704 lastEntry = None 705 for entry in entityTuples: 706 if type(lastEntry) == type(()) and type(entry) == type(()): 707 lastPlainSyllable, lastTone = lastEntry 708 plainSyllable, tone = entry 709 if plainSyllable.lower() == 'r' \ 710 and lastPlainSyllable.lower() not in ['e', 'er', 'r', 'n', 711 'ng', 'hng', 'hm', 'm', u'ê']: 712 # merge two syllables and use tone of main syllable 713 convertedTuples.append((lastPlainSyllable + plainSyllable, 714 lastTone)) 715 lastEntry = None 716 else: 717 convertedTuples.append(lastEntry) 718 lastEntry = entry 719 else: 720 if lastEntry != None: 721 convertedTuples.append(lastEntry) 722 lastEntry = entry 723 if lastEntry != None: 724 convertedTuples.append(lastEntry) 725 726 return convertedTuples

727

728 - def convertToTwoSyllablesErhua(self, entityTuples):

729 """ 730 Converts the various I{Erhua} forms in a list of reading entities to 731 a representation with two syllable, e.g. C{['tour2']} to 732 C{['tou2', 'r5']}. 733 734 @type entityTuples: list of tuple/str 735 @param entityTuples: list of tuples with plain syllable and tone 736 @rtype: list of tuple/str 737 @return: list of tuples with plain syllable and tone 738 """ 739 convertedTuples = [] 740 for entry in entityTuples: 741 if type(entry) != type(()): 742 convertedTuples.append(entry) 743 else: 744 plainSyllable, tone = entry 745 if plainSyllable[-1:].lower() == 'r' \ 746 and plainSyllable.lower() not in ['er', 'r']: 747 # split syllable into plain syllable... 748 convertedTuples.append((plainSyllable[:-1], tone)) 749 # ...and single 'r' 750 convertedTuples.append((plainSyllable[-1:], 5)) 751 else: 752 convertedTuples.append(entry) 753 754 return convertedTuples

755

756 - def _checkForErhua(self, entityTuples):

757 """ 758 Checks the given entities for Erhua forms and raises a ConversionError. 759 760 @type entityTuples: list of tuple/str 761 @param entityTuples: list of tuples with plain syllable and tone 762 @rtype: list of tuple/str 763 @return: list of tuples with plain syllable and tone 764 @raise ConversionError: when an Erhua form is found 765 """ 766 for entry in entityTuples: 767 if type(entry) == type(()): 768 plainSyllable, _ = entry 769 770 if plainSyllable.endswith('r') and plainSyllable != 'er': 771 raise ConversionError( 772 "Cannot convert Erhua form in syllable '" \ 773 + plainSyllable + "'") 774 775 return entityTuples

776

777 778 -class WadeGilesDialectConverter(EntityWiseReadingConverter):

779 u""" 780 Provides a converter for different representations of the Mandarin Chinese 781 romanisation I{Wade-Giles}. 782 783 The converter has very limited possibilities for conversion at this time, 784 much more different forms of Wade-Giles are possible and should be 785 implemented. 786 """ 787 CONVERSION_DIRECTIONS = [('WadeGiles', 'WadeGiles')] 788

789 - def convertBasicEntity(self, entity, fromReading, toReading):

790 # split syllable into plain part and tonal information 791 plainSyllable, tone \ 792 = self._getFromOperator(fromReading).splitEntityTone(entity) 793 794 # convert apostrophe 795 if (self._getFromOperator(fromReading)\ 796 .getOption('WadeGilesApostrophe') \ 797 != self._getToOperator(toReading).getOption('WadeGilesApostrophe')): 798 plainSyllable = plainSyllable.replace( 799 self._getFromOperator(fromReading)\ 800 .getOption('WadeGilesApostrophe'), 801 self._getToOperator(toReading).getOption('WadeGilesApostrophe')) 802 803 # capitalisation 804 if self._getToOperator(toReading).getOption('case') == 'lower': 805 plainSyllable = plainSyllable.lower() 806 elif self._getToOperator(toReading).getOption('case') == 'upper': 807 plainSyllable = plainSyllable.upper() 808 809 # get syllable with tone mark 810 try: 811 return self._getToOperator(toReading).getTonalEntity(plainSyllable, 812 tone) 813 except InvalidEntityError, e: 814 # handle this as a conversion error as the converted syllable is not 815 # accepted by the operator 816 raise ConversionError(e)

817

818 819 -class PinyinWadeGilesConverter(RomanisationConverter):

820 """ 821 Provides a converter between the Chinese romanisation I{Hanyu Pinyin} and 822 I{Wade-Giles}. 823 824 Currently only a non standard subset of Wade-Giles is implemented. As many 825 different interpretations exist providing a complete coverage seems hardly 826 achievable. An important step is support for the revised system by Giles as 827 found in his I{Chinese-English Dictionary} (as of 1912). A further target is 828 to at least implement means to support concrete shapes found in the usage of 829 big bodies e.g. libraries. 830 831 Upper or lower case will be transfered between syllables, no special 832 formatting according to the standards (i.e. Pinyin) will be made. Upper/ 833 lower case will be identified according to three classes: either the whole 834 syllable is upper case, only the initial letter is upper case or otherwise 835 the whole syllable is assumed being lower case. 836 837 Conversion cannot in general be done in a one-to-one manner. Standard Pinyin 838 has no notion to explicitly specify missing tonal information while this is 839 in general given in Wade-Giles by just omitting the tone digits. This 840 implementation furthermore doesn't support explicit depiction of I{Erhua} in 841 the Wade-Giles romanisation system thus failing when r-colourised syllables 842 are found. 843 844 @todo Lang: Increase support for different I{reading dialects} of the 845 Wade-Giles romanisation system. Includes support in 846 L{WadeGilesOperator}. Get proper sources on the syllables and 847 mappings. Use well-known instances. 848 @warning: This module isn't backed-up by any sources yet and doesn't 849 guarantee a syllable mapping free of errors. 850 """ 851 CONVERSION_DIRECTIONS = [('Pinyin', 'WadeGiles'), ('WadeGiles', 'Pinyin')] 852 # use the tone mark type 'Numbers' from Pinyin to support missing tonal 853 # information. Erhua furthermore is not supported. 854 DEFAULT_READING_OPTIONS = {'Pinyin': {'Erhua': 'ignore', 855 'toneMarkType': 'Numbers'}, 'WadeGiles': {}} 856

857 - def convertEntities(self, readingEntities, fromReading, toReading):

858 # for conversion from Wade-Giles remove the hyphens that will not be 859 # transfered to Pinyin 860 if fromReading == 'WadeGiles': 861 readingEntities = self._getFromOperator(fromReading).removeHyphens( 862 readingEntities) 863 864 return super(PinyinWadeGilesConverter, self).convertEntities( 865 readingEntities, fromReading, toReading)

866

867 - def convertBasicEntity(self, entity, fromReading, toReading):

868 # split syllable into plain part and tonal information 869 plainSyllable, tone = self.readingFact.splitEntityTone(entity, 870 fromReading, **self.DEFAULT_READING_OPTIONS[fromReading]) 871 872 # lookup in database 873 if fromReading == "WadeGiles": 874 table = self.db.tables['WadeGilesPinyinMapping'] 875 transSyllable = self.db.selectScalar( 876 select([table.c.Pinyin], table.c.WadeGiles == plainSyllable)) 877 elif fromReading == "Pinyin": 878 # mapping from WG to Pinyin is ambiguous, use index for distinct 879 table = self.db.tables['WadeGilesPinyinMapping'] 880 transSyllable = self.db.selectScalar( 881 select([table.c.WadeGiles], 882 and_(table.c.Pinyin == plainSyllable, 883 table.c.PinyinIdx == 0))) 884 if not transSyllable: 885 raise ConversionError("conversion for entity '" + plainSyllable \ 886 + "' not supported") 887 888 try: 889 return self.readingFact.getTonalEntity(transSyllable, tone, 890 toReading, **self.DEFAULT_READING_OPTIONS[toReading]) 891 except InvalidEntityError, e: 892 # handle this as a conversion error as the converted syllable is not 893 # accepted by the operator 894 raise ConversionError(e)

895

896 897 -class GRDialectConverter(ReadingConverter):

898 u""" 899 Provides a converter for different representations of the Chinese 900 romanisation I{Gwoyeu Romatzyh}. 901 """ 902 CONVERSION_DIRECTIONS = [('GR', 'GR')] 903

904 - def __init__(self, *args, **options):

905 u""" 906 Creates an instance of the GRDialectConverter. 907 908 @param args: optional list of L{RomanisationOperator}s to use for 909 handling source and target readings. 910 @param options: extra options 911 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 912 given, default settings will be assumed. 913 @keyword sourceOperators: list of L{ReadingOperator}s used for handling 914 source readings. 915 @keyword targetOperators: list of L{ReadingOperator}s used for handling 916 target readings. 917 @keyword keepGRApostrophes: if set to C{True} apostrophes separating 918 two syllables in Gwoyeu Romatzyh will be kept even if not necessary. 919 Apostrophes missing before 0-initials will be added though. 920 """ 921 super(GRDialectConverter, self).__init__(*args, **options) 922 # set options 923 if 'keepGRApostrophes' in options: 924 self.optionValue['keepGRApostrophes'] \ 925 = options['keepGRApostrophes']

926 927 @classmethod

928 - def getDefaultOptions(cls):

929 options = super(GRDialectConverter, cls).getDefaultOptions() 930 options.update({'keepGRApostrophes': False}) 931 932 return options

933

934 - def convertEntities(self, readingEntities, fromReading='GR', 935 toReading='GR'):

936 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS: 937 raise UnsupportedError("conversion direction from '" \ 938 + fromReading + "' to '" + toReading + "' not supported") 939 940 if self.getOption('keepGRApostrophes'): 941 # convert separator apostrophe 942 fromApostrophe = self._getFromOperator(fromReading)\ 943 .getOption('GRSyllableSeparatorApostrophe') 944 toApostrophe = self._getToOperator(toReading)\ 945 .getOption('GRSyllableSeparatorApostrophe') 946 if fromApostrophe != toApostrophe: 947 convertedEntities = [] 948 for entity in readingEntities: 949 if entity == fromApostrophe: 950 convertedEntities.append(toApostrophe) 951 else: 952 convertedEntities.append(entity) 953 else: 954 # remove syllable separator 955 readingEntities = self._getFromOperator(fromReading)\ 956 .removeApostrophes(readingEntities) 957 958 # capitalisation 959 if self._getToOperator(toReading).getOption('case') == 'lower': 960 readingEntities = [entity.lower() for entity in readingEntities] 961 elif self._getToOperator(toReading).getOption('case') == 'upper': 962 readingEntities = [entity.upper() for entity in readingEntities] 963 964 # convert rhotacised final apostrophe 965 fromApostrophe = self._getFromOperator(fromReading)\ 966 .getOption('GRRhotacisedFinalApostrophe') 967 toApostrophe = self._getToOperator(toReading)\ 968 .getOption('GRRhotacisedFinalApostrophe') 969 if fromApostrophe != toApostrophe: 970 readingEntities = [entity.replace(fromApostrophe, toApostrophe) \ 971 for entity in readingEntities] 972 973 # abbreviated forms 974 if not self._getToOperator(toReading).getOption('abbreviations'): 975 convertedEntities = [] 976 for entity in readingEntities: 977 convertedEntities.append(self._getToOperator(toReading)\ 978 .convertAbbreviatedEntity(entity)) 979 readingEntities = convertedEntities 980 981 return readingEntities

982

983 984 -class GRPinyinConverter(RomanisationConverter):

985 """ 986 Provides a converter between the Chinese romanisation I{Gwoyeu Romatzyh} and 987 I{Hanyu Pinyin}. 988 989 Features: 990 - configurable mapping of options neutral tone when converting from GR, 991 - conversion of abbreviated forms of GR. 992 993 Upper or lower case will be transfered between syllables, no special 994 formatting according to the standards (i.e. Pinyin) will be made. Upper/ 995 lower case will be identified according to three classes: either the whole 996 syllable is upper case, only the initial letter is upper case or otherwise 997 the whole syllable is assumed being lower case. 998 999 Limitations 1000 =========== 1001 Conversion cannot in general be done in a one-to-one manner. 1002 I{Gwoyeu Romatzyh} (GR) gives the etymological tone for a syllable in 1003 neutral tone while Pinyin doesn't. In contrast to tones in GR carrying more 1004 information I{r-coloured} syllables (I{Erlhuah}) are rendered the way they 1005 are pronounced that loosing the original syllable. Converting those forms to 1006 Pinyin in a general manner is not possible while yielding the original 1007 string in Chinese characters might help do disambiguate. Another issue 1008 tone-wise is that Pinyin allows to specify the changed tone when dealing 1009 with tone sandhis instead of the etymological one while GR doesn't. Only 1010 working with the Chinese character string might help to restore the original 1011 tone. 1012 1013 Conversion from Pinyin is crippled as the neutral tone in this form cannot 1014 be transfered to GR as described above. More information is needed to 1015 resolve this. For the other direction the neutral tone can be mapped but the 1016 etymological tone information is lost. For the optional neutral tone either 1017 a mapping is done to the neutral tone in Pinyin or to the original 1018 (etymological). 1019 """ 1020 CONVERSION_DIRECTIONS = [('GR', 'Pinyin'), ('Pinyin', 'GR')] 1021 # GR deals with Erlhuah in one syllable, force on Pinyin. Convert GR 1022 # abbreviations to full forms 1023 DEFAULT_READING_OPTIONS = {'Pinyin': {'Erhua': 'oneSyllable'}, 1024 'GR': {'abbreviations': False}} 1025

1026 - def __init__(self, *args, **options):

1027 """ 1028 Creates an instance of the GRPinyinConverter. 1029 1030 @param args: optional list of L{RomanisationOperator}s to use for 1031 handling source and target readings. 1032 @param options: extra options 1033 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 1034 given, default settings will be assumed. 1035 @keyword sourceOperators: list of L{ReadingOperator}s used for handling 1036 source readings. 1037 @keyword targetOperators: list of L{ReadingOperator}s used for handling 1038 target readings. 1039 @keyword GROptionalNeutralToneMapping: if set to 'original' GR syllables 1040 marked with an optional neutral tone will be mapped to the 1041 etymological tone, if set to 'neutral' they will be mapped to the 1042 neutral tone in Pinyin. 1043 """ 1044 super(GRPinyinConverter, self).__init__(*args, **options) 1045 1046 if 'GROptionalNeutralToneMapping' in options: 1047 if options['GROptionalNeutralToneMapping'] not in ['original', 1048 'neutral']: 1049 raise ValueError("Invalid option '" \ 1050 + str(options['GROptionalNeutralToneMapping']) \ 1051 + "' for keyword 'GROptionalNeutralToneMapping'") 1052 self.optionValue['GROptionalNeutralToneMapping'] \ 1053 = options['GROptionalNeutralToneMapping'] 1054 1055 # mapping from GR tones to Pinyin 1056 self.grToneMapping = dict([(tone, int(tone[0])) \ 1057 for tone in operator.GROperator.TONES]) 1058 # set optional neutral mapping 1059 if self.getOption('GROptionalNeutralToneMapping') == 'neutral': 1060 for tone in ['1stToneOptional5th', '2ndToneOptional5th', 1061 '3rdToneOptional5th', '4thToneOptional5th']: 1062 self.grToneMapping[tone] = 5 1063 1064 # mapping from Pinyin tones to GR 1065 self.pyToneMapping = {1: '1stTone', 2: '2ndTone', 3: '3rdTone', 1066 4: '4thTone', 5: None} 1067 1068 # GROperator instance 1069 self.grOperator = None

1070 1071 @classmethod

1072 - def getDefaultOptions(cls):

1073 options = super(GRPinyinConverter, cls).getDefaultOptions() 1074 options.update({'GROptionalNeutralToneMapping': 'original'}) 1075 1076 return options

1077

1078 - def convertBasicEntity(self, entity, fromReading, toReading):

1079 # we can't convert Erlhuah in GR 1080 if fromReading == "GR" and entity.endswith('l') \ 1081 and entity not in ['el', 'erl', 'eel', 'ell']: 1082 raise AmbiguousConversionError("conversion for entity '" + entity \ 1083 + "' is ambiguous") 1084 1085 # split syllable into plain part and tonal information 1086 plainSyllable, tone = self.readingFact.splitEntityTone(entity, 1087 fromReading, **self.DEFAULT_READING_OPTIONS[fromReading]) 1088 1089 # lookup in database 1090 if fromReading == "GR": 1091 table = self.db.tables['PinyinGRMapping'] 1092 transSyllable = self.db.selectScalar(select([table.c.Pinyin], 1093 table.c.GR == plainSyllable)) 1094 transTone = self.grToneMapping[tone] 1095 1096 elif fromReading == "Pinyin": 1097 # reduce Erlhuah form 1098 if plainSyllable != 'er' and plainSyllable.endswith('r'): 1099 erlhuahForm = True 1100 plainSyllable = plainSyllable[:-1] 1101 else: 1102 erlhuahForm = False 1103 1104 table = self.db.tables['PinyinGRMapping'] 1105 transSyllable = self.db.selectScalar(select([table.c.GR], 1106 table.c.Pinyin == plainSyllable)) 1107 if self.pyToneMapping[tone]: 1108 transTone = self.pyToneMapping[tone] 1109 else: 1110 raise AmbiguousConversionError("conversion for entity '" \ 1111 + plainSyllable + "' with tone '" + str(tone) \ 1112 + "' is ambiguous") 1113 1114 if not transSyllable: 1115 raise ConversionError("conversion for entity '" + plainSyllable \ 1116 + "' not supported") 1117 1118 try: 1119 if toReading == 'GR' and erlhuahForm: 1120 try: 1121 # lookup Erlhuah form for GR 1122 return self._getGROperator().getRhotacisedTonalEntity( 1123 transSyllable, transTone) 1124 except UnsupportedError, e: 1125 # handle this as a conversion error as the there is no 1126 # Erlhuah form given for the given tone 1127 raise ConversionError(e) 1128 else: 1129 return self.readingFact.getTonalEntity(transSyllable, transTone, 1130 toReading, **self.DEFAULT_READING_OPTIONS[toReading]) 1131 except InvalidEntityError, e: 1132 # handle this as a conversion error as the converted syllable is not 1133 # accepted by the operator 1134 raise ConversionError(e)

1135

1136 - def _getGROperator(self):

1137 """Creates an instance of a GROperator if needed and returns it.""" 1138 if self.grOperator == None: 1139 self.grOperator = operator.GROperator( 1140 **self.DEFAULT_READING_OPTIONS['GR']) 1141 return self.grOperator

1142

1143 1144 -class PinyinIPAConverter(ReadingConverter):

1145 u""" 1146 Provides a converter between the Mandarin Chinese romanisation 1147 I{Hanyu Pinyin} and the I{International Phonetic Alphabet} (I{IPA}) for 1148 Standard Mandarin. This converter provides only basic support for tones and 1149 the user needs to specify additional means when handling tone sandhi 1150 occurrences. 1151 1152 The standard conversion table is based on the source mentioned below. 1153 Though depiction in IPA depends on many factors and therefore might highly 1154 vary it seems this source is not error-free: final I{-üan} written [yan] 1155 should be similar to I{-ian} [iɛn] and I{-iong} written [yŋ] should be 1156 similar to I{-ong} [uŋ]. 1157 1158 As IPA allows for a big range of different representations for the sounds 1159 in a varying degree no conversion to Pinyin is offered. 1160 1161 Currently conversion of I{Erhua sound} is not supported. 1162 1163 Features: 1164 - Default tone sandhi handling for lower third tone and neutral tone, 1165 - extensibility of tone sandhi handling, 1166 - extensibility for general coarticulation effects. 1167 1168 Limitations: 1169 - Tone sandhi needs special treatment depending on the user's needs, 1170 - transcription of onomatopoeic words will be limited to the general 1171 syllable scheme, 1172 - limited linking between syllables (e.g. for 啊、呕) will not be 1173 considered and 1174 - stress, intonation and accented speech are not covered. 1175 1176 Tone sandhi 1177 =========== 1178 Speech in tonal languages is generally subject to X{tone sandhi}. For 1179 example in Mandarin I{bu4 cuo4} for 不错 will render to I{bu2 cuo4}, or 1180 I{lao3shi1} (老师) with a tone contour of 214 for I{lao3} and 55 for I{shi1} 1181 will render to a contour 21 for I{lao3}. 1182 1183 When translating to IPA the system has to deal with these tone sandhis and 1184 therefore provides an option C{'sandhiFunction'} that can be set to the user 1185 specified handler. PinyinIPAConverter will only provide a very basic handler 1186 L{lowThirdAndNeutralToneRule()} which will apply the contour 21 for the 1187 third tone when several syllables occur and needs the user to supply proper 1188 tone information, e.g. I{ke2yi3} (可以) instead of the normal rendering as 1189 I{ke3yi3} to indicate the tone sandhi for the first syllable. 1190 1191 Further support will be provided for varying stress on syllables in the 1192 neutral tone. Following a first tone the weak syllable will have a half-low 1193 pitch, following a second tone a middle, following a third tone a half-high 1194 and following a forth tone a low pitch. 1195 1196 There a further occurrences of tone sandhis: 1197 - pronunciations of 一 and 不 vary in different tones depending on their 1198 context, 1199 - directional complements like 拿出来 I{ná chu lai} under some 1200 circumstances loose their tone, 1201 - in a three syllable group ABC the second syllable B changes from 1202 second tone to first tone when A is in the first or second tone and 1203 C is not in the neutral tone. 1204 1205 Coarticulation 1206 ============== 1207 In most cases conversion from Pinyin to IPA is straightforward if one does 1208 not take tone sandhi into account. There are case though (when leaving 1209 aside tones), where phonetic realisation of a syllable depends on its 1210 context. The converter allows for handling coarticulation effects by 1211 adding a hook C{coarticulationFunction} to which a user-implemented 1212 function can be given. An example implementation is given with 1213 L{finalECoarticulation()}. 1214 1215 Source 1216 ====== 1217 - Hànyǔ Pǔtōnghuà Yǔyīn Biànzhèng (汉语普通话语音辨正). Page 15, Běijīng Yǔyán 1218 Dàxué Chūbǎnshè (北京语言大学出版社), 2003, ISBN 7-5619-0622-6. 1219 - San Duanmu: The Phonology of Standard Chinese. Second edition, Oxford 1220 University Press, 2007, ISBN 978-0-19-921578-2, ISBN 978-0-19-921579-9. 1221 - Yuen Ren Chao: A Grammar of Spoken Chinese. University of California 1222 Press, Berkeley, 1968, ISBN 0-520-00219-9. 1223 1224 @see: 1225 - Mandarin tone sandhi: 1226 U{http://web.mit.edu/jinzhang/www/pinyin/tones/index.html} 1227 - IPA: U{http://en.wikipedia.org/wiki/International_Phonetic_Alphabet} 1228 - The Phonology of Standard Chinese. First edition, 2000: 1229 U{http://books.google.de/books?id=tG0-Ad9CrBcC} 1230 1231 @todo Impl: Two different methods for tone sandhi and coarticulation 1232 effects? 1233 @todo Lang: Support for I{Erhua} in mapping. 1234 """ 1235 CONVERSION_DIRECTIONS = [('Pinyin', 'MandarinIPA')] 1236 1237 PINYIN_OPTIONS = {'Erhua': 'ignore', 'toneMarkType': 'Numbers', 1238 'missingToneMark': 'noinfo', 'case': 'lower'} 1239 """Options for the PinyinOperator.""" 1240 1241 TONEMARK_MAPPING = {1: '1stTone', 2: '2ndTone', 3: '3rdToneRegular', 1242 4: '4thTone', 5: '5thTone'} 1243 1244 NEUTRAL_TONE_MAPPING = {'1stTone': '5thToneHalfLow', 1245 '2ndTone': '5thToneMiddle', '3rdToneRegular': '5thToneHalfHigh', 1246 '3rdToneLow': '5thToneHalfHigh', '4thTone': '5thToneLow', 1247 '5thTone': '5thTone', '5thToneHalfHigh': '5thToneHalfHigh', 1248 '5thToneMiddle': '5thToneMiddle', '5thToneHalfLow':'5thToneHalfLow', 1249 '5thToneLow': '5thToneLow'} 1250 """Mapping of neutral tone following another tone.""" 1251

1252 - def __init__(self, *args, **options):

1253 """ 1254 Creates an instance of the PinyinIPAConverter. 1255 1256 @param args: optional list of L{RomanisationOperator}s to use for 1257 handling source and target readings. 1258 @param options: extra options 1259 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 1260 given, default settings will be assumed. 1261 @keyword sourceOperators: list of L{ReadingOperator}s used for handling 1262 source readings. 1263 @keyword targetOperators: list of L{ReadingOperator}s used for handling 1264 target readings. 1265 @keyword sandhiFunction: a function that handles tonal changes 1266 and converts a given list of entities to accommodate sandhi 1267 occurrences, see L{lowThirdAndNeutralToneRule()} for the default 1268 implementation. 1269 @keyword coarticulationFunction: a function that handles coarticulation 1270 effects, see L{finalECoarticulation()} for an example 1271 implementation. 1272 """ 1273 super(PinyinIPAConverter, self).__init__(*args, **options) 1274 1275 # set the sandhiFunction for handling tonal changes 1276 if 'sandhiFunction' in options: 1277 self.optionValue['sandhiFunction'] = options['sandhiFunction'] 1278 # set the sandhiFunction for handling general phonological changes 1279 if 'coarticulationFunction' in options: 1280 self.optionValue['coarticulationFunction'] \ 1281 = options['coarticulationFunction']

1282 1283 @classmethod

1284 - def getDefaultOptions(cls):

1285 options = super(PinyinIPAConverter, cls).getDefaultOptions() 1286 options.update({'coarticulationFunction': None, 1287 'sandhiFunction': PinyinIPAConverter.lowThirdAndNeutralToneRule}) 1288 1289 return options

1290

1291 - def convertEntities(self, readingEntities, fromReading='Pinyin', 1292 toReading='MandarinIPA'):

1293 1294 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS: 1295 raise UnsupportedError("conversion direction from '" \ 1296 + fromReading + "' to '" + toReading + "' not supported") 1297 1298 if self.readingFact.isReadingConversionSupported(fromReading, 1299 fromReading): 1300 # use user specified source operator, set target to not accept Erhua 1301 # sound (for Pinyin) 1302 readingEntities = self.readingFact.convertEntities(readingEntities, 1303 fromReading, fromReading, 1304 sourceOperators=[self._getFromOperator(fromReading)], 1305 targetOptions=self.PINYIN_OPTIONS) 1306 # TODO once we support Erhua, use oneSyllable form to lookup 1307 1308 # split syllables into plain syllable and tone part 1309 entityTuples = [] 1310 for entity in readingEntities: 1311 # convert reading entities, don't convert the rest 1312 if self.readingFact.isReadingEntity(entity, fromReading, 1313 **self.PINYIN_OPTIONS): 1314 # split syllable into plain part and tonal information 1315 plainSyllable, tone = self.readingFact.splitEntityTone(entity, 1316 fromReading, **self.PINYIN_OPTIONS) 1317 1318 entityTuples.append((plainSyllable, tone)) 1319 else: 1320 entityTuples.append(entity) 1321 1322 # convert to IPA 1323 ipaTupelList = [] 1324 for idx, entry in enumerate(entityTuples): 1325 # convert reading entities, don't convert the rest 1326 if type(entry) == type(()): 1327 plainSyllable, tone = entry 1328 1329 transEntry = None 1330 if self.getOption('coarticulationFunction'): 1331 transEntry = self.getOption('coarticulationFunction')(self, 1332 entityTuples[:i], plainSyllable, tone, 1333 entityTuples[i+1:]) 1334 1335 if not transEntry: 1336 # standard conversion 1337 transEntry = self._convertSyllable(plainSyllable, tone) 1338 1339 ipaTupelList.append(transEntry) 1340 else: 1341 ipaTupelList.append(entry) 1342 1343 # handle sandhi 1344 if self._getToOperator(toReading).getOption('toneMarkType') != 'None': 1345 ipaTupelList = self.getOption('sandhiFunction')(self, ipaTupelList) 1346 1347 # get tonal forms 1348 toReadingEntities = [] 1349 for entry in ipaTupelList: 1350 if type(entry) == type(()): 1351 plainSyllable, tone = entry 1352 entity = self._getToOperator(toReading).getTonalEntity( 1353 plainSyllable, tone) 1354 else: 1355 entity = entry 1356 toReadingEntities.append(entity) 1357 return toReadingEntities

1358

1359 - def _convertSyllable(self, plainSyllable, tone):

1360 """ 1361 Converts a single syllable from Pinyin to IPA. 1362 1363 @type plainSyllable: str 1364 @param plainSyllable: plain syllable in the source reading 1365 @type tone: int 1366 @param tone: the syllable's tone 1367 @rtype: str 1368 @return: IPA representation 1369 """ 1370 # lookup in database 1371 table = self.db.tables['PinyinIPAMapping'] 1372 transSyllables = self.db.selectScalars(select([table.c.IPA], 1373 and_(table.c.Pinyin == plainSyllable, 1374 table.c.Feature.in_(['', 'Default'])))) 1375 1376 if not transSyllables: 1377 raise ConversionError("conversion for entity '" + plainSyllable \ 1378 + "' not supported") 1379 elif len(transSyllables) != 1: 1380 raise ConversionError("conversion for entity '" + plainSyllable \ 1381 + "' ambiguous") 1382 if tone: 1383 transTone = self.TONEMARK_MAPPING[tone] 1384 else: 1385 transTone = None 1386 1387 return transSyllables[0], transTone

1388

1389 - def lowThirdAndNeutralToneRule(self, entityTuples):

1390 """ 1391 Converts C{'3rdToneRegular'} to C{'3rdToneLow'} for syllables followed 1392 by others and C{'5thTone'} to the respective forms when following 1393 another syllable. 1394 1395 This function serves as the default rule and can be overwritten by 1396 giving a function as option C{sandhiFunction} on instantiation. 1397 1398 @type entityTuples: list of tuple/str 1399 @param entityTuples: a list of tuples and strings. An IPA entity is 1400 given as a tuple with the plain syllable and its tone, other content 1401 is given as plain string. 1402 @rtype: list 1403 @return: converted entity list 1404 @todo Lang: What to do on several following neutral tones? 1405 """ 1406 # only convert 3rd tone to lower form when multiple syllables occur 1407 if len(entityTuples) <= 1: 1408 return entityTuples 1409 1410 # convert 1411 convertedEntities = [] 1412 precedingTone = None 1413 for idx, entry in enumerate(entityTuples): 1414 if type(entry) == type(()): 1415 plainSyllable, tone = entry 1416 1417 if tone == '5thTone' and precedingTone: 1418 tone = self.NEUTRAL_TONE_MAPPING[precedingTone] 1419 elif tone == '3rdToneRegular' and idx + 1 != len(entityTuples): 1420 tone = '3rdToneLow' 1421 entry = (plainSyllable, tone) 1422 1423 precedingTone = tone 1424 else: 1425 precedingTone = None 1426 1427 convertedEntities.append(entry) 1428 1429 return convertedEntities

1430

1431 - def finalECoarticulation(self, leftContext, plainSyllable, tone, 1432 rightContext):

1433 u""" 1434 Example function for handling coarticulation of final I{e} for the 1435 neutral tone. 1436 1437 Only syllables with final I{e} are considered for other syllables 1438 C{None} is returned. This will trigger the regular conversion method. 1439 1440 Pronunciation of final I{e} 1441 =========================== 1442 The final I{e} found in syllables I{de}, I{me} and others is 1443 pronounced /ɤ/ in the general case (see source below) but if tonal 1444 stress is missing it will be pronounced /ə/. This implementation will 1445 take care of this for the fifth tone. If no tone is specified 1446 (C{'None'}) an L{ConversionError} will be raised for the syllables 1447 affected. 1448 1449 Source: Hànyǔ Pǔtōnghuà Yǔyīn Biànzhèng (汉语普通话语音辨正). Page 15, 1450 Běijīng Yǔyán Dàxué Chūbǎnshè (北京语言大学出版社), 2003, 1451 ISBN 7-5619-0622-6. 1452 1453 @type leftContext: list of tuple/str 1454 @param leftContext: syllables preceding the syllable in question in the 1455 source reading 1456 @type plainSyllable: str 1457 @param plainSyllable: plain syllable in the source reading 1458 @type tone: int 1459 @param tone: the syllable's tone 1460 @type rightContext: list of tuple/str 1461 @param rightContext: syllables following the syllable in question in the 1462 source reading 1463 @rtype: str 1464 @return: IPA representation 1465 """ 1466 if tone == 5: 1467 _, final = self._getToOperator('Pinyin').getOnsetRhyme( 1468 plainSyllable) 1469 if final == 'e': 1470 # lookup in database 1471 table = self.db.tables['PinyinIPAMapping'] 1472 transSyllable = self.db.selectScalars(select([table.c.IPA], 1473 and_(table.c.Pinyin == plainSyllable, 1474 table.c.Feature == '5thTone'))) 1475 if not transSyllables: 1476 raise ConversionError("conversion for entity '" \ 1477 + plainSyllable + "' not supported") 1478 elif len(transSyllables) != 1: 1479 raise ConversionError("conversion for entity '" \ 1480 + plainSyllable + "' and tone '" + str(tone) \ 1481 + "' ambiguous") 1482 1483 return transSyllables[0], self.TONEMARK_MAPPING[tone]

1484

1485 1486 -class PinyinBrailleConverter(ReadingConverter):

1487 """ 1488 PinyinBrailleConverter defines a converter between the Chinese romanisation 1489 I{Hanyu Pinyin} (with tone marks as numbers) and the I{Braille} system for 1490 Mandarin. 1491 1492 Conversion from Braille to Pinyin is ambiguous. The syllable pairs mo/me, 1493 e/o and le/lo will yield an L{AmbiguousConversionError}. 1494 1495 @see: 1496 - How is Chinese written in Braille?: 1497 U{http://www.braille.ch/pschin-e.htm} 1498 - Chinese Braille: U{http://en.wikipedia.org/wiki/Chinese_braille} 1499 @todo Impl: Move the toneMarks option to the L{MandarinBrailleOperator}. 1500 """ 1501 CONVERSION_DIRECTIONS = [('Pinyin', 'MandarinBraille'), 1502 ('MandarinBraille', 'Pinyin')] 1503 1504 PUNCTUATION_SIGNS_MAPPING = {u'。': u'⠐⠆', u',': u'⠐', u'?': u'⠐⠄', 1505 u'!': u'⠰⠂', u':': u'⠒', u';': u'⠰', u'-': u'⠠⠤', u'…': u'⠐⠐⠐', 1506 u'·': u'⠠⠄', u'(': u'⠰⠄', u')': u'⠠⠆', u'[': u'⠰⠆', u']': u'⠰⠆'} 1507

1508 - def __init__(self, *args, **options):

1509 """ 1510 Creates an instance of the PinyinBrailleConverter. 1511 1512 @param args: optional list of L{RomanisationOperator}s to use for 1513 handling source and target readings. 1514 @param options: extra options 1515 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 1516 given, default settings will be assumed. 1517 @keyword sourceOperators: list of L{ReadingOperator}s used for handling 1518 source readings. 1519 @keyword targetOperators: list of L{ReadingOperator}s used for handling 1520 target readings. 1521 @keyword toneMarks: if set to C{True} tone marks will be used when 1522 converted to Braille representation. 1523 """ 1524 super(PinyinBrailleConverter, self).__init__(*args, **options) 1525 # use tone marks when converting to Braille? 1526 if 'toneMarks' in options: 1527 self.optionValue['toneMarks'] = options['toneMarks'] 1528 1529 # get mappings 1530 self._createMappings() 1531 1532 # punctuation mapping 1533 self.reversePunctuationMapping = {} 1534 for key in self.PUNCTUATION_SIGNS_MAPPING: 1535 if key in self.reversePunctuationMapping: 1536 # ambiguous mapping, so remove 1537 self.reversePunctuationMapping[key] = None 1538 else: 1539 value = self.PUNCTUATION_SIGNS_MAPPING[key] 1540 self.reversePunctuationMapping[value] = key 1541 1542 # regex to split out punctuation 1543 self.pinyinPunctuationRegex = re.compile(ur'(' \ 1544 + '|'.join([re.escape(p) for p \ 1545 in self.PUNCTUATION_SIGNS_MAPPING.keys()]) \ 1546 + '|.+?)') 1547 1548 braillePunctuation = list(set(self.PUNCTUATION_SIGNS_MAPPING.values())) 1549 # longer marks first in regex 1550 braillePunctuation.sort(lambda x,y: len(y) - len(x)) 1551 self.braillePunctuationRegex = re.compile(ur'(' \ 1552 + '|'.join([re.escape(p) for p in braillePunctuation]) + '|.+?)')

1553 1554 @classmethod

1555 - def getDefaultOptions(cls):

1556 options = super(PinyinBrailleConverter, cls).getDefaultOptions() 1557 options.update({'toneMarks': True}) 1558 1559 return options

1560

1561 - def _createMappings(self):

1562 """ 1563 Creates the mappings of syllable initials and finals from the database. 1564 """ 1565 # initials 1566 self.pinyinInitial2Braille = {} 1567 self.braille2PinyinInitial = {} 1568 1569 table = self.db.tables['PinyinBrailleInitialMapping'] 1570 entries = self.db.selectRows( 1571 select([table.c.PinyinInitial, table.c.Braille])) 1572 1573 for pinyinInitial, brailleChar in entries: 1574 # Pinyin 2 Braille 1575 if pinyinInitial in self.pinyinInitial2Braille: 1576 raise ValueError( 1577 "Ambiguous mapping from Pinyin syllable initial to Braille") 1578 self.pinyinInitial2Braille[pinyinInitial] = brailleChar 1579 # Braille 2 Pinyin 1580 if brailleChar not in self.braille2PinyinInitial: 1581 self.braille2PinyinInitial[brailleChar] = set() 1582 self.braille2PinyinInitial[brailleChar].add(pinyinInitial) 1583 1584 self.pinyinInitial2Braille[''] = '' 1585 self.braille2PinyinInitial[''] = set(['']) 1586 1587 # finals 1588 self.pinyinFinal2Braille = {} 1589 self.braille2PinyinFinal = {} 1590 1591 table = self.db.tables['PinyinBrailleFinalMapping'] 1592 entries = self.db.selectRows( 1593 select([table.c.PinyinFinal, table.c.Braille])) 1594 1595 for pinyinFinal, brailleChar in entries: 1596 # Pinyin 2 Braille 1597 if pinyinFinal in self.pinyinFinal2Braille: 1598 raise ValueError( 1599 "Ambiguous mapping from Pinyin syllable final to Braille") 1600 self.pinyinFinal2Braille[pinyinFinal] = brailleChar 1601 # Braille 2 Pinyin 1602 if brailleChar not in self.braille2PinyinFinal: 1603 self.braille2PinyinFinal[brailleChar] = set() 1604 self.braille2PinyinFinal[brailleChar].add(pinyinFinal) 1605 1606 # map ê to same Braille character as e 1607 self.pinyinFinal2Braille[u'ê'] = self.pinyinFinal2Braille[u'e']

1608

1609 - def convertEntities(self, readingEntities, fromReading, toReading):

1610 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS: 1611 raise UnsupportedError("conversion direction from '" \ 1612 + fromReading + "' to '" + toReading + "' not supported") 1613 # convert to standard form if supported 1614 if self.readingFact.isReadingConversionSupported(fromReading, 1615 fromReading): 1616 # use user specified source operator, set target to not accept Erhua 1617 # sound (for Pinyin) 1618 readingEntities = self.readingFact.convertEntities(readingEntities, 1619 fromReading, fromReading, 1620 sourceOperators=[self._getFromOperator(fromReading)], 1621 targetOptions={'Erhua': 'ignore', 'toneMarkType': 'Numbers', 1622 'missingToneMark': 'noinfo'}) 1623 1624 toReadingEntities = [] 1625 if fromReading == "Pinyin": 1626 for entity in readingEntities: 1627 # convert reading entities, don't convert the rest 1628 if self._getFromOperator(fromReading).isReadingEntity(entity): 1629 toReadingEntity = self.convertBasicEntity(entity, 1630 fromReading, toReading) 1631 toReadingEntities.append(toReadingEntity) 1632 else: 1633 # find punctuation marks 1634 for subEntity in self.pinyinPunctuationRegex.findall( 1635 entity): 1636 if subEntity in self.PUNCTUATION_SIGNS_MAPPING: 1637 toReadingEntities.append( 1638 self.PUNCTUATION_SIGNS_MAPPING[subEntity]) 1639 else: 1640 toReadingEntities.append(subEntity) 1641 elif fromReading == "MandarinBraille": 1642 for entity in readingEntities: 1643 if self._getFromOperator(fromReading).isReadingEntity(entity): 1644 toReadingEntity = self.convertBasicEntity(entity.lower(), 1645 fromReading, toReading) 1646 toReadingEntities.append(toReadingEntity) 1647 else: 1648 # find punctuation marks 1649 for subEntity in self.braillePunctuationRegex.findall( 1650 entity): 1651 if subEntity in self.reversePunctuationMapping: 1652 if not self.reversePunctuationMapping[subEntity]: 1653 raise AmbiguousConversionError( 1654 "conversion for entity '" + subEntity \ 1655 + "' is ambiguous") 1656 toReadingEntities.append( 1657 self.reversePunctuationMapping[subEntity]) 1658 else: 1659 toReadingEntities.append(subEntity) 1660 1661 # convert to requested form if supported 1662 if self.readingFact.isReadingConversionSupported(toReading, toReading): 1663 toReadingEntities = self.readingFact.convertEntities( 1664 toReadingEntities, toReading, toReading, 1665 targetOperators=[self._getToOperator(toReading)]) 1666 return toReadingEntities

1667

1668 - def convertBasicEntity(self, entity, fromReading, toReading):

1669 """ 1670 Converts a basic entity (a syllable) in the source reading to the given 1671 target reading. 1672 1673 This method is called by L{convertEntities()} and a single entity 1674 is given for conversion. 1675 1676 If a single entity needs to be converted it is recommended to use 1677 L{convertEntities()} instead. In the general case it can not be ensured 1678 that a mapping from one reading to another can be done by the simple 1679 conversion of a basic entity. One-to-many mappings are possible and 1680 there is no guarantee that any entity of a reading recognised by 1681 L{operator.ReadingOperator.isReadingEntity()} will be mapped here. 1682 1683 @type entity: str 1684 @param entity: string written in the source reading in lower case 1685 letters 1686 @type fromReading: str 1687 @param fromReading: name of the source reading 1688 @type toReading: str 1689 @param toReading: name of the target reading, different from the source 1690 reading 1691 @rtype: str 1692 @returns: the entity converted to the C{toReading} in lower case 1693 @raise AmbiguousConversionError: if conversion for this entity of the 1694 source reading is ambiguous. 1695 @raise ConversionError: on other operations specific to the conversion 1696 of the entity. 1697 @raise InvalidEntityError: if the entity is invalid. 1698 """ 1699 # split entity into plain part and tonal information 1700 plainEntity, tone \ 1701 = self._getFromOperator(fromReading).splitEntityTone(entity) 1702 # lookup in database 1703 if fromReading == "Pinyin": 1704 initial, final \ 1705 = self._getFromOperator(fromReading).getOnsetRhyme(plainEntity) 1706 try: 1707 transSyllable = self.pinyinInitial2Braille[initial] \ 1708 + self.pinyinFinal2Braille[final] 1709 except KeyError: 1710 raise ConversionError("conversion for entity '" \ 1711 + plainEntity + "' not supported") 1712 elif fromReading == "MandarinBraille": 1713 # mapping from Braille to Pinyin is ambiguous 1714 initial, final \ 1715 = self._getFromOperator(fromReading).getOnsetRhyme(plainEntity) 1716 1717 # get all possible forms 1718 forms = [] 1719 for i in self.braille2PinyinInitial[initial]: 1720 for f in self.braille2PinyinFinal[final]: 1721 # get Pinyin syllable 1722 table = self.db.tables['PinyinInitialFinal'] 1723 entry = self.db.selectScalar( 1724 select([table.c.Pinyin], 1725 and_(table.c.PinyinInitial == i, 1726 table.c.PinyinFinal == f))) 1727 if entry: 1728 forms.append(entry) 1729 1730 # narrow down to possible ones 1731 if len(forms) > 1: 1732 for form in forms[:]: 1733 if not self._getToOperator(toReading).isPlainReadingEntity( 1734 form): 1735 forms.remove(form) 1736 if not forms: 1737 raise ConversionError("conversion for entity '" \ 1738 + plainEntity + "' not supported") 1739 if len(forms) > 1: 1740 raise AmbiguousConversionError("conversion for entity '" \ 1741 + plainEntity + "' is ambiguous") 1742 else: 1743 transSyllable = forms[0] 1744 1745 # remove tone information 1746 if not self.getOption('toneMarks'): 1747 tone = None 1748 try: 1749 return self._getToOperator(toReading).getTonalEntity(transSyllable, 1750 tone) 1751 except InvalidEntityError, e: 1752 # handle this as a conversion error as the converted syllable is not 1753 # accepted by the operator 1754 raise ConversionError(e)

1755

1756 1757 -class JyutpingDialectConverter(EntityWiseReadingConverter):

1758 u""" 1759 Provides a converter for different representations of the Cantonese 1760 romanisation I{Jyutping}. 1761 """ 1762 CONVERSION_DIRECTIONS = [('Jyutping', 'Jyutping')] 1763

1764 - def convertBasicEntity(self, entity, fromReading, toReading):

1765 # split syllable into plain part and tonal information 1766 plainSyllable, tone \ 1767 = self._getFromOperator(fromReading).splitEntityTone(entity) 1768 1769 # capitalisation 1770 if self._getToOperator(toReading).getOption('case') == 'lower': 1771 plainSyllable = plainSyllable.lower() 1772 elif self._getToOperator(toReading).getOption('case') == 'upper': 1773 plainSyllable = plainSyllable.upper() 1774 1775 # get syllable with tone mark 1776 try: 1777 return self._getToOperator(toReading).getTonalEntity(plainSyllable, 1778 tone) 1779 except InvalidEntityError, e: 1780 # handle this as a conversion error as the converted syllable is not 1781 # accepted by the operator 1782 raise ConversionError(e)

1783

1784 1785 -class CantoneseYaleDialectConverter(EntityWiseReadingConverter):

1786 u""" 1787 Provides a converter for different representations of the I{Cantonese Yale} 1788 romanisation system. 1789 1790 High Level vs. High Falling Tone 1791 ================================ 1792 As described in L{CantoneseYaleOperator} the abbreviated form of the 1793 Cantonese Yale romanisation system which uses numbers as tone marks makes no 1794 distinction between the high level tone and the high falling tone. On 1795 conversion to the form with diacritical marks it is thus important to choose 1796 the correct mapping. This can be configured by applying a special instance 1797 of a L{CantoneseYaleOperator}. 1798 """ 1799 CONVERSION_DIRECTIONS = [('CantoneseYale', 'CantoneseYale')] 1800

1801 - def convertBasicEntity(self, entity, fromReading, toReading):

1802 # split syllable into plain part and tonal information 1803 plainSyllable, tone \ 1804 = self._getFromOperator(fromReading).splitEntityTone(entity) 1805 1806 # capitalisation 1807 if self._getToOperator(toReading).getOption('case') == 'lower': 1808 plainSyllable = plainSyllable.lower() 1809 elif self._getToOperator(toReading).getOption('case') == 'upper': 1810 plainSyllable = plainSyllable.upper() 1811 1812 # get syllable with tone mark 1813 try: 1814 return self._getToOperator(toReading).getTonalEntity(plainSyllable, 1815 tone) 1816 except InvalidEntityError, e: 1817 # handle this as a conversion error as the converted syllable is not 1818 # accepted by the operator 1819 raise ConversionError(e)

1820

1821 1822 -class JyutpingYaleConverter(RomanisationConverter):

1823 """ 1824 Provides a converter between the Cantonese romanisation systems I{Jyutping} 1825 and I{Cantonese Yale}. 1826 1827 Upper or lower case will be transfered between syllables, no special 1828 formatting according to the standards will be made. Upper/lower case will be 1829 identified according to three classes: either the whole syllable is upper 1830 case, only the initial letter is upper case or otherwise the whole syllable 1831 is assumed being lower case. 1832 1833 High Level vs. High Falling Tone 1834 ================================ 1835 As described in L{CantoneseYaleOperator} the Cantonese Yale romanisation 1836 system makes a distinction between the high level tone and the high falling 1837 tone in general while Jyutping does not. On conversion it is thus important 1838 to choose the correct mapping. This can be configured by applying a special 1839 instance of a L{CantoneseYaleOperator}. 1840 """ 1841 CONVERSION_DIRECTIONS = [('Jyutping', 'CantoneseYale'), 1842 ('CantoneseYale', 'Jyutping')] 1843 # retain all information when converting Yale, use special dialect 1844 DEFAULT_READING_OPTIONS = {'CantoneseYale': {'toneMarkType': 'Internal'}, 1845 'Jyutping': {}} 1846 1847 DEFAULT_TONE_MAPPING = {2: '2ndTone', 3: '3rdTone', 4: '4thTone', 1848 5: '5thTone', 6: '6thTone'} 1849 """ 1850 Mapping of Jyutping tones to Yale tones. Tone 1 needs to be handled 1851 independently. 1852 """ 1853

1854 - def __init__(self, *args, **options):

1855 """ 1856 Creates an instance of the JyutpingYaleConverter. 1857 1858 @param args: optional list of L{RomanisationOperator}s to use for 1859 handling source and target readings. 1860 @param options: extra options 1861 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 1862 given, default settings will be assumed. 1863 @keyword sourceOperators: list of L{ReadingOperator}s used for handling 1864 source readings. 1865 @keyword targetOperators: list of L{ReadingOperator}s used for handling 1866 target readings. 1867 """ 1868 super(JyutpingYaleConverter, self).__init__(*args, **options)

1869

1870 - def convertBasicEntity(self, entity, fromReading, toReading):

1871 # split syllable into plain part and tonal information 1872 plainSyllable, tone = self.readingFact.splitEntityTone(entity, 1873 fromReading, **self.DEFAULT_READING_OPTIONS[fromReading]) 1874 1875 # lookup in database 1876 if fromReading == "CantoneseYale": 1877 table = self.db.tables['JyutpingYaleMapping'] 1878 transSyllable = self.db.selectScalar( 1879 select([table.c.Jyutping], 1880 table.c.CantoneseYale == plainSyllable)) 1881 # get tone 1882 if tone: 1883 # get tone number from first character of string representation 1884 transTone = int(tone[0]) 1885 else: 1886 transTone = None 1887 elif fromReading == "Jyutping": 1888 table = self.db.tables['JyutpingYaleMapping'] 1889 transSyllable = self.db.selectScalar( 1890 select([table.c.CantoneseYale], 1891 table.c.Jyutping == plainSyllable)) 1892 # get tone 1893 if not tone: 1894 transTone = None 1895 elif tone != 1: 1896 transTone = self.DEFAULT_TONE_MAPPING[tone] 1897 else: 1898 # get setting from operator 1899 transTone \ 1900 = self._getToOperator(toReading).getOption('YaleFirstTone') 1901 1902 if not transSyllable: 1903 raise ConversionError("conversion for entity '" + plainSyllable \ 1904 + "' not supported") 1905 try: 1906 return self.readingFact.getTonalEntity(transSyllable, transTone, 1907 toReading, **self.DEFAULT_READING_OPTIONS[toReading]) 1908 except InvalidEntityError, e: 1909 # handle this as a conversion error as the converted syllable is not 1910 # accepted by the operator 1911 raise ConversionError(e)

1912

1913 1914 -class BridgeConverter(ReadingConverter):

1915 """ 1916 Provides a L{ReadingConverter} that converts between readings over a third 1917 reading called bridge reading. 1918 """

1919 - def _getConversionDirections(bridge):

1920 """ 1921 Extracts all conversion directions implicitly stored in the bridge 1922 definition. 1923 1924 @type bridge: list of tuple 1925 @param bridge: 3-tuples indicating conversion direction over a third 1926 reading (bridge) 1927 @rtype: list of tuple 1928 @return: conversion directions 1929 """ 1930 dirSet = set() 1931 for fromReading, bridgeReading, toReading in bridge: 1932 dirSet.add((fromReading, toReading)) 1933 return list(dirSet)

1934 1935 CONVERSION_BRIDGE = [('WadeGiles', 'Pinyin', 'MandarinIPA'), 1936 ('MandarinBraille', 'Pinyin', 'MandarinIPA'), 1937 ('WadeGiles', 'Pinyin', 'MandarinBraille'), 1938 ('MandarinBraille', 'Pinyin', 'WadeGiles'), 1939 ('GR', 'Pinyin', 'WadeGiles'), ('MandarinBraille', 'Pinyin', 'GR'), 1940 ('WadeGiles', 'Pinyin', 'GR'), ('GR', 'Pinyin', 'MandarinBraille'), 1941 ('GR', 'Pinyin', 'MandarinIPA'), # TODO remove once there is a proper 1942 # converter for GR to IPA 1943 ] 1944 """ 1945 List containing all conversion directions together with the bridge reading 1946 over which the conversion is made. 1947 Form: (fromReading, bridgeReading, toReading) 1948 As conversion may be lossy it is important which conversion path is chosen. 1949 """ 1950 1951 CONVERSION_DIRECTIONS = _getConversionDirections(CONVERSION_BRIDGE) 1952

1953 - def __init__(self, *args, **options):

1954 """ 1955 Creates an instance of the BridgeConverter. 1956 1957 @param args: optional list of L{RomanisationOperator}s to use for 1958 handling source and target readings. 1959 @param options: extra options passed to the L{ReadingConverter}s 1960 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is 1961 given, default settings will be assumed. 1962 @keyword sourceOperators: list of L{ReadingOperator}s used for handling 1963 source readings. 1964 @keyword targetOperators: list of L{ReadingOperator}s used for handling 1965 target readings. 1966 """ 1967 super(BridgeConverter, self).__init__(*args, **options) 1968 1969 self.bridgeLookup = {} 1970 for fromReading, bridgeReading, toReading in self.CONVERSION_BRIDGE: 1971 self.bridgeLookup[(fromReading, toReading)] = bridgeReading

1972

1973 - def convertEntities(self, readingEntities, fromReading, toReading):

1974 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS: 1975 raise UnsupportedError("conversion direction from '" \ 1976 + fromReading + "' to '" + toReading + "' not supported") 1977 bridgeReading = self.bridgeLookup[(fromReading, toReading)] 1978 1979 # to bridge reading 1980 bridgeReadingEntities = self.readingFact.convertEntities( 1981 readingEntities, fromReading, bridgeReading, 1982 sourceOperators=[self._getFromOperator(fromReading)]) 1983 1984 # from bridge reading 1985 toReadingEntities = self.readingFact.convertEntities( 1986 bridgeReadingEntities, bridgeReading, toReading, 1987 targetOperators=[self._getToOperator(toReading)]) 1988 return toReadingEntities

1989

Source Code for Module cjklib.reading.converter