1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 u"""
19 Provides L{ReadingConverter}s, classes to convert strings written in a character
20 reading to another reading.
21
22 Examples
23 ========
24 Convert a string from I{Jyutping} to I{Cantonese Yale}:
25
26 >>> from cjklib.reading import ReadingFactory
27 >>> f = ReadingFactory()
28 >>> f.convert('gwong2jau1wa2', 'Jyutping', 'CantoneseYale')
29 u'gw\xf3ngy\u0101uw\xe1'
30
31 This is also possible creating a converter instance explicitly using the
32 factory:
33
34 >>> jyc = f.createReadingConverter('GR', 'Pinyin')
35 >>> jyc.convert('Woo.men tingshuo yeou "Yinnduhshyue", "Aijyishyue"')
36 u'W\u01d2men t\u012bngshu\u014d y\u01d2u "Y\xecnd\xf9xu\xe9", \
37 "\u0100ij\xedxu\xe9"'
38
39 Convert between different dialects of the same reading I{Wade-Giles}:
40
41 >>> f.convert(u'kuo3-yü2', 'WadeGiles', 'WadeGiles',
42 ... sourceOptions={'toneMarkType': 'Numbers'},
43 ... targetOptions={'toneMarkType': 'SuperscriptNumbers'})
44 u'kuo\xb3-y\xfc\xb2'
45
46 See L{PinyinDialectConverter} for more examples.
47 """
48 import re
49 import copy
50
51 from sqlalchemy import select
52 from sqlalchemy.sql import and_, or_, not_
53
54 from cjklib.exception import (ConversionError, AmbiguousConversionError,
55 InvalidEntityError, UnsupportedError)
56 from cjklib.dbconnector import DatabaseConnector
57 import operator
58 import cjklib.reading
61 """
62 Defines an abstract converter between two or more I{character reading}s.
63
64 The basic method is L{convert()} which converts one input string from one
65 reading to another.
66
67 The methods L{getDefaultOptions()} and L{getOption()} provide means to
68 handle conversion specific settings.
69
70 The class itself can't be used directly, it has to be subclassed and its
71 methods need to be extended.
72 """
73 CONVERSION_DIRECTIONS = []
74 """
75 List of tuples for specifying supported conversion directions from reading A
76 to reading B. If both directions are supported, two tuples (A, B) and (B, A)
77 are given.
78 """
79
81 """
82 Creates an instance of the ReadingConverter.
83
84 @param args: optional list of L{RomanisationOperator}s to use for
85 handling source and target readings.
86 @param options: extra options
87 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
88 given, default settings will be assumed.
89 @keyword sourceOperators: list of L{ReadingOperator}s used for handling
90 source readings.
91 @keyword targetOperators: list of L{ReadingOperator}s used for handling
92 target readings.
93 """
94 if 'dbConnectInst' in options:
95 self.db = options['dbConnectInst']
96 else:
97 self.db = DatabaseConnector.getDBConnector()
98
99 self.readingFact = cjklib.reading.ReadingFactory(dbConnectInst=self.db)
100
101 self.optionValue = {}
102 defaultOptions = self.getDefaultOptions()
103 for option in defaultOptions:
104 if type(defaultOptions[option]) in [type(()), type([]), type({})]:
105 self.optionValue[option] = copy.deepcopy(defaultOptions[option])
106 else:
107 self.optionValue[option] = defaultOptions[option]
108
109
110 for arg in args:
111 if isinstance(arg, operator.ReadingOperator):
112
113 self.optionValue['sourceOperators'][arg.READING_NAME] = arg
114 self.optionValue['targetOperators'][arg.READING_NAME] = arg
115 else:
116 raise ValueError("unknown type '" + str(type(arg)) \
117 + "' given as ReadingOperator")
118
119
120 if 'sourceOperators' in options:
121 for arg in options['sourceOperators']:
122 if isinstance(arg, operator.ReadingOperator):
123
124 self.optionValue['sourceOperators'][arg.READING_NAME] = arg
125 else:
126 raise ValueError("unknown type '" + str(type(arg)) \
127 + "' given as source reading operator")
128
129 if 'targetOperators' in options:
130 for arg in options['targetOperators']:
131 if isinstance(arg, operator.ReadingOperator):
132
133 self.optionValue['targetOperators'][arg.READING_NAME] = arg
134 else:
135 raise ValueError("unknown type '" + str(type(arg)) \
136 + "' given as target reading operator")
137
138 @classmethod
140 """
141 Returns the reading converter's default options.
142
143 The keyword 'dbConnectInst' is not regarded a configuration option of
144 the converter and is thus not included in the dict returned.
145
146 @rtype: dict
147 @return: the reading converter's default options.
148 """
149 return {'sourceOperators': {}, 'targetOperators': {}}
150
152 """
153 Returns the value of the reading converter's option.
154
155 @return: the value of the given reading converter's option.
156 """
157 return self.optionValue[option]
158
159 - def convert(self, string, fromReading, toReading):
160 """
161 Converts a string in the source reading to the given target reading.
162
163 @type string: str
164 @param string: string written in the source reading
165 @type fromReading: str
166 @param fromReading: name of the source reading
167 @type toReading: str
168 @param toReading: name of the target reading
169 @rtype: str
170 @returns: the input string converted to the C{toReading}
171 @raise DecompositionError: if the string can not be decomposed into
172 basic entities with regards to the source reading or the given
173 information is insufficient.
174 @raise ConversionError: on operations specific to the conversion between
175 the two readings (e.g. error on converting entities).
176 @raise UnsupportedError: if source or target reading is not supported
177 for conversion.
178 """
179
180 fromReadingEntities = self._getFromOperator(fromReading).decompose(
181 string)
182
183 toReadingEntities = self.convertEntities(fromReadingEntities,
184 fromReading, toReading)
185
186 return self._getToOperator(toReading).compose(toReadingEntities)
187
189 """
190 Converts a list of entities in the source reading to the given target
191 reading.
192
193 The default implementation will raise a NotImplementedError.
194
195 @type readingEntities: list of str
196 @param readingEntities: list of entities written in source reading
197 @type fromReading: str
198 @param fromReading: name of the source reading
199 @type toReading: str
200 @param toReading: name of the target reading
201 @rtype: list of str
202 @return: list of entities written in target reading
203 @raise ConversionError: on operations specific to the conversion between
204 the two readings (e.g. error on converting entities).
205 @raise UnsupportedError: if source or target reading is not supported
206 for conversion.
207 @raise InvalidEntityError: if an invalid entity is given.
208 """
209 raise NotImplementedError
210
212 """
213 Gets a reading operator instance for conversion from the given reading.
214
215 @type readingN: str
216 @param readingN: name of reading
217 @rtype: instance
218 @return: a L{ReadingOperator} instance
219 @raise UnsupportedError: if the given reading is not supported.
220 """
221 if readingN not in self.getOption('sourceOperators'):
222 self.optionValue['sourceOperators'][readingN] \
223 = self.readingFact._getReadingOperatorInstance(readingN)
224 return self.getOption('sourceOperators')[readingN]
225
227 """
228 Gets a reading operator instance for conversion to the given reading.
229
230 @type readingN: str
231 @param readingN: name of reading
232 @rtype: instance
233 @return: a L{ReadingOperator} instance
234 @raise UnsupportedError: if the given reading is not supported.
235 """
236 if readingN not in self.getOption('targetOperators'):
237 self.optionValue['targetOperators'][readingN] \
238 = self.readingFact._getReadingOperatorInstance(readingN)
239 return self.getOption('targetOperators')[readingN]
240
243 """
244 Defines an abstract L{ReadingConverter} between two or more I{readings}s for
245 doing entity wise conversion.
246
247 Converters that simply convert one syllable at once can implement this class
248 and merely need to overwrite L{convertBasicEntity()}
249 """
251 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS:
252 raise UnsupportedError("conversion direction from '" \
253 + fromReading + "' to '" + toReading + "' not supported")
254
255
256 toReadingEntities = []
257
258 for entity in readingEntities:
259
260 if self._getFromOperator(fromReading).isReadingEntity(entity):
261 toReadingEntity = self.convertBasicEntity(entity, fromReading,
262 toReading)
263 toReadingEntities.append(toReadingEntity)
264 else:
265 toReadingEntities.append(entity)
266
267 return toReadingEntities
268
270 """
271 Converts a basic entity (e.g. a syllable) in the source reading to the
272 given target reading.
273
274 This method is called by L{convertEntities()} and a single entity is
275 given for conversion.
276
277 The default implementation will raise a NotImplementedError.
278
279 @type entity: str
280 @param entity: string written in the source reading
281 @type fromReading: str
282 @param fromReading: name of the source reading
283 @type toReading: str
284 @param toReading: name of the target reading
285 @rtype: str
286 @returns: the entity converted to the C{toReading}
287 @raise AmbiguousConversionError: if conversion for this entity of the
288 source reading is ambiguous.
289 @raise ConversionError: on other operations specific to the conversion
290 of the entity.
291 @raise InvalidEntityError: if the entity is invalid.
292 """
293 raise NotImplementedError
294
297 """
298 Defines an abstract L{ReadingConverter} between two or more
299 I{romanisation}s.
300
301 Reading dialects can produce different entities which have to be handled by
302 the conversion process. This is realised by converting the given reading
303 dialect to a default form, then converting to the default target reading and
304 finally converting to the specified target reading dialect. On conversion
305 step thus involves three single conversion steps using a default form. This
306 default form can be defined in L{DEFAULT_READING_OPTIONS}.
307
308 Upper or lower case will be transfered between syllables, no special
309 formatting according to anyhow defined standards will be guaranteed.
310 Upper/lower case will be identified according to three classes: either the
311 whole syllable is upper case, only the initial letter is upper case or
312 otherwise the whole syllable is assumed being lower case.
313
314 The class itself can't be used directly, it has to be subclassed and
315 L{convertBasicEntity()} has to be implemented, as to make the translation of
316 a syllable from one romanisation to another possible.
317 """
318 DEFAULT_READING_OPTIONS = {}
319 """
320 Defines default reading options for the reading used to convert from (to
321 resp.) before (after resp.) converting to (from resp.) the user specified
322 dialect.
323
324 The most general reading dialect should be specified as to allow for a broad
325 range of input.
326 """
327
329 """
330 Converts a list of entities in the source reading to the given target
331 reading.
332
333 Upper case of the first character or the whole characters of one entity
334 (e.g. syllable) is respected. Entities like C{"HaO"} will degenerate to
335 C{"Hao"} though.
336
337 @type readingEntities: list of str
338 @param readingEntities: list of entities written in source reading
339 @type fromReading: str
340 @param fromReading: name of the source reading
341 @type toReading: str
342 @param toReading: name of the target reading
343 @rtype: list of str
344 @return: list of entities written in target reading
345 @raise AmbiguousConversionError: if conversion for a specific entity of
346 the source reading is ambiguous.
347 @raise ConversionError: on other operations specific to the conversion
348 between the two readings (e.g. error on converting entities).
349 @raise UnsupportedError: if source or target reading is not supported
350 for conversion.
351 @raise InvalidEntityError: if an invalid entity is given.
352 """
353 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS:
354 raise UnsupportedError("conversion direction from '" \
355 + fromReading + "' to '" + toReading + "' not supported")
356
357
358
359 if fromReading in self.DEFAULT_READING_OPTIONS:
360 fromDefaultOptions = self.DEFAULT_READING_OPTIONS[fromReading]
361 else:
362 fromDefaultOptions = {}
363
364 if self.readingFact.isReadingConversionSupported(fromReading,
365 fromReading):
366
367 readingEntities = self.readingFact.convertEntities(
368 readingEntities, fromReading, fromReading,
369 sourceOperators=[self._getFromOperator(fromReading)],
370 targetOptions=fromDefaultOptions)
371
372
373 toReadingEntities = []
374 for entity in readingEntities:
375
376 if self.readingFact.isReadingEntity(entity, fromReading,
377 **fromDefaultOptions):
378 toReadingEntity = self.convertBasicEntity(entity.lower(),
379 fromReading, toReading)
380
381
382 if self._getToOperator(toReading).getOption('case') == 'both':
383
384 if entity.isupper():
385 toReadingEntity = toReadingEntity.upper()
386 elif entity.istitle():
387 toReadingEntity = toReadingEntity.capitalize()
388 elif self._getToOperator(toReading) == 'upper':
389 toReadingEntity = toReadingEntity.upper()
390
391 toReadingEntities.append(toReadingEntity)
392 else:
393 toReadingEntities.append(entity)
394
395
396
397 if toReading in self.DEFAULT_READING_OPTIONS:
398 toDefaultOptions = self.DEFAULT_READING_OPTIONS[toReading]
399 else:
400 toDefaultOptions = {}
401
402 if self.readingFact.isReadingConversionSupported(toReading, toReading):
403
404 toReadingEntities = self.readingFact.convertEntities(
405 toReadingEntities, toReading, toReading,
406 sourceOptions=toDefaultOptions,
407 targetOperators=[self._getToOperator(toReading)])
408
409 return toReadingEntities
410
412 """
413 Converts a basic entity (e.g. a syllable) in the source reading to the
414 given target reading.
415
416 This method is called by L{convertEntities()} and a lower case entity
417 is given for conversion. The returned value should be in lower case
418 characters too, as L{convertEntities()} will take care of
419 capitalisation.
420
421 If a single entity needs to be converted it is recommended to use
422 L{convertEntities()} instead. In the general case it can not be ensured
423 that a mapping from one reading to another can be done by the simple
424 conversion of a basic entity. One-to-many mappings are possible and
425 there is no guarantee that any entity of a reading recognised by
426 L{operator.ReadingOperator.isReadingEntity()} will be mapped here.
427
428 The default implementation will raise a NotImplementedError.
429
430 @type entity: str
431 @param entity: string written in the source reading in lower case
432 letters
433 @type fromReading: str
434 @param fromReading: name of the source reading
435 @type toReading: str
436 @param toReading: name of the target reading
437 @rtype: str
438 @returns: the entity converted to the C{toReading} in lower case
439 @raise AmbiguousConversionError: if conversion for this entity of the
440 source reading is ambiguous.
441 @raise ConversionError: on other operations specific to the conversion
442 of the entity.
443 @raise InvalidEntityError: if the entity is invalid.
444 """
445 raise NotImplementedError
446
449 u"""
450 Provides a converter for different representations of the Chinese
451 romanisation I{Hanyu Pinyin}.
452
453 Examples
454 ========
455 The following examples show how to convert between different representations
456 of Pinyin.
457 - Create the Converter and convert from standard Pinyin to Pinyin with
458 tones represented by numbers:
459
460 >>> from cjklib.reading import *
461 >>> targetOp = operator.PinyinOperator(toneMarkType='Numbers')
462 >>> pinyinConv = converter.PinyinDialectConverter(
463 ... targetOperators=[targetOp])
464 >>> pinyinConv.convert(u'hànzì', 'Pinyin', 'Pinyin')
465 u'han4zi4'
466
467 - Convert Pinyin written with numbers, the ü (u with umlaut) replaced
468 by character v and omitted fifth tone to standard Pinyin:
469
470 >>> sourceOp = operator.PinyinOperator(toneMarkType='Numbers',
471 ... yVowel='v', missingToneMark='fifth')
472 >>> pinyinConv = converter.PinyinDialectConverter(
473 ... sourceOperators=[sourceOp])
474 >>> pinyinConv.convert('nv3hai2zi', 'Pinyin', 'Pinyin')
475 u'n\u01dah\xe1izi'
476
477 - Or more elegantly:
478
479 >>> f = ReadingFactory()
480 >>> f.convert('nv3hai2zi', 'Pinyin', 'Pinyin',
481 ... sourceOptions={'toneMarkType': 'Numbers', 'yVowel': 'v',
482 ... 'missingToneMark': 'fifth'})
483 u'n\u01dah\xe1izi'
484
485 - Decompose the reading of a dictionary entry from CEDICT into syllables
486 and convert the ü-vowel and forms of I{Erhua sound}:
487
488 >>> pinyinFrom = operator.PinyinOperator(toneMarkType='Numbers',
489 ... yVowel='u:', Erhua='oneSyllable')
490 >>> syllables = pinyinFrom.decompose('sun1nu:r3')
491 >>> print syllables
492 ['sun1', 'nu:r3']
493 >>> pinyinTo = operator.PinyinOperator(toneMarkType='Numbers',
494 ... Erhua='twoSyllables')
495 >>> pinyinConv = converter.PinyinDialectConverter(
496 ... sourceOperators=[pinyinFrom], targetOperators=[pinyinTo])
497 >>> pinyinConv.convertEntities(syllables, 'Pinyin', 'Pinyin')
498 [u'sun1', u'n\xfc3', u'r5']
499
500 - Or more elegantly with entities already decomposed:
501
502 >>> f.convertEntities(['sun1', 'nu:r3'], 'Pinyin', 'Pinyin',
503 ... sourceOptions={'toneMarkType': 'Numbers', 'yVowel': 'u:',
504 ... 'Erhua': 'oneSyllable'},
505 ... targetOptions={'toneMarkType': 'Numbers',
506 ... 'Erhua': 'twoSyllables'})
507 [u'sun1', u'n\xfc3', u'r5']
508 """
509 CONVERSION_DIRECTIONS = [('Pinyin', 'Pinyin')]
510
512 u"""
513 Creates an instance of the PinyinDialectConverter.
514
515 @param args: optional list of L{RomanisationOperator}s to use for
516 handling source and target readings.
517 @param options: extra options
518 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
519 given, default settings will be assumed.
520 @keyword sourceOperators: list of L{ReadingOperator}s used for handling
521 source readings.
522 @keyword targetOperators: list of L{ReadingOperator}s used for handling
523 target readings.
524 @keyword keepPinyinApostrophes: if set to C{True} apostrophes separating
525 two syllables in Pinyin will be kept even if not necessary.
526 Apostrophes missing according to the given rule will be added
527 though.
528 @keyword breakUpErhua: if set to C{'on'} I{Erhua} forms will be
529 converted to single syllables with a full I{er} syllable regardless
530 of the Erhua form setting of the target reading, e.g. I{zher} will
531 be converted to I{zhe}, I{er}, if set to C{'auto'} Erhua forms are
532 converted if the given target reading operator doesn't support
533 Erhua forms, if set to C{'off'} Erhua forms will always be
534 conserved.
535 """
536 super(PinyinDialectConverter, self).__init__(*args, **options)
537
538 if 'keepPinyinApostrophes' in options:
539 self.optionValue['keepPinyinApostrophes'] \
540 = options['keepPinyinApostrophes']
541
542 if 'breakUpErhua' in options:
543 if options['breakUpErhua'] not in ['on', 'auto', 'off']:
544 raise ValueError("Invalid option '" \
545 + str(options['breakUpErhua']) \
546 + "' for keyword 'breakUpErhua'")
547 self.optionValue['breakUpErhua'] = options['breakUpErhua']
548
549
550 if self._getFromOperator('Pinyin').getOption('yVowel') != u'ü':
551 self.fromYVowel \
552 = self._getFromOperator('Pinyin').getOption('yVowel')
553 else:
554 self.fromYVowel = u'ü'
555 if self._getToOperator('Pinyin').getOption('yVowel') != u'ü':
556 self.toYVowel = self._getToOperator('Pinyin').getOption('yVowel')
557 else:
558 self.toYVowel = u'ü'
559
560
561 if self.getOption('breakUpErhua') == 'on' \
562 or (self.getOption('breakUpErhua') == 'auto' \
563 and self._getToOperator('Pinyin').getOption('Erhua') \
564 == 'ignore')\
565 or (self._getToOperator('Pinyin').getOption('Erhua') \
566 == 'twoSyllables'\
567 and self._getFromOperator('Pinyin').getOption('Erhua') \
568 == 'oneSyllable'):
569
570 self.convertErhuaFunc = self.convertToTwoSyllablesErhua
571 elif self._getToOperator('Pinyin').getOption('Erhua') == 'oneSyllable'\
572 and self._getFromOperator('Pinyin').getOption('Erhua') \
573 != 'oneSyllable':
574
575 self.convertErhuaFunc = self.convertToSingleSyllableErhua
576 elif self._getFromOperator('Pinyin').getOption('Erhua') != 'ignore'\
577 and self._getToOperator('Pinyin').getOption('Erhua') == 'ignore':
578
579 self.convertErhuaFunc = self._checkForErhua
580 else:
581
582 self.convertErhuaFunc = lambda x: x
583
584 @classmethod
590
591 - def convertEntities(self, readingEntities, fromReading='Pinyin',
592 toReading='Pinyin'):
593 """
594 Converts a list of entities in the source reading to the given target
595 reading.
596
597 @type readingEntities: list of str
598 @param readingEntities: list of entities written in source reading
599 @type fromReading: str
600 @param fromReading: name of the source reading
601 @type toReading: str
602 @param toReading: name of the target reading
603 @rtype: list of str
604 @return: list of entities written in target reading
605 @raise AmbiguousConversionError: if conversion for a specific entity of
606 the source reading is ambiguous.
607 @raise ConversionError: on other operations specific to the conversion
608 between the two readings (e.g. error on converting entities).
609 @raise UnsupportedError: if source or target reading is not supported
610 for conversion.
611 @raise InvalidEntityError: if an invalid entity is given.
612 """
613 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS:
614 raise UnsupportedError("conversion direction from '" \
615 + fromReading + "' to '" + toReading + "' not supported")
616
617
618 if not self.getOption('keepPinyinApostrophes'):
619 readingEntities = self._getFromOperator(fromReading)\
620 .removeApostrophes(readingEntities)
621
622
623 entityTuples = []
624 for entity in readingEntities:
625
626 if self._getFromOperator(fromReading).isReadingEntity(entity):
627
628 plainSyllable, tone = self._getFromOperator(fromReading)\
629 .splitEntityTone(entity)
630
631 entityTuples.append((plainSyllable, tone))
632 else:
633 entityTuples.append(entity)
634
635
636 entityTuples = self.convertErhuaFunc(entityTuples)
637
638 targetTones = self._getToOperator(toReading).getTones()
639
640
641 toReadingEntities = []
642 for entry in entityTuples:
643 if type(entry) == type(()):
644 plainSyllable, tone = entry
645
646
647 if tone not in targetTones:
648
649 raise AmbiguousConversionError("Target reading does not " \
650 "support missing tone information")
651
652
653 if plainSyllable.lower() == 'r' \
654 and ((self.getOption('breakUpErhua') == 'auto' \
655 and self._getToOperator('Pinyin').getOption('Erhua') \
656 == 'ignore') \
657 or self.getOption('breakUpErhua') == 'on'):
658 if plainSyllable.isupper():
659 plainSyllable = 'ER'
660 else:
661 plainSyllable = 'er'
662
663
664 if self.fromYVowel != self.toYVowel:
665 plainSyllable = plainSyllable.replace(self.fromYVowel,
666 self.toYVowel)
667
668
669 if self._getToOperator(toReading).getOption('case') == 'lower':
670 plainSyllable = plainSyllable.lower()
671 elif self._getToOperator(toReading).getOption('case') \
672 == 'upper':
673 plainSyllable = plainSyllable.upper()
674
675 try:
676 toReadingEntities.append(
677 self._getToOperator(toReading).getTonalEntity(
678 plainSyllable, tone))
679 except InvalidEntityError, e:
680
681
682 raise ConversionError(e)
683 elif entry == self._getToOperator(fromReading)\
684 .getOption('PinyinApostrophe'):
685 toReadingEntities.append(self._getToOperator(toReading)\
686 .getOption('PinyinApostrophe'))
687 else:
688 toReadingEntities.append(entry)
689
690 return toReadingEntities
691
693 """
694 Converts the various I{Erhua} forms in a list of reading entities to
695 a representation with one syllable, e.g. C{['tou2', 'r5']} to
696 C{['tour2']}.
697
698 @type entityTuples: list of tuple/str
699 @param entityTuples: list of tuples with plain syllable and tone
700 @rtype: list of tuple/str
701 @return: list of tuples with plain syllable and tone
702 """
703 convertedTuples = []
704 lastEntry = None
705 for entry in entityTuples:
706 if type(lastEntry) == type(()) and type(entry) == type(()):
707 lastPlainSyllable, lastTone = lastEntry
708 plainSyllable, tone = entry
709 if plainSyllable.lower() == 'r' \
710 and lastPlainSyllable.lower() not in ['e', 'er', 'r', 'n',
711 'ng', 'hng', 'hm', 'm', u'ê']:
712
713 convertedTuples.append((lastPlainSyllable + plainSyllable,
714 lastTone))
715 lastEntry = None
716 else:
717 convertedTuples.append(lastEntry)
718 lastEntry = entry
719 else:
720 if lastEntry != None:
721 convertedTuples.append(lastEntry)
722 lastEntry = entry
723 if lastEntry != None:
724 convertedTuples.append(lastEntry)
725
726 return convertedTuples
727
729 """
730 Converts the various I{Erhua} forms in a list of reading entities to
731 a representation with two syllable, e.g. C{['tour2']} to
732 C{['tou2', 'r5']}.
733
734 @type entityTuples: list of tuple/str
735 @param entityTuples: list of tuples with plain syllable and tone
736 @rtype: list of tuple/str
737 @return: list of tuples with plain syllable and tone
738 """
739 convertedTuples = []
740 for entry in entityTuples:
741 if type(entry) != type(()):
742 convertedTuples.append(entry)
743 else:
744 plainSyllable, tone = entry
745 if plainSyllable[-1:].lower() == 'r' \
746 and plainSyllable.lower() not in ['er', 'r']:
747
748 convertedTuples.append((plainSyllable[:-1], tone))
749
750 convertedTuples.append((plainSyllable[-1:], 5))
751 else:
752 convertedTuples.append(entry)
753
754 return convertedTuples
755
757 """
758 Checks the given entities for Erhua forms and raises a ConversionError.
759
760 @type entityTuples: list of tuple/str
761 @param entityTuples: list of tuples with plain syllable and tone
762 @rtype: list of tuple/str
763 @return: list of tuples with plain syllable and tone
764 @raise ConversionError: when an Erhua form is found
765 """
766 for entry in entityTuples:
767 if type(entry) == type(()):
768 plainSyllable, _ = entry
769
770 if plainSyllable.endswith('r') and plainSyllable != 'er':
771 raise ConversionError(
772 "Cannot convert Erhua form in syllable '" \
773 + plainSyllable + "'")
774
775 return entityTuples
776
779 u"""
780 Provides a converter for different representations of the Mandarin Chinese
781 romanisation I{Wade-Giles}.
782
783 The converter has very limited possibilities for conversion at this time,
784 much more different forms of Wade-Giles are possible and should be
785 implemented.
786 """
787 CONVERSION_DIRECTIONS = [('WadeGiles', 'WadeGiles')]
788
817
820 """
821 Provides a converter between the Chinese romanisation I{Hanyu Pinyin} and
822 I{Wade-Giles}.
823
824 Currently only a non standard subset of Wade-Giles is implemented. As many
825 different interpretations exist providing a complete coverage seems hardly
826 achievable. An important step is support for the revised system by Giles as
827 found in his I{Chinese-English Dictionary} (as of 1912). A further target is
828 to at least implement means to support concrete shapes found in the usage of
829 big bodies e.g. libraries.
830
831 Upper or lower case will be transfered between syllables, no special
832 formatting according to the standards (i.e. Pinyin) will be made. Upper/
833 lower case will be identified according to three classes: either the whole
834 syllable is upper case, only the initial letter is upper case or otherwise
835 the whole syllable is assumed being lower case.
836
837 Conversion cannot in general be done in a one-to-one manner. Standard Pinyin
838 has no notion to explicitly specify missing tonal information while this is
839 in general given in Wade-Giles by just omitting the tone digits. This
840 implementation furthermore doesn't support explicit depiction of I{Erhua} in
841 the Wade-Giles romanisation system thus failing when r-colourised syllables
842 are found.
843
844 @todo Lang: Increase support for different I{reading dialects} of the
845 Wade-Giles romanisation system. Includes support in
846 L{WadeGilesOperator}. Get proper sources on the syllables and
847 mappings. Use well-known instances.
848 @warning: This module isn't backed-up by any sources yet and doesn't
849 guarantee a syllable mapping free of errors.
850 """
851 CONVERSION_DIRECTIONS = [('Pinyin', 'WadeGiles'), ('WadeGiles', 'Pinyin')]
852
853
854 DEFAULT_READING_OPTIONS = {'Pinyin': {'Erhua': 'ignore',
855 'toneMarkType': 'Numbers'}, 'WadeGiles': {}}
856
866
868
869 plainSyllable, tone = self.readingFact.splitEntityTone(entity,
870 fromReading, **self.DEFAULT_READING_OPTIONS[fromReading])
871
872
873 if fromReading == "WadeGiles":
874 table = self.db.tables['WadeGilesPinyinMapping']
875 transSyllable = self.db.selectScalar(
876 select([table.c.Pinyin], table.c.WadeGiles == plainSyllable))
877 elif fromReading == "Pinyin":
878
879 table = self.db.tables['WadeGilesPinyinMapping']
880 transSyllable = self.db.selectScalar(
881 select([table.c.WadeGiles],
882 and_(table.c.Pinyin == plainSyllable,
883 table.c.PinyinIdx == 0)))
884 if not transSyllable:
885 raise ConversionError("conversion for entity '" + plainSyllable \
886 + "' not supported")
887
888 try:
889 return self.readingFact.getTonalEntity(transSyllable, tone,
890 toReading, **self.DEFAULT_READING_OPTIONS[toReading])
891 except InvalidEntityError, e:
892
893
894 raise ConversionError(e)
895
898 u"""
899 Provides a converter for different representations of the Chinese
900 romanisation I{Gwoyeu Romatzyh}.
901 """
902 CONVERSION_DIRECTIONS = [('GR', 'GR')]
903
905 u"""
906 Creates an instance of the GRDialectConverter.
907
908 @param args: optional list of L{RomanisationOperator}s to use for
909 handling source and target readings.
910 @param options: extra options
911 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
912 given, default settings will be assumed.
913 @keyword sourceOperators: list of L{ReadingOperator}s used for handling
914 source readings.
915 @keyword targetOperators: list of L{ReadingOperator}s used for handling
916 target readings.
917 @keyword keepGRApostrophes: if set to C{True} apostrophes separating
918 two syllables in Gwoyeu Romatzyh will be kept even if not necessary.
919 Apostrophes missing before 0-initials will be added though.
920 """
921 super(GRDialectConverter, self).__init__(*args, **options)
922
923 if 'keepGRApostrophes' in options:
924 self.optionValue['keepGRApostrophes'] \
925 = options['keepGRApostrophes']
926
927 @classmethod
933
934 - def convertEntities(self, readingEntities, fromReading='GR',
935 toReading='GR'):
936 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS:
937 raise UnsupportedError("conversion direction from '" \
938 + fromReading + "' to '" + toReading + "' not supported")
939
940 if self.getOption('keepGRApostrophes'):
941
942 fromApostrophe = self._getFromOperator(fromReading)\
943 .getOption('GRSyllableSeparatorApostrophe')
944 toApostrophe = self._getToOperator(toReading)\
945 .getOption('GRSyllableSeparatorApostrophe')
946 if fromApostrophe != toApostrophe:
947 convertedEntities = []
948 for entity in readingEntities:
949 if entity == fromApostrophe:
950 convertedEntities.append(toApostrophe)
951 else:
952 convertedEntities.append(entity)
953 else:
954
955 readingEntities = self._getFromOperator(fromReading)\
956 .removeApostrophes(readingEntities)
957
958
959 if self._getToOperator(toReading).getOption('case') == 'lower':
960 readingEntities = [entity.lower() for entity in readingEntities]
961 elif self._getToOperator(toReading).getOption('case') == 'upper':
962 readingEntities = [entity.upper() for entity in readingEntities]
963
964
965 fromApostrophe = self._getFromOperator(fromReading)\
966 .getOption('GRRhotacisedFinalApostrophe')
967 toApostrophe = self._getToOperator(toReading)\
968 .getOption('GRRhotacisedFinalApostrophe')
969 if fromApostrophe != toApostrophe:
970 readingEntities = [entity.replace(fromApostrophe, toApostrophe) \
971 for entity in readingEntities]
972
973
974 if not self._getToOperator(toReading).getOption('abbreviations'):
975 convertedEntities = []
976 for entity in readingEntities:
977 convertedEntities.append(self._getToOperator(toReading)\
978 .convertAbbreviatedEntity(entity))
979 readingEntities = convertedEntities
980
981 return readingEntities
982
985 """
986 Provides a converter between the Chinese romanisation I{Gwoyeu Romatzyh} and
987 I{Hanyu Pinyin}.
988
989 Features:
990 - configurable mapping of options neutral tone when converting from GR,
991 - conversion of abbreviated forms of GR.
992
993 Upper or lower case will be transfered between syllables, no special
994 formatting according to the standards (i.e. Pinyin) will be made. Upper/
995 lower case will be identified according to three classes: either the whole
996 syllable is upper case, only the initial letter is upper case or otherwise
997 the whole syllable is assumed being lower case.
998
999 Limitations
1000 ===========
1001 Conversion cannot in general be done in a one-to-one manner.
1002 I{Gwoyeu Romatzyh} (GR) gives the etymological tone for a syllable in
1003 neutral tone while Pinyin doesn't. In contrast to tones in GR carrying more
1004 information I{r-coloured} syllables (I{Erlhuah}) are rendered the way they
1005 are pronounced that loosing the original syllable. Converting those forms to
1006 Pinyin in a general manner is not possible while yielding the original
1007 string in Chinese characters might help do disambiguate. Another issue
1008 tone-wise is that Pinyin allows to specify the changed tone when dealing
1009 with tone sandhis instead of the etymological one while GR doesn't. Only
1010 working with the Chinese character string might help to restore the original
1011 tone.
1012
1013 Conversion from Pinyin is crippled as the neutral tone in this form cannot
1014 be transfered to GR as described above. More information is needed to
1015 resolve this. For the other direction the neutral tone can be mapped but the
1016 etymological tone information is lost. For the optional neutral tone either
1017 a mapping is done to the neutral tone in Pinyin or to the original
1018 (etymological).
1019 """
1020 CONVERSION_DIRECTIONS = [('GR', 'Pinyin'), ('Pinyin', 'GR')]
1021
1022
1023 DEFAULT_READING_OPTIONS = {'Pinyin': {'Erhua': 'oneSyllable'},
1024 'GR': {'abbreviations': False}}
1025
1027 """
1028 Creates an instance of the GRPinyinConverter.
1029
1030 @param args: optional list of L{RomanisationOperator}s to use for
1031 handling source and target readings.
1032 @param options: extra options
1033 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
1034 given, default settings will be assumed.
1035 @keyword sourceOperators: list of L{ReadingOperator}s used for handling
1036 source readings.
1037 @keyword targetOperators: list of L{ReadingOperator}s used for handling
1038 target readings.
1039 @keyword GROptionalNeutralToneMapping: if set to 'original' GR syllables
1040 marked with an optional neutral tone will be mapped to the
1041 etymological tone, if set to 'neutral' they will be mapped to the
1042 neutral tone in Pinyin.
1043 """
1044 super(GRPinyinConverter, self).__init__(*args, **options)
1045
1046 if 'GROptionalNeutralToneMapping' in options:
1047 if options['GROptionalNeutralToneMapping'] not in ['original',
1048 'neutral']:
1049 raise ValueError("Invalid option '" \
1050 + str(options['GROptionalNeutralToneMapping']) \
1051 + "' for keyword 'GROptionalNeutralToneMapping'")
1052 self.optionValue['GROptionalNeutralToneMapping'] \
1053 = options['GROptionalNeutralToneMapping']
1054
1055
1056 self.grToneMapping = dict([(tone, int(tone[0])) \
1057 for tone in operator.GROperator.TONES])
1058
1059 if self.getOption('GROptionalNeutralToneMapping') == 'neutral':
1060 for tone in ['1stToneOptional5th', '2ndToneOptional5th',
1061 '3rdToneOptional5th', '4thToneOptional5th']:
1062 self.grToneMapping[tone] = 5
1063
1064
1065 self.pyToneMapping = {1: '1stTone', 2: '2ndTone', 3: '3rdTone',
1066 4: '4thTone', 5: None}
1067
1068
1069 self.grOperator = None
1070
1071 @classmethod
1077
1079
1080 if fromReading == "GR" and entity.endswith('l') \
1081 and entity not in ['el', 'erl', 'eel', 'ell']:
1082 raise AmbiguousConversionError("conversion for entity '" + entity \
1083 + "' is ambiguous")
1084
1085
1086 plainSyllable, tone = self.readingFact.splitEntityTone(entity,
1087 fromReading, **self.DEFAULT_READING_OPTIONS[fromReading])
1088
1089
1090 if fromReading == "GR":
1091 table = self.db.tables['PinyinGRMapping']
1092 transSyllable = self.db.selectScalar(select([table.c.Pinyin],
1093 table.c.GR == plainSyllable))
1094 transTone = self.grToneMapping[tone]
1095
1096 elif fromReading == "Pinyin":
1097
1098 if plainSyllable != 'er' and plainSyllable.endswith('r'):
1099 erlhuahForm = True
1100 plainSyllable = plainSyllable[:-1]
1101 else:
1102 erlhuahForm = False
1103
1104 table = self.db.tables['PinyinGRMapping']
1105 transSyllable = self.db.selectScalar(select([table.c.GR],
1106 table.c.Pinyin == plainSyllable))
1107 if self.pyToneMapping[tone]:
1108 transTone = self.pyToneMapping[tone]
1109 else:
1110 raise AmbiguousConversionError("conversion for entity '" \
1111 + plainSyllable + "' with tone '" + str(tone) \
1112 + "' is ambiguous")
1113
1114 if not transSyllable:
1115 raise ConversionError("conversion for entity '" + plainSyllable \
1116 + "' not supported")
1117
1118 try:
1119 if toReading == 'GR' and erlhuahForm:
1120 try:
1121
1122 return self._getGROperator().getRhotacisedTonalEntity(
1123 transSyllable, transTone)
1124 except UnsupportedError, e:
1125
1126
1127 raise ConversionError(e)
1128 else:
1129 return self.readingFact.getTonalEntity(transSyllable, transTone,
1130 toReading, **self.DEFAULT_READING_OPTIONS[toReading])
1131 except InvalidEntityError, e:
1132
1133
1134 raise ConversionError(e)
1135
1137 """Creates an instance of a GROperator if needed and returns it."""
1138 if self.grOperator == None:
1139 self.grOperator = operator.GROperator(
1140 **self.DEFAULT_READING_OPTIONS['GR'])
1141 return self.grOperator
1142
1145 u"""
1146 Provides a converter between the Mandarin Chinese romanisation
1147 I{Hanyu Pinyin} and the I{International Phonetic Alphabet} (I{IPA}) for
1148 Standard Mandarin. This converter provides only basic support for tones and
1149 the user needs to specify additional means when handling tone sandhi
1150 occurrences.
1151
1152 The standard conversion table is based on the source mentioned below.
1153 Though depiction in IPA depends on many factors and therefore might highly
1154 vary it seems this source is not error-free: final I{-üan} written [yan]
1155 should be similar to I{-ian} [iɛn] and I{-iong} written [yŋ] should be
1156 similar to I{-ong} [uŋ].
1157
1158 As IPA allows for a big range of different representations for the sounds
1159 in a varying degree no conversion to Pinyin is offered.
1160
1161 Currently conversion of I{Erhua sound} is not supported.
1162
1163 Features:
1164 - Default tone sandhi handling for lower third tone and neutral tone,
1165 - extensibility of tone sandhi handling,
1166 - extensibility for general coarticulation effects.
1167
1168 Limitations:
1169 - Tone sandhi needs special treatment depending on the user's needs,
1170 - transcription of onomatopoeic words will be limited to the general
1171 syllable scheme,
1172 - limited linking between syllables (e.g. for 啊、呕) will not be
1173 considered and
1174 - stress, intonation and accented speech are not covered.
1175
1176 Tone sandhi
1177 ===========
1178 Speech in tonal languages is generally subject to X{tone sandhi}. For
1179 example in Mandarin I{bu4 cuo4} for 不错 will render to I{bu2 cuo4}, or
1180 I{lao3shi1} (老师) with a tone contour of 214 for I{lao3} and 55 for I{shi1}
1181 will render to a contour 21 for I{lao3}.
1182
1183 When translating to IPA the system has to deal with these tone sandhis and
1184 therefore provides an option C{'sandhiFunction'} that can be set to the user
1185 specified handler. PinyinIPAConverter will only provide a very basic handler
1186 L{lowThirdAndNeutralToneRule()} which will apply the contour 21 for the
1187 third tone when several syllables occur and needs the user to supply proper
1188 tone information, e.g. I{ke2yi3} (可以) instead of the normal rendering as
1189 I{ke3yi3} to indicate the tone sandhi for the first syllable.
1190
1191 Further support will be provided for varying stress on syllables in the
1192 neutral tone. Following a first tone the weak syllable will have a half-low
1193 pitch, following a second tone a middle, following a third tone a half-high
1194 and following a forth tone a low pitch.
1195
1196 There a further occurrences of tone sandhis:
1197 - pronunciations of 一 and 不 vary in different tones depending on their
1198 context,
1199 - directional complements like 拿出来 I{ná chu lai} under some
1200 circumstances loose their tone,
1201 - in a three syllable group ABC the second syllable B changes from
1202 second tone to first tone when A is in the first or second tone and
1203 C is not in the neutral tone.
1204
1205 Coarticulation
1206 ==============
1207 In most cases conversion from Pinyin to IPA is straightforward if one does
1208 not take tone sandhi into account. There are case though (when leaving
1209 aside tones), where phonetic realisation of a syllable depends on its
1210 context. The converter allows for handling coarticulation effects by
1211 adding a hook C{coarticulationFunction} to which a user-implemented
1212 function can be given. An example implementation is given with
1213 L{finalECoarticulation()}.
1214
1215 Source
1216 ======
1217 - Hànyǔ Pǔtōnghuà Yǔyīn Biànzhèng (汉语普通话语音辨正). Page 15, Běijīng Yǔyán
1218 Dàxué Chūbǎnshè (北京语言大学出版社), 2003, ISBN 7-5619-0622-6.
1219 - San Duanmu: The Phonology of Standard Chinese. Second edition, Oxford
1220 University Press, 2007, ISBN 978-0-19-921578-2, ISBN 978-0-19-921579-9.
1221 - Yuen Ren Chao: A Grammar of Spoken Chinese. University of California
1222 Press, Berkeley, 1968, ISBN 0-520-00219-9.
1223
1224 @see:
1225 - Mandarin tone sandhi:
1226 U{http://web.mit.edu/jinzhang/www/pinyin/tones/index.html}
1227 - IPA: U{http://en.wikipedia.org/wiki/International_Phonetic_Alphabet}
1228 - The Phonology of Standard Chinese. First edition, 2000:
1229 U{http://books.google.de/books?id=tG0-Ad9CrBcC}
1230
1231 @todo Impl: Two different methods for tone sandhi and coarticulation
1232 effects?
1233 @todo Lang: Support for I{Erhua} in mapping.
1234 """
1235 CONVERSION_DIRECTIONS = [('Pinyin', 'MandarinIPA')]
1236
1237 PINYIN_OPTIONS = {'Erhua': 'ignore', 'toneMarkType': 'Numbers',
1238 'missingToneMark': 'noinfo', 'case': 'lower'}
1239 """Options for the PinyinOperator."""
1240
1241 TONEMARK_MAPPING = {1: '1stTone', 2: '2ndTone', 3: '3rdToneRegular',
1242 4: '4thTone', 5: '5thTone'}
1243
1244 NEUTRAL_TONE_MAPPING = {'1stTone': '5thToneHalfLow',
1245 '2ndTone': '5thToneMiddle', '3rdToneRegular': '5thToneHalfHigh',
1246 '3rdToneLow': '5thToneHalfHigh', '4thTone': '5thToneLow',
1247 '5thTone': '5thTone', '5thToneHalfHigh': '5thToneHalfHigh',
1248 '5thToneMiddle': '5thToneMiddle', '5thToneHalfLow':'5thToneHalfLow',
1249 '5thToneLow': '5thToneLow'}
1250 """Mapping of neutral tone following another tone."""
1251
1253 """
1254 Creates an instance of the PinyinIPAConverter.
1255
1256 @param args: optional list of L{RomanisationOperator}s to use for
1257 handling source and target readings.
1258 @param options: extra options
1259 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
1260 given, default settings will be assumed.
1261 @keyword sourceOperators: list of L{ReadingOperator}s used for handling
1262 source readings.
1263 @keyword targetOperators: list of L{ReadingOperator}s used for handling
1264 target readings.
1265 @keyword sandhiFunction: a function that handles tonal changes
1266 and converts a given list of entities to accommodate sandhi
1267 occurrences, see L{lowThirdAndNeutralToneRule()} for the default
1268 implementation.
1269 @keyword coarticulationFunction: a function that handles coarticulation
1270 effects, see L{finalECoarticulation()} for an example
1271 implementation.
1272 """
1273 super(PinyinIPAConverter, self).__init__(*args, **options)
1274
1275
1276 if 'sandhiFunction' in options:
1277 self.optionValue['sandhiFunction'] = options['sandhiFunction']
1278
1279 if 'coarticulationFunction' in options:
1280 self.optionValue['coarticulationFunction'] \
1281 = options['coarticulationFunction']
1282
1283 @classmethod
1290
1291 - def convertEntities(self, readingEntities, fromReading='Pinyin',
1292 toReading='MandarinIPA'):
1293
1294 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS:
1295 raise UnsupportedError("conversion direction from '" \
1296 + fromReading + "' to '" + toReading + "' not supported")
1297
1298 if self.readingFact.isReadingConversionSupported(fromReading,
1299 fromReading):
1300
1301
1302 readingEntities = self.readingFact.convertEntities(readingEntities,
1303 fromReading, fromReading,
1304 sourceOperators=[self._getFromOperator(fromReading)],
1305 targetOptions=self.PINYIN_OPTIONS)
1306
1307
1308
1309 entityTuples = []
1310 for entity in readingEntities:
1311
1312 if self.readingFact.isReadingEntity(entity, fromReading,
1313 **self.PINYIN_OPTIONS):
1314
1315 plainSyllable, tone = self.readingFact.splitEntityTone(entity,
1316 fromReading, **self.PINYIN_OPTIONS)
1317
1318 entityTuples.append((plainSyllable, tone))
1319 else:
1320 entityTuples.append(entity)
1321
1322
1323 ipaTupelList = []
1324 for idx, entry in enumerate(entityTuples):
1325
1326 if type(entry) == type(()):
1327 plainSyllable, tone = entry
1328
1329 transEntry = None
1330 if self.getOption('coarticulationFunction'):
1331 transEntry = self.getOption('coarticulationFunction')(self,
1332 entityTuples[:i], plainSyllable, tone,
1333 entityTuples[i+1:])
1334
1335 if not transEntry:
1336
1337 transEntry = self._convertSyllable(plainSyllable, tone)
1338
1339 ipaTupelList.append(transEntry)
1340 else:
1341 ipaTupelList.append(entry)
1342
1343
1344 if self._getToOperator(toReading).getOption('toneMarkType') != 'None':
1345 ipaTupelList = self.getOption('sandhiFunction')(self, ipaTupelList)
1346
1347
1348 toReadingEntities = []
1349 for entry in ipaTupelList:
1350 if type(entry) == type(()):
1351 plainSyllable, tone = entry
1352 entity = self._getToOperator(toReading).getTonalEntity(
1353 plainSyllable, tone)
1354 else:
1355 entity = entry
1356 toReadingEntities.append(entity)
1357 return toReadingEntities
1358
1360 """
1361 Converts a single syllable from Pinyin to IPA.
1362
1363 @type plainSyllable: str
1364 @param plainSyllable: plain syllable in the source reading
1365 @type tone: int
1366 @param tone: the syllable's tone
1367 @rtype: str
1368 @return: IPA representation
1369 """
1370
1371 table = self.db.tables['PinyinIPAMapping']
1372 transSyllables = self.db.selectScalars(select([table.c.IPA],
1373 and_(table.c.Pinyin == plainSyllable,
1374 table.c.Feature.in_(['', 'Default']))))
1375
1376 if not transSyllables:
1377 raise ConversionError("conversion for entity '" + plainSyllable \
1378 + "' not supported")
1379 elif len(transSyllables) != 1:
1380 raise ConversionError("conversion for entity '" + plainSyllable \
1381 + "' ambiguous")
1382 if tone:
1383 transTone = self.TONEMARK_MAPPING[tone]
1384 else:
1385 transTone = None
1386
1387 return transSyllables[0], transTone
1388
1390 """
1391 Converts C{'3rdToneRegular'} to C{'3rdToneLow'} for syllables followed
1392 by others and C{'5thTone'} to the respective forms when following
1393 another syllable.
1394
1395 This function serves as the default rule and can be overwritten by
1396 giving a function as option C{sandhiFunction} on instantiation.
1397
1398 @type entityTuples: list of tuple/str
1399 @param entityTuples: a list of tuples and strings. An IPA entity is
1400 given as a tuple with the plain syllable and its tone, other content
1401 is given as plain string.
1402 @rtype: list
1403 @return: converted entity list
1404 @todo Lang: What to do on several following neutral tones?
1405 """
1406
1407 if len(entityTuples) <= 1:
1408 return entityTuples
1409
1410
1411 convertedEntities = []
1412 precedingTone = None
1413 for idx, entry in enumerate(entityTuples):
1414 if type(entry) == type(()):
1415 plainSyllable, tone = entry
1416
1417 if tone == '5thTone' and precedingTone:
1418 tone = self.NEUTRAL_TONE_MAPPING[precedingTone]
1419 elif tone == '3rdToneRegular' and idx + 1 != len(entityTuples):
1420 tone = '3rdToneLow'
1421 entry = (plainSyllable, tone)
1422
1423 precedingTone = tone
1424 else:
1425 precedingTone = None
1426
1427 convertedEntities.append(entry)
1428
1429 return convertedEntities
1430
1433 u"""
1434 Example function for handling coarticulation of final I{e} for the
1435 neutral tone.
1436
1437 Only syllables with final I{e} are considered for other syllables
1438 C{None} is returned. This will trigger the regular conversion method.
1439
1440 Pronunciation of final I{e}
1441 ===========================
1442 The final I{e} found in syllables I{de}, I{me} and others is
1443 pronounced /ɤ/ in the general case (see source below) but if tonal
1444 stress is missing it will be pronounced /ə/. This implementation will
1445 take care of this for the fifth tone. If no tone is specified
1446 (C{'None'}) an L{ConversionError} will be raised for the syllables
1447 affected.
1448
1449 Source: Hànyǔ Pǔtōnghuà Yǔyīn Biànzhèng (汉语普通话语音辨正). Page 15,
1450 Běijīng Yǔyán Dàxué Chūbǎnshè (北京语言大学出版社), 2003,
1451 ISBN 7-5619-0622-6.
1452
1453 @type leftContext: list of tuple/str
1454 @param leftContext: syllables preceding the syllable in question in the
1455 source reading
1456 @type plainSyllable: str
1457 @param plainSyllable: plain syllable in the source reading
1458 @type tone: int
1459 @param tone: the syllable's tone
1460 @type rightContext: list of tuple/str
1461 @param rightContext: syllables following the syllable in question in the
1462 source reading
1463 @rtype: str
1464 @return: IPA representation
1465 """
1466 if tone == 5:
1467 _, final = self._getToOperator('Pinyin').getOnsetRhyme(
1468 plainSyllable)
1469 if final == 'e':
1470
1471 table = self.db.tables['PinyinIPAMapping']
1472 transSyllable = self.db.selectScalars(select([table.c.IPA],
1473 and_(table.c.Pinyin == plainSyllable,
1474 table.c.Feature == '5thTone')))
1475 if not transSyllables:
1476 raise ConversionError("conversion for entity '" \
1477 + plainSyllable + "' not supported")
1478 elif len(transSyllables) != 1:
1479 raise ConversionError("conversion for entity '" \
1480 + plainSyllable + "' and tone '" + str(tone) \
1481 + "' ambiguous")
1482
1483 return transSyllables[0], self.TONEMARK_MAPPING[tone]
1484
1487 """
1488 PinyinBrailleConverter defines a converter between the Chinese romanisation
1489 I{Hanyu Pinyin} (with tone marks as numbers) and the I{Braille} system for
1490 Mandarin.
1491
1492 Conversion from Braille to Pinyin is ambiguous. The syllable pairs mo/me,
1493 e/o and le/lo will yield an L{AmbiguousConversionError}.
1494
1495 @see:
1496 - How is Chinese written in Braille?:
1497 U{http://www.braille.ch/pschin-e.htm}
1498 - Chinese Braille: U{http://en.wikipedia.org/wiki/Chinese_braille}
1499 @todo Impl: Move the toneMarks option to the L{MandarinBrailleOperator}.
1500 """
1501 CONVERSION_DIRECTIONS = [('Pinyin', 'MandarinBraille'),
1502 ('MandarinBraille', 'Pinyin')]
1503
1504 PUNCTUATION_SIGNS_MAPPING = {u'。': u'⠐⠆', u',': u'⠐', u'?': u'⠐⠄',
1505 u'!': u'⠰⠂', u':': u'⠒', u';': u'⠰', u'-': u'⠠⠤', u'…': u'⠐⠐⠐',
1506 u'·': u'⠠⠄', u'(': u'⠰⠄', u')': u'⠠⠆', u'[': u'⠰⠆', u']': u'⠰⠆'}
1507
1509 """
1510 Creates an instance of the PinyinBrailleConverter.
1511
1512 @param args: optional list of L{RomanisationOperator}s to use for
1513 handling source and target readings.
1514 @param options: extra options
1515 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
1516 given, default settings will be assumed.
1517 @keyword sourceOperators: list of L{ReadingOperator}s used for handling
1518 source readings.
1519 @keyword targetOperators: list of L{ReadingOperator}s used for handling
1520 target readings.
1521 @keyword toneMarks: if set to C{True} tone marks will be used when
1522 converted to Braille representation.
1523 """
1524 super(PinyinBrailleConverter, self).__init__(*args, **options)
1525
1526 if 'toneMarks' in options:
1527 self.optionValue['toneMarks'] = options['toneMarks']
1528
1529
1530 self._createMappings()
1531
1532
1533 self.reversePunctuationMapping = {}
1534 for key in self.PUNCTUATION_SIGNS_MAPPING:
1535 if key in self.reversePunctuationMapping:
1536
1537 self.reversePunctuationMapping[key] = None
1538 else:
1539 value = self.PUNCTUATION_SIGNS_MAPPING[key]
1540 self.reversePunctuationMapping[value] = key
1541
1542
1543 self.pinyinPunctuationRegex = re.compile(ur'(' \
1544 + '|'.join([re.escape(p) for p \
1545 in self.PUNCTUATION_SIGNS_MAPPING.keys()]) \
1546 + '|.+?)')
1547
1548 braillePunctuation = list(set(self.PUNCTUATION_SIGNS_MAPPING.values()))
1549
1550 braillePunctuation.sort(lambda x,y: len(y) - len(x))
1551 self.braillePunctuationRegex = re.compile(ur'(' \
1552 + '|'.join([re.escape(p) for p in braillePunctuation]) + '|.+?)')
1553
1554 @classmethod
1560
1562 """
1563 Creates the mappings of syllable initials and finals from the database.
1564 """
1565
1566 self.pinyinInitial2Braille = {}
1567 self.braille2PinyinInitial = {}
1568
1569 table = self.db.tables['PinyinBrailleInitialMapping']
1570 entries = self.db.selectRows(
1571 select([table.c.PinyinInitial, table.c.Braille]))
1572
1573 for pinyinInitial, brailleChar in entries:
1574
1575 if pinyinInitial in self.pinyinInitial2Braille:
1576 raise ValueError(
1577 "Ambiguous mapping from Pinyin syllable initial to Braille")
1578 self.pinyinInitial2Braille[pinyinInitial] = brailleChar
1579
1580 if brailleChar not in self.braille2PinyinInitial:
1581 self.braille2PinyinInitial[brailleChar] = set()
1582 self.braille2PinyinInitial[brailleChar].add(pinyinInitial)
1583
1584 self.pinyinInitial2Braille[''] = ''
1585 self.braille2PinyinInitial[''] = set([''])
1586
1587
1588 self.pinyinFinal2Braille = {}
1589 self.braille2PinyinFinal = {}
1590
1591 table = self.db.tables['PinyinBrailleFinalMapping']
1592 entries = self.db.selectRows(
1593 select([table.c.PinyinFinal, table.c.Braille]))
1594
1595 for pinyinFinal, brailleChar in entries:
1596
1597 if pinyinFinal in self.pinyinFinal2Braille:
1598 raise ValueError(
1599 "Ambiguous mapping from Pinyin syllable final to Braille")
1600 self.pinyinFinal2Braille[pinyinFinal] = brailleChar
1601
1602 if brailleChar not in self.braille2PinyinFinal:
1603 self.braille2PinyinFinal[brailleChar] = set()
1604 self.braille2PinyinFinal[brailleChar].add(pinyinFinal)
1605
1606
1607 self.pinyinFinal2Braille[u'ê'] = self.pinyinFinal2Braille[u'e']
1608
1610 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS:
1611 raise UnsupportedError("conversion direction from '" \
1612 + fromReading + "' to '" + toReading + "' not supported")
1613
1614 if self.readingFact.isReadingConversionSupported(fromReading,
1615 fromReading):
1616
1617
1618 readingEntities = self.readingFact.convertEntities(readingEntities,
1619 fromReading, fromReading,
1620 sourceOperators=[self._getFromOperator(fromReading)],
1621 targetOptions={'Erhua': 'ignore', 'toneMarkType': 'Numbers',
1622 'missingToneMark': 'noinfo'})
1623
1624 toReadingEntities = []
1625 if fromReading == "Pinyin":
1626 for entity in readingEntities:
1627
1628 if self._getFromOperator(fromReading).isReadingEntity(entity):
1629 toReadingEntity = self.convertBasicEntity(entity,
1630 fromReading, toReading)
1631 toReadingEntities.append(toReadingEntity)
1632 else:
1633
1634 for subEntity in self.pinyinPunctuationRegex.findall(
1635 entity):
1636 if subEntity in self.PUNCTUATION_SIGNS_MAPPING:
1637 toReadingEntities.append(
1638 self.PUNCTUATION_SIGNS_MAPPING[subEntity])
1639 else:
1640 toReadingEntities.append(subEntity)
1641 elif fromReading == "MandarinBraille":
1642 for entity in readingEntities:
1643 if self._getFromOperator(fromReading).isReadingEntity(entity):
1644 toReadingEntity = self.convertBasicEntity(entity.lower(),
1645 fromReading, toReading)
1646 toReadingEntities.append(toReadingEntity)
1647 else:
1648
1649 for subEntity in self.braillePunctuationRegex.findall(
1650 entity):
1651 if subEntity in self.reversePunctuationMapping:
1652 if not self.reversePunctuationMapping[subEntity]:
1653 raise AmbiguousConversionError(
1654 "conversion for entity '" + subEntity \
1655 + "' is ambiguous")
1656 toReadingEntities.append(
1657 self.reversePunctuationMapping[subEntity])
1658 else:
1659 toReadingEntities.append(subEntity)
1660
1661
1662 if self.readingFact.isReadingConversionSupported(toReading, toReading):
1663 toReadingEntities = self.readingFact.convertEntities(
1664 toReadingEntities, toReading, toReading,
1665 targetOperators=[self._getToOperator(toReading)])
1666 return toReadingEntities
1667
1669 """
1670 Converts a basic entity (a syllable) in the source reading to the given
1671 target reading.
1672
1673 This method is called by L{convertEntities()} and a single entity
1674 is given for conversion.
1675
1676 If a single entity needs to be converted it is recommended to use
1677 L{convertEntities()} instead. In the general case it can not be ensured
1678 that a mapping from one reading to another can be done by the simple
1679 conversion of a basic entity. One-to-many mappings are possible and
1680 there is no guarantee that any entity of a reading recognised by
1681 L{operator.ReadingOperator.isReadingEntity()} will be mapped here.
1682
1683 @type entity: str
1684 @param entity: string written in the source reading in lower case
1685 letters
1686 @type fromReading: str
1687 @param fromReading: name of the source reading
1688 @type toReading: str
1689 @param toReading: name of the target reading, different from the source
1690 reading
1691 @rtype: str
1692 @returns: the entity converted to the C{toReading} in lower case
1693 @raise AmbiguousConversionError: if conversion for this entity of the
1694 source reading is ambiguous.
1695 @raise ConversionError: on other operations specific to the conversion
1696 of the entity.
1697 @raise InvalidEntityError: if the entity is invalid.
1698 """
1699
1700 plainEntity, tone \
1701 = self._getFromOperator(fromReading).splitEntityTone(entity)
1702
1703 if fromReading == "Pinyin":
1704 initial, final \
1705 = self._getFromOperator(fromReading).getOnsetRhyme(plainEntity)
1706 try:
1707 transSyllable = self.pinyinInitial2Braille[initial] \
1708 + self.pinyinFinal2Braille[final]
1709 except KeyError:
1710 raise ConversionError("conversion for entity '" \
1711 + plainEntity + "' not supported")
1712 elif fromReading == "MandarinBraille":
1713
1714 initial, final \
1715 = self._getFromOperator(fromReading).getOnsetRhyme(plainEntity)
1716
1717
1718 forms = []
1719 for i in self.braille2PinyinInitial[initial]:
1720 for f in self.braille2PinyinFinal[final]:
1721
1722 table = self.db.tables['PinyinInitialFinal']
1723 entry = self.db.selectScalar(
1724 select([table.c.Pinyin],
1725 and_(table.c.PinyinInitial == i,
1726 table.c.PinyinFinal == f)))
1727 if entry:
1728 forms.append(entry)
1729
1730
1731 if len(forms) > 1:
1732 for form in forms[:]:
1733 if not self._getToOperator(toReading).isPlainReadingEntity(
1734 form):
1735 forms.remove(form)
1736 if not forms:
1737 raise ConversionError("conversion for entity '" \
1738 + plainEntity + "' not supported")
1739 if len(forms) > 1:
1740 raise AmbiguousConversionError("conversion for entity '" \
1741 + plainEntity + "' is ambiguous")
1742 else:
1743 transSyllable = forms[0]
1744
1745
1746 if not self.getOption('toneMarks'):
1747 tone = None
1748 try:
1749 return self._getToOperator(toReading).getTonalEntity(transSyllable,
1750 tone)
1751 except InvalidEntityError, e:
1752
1753
1754 raise ConversionError(e)
1755
1758 u"""
1759 Provides a converter for different representations of the Cantonese
1760 romanisation I{Jyutping}.
1761 """
1762 CONVERSION_DIRECTIONS = [('Jyutping', 'Jyutping')]
1763
1783
1786 u"""
1787 Provides a converter for different representations of the I{Cantonese Yale}
1788 romanisation system.
1789
1790 High Level vs. High Falling Tone
1791 ================================
1792 As described in L{CantoneseYaleOperator} the abbreviated form of the
1793 Cantonese Yale romanisation system which uses numbers as tone marks makes no
1794 distinction between the high level tone and the high falling tone. On
1795 conversion to the form with diacritical marks it is thus important to choose
1796 the correct mapping. This can be configured by applying a special instance
1797 of a L{CantoneseYaleOperator}.
1798 """
1799 CONVERSION_DIRECTIONS = [('CantoneseYale', 'CantoneseYale')]
1800
1820
1823 """
1824 Provides a converter between the Cantonese romanisation systems I{Jyutping}
1825 and I{Cantonese Yale}.
1826
1827 Upper or lower case will be transfered between syllables, no special
1828 formatting according to the standards will be made. Upper/lower case will be
1829 identified according to three classes: either the whole syllable is upper
1830 case, only the initial letter is upper case or otherwise the whole syllable
1831 is assumed being lower case.
1832
1833 High Level vs. High Falling Tone
1834 ================================
1835 As described in L{CantoneseYaleOperator} the Cantonese Yale romanisation
1836 system makes a distinction between the high level tone and the high falling
1837 tone in general while Jyutping does not. On conversion it is thus important
1838 to choose the correct mapping. This can be configured by applying a special
1839 instance of a L{CantoneseYaleOperator}.
1840 """
1841 CONVERSION_DIRECTIONS = [('Jyutping', 'CantoneseYale'),
1842 ('CantoneseYale', 'Jyutping')]
1843
1844 DEFAULT_READING_OPTIONS = {'CantoneseYale': {'toneMarkType': 'Internal'},
1845 'Jyutping': {}}
1846
1847 DEFAULT_TONE_MAPPING = {2: '2ndTone', 3: '3rdTone', 4: '4thTone',
1848 5: '5thTone', 6: '6thTone'}
1849 """
1850 Mapping of Jyutping tones to Yale tones. Tone 1 needs to be handled
1851 independently.
1852 """
1853
1855 """
1856 Creates an instance of the JyutpingYaleConverter.
1857
1858 @param args: optional list of L{RomanisationOperator}s to use for
1859 handling source and target readings.
1860 @param options: extra options
1861 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
1862 given, default settings will be assumed.
1863 @keyword sourceOperators: list of L{ReadingOperator}s used for handling
1864 source readings.
1865 @keyword targetOperators: list of L{ReadingOperator}s used for handling
1866 target readings.
1867 """
1868 super(JyutpingYaleConverter, self).__init__(*args, **options)
1869
1871
1872 plainSyllable, tone = self.readingFact.splitEntityTone(entity,
1873 fromReading, **self.DEFAULT_READING_OPTIONS[fromReading])
1874
1875
1876 if fromReading == "CantoneseYale":
1877 table = self.db.tables['JyutpingYaleMapping']
1878 transSyllable = self.db.selectScalar(
1879 select([table.c.Jyutping],
1880 table.c.CantoneseYale == plainSyllable))
1881
1882 if tone:
1883
1884 transTone = int(tone[0])
1885 else:
1886 transTone = None
1887 elif fromReading == "Jyutping":
1888 table = self.db.tables['JyutpingYaleMapping']
1889 transSyllable = self.db.selectScalar(
1890 select([table.c.CantoneseYale],
1891 table.c.Jyutping == plainSyllable))
1892
1893 if not tone:
1894 transTone = None
1895 elif tone != 1:
1896 transTone = self.DEFAULT_TONE_MAPPING[tone]
1897 else:
1898
1899 transTone \
1900 = self._getToOperator(toReading).getOption('YaleFirstTone')
1901
1902 if not transSyllable:
1903 raise ConversionError("conversion for entity '" + plainSyllable \
1904 + "' not supported")
1905 try:
1906 return self.readingFact.getTonalEntity(transSyllable, transTone,
1907 toReading, **self.DEFAULT_READING_OPTIONS[toReading])
1908 except InvalidEntityError, e:
1909
1910
1911 raise ConversionError(e)
1912
1915 """
1916 Provides a L{ReadingConverter} that converts between readings over a third
1917 reading called bridge reading.
1918 """
1920 """
1921 Extracts all conversion directions implicitly stored in the bridge
1922 definition.
1923
1924 @type bridge: list of tuple
1925 @param bridge: 3-tuples indicating conversion direction over a third
1926 reading (bridge)
1927 @rtype: list of tuple
1928 @return: conversion directions
1929 """
1930 dirSet = set()
1931 for fromReading, bridgeReading, toReading in bridge:
1932 dirSet.add((fromReading, toReading))
1933 return list(dirSet)
1934
1935 CONVERSION_BRIDGE = [('WadeGiles', 'Pinyin', 'MandarinIPA'),
1936 ('MandarinBraille', 'Pinyin', 'MandarinIPA'),
1937 ('WadeGiles', 'Pinyin', 'MandarinBraille'),
1938 ('MandarinBraille', 'Pinyin', 'WadeGiles'),
1939 ('GR', 'Pinyin', 'WadeGiles'), ('MandarinBraille', 'Pinyin', 'GR'),
1940 ('WadeGiles', 'Pinyin', 'GR'), ('GR', 'Pinyin', 'MandarinBraille'),
1941 ('GR', 'Pinyin', 'MandarinIPA'),
1942
1943 ]
1944 """
1945 List containing all conversion directions together with the bridge reading
1946 over which the conversion is made.
1947 Form: (fromReading, bridgeReading, toReading)
1948 As conversion may be lossy it is important which conversion path is chosen.
1949 """
1950
1951 CONVERSION_DIRECTIONS = _getConversionDirections(CONVERSION_BRIDGE)
1952
1954 """
1955 Creates an instance of the BridgeConverter.
1956
1957 @param args: optional list of L{RomanisationOperator}s to use for
1958 handling source and target readings.
1959 @param options: extra options passed to the L{ReadingConverter}s
1960 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
1961 given, default settings will be assumed.
1962 @keyword sourceOperators: list of L{ReadingOperator}s used for handling
1963 source readings.
1964 @keyword targetOperators: list of L{ReadingOperator}s used for handling
1965 target readings.
1966 """
1967 super(BridgeConverter, self).__init__(*args, **options)
1968
1969 self.bridgeLookup = {}
1970 for fromReading, bridgeReading, toReading in self.CONVERSION_BRIDGE:
1971 self.bridgeLookup[(fromReading, toReading)] = bridgeReading
1972
1974 if (fromReading, toReading) not in self.CONVERSION_DIRECTIONS:
1975 raise UnsupportedError("conversion direction from '" \
1976 + fromReading + "' to '" + toReading + "' not supported")
1977 bridgeReading = self.bridgeLookup[(fromReading, toReading)]
1978
1979
1980 bridgeReadingEntities = self.readingFact.convertEntities(
1981 readingEntities, fromReading, bridgeReading,
1982 sourceOperators=[self._getFromOperator(fromReading)])
1983
1984
1985 toReadingEntities = self.readingFact.convertEntities(
1986 bridgeReadingEntities, bridgeReading, toReading,
1987 targetOperators=[self._getToOperator(toReading)])
1988 return toReadingEntities
1989