1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 u"""
19 Provides L{ReadingOperator}s, classes to handle strings written in a character
20 reading.
21
22 Examples
23 ========
24 Decompose a reading string in I{Gwoyeu Romatzyh} into single entities:
25
26 >>> from cjklib.reading import ReadingFactory
27 >>> f = ReadingFactory()
28 >>> f.decompose('"Hannshyue" .de mingcheng duey Jonggwo [...]', 'GR')
29 ['"', 'Hann', 'shyue', '" ', '.de', ' ', 'ming', 'cheng', ' ', 'duey',
30 ' ', 'Jong', 'gwo', ' [...]']
31
32 The same can be done by directly using the operator's instance:
33
34 >>> from cjklib.reading import operator
35 >>> cy = operator.CantoneseYaleOperator()
36 >>> cy.decompose('gwóngjàuwá')
37 [u'gw\xf3ng', u'j\xe0u', u'w\xe1']
38
39 Composing will reverse the process, using a I{Pinyin} string:
40
41 >>> f.compose([u'xī', u'ān'], 'Pinyin')
42 u"x\u012b'\u0101n"
43
44 For more complex operators, see L{PinyinOperator} or L{MandarinIPAOperator}.
45 """
46 import re
47 import unicodedata
48 import copy
49
50 from sqlalchemy import Table, Column, Integer, String
51 from sqlalchemy import select, union
52 from sqlalchemy.sql import and_, or_, not_
53
54 from cjklib.exception import (AmbiguousConversionError, DecompositionError,
55 AmbiguousDecompositonError, InvalidEntityError, UnsupportedError)
56 from cjklib.dbconnector import DatabaseConnector
59 """
60 Defines an abstract operator on text written in a I{character reading}.
61
62 The two basic methods are L{decompose()} and L{compose()}. L{decompose()}
63 breaks down a text into the basic entities of that reading (additional non
64 reading substrings are accepted though). L{compose()} joins these entities
65 together again and applies formating rules needed by the reading.
66 Additionally the method L{isReadingEntity()} is provided to check which of
67 the strings returned by L{decompose()} are supported entities for the given
68 reading.
69
70 The methods L{getDefaultOptions()} and L{getOption()} provide means to
71 handle the I{reading dialect}'s specific settings.
72
73 The class itself can't be used directly, it has to be subclassed and its
74 methods need to be extended.
75 """
76 READING_NAME = None
77 """Unique name of reading"""
78
80 """
81 Creates an instance of the ReadingOperator.
82
83 @param options: extra options
84 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
85 given, default settings will be assumed.
86 """
87 if 'dbConnectInst' in options:
88 self.db = options['dbConnectInst']
89 else:
90 self.db = DatabaseConnector.getDBConnector()
91
92 self.optionValue = {}
93 defaultOptions = self.getDefaultOptions()
94 for option in defaultOptions:
95 if type(defaultOptions[option]) \
96 in [type(()), type([]), type({}), type(set())]:
97 self.optionValue[option] = copy.deepcopy(defaultOptions[option])
98 else:
99 self.optionValue[option] = defaultOptions[option]
100
101 @classmethod
103 """
104 Returns the reading operator's default options.
105
106 The default implementation returns an empty dictionary. The keyword
107 'dbConnectInst' is not regarded a configuration option of the operator
108 and is thus not included in the dict returned.
109
110 @rtype: dict
111 @return: the reading operator's default options.
112 """
113 return {}
114
116 """
117 Returns the value of the reading operator's option.
118
119 @return: the value of the given reading operator's option.
120 """
121 return self.optionValue[option]
122
124 """
125 Decomposes the given string into basic entities that can be mapped to
126 one Chinese character each (exceptions possible).
127
128 The given input string can contain other non reading characters, e.g.
129 punctuation marks.
130
131 The returned list contains a mix of basic reading entities and other
132 characters e.g. spaces and punctuation marks.
133
134 The default implementation will raise a NotImplementedError.
135
136 @type string: str
137 @param string: reading string
138 @rtype: list of str
139 @return: a list of basic entities of the input string
140 @raise DecompositionError: if the string can not be decomposed.
141 """
142 raise NotImplementedError
143
144 - def compose(self, readingEntities):
145 """
146 Composes the given list of basic entities to a string.
147
148 The default implementation will raise a NotImplementedError.
149
150 @type readingEntities: list of str
151 @param readingEntities: list of basic entities or other content
152 @rtype: str
153 @return: composed entities
154 """
155 raise NotImplementedError
156
158 """
159 Returns true if the given entity is recognised by the reading
160 operator, i.e. it is a valid entity of the reading returned by
161 L{decompose()}.
162
163 The default implementation will raise a NotImplementedError.
164
165 @type entity: str
166 @param entity: entity to check
167 @rtype: bool
168 @return: true if string is an entity of the reading, false otherwise.
169 """
170 raise NotImplementedError
171
174 """
175 Defines an abstract L{ReadingOperator} on text written in a I{romanisation},
176 i.e. text written in the Latin alphabet or written in the Cyrillic alphabet.
177
178 Additional to L{decompose()} provided by the class L{ReadingOperator} this
179 class offers a method L{getDecompositions()} that returns several possible
180 decompositions in an ambiguous case.
181
182 This class itself can't be used directly, it has to be subclassed and
183 extended.
184
185 X{Decomposition}
186 ================
187 Transcriptions into the Latin alphabet generate the problem that syllable
188 boundaries or boundaries of entities belonging to single Chinese characters
189 aren't clear anymore once entities are grouped together.
190
191 Therefore it is important to have methods at hand to separate this strings
192 and to split them into single entities. This though cannot always be done
193 in a clear and unambiguous way as several different decompositions might be
194 possible thus leading to the general case of X{ambiguous decomposition}s.
195
196 Many romanisations do provide a way to tackle this problem. Pinyin for
197 example requires the use of an apostrophe (C{'}) when the reverse process
198 of splitting the string into syllables gets ambiguous. The Wade-Giles
199 romanisation in its strict implementation asks for a hyphen used between all
200 syllables. The LSHK's Jyutping when written with tone marks will always be
201 clearly decomposable.
202
203 The method L{isStrictDecomposition()} can be implemented to check if one
204 possible decomposition is the X{strict decomposition} offered by the
205 romanisation's protocol. This method should guarantee that under all
206 circumstances only one decomposed version will be regarded as strict.
207
208 If no strict version is yielded and different decompositions exist an
209 X{unambiguous decomposition} can not be made. These decompositions can be
210 accessed through method L{getDecompositions()}, even in a cases where a
211 strict decomposition exists.
212 @todo Impl: Optimise decompose() as to incorporate segment() and prune the
213 tree while it is created. Does this though yield significant
214 improvement? Would at least be O(n).
215 """
216 readingEntityRegex = re.compile(u"([A-Za-z]+)")
217 """Regular Expression for finding romanisation entities in input."""
218
220 """
221 Creates an instance of the RomanisationOperator.
222
223 @param options: extra options
224 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
225 given, default settings will be assumed.
226 @keyword strictSegmentation: if C{True} segmentation (using
227 L{segment()}) and thus decomposition (using L{decompose()}) will
228 raise an exception if an alphabetic string is parsed which can not
229 be segmented into single reading entities. If C{False} the aforesaid
230 string will be returned unsegmented.
231 @keyword case: if set to C{'lower'}/C{'upper'}, only lower/upper
232 case will be supported, respectively, if set to C{'both'} both upper
233 and lower case will be supported.
234 """
235 super(RomanisationOperator, self).__init__(**options)
236
237 if 'strictSegmentation' in options:
238 self.optionValue['strictSegmentation'] \
239 = options['strictSegmentation']
240
241 if 'case' in options:
242 self.optionValue['case'] = options['case']
243
244 self.syllableTable = None
245 self.substringSet = None
246
247 @classmethod
253
255 """
256 Decomposes the given string into basic entities on a one-to-one mapping
257 level to Chinese characters. Decomposing can be ambiguous and there are
258 two assumptions made to solve this problem: If two subsequent entities
259 together make up a longer valid entity, then the decomposition with the
260 shorter entities can be disregarded. Furthermore it is assumed that the
261 reading provides rules to mark entity borders and that these rules can
262 be checked, so that the decomposition that abides by this rules will be
263 prefered. This check is done by calling L{isStrictDecomposition()}.
264
265 The given input string can contain other characters not supported by the
266 reading, e.g. punctuation marks. The returned list then contains a mix
267 of basic reading entities and other characters e.g. spaces and
268 punctuation marks.
269
270 @type string: str
271 @param string: reading string
272 @rtype: list of str
273 @return: a list of basic entities of the input string
274 @raise AmbiguousDecompositonError: if decomposition is ambiguous.
275 @raise DecompositionError: if the given string has a wrong format.
276 """
277 decompositionParts = self.getDecompositionTree(string)
278
279 strictDecomposition = []
280 for segment in decompositionParts:
281 if len(segment) == 1:
282
283 strictDecomposition.extend(segment[0])
284 else:
285
286
287
288 nonMergeableParts = []
289 for decomposition in segment:
290 if not self._hasMergeableSyllables(decomposition):
291 nonMergeableParts.append(decomposition)
292 if len(nonMergeableParts) == 1:
293 strictDecomposition.extend(nonMergeableParts[0])
294 else:
295
296 for decomposition in nonMergeableParts:
297 if self.isStrictDecomposition(decomposition):
298
299
300 strictDecomposition.extend(decomposition)
301 break
302 else:
303 raise AmbiguousDecompositonError("decomposition of '" \
304 + string + "' ambiguous: '" \
305 + ''.join(decomposition) + "'")
306
307 return strictDecomposition
308
310 """
311 Decomposes the given string into basic entities that can be mapped to
312 one Chinese character each for all possible decompositions and returns
313 the possible decompositions as a lattice.
314
315 @type string: str
316 @param string: reading string
317 @rtype: list
318 @return: a list of all possible decompositions consisting of basic
319 entities as a lattice construct.
320 @raise DecompositionError: if the given string has a wrong format.
321 """
322
323 decompositionParts = []
324
325 for part in self.readingEntityRegex.split(string):
326 if part == '':
327 continue
328 if not self.readingEntityRegex.match(part):
329
330 decompositionParts.append([[part]])
331 else:
332 segmentations = self.segment(part)
333 decompositionParts.append(segmentations)
334
335 return decompositionParts
336
338 """
339 Decomposes the given string into basic entities that can be mapped to
340 one Chinese character each for all possible decompositions. This method
341 is a more general version of L{decompose()}.
342
343 The returned list construction consists of two entity types: entities of
344 the romanisation and other strings.
345
346 @type string: str
347 @param string: reading string
348 @rtype: list of list of str
349 @return: a list of all possible decompositions consisting of basic
350 entities.
351 @raise DecompositionError: if the given string has a wrong format.
352 """
353 decompositionParts = self.getDecompositionTree(string)
354
355 decompCrossProd = self._crossProduct(decompositionParts)
356
357 decompositionList = []
358 for line in decompCrossProd:
359 resultList = []
360 for entry in line:
361 resultList.extend(entry)
362 decompositionList.append(resultList)
363
364 return decompositionList
365
367 """
368 Takes a string written in the romanisation and returns the possible
369 segmentations as a list of syllables.
370
371 In contrast to L{decompose()} this method merely segments continuous
372 entities of the romanisation. Characters not part of the romanisation
373 will not be dealt with, this is the task of the more general decompose
374 method.
375
376 @type string: str
377 @param string: reading string
378 @rtype: list of list of str
379 @return: a list of possible segmentations (several if ambiguous) into
380 single syllables
381 @raise DecompositionError: if the given string has an invalid format.
382 """
383 segmentationTree = self._recursiveSegmentation(string)
384 if string != '' and len(segmentationTree) == 0:
385 if self.getOption('strictSegmentation'):
386 raise DecompositionError(u"Segmentation of '" + string \
387 + "' not possible or invalid syllable")
388 else:
389 return [[string]]
390 resultList = []
391 for entry in segmentationTree:
392 resultList.extend(self._treeToList(entry))
393 return resultList
394
396 """
397 Takes a string written in the romanisation and returns the possible
398 segmentations as a tree of syllables.
399
400 The tree is represented by tuples C{(syllable, subtree)}.
401
402 @type string: str
403 @param string: reading string
404 @rtype: list of tuple
405 @return: a tree of possible segmentations (if ambiguous) into single
406 syllables
407 """
408 segmentationParts = []
409 substringIndex = 1
410 while substringIndex <= len(string) and \
411 self._hasSyllableSubstring(string[0:substringIndex].lower()):
412 syllable = string[0:substringIndex]
413 if self.isReadingEntity(syllable):
414 remaining = string[substringIndex:]
415 if remaining != '':
416 remainingParts = self._recursiveSegmentation(remaining)
417 if remainingParts != []:
418 segmentationParts.append((syllable, remainingParts))
419 else:
420 segmentationParts.append((syllable, None))
421 substringIndex = substringIndex + 1
422 return segmentationParts
423
425 """
426 Checks if the given decomposition has two or more following syllables
427 which together make up a new syllable.
428
429 Segmentation can give several results with some possible syllables being
430 even further subdivided (e.g. I{tian} to I{ti'an} in Pinyin). These
431 segmentations are only secondary and the segmentation with the longer
432 syllables will be the one to take.
433
434 @type decomposition: list of str
435 @param decomposition: decomposed reading string
436 @rtype: bool
437 @return: True if following syllables make up a syllable
438 """
439 for startIndex in range(0, len(decomposition)-1):
440 endIndex = startIndex + 2
441 subDecomp = "".join(decomposition[startIndex:endIndex]).lower()
442 while endIndex <= len(decomposition) and \
443 self._hasSyllableSubstring(subDecomp):
444 if self.isReadingEntity(subDecomp):
445 return True
446 endIndex = endIndex + 1
447 subDecomp = "".join(decomposition[startIndex:endIndex]).lower()
448 return False
449
451 """
452 Checks if the given decomposition follows the romanisation format
453 strictly to allow unambiguous decomposition.
454
455 The romanisation should offer a way/protocol to make an unambiguous
456 decomposition into it's basic syllables possible as to make the process
457 of appending syllables to a string reversible. The testing on compliance
458 with this protocol has to be implemented here. Thus this method can only
459 return true for one and only one possible decomposition for all strings.
460
461 @type decomposition: list of str
462 @param decomposition: decomposed reading string
463 @rtype: bool
464 @return: False, as this methods needs to be implemented by the sub class
465 """
466 return False
467
469 """
470 Checks if the given string is a syllable supported by this romanisation
471 or a substring of one.
472
473 @type string: str
474 @param string: romanisation syllable or substring
475 @rtype: bool
476 @return: true if this string is a substring of a syllable, false
477 otherwise
478 """
479 if self.substringSet == None:
480
481 self.substringSet = set()
482 for syllable in self.getReadingEntities():
483 for i in range(len(syllable)):
484 self.substringSet.add(syllable[0:i+1])
485 return string in self.substringSet
486
488 """
489 Returns true if the given entity is recognised by the romanisation
490 operator, i.e. it is a valid entity of the reading returned by the
491 segmentation method.
492
493 Reading entities will be handled as being case insensitive.
494
495 @type entity: str
496 @param entity: entity to check
497 @rtype: bool
498 @return: C{True} if string is an entity of the reading, C{False}
499 otherwise.
500 """
501
502 if self.getOption('case') == 'lower' and entity.lower() != entity:
503 return False
504 elif self.getOption('case') == 'upper' and entity.upper() != entity:
505 return False
506
507 if self.syllableTable == None:
508
509 self.syllableTable = self.getReadingEntities()
510 return entity.lower() in self.syllableTable
511
513 """
514 Gets a set of all entities supported by the reading.
515
516 The list is used in the segmentation process to find entity boundaries.
517 The default implementation will raise a NotImplementedError.
518
519 @rtype: set of str
520 @return: set of supported syllables
521 """
522 raise NotImplementedError
523
524 @staticmethod
526 """
527 Calculates the cross product (aka Cartesian product) of sets given as
528 lists.
529
530 Example:
531 >>> RomanisationOperator._crossProduct([['A', 'B'], [1, 2, 3]])
532 [['A', 1], ['A', 2], ['A', 3], ['B', 1], ['B', 2], ['B', 3]]
533
534 @type singleLists: list of list
535 @param singleLists: a list of list entries containing various elements
536 @rtype: list of list
537 @return: the cross product of the given sets
538 """
539
540 lastRepeat = 1
541 repeatSet = []
542 for elem in singleLists:
543 repeatSet.append(lastRepeat)
544 lastRepeat = lastRepeat * len(elem)
545 repeatEntry = []
546
547 newListLength = 1
548 for i in range(0, len(singleLists)):
549 elem = singleLists[len(singleLists) - i - 1]
550 repeatEntry.append(newListLength)
551 newListLength = newListLength * len(elem)
552 repeatEntry.reverse()
553
554 newList = [[] for i in range(0, newListLength)]
555 lastSetLen = 1
556 for i, listElem in enumerate(singleLists):
557 for j in range(0, repeatSet[i]):
558 for k, elem in enumerate(listElem):
559 for l in range(0, repeatEntry[i]):
560 newList[j * lastSetLen + k*repeatEntry[i] \
561 + l].append(elem)
562 lastSetLen = repeatEntry[i]
563 return newList
564
565 @staticmethod
567 """
568 Converts a tree to a list containing all full paths from root to leaf
569 node.
570
571 The tree is given by tuples C{(leaf node element, subtree)}.
572
573 Example:
574 >>> RomanisationOperator._treeToList(
575 ... ('A', [('B', None), ('C', [('D', None), ('E', None)])]))
576 [['A', 'B'], ['A', 'C', 'D'], ['A', 'C', 'E']]
577
578 @type tupleTree: tuple
579 @param tupleTree: a tree realised through a tuple of a node and a
580 subtree
581 @rtype: list of list
582 @return: a list of all paths contained by the given tree
583 """
584 resultList = []
585 root, pathList = tupleTree
586 if not pathList:
587 return [[root]]
588 for path in pathList:
589 subList = RomanisationOperator._treeToList(path)
590 for entry in subList:
591 newEntry = [root]
592 newEntry.extend(entry)
593 resultList.append(newEntry)
594 return resultList
595
598 """
599 Provides an abstract L{ReadingOperator} for tonal languages for a reading
600 based on a fixed set of reading entities.
601
602 It provides two methods L{getTonalEntity()} and L{splitEntityTone()} to
603 cope with tonal information in text.
604
605 The class itself can't be used directly, it has to be subclassed and its
606 methods need to be extended.
607 """
609 """
610 Creates an instance of the TonalFixedEntityOperator.
611
612 @param options: extra options
613 """
614 super(TonalFixedEntityOperator, self).__init__(**options)
615
616 self.plainEntityTable = None
617
619 """
620 Returns a set of tones supported by the reading. These tones don't
621 necessarily reflect the tones of the underlying language but may defer
622 to reflect notational or other features.
623
624 The default implementation will raise a NotImplementedError.
625
626 @rtype: list
627 @return: list of supported tone marks.
628 """
629 raise NotImplementedError
630
632 """
633 Gets the entity with tone mark for the given plain entity and tone.
634
635 The default implementation will raise a NotImplementedError.
636
637 @type plainEntity: str
638 @param plainEntity: entity without tonal information
639 @param tone: tone
640 @rtype: str
641 @return: entity with appropriate tone
642 @raise InvalidEntityError: if the entity is invalid.
643 @raise UnsupportedError: if the operation is not supported for the given
644 form.
645 """
646 raise NotImplementedError
647
649 """
650 Splits the entity into an entity without tone mark (plain entity) and
651 the entity's tone.
652
653 The default implementation will raise a NotImplementedError.
654
655 @type entity: str
656 @param entity: entity with tonal information
657 @rtype: tuple
658 @return: plain entity without tone mark and entity's tone
659 @raise InvalidEntityError: if the entity is invalid.
660 @raise UnsupportedError: if the operation is not supported for the given
661 form.
662 """
663 raise NotImplementedError
664
666 """
667 Gets a set of all entities supported by the reading.
668
669 The list is used in the segmentation process to find entity boundaries.
670
671 @rtype: list of str
672 @return: list of supported syllables
673 """
674 syllableSet = set()
675 for syllable in self.getPlainReadingEntities():
676 for tone in self.getTones():
677 syllableSet.add(self.getTonalEntity(syllable, tone))
678 return syllableSet
679
681 """
682 Gets the list of plain entities supported by this reading. Different to
683 L{getReadingEntities()} the entities will carry no tone mark.
684
685 The default implementation will raise a NotImplementedError.
686
687 @rtype: set of str
688 @return: set of supported syllables
689 """
690 raise NotImplementedError
691
693 """
694 Returns true if the given plain entity (without any tone mark) is
695 recognised by the romanisation operator, i.e. it is a valid entity of
696 the reading returned by the segmentation method.
697
698 @type entity: str
699 @param entity: entity to check
700 @rtype: bool
701 @return: C{True} if string is an entity of the reading, C{False}
702 otherwise.
703 """
704 if self.plainEntityTable == None:
705
706 self.plainEntityTable = self.getPlainReadingEntities()
707 return entity in self.plainEntityTable
708
717
721 """
722 Provides an abstract L{RomanisationOperator} for tonal languages
723 incorporating methods from L{TonalFixedEntityOperator}.
724
725 It provides two methods L{getTonalEntity()} and L{splitEntityTone()} to
726 cope with tonal information in text.
727
728 The class itself can't be used directly, it has to be subclassed and its
729 methods need to be extended.
730 """
732 """
733 Creates an instance of the TonalRomanisationOperator.
734
735 @param options: extra options
736 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
737 given, default settings will be assumed.
738 @keyword strictSegmentation: if C{True} segmentation (using
739 L{segment()}) and thus decomposition (using L{decompose()}) will
740 raise an exception if an alphabetic string is parsed which can not
741 be segmented into single reading entities. If C{False} the aforesaid
742 string will be returned unsegmented.
743 """
744 super(TonalRomanisationOperator, self).__init__(**options)
745
747 """
748 Gets a set of all entities supported by the reading.
749
750 The list is used in the segmentation process to find entity boundaries.
751
752 @rtype: list of str
753 @return: list of supported syllables
754 """
755 return TonalFixedEntityOperator.getReadingEntities(self)
756
758 """
759 Returns true if the given plain entity (without any tone mark) is
760 recognised by the romanisation operator, i.e. it is a valid entity of
761 the reading returned by the segmentation method.
762
763 Reading entities will be handled as being case insensitive.
764
765 @type entity: str
766 @param entity: entity to check
767 @rtype: bool
768 @return: C{True} if string is an entity of the reading, C{False}
769 otherwise.
770 """
771
772 if self.getOption('case') == 'lower' and entity.lower() != entity:
773 return False
774 elif self.getOption('case') == 'upper' and entity.upper() != entity:
775 return False
776
777 return TonalFixedEntityOperator.isPlainReadingEntity(self,
778 entity.lower())
779
782
785 u"""
786 Defines an operator on strings of a tonal language written in the
787 X{International Phonetic Alphabet} (X{IPA}).
788
789 TonalIPAOperator does not supply the same closed set of syllables as
790 other L{ReadingOperator}s as IPA provides different ways to represent
791 pronunciation. Because of that a user defined IPA syllable will not easily
792 map to another transcription system and thus only basic support is provided
793 for this direction.
794
795 Tones
796 =====
797 Tones in IPA can be expressed using different schemes. The following schemes
798 are implemented here:
799 - Numbers, tone numbers ,
800 - ChaoDigits, numbers displaying the levels of Chao tone contours,
801 - IPAToneBar, IPA modifying tone bar characters, e.g. ɛw˥˧,
802 - Diacritics, diacritical marks and finally
803 - None, no support for tone marks
804
805 @todo Lang: Shed more light on representations of tones in IPA.
806 @todo Fix: Get all diacritics used in IPA as tones for L{TONE_MARK_REGEX}.
807 """
808 TONE_MARK_REGEX = {'Numbers': re.compile(r'(\d)$'),
809 'ChaoDigits': re.compile(r'(12345+)$'),
810 'IPAToneBar': re.compile(ur'([˥˦˧˨˩꜈꜉꜊꜋꜌]+)$'),
811 'Diacritics': re.compile(ur'([\u0300\u0301\u0302\u0303\u030c]+)')
812 }
813
814 DEFAULT_TONE_MARK_TYPE = 'IPAToneBar'
815 """Tone mark type to select by default."""
816
817 TONES = []
818 """List of tone names. Needs to be implemented in child class."""
819
820 TONE_MARK_PREFER = {'Numbers': {}, 'ChaoDigits': {}, 'IPAToneBar': {},
821 'Diacritics': {}}
822 """
823 Mapping of tone marks to tone name which will be preferred on ambiguous
824 mappings. Needs to be implemented in child classes.
825 """
826
827 TONE_MARK_MAPPING = {'Numbers': {}, 'ChaoDigits': {}, 'IPAToneBar': {},
828 'Diacritics': {}}
829 """
830 Mapping of tone names to tone mark for each tone mark type. Needs to be
831 implemented in child classes.
832 """
833
835 """
836 Creates an instance of the TonalIPAOperator.
837
838 By default no tone marks will be shown.
839
840 @param options: extra options
841 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
842 given, default settings will be assumed.
843 @keyword toneMarkType: type of tone marks, one out of C{'Numbers'},
844 C{'ChaoDigits'}, C{'IPAToneBar'}, C{'Diacritics'}, C{'None'}
845 @keyword missingToneMark: if set to C{'noinfo'} no tone information
846 will be deduced when no tone mark is found (takes on value C{None}),
847 if set to C{'ignore'} this entity will not be valid.
848 """
849 super(TonalIPAOperator, self).__init__(**options)
850
851 if 'toneMarkType' in options:
852 if options['toneMarkType'] not in ['Numbers', 'ChaoDigits',
853 'IPAToneBar', 'Diacritics', 'None']:
854 raise ValueError("Invalid option '" \
855 + str(options['toneMarkType']) \
856 + "' for keyword 'toneMarkType'")
857 self.optionValue['toneMarkType'] = options['toneMarkType']
858
859
860 if 'missingToneMark' in options:
861 if options['missingToneMark'] not in ['noinfo', 'ignore']:
862 raise ValueError("Invalid option '" \
863 + str(options['missingToneMark']) \
864 + "' for keyword 'missingToneMark'")
865 self.optionValue['missingToneMark'] = options['missingToneMark']
866
867 self.toneMarkLookup = None
868
869
870 self.splitRegex = re.compile('([\.\s]+)')
871
872 @classmethod
879
881 tones = self.TONES[:]
882 if self.getOption('missingToneMark') == 'noinfo' \
883 or self.getOption('toneMarkType') == 'None':
884 tones.append(None)
885
886 return tones
887
889 """
890 Decomposes the given string into basic entities that can be mapped to
891 one Chinese character each (exceptions possible).
892
893 The returned list contains a mix of basic reading entities and other
894 characters e.g. spaces and punctuation marks.
895
896 Single syllables can only be found if distinguished by a period or
897 whitespace, such as L{compose()} would return.
898
899 @type string: str
900 @param string: reading string
901 @rtype: list of str
902 @return: a list of basic entities of the input string
903 """
904 return self.splitRegex.split(string)
905
906 - def compose(self, readingEntities):
907 """
908 Composes the given list of basic entities to a string. IPA syllables are
909 separated by a period.
910
911 @type readingEntities: list of str
912 @param readingEntities: list of basic entities or other content
913 @rtype: str
914 @return: composed entities
915 """
916 newReadingEntities = []
917 if len(readingEntities) > 0:
918 newReadingEntities.append(readingEntities[0])
919
920
921 lastIsReadingEntity = self.isReadingEntity(readingEntities[0])
922 for entity in readingEntities[1:]:
923 isReadingEntity = self.isReadingEntity(entity)
924
925 if lastIsReadingEntity and isReadingEntity:
926 newReadingEntities.append(u'.')
927 newReadingEntities.append(entity)
928
929 lastIsReadingEntity = isReadingEntity
930
931 return "".join(newReadingEntities)
932
934 """
935 Gets the entity with tone mark for the given plain entity and tone.
936
937 The plain entity returned will always be in Unicode's
938 I{Normalization Form C} (NFC, see
939 U{http://www.unicode.org/reports/tr15/}).
940
941 @type plainEntity: str
942 @param plainEntity: entity without tonal information
943 @type tone: str
944 @param tone: tone
945 @rtype: str
946 @return: entity with appropriate tone
947 @raise InvalidEntityError: if the entity is invalid.
948 @todo Impl: Place diacritics on main vowel, derive from IPA
949 representation.
950 """
951 if tone not in self.getTones():
952 raise InvalidEntityError("Invalid tone information given for '" \
953 + plainEntity + "': '" + str(tone) + "'")
954 if self.getOption('toneMarkType') == "None" or tone == None:
955 entity = plainEntity
956 else:
957 entity = plainEntity \
958 + self.TONE_MARK_MAPPING[self.getOption('toneMarkType')][tone]
959 return unicodedata.normalize("NFC", entity)
960
962 """
963 Splits the entity into an entity without tone mark and the name of the
964 entity's tone.
965
966 The plain entity returned will always be in Unicode's
967 I{Normalization Form C} (NFC, see
968 U{http://www.unicode.org/reports/tr15/}).
969
970 @type entity: str
971 @param entity: entity with tonal information
972 @rtype: tuple
973 @return: plain entity without tone mark and additionally the tone
974 @raise InvalidEntityError: if the entity is invalid.
975 """
976
977 entity = unicodedata.normalize("NFD", unicode(entity))
978
979 toneMarkType = self.getOption('toneMarkType')
980 if toneMarkType == 'None':
981 return unicodedata.normalize("NFC", entity), None
982 else:
983 matchObj = self.TONE_MARK_REGEX[toneMarkType].search(entity)
984 if matchObj:
985 toneMark = matchObj.group(1)
986 tone = self.getToneForToneMark(toneMark)
987
988
989 plainEntity = entity.replace(toneMark, '')
990 return unicodedata.normalize("NFC", plainEntity), tone
991 elif self.getOption('missingToneMark') == 'noinfo':
992 return unicodedata.normalize("NFC", entity), None
993
994 raise InvalidEntityError("Invalid entity given for '" + entity + "'")
995
997 """
998 Gets the tone for the given tone mark.
999
1000 @type toneMark: str
1001 @param toneMark: tone mark representation of the tone
1002 @rtype: str
1003 @return: tone
1004 @raise InvalidEntityError: if the toneMark does not exist.
1005 """
1006 if self.toneMarkLookup == None:
1007 toneMarkType = self.getOption('toneMarkType')
1008
1009 self.toneMarkLookup = {}
1010 for tone in self.getTones():
1011 if tone == None:
1012 continue
1013 toneMark = self.TONE_MARK_MAPPING[toneMarkType][tone]
1014 if toneMark not in self.toneMarkLookup \
1015 or (toneMark in self.TONE_MARK_PREFER[toneMarkType] \
1016 and self.TONE_MARK_PREFER[toneMarkType][toneMark] \
1017 == tone):
1018 self.toneMarkLookup[toneMark] = tone
1019
1020 if toneMark in self.toneMarkLookup:
1021 return self.toneMarkLookup[toneMark]
1022 else:
1023 raise InvalidEntityError("Invalid tone mark given with '" \
1024 + toneMark + "'")
1025
1028 """Provides an operator on readings with a single character per entity."""
1030 readingEntities = []
1031 i = 0
1032 while i < len(string):
1033
1034 oldIndex = i
1035 while i < len(string) and not self.isReadingEntity(string[i]):
1036 i = i + 1
1037 if oldIndex != i:
1038 readingEntities.append(string[oldIndex:i])
1039
1040 if i < len(string):
1041 readingEntities.append(string[i])
1042 i = i + 1
1043 return readingEntities
1044
1045 - def compose(self, readingEntities):
1046 return ''.join(readingEntities)
1047
1049 raise NotImplemented()
1050
1053 """Provides an operator on Korean text written in X{Hangul}."""
1054 READING_NAME = "Hangul"
1055
1057 return (entity >= u'가') and (entity <= u'힣')
1058
1061 """Provides an operator on Japanese text written in X{Hiragana}."""
1062 READING_NAME = "Hiragana"
1063
1065 return (entity >= u'ぁ') and (entity <= u'ゟ')
1066
1069 """Provides an operator on Japanese text written in X{Katakana}."""
1070 READING_NAME = "Katakana"
1071
1073 return (entity >= u'゠') and (entity <= u'ヿ')
1074
1077 """
1078 Provides an operator on Japanese text written in a mix of X{Hiragana} and
1079 X{Katakana}.
1080 """
1081 READING_NAME = "Kana"
1082
1084 return ((entity >= u'ぁ') and (entity <= u'ヿ'))
1085
1088 ur"""
1089 Provides an operator for the Mandarin romanisation X{Hanyu Pinyin}.
1090 It can be configured to cope with different representations (I{"dialects"})
1091 of X{Pinyin}. For conversion between different representations the
1092 L{PinyinDialectConverter} can be used.
1093
1094 Features:
1095 - tones marked by either diacritics or numbers,
1096 - alternative representation of I{ü}-character,
1097 - correct placement of apostrophes,
1098 - guessing of input form (I{reading dialect}),
1099 - support for Erhua and
1100 - splitting of syllables into onset and rhyme.
1101
1102 Apostrophes
1103 ===========
1104 Pinyin syllables need to be separated by an X{apostrophe} in case their
1105 decomposition will get ambiguous. A famous example might be the city
1106 I{Xi'an}, which if written I{xian} would be read as one syllable, meaning
1107 e.g. 'fresh'. Another example would be I{Chang'an} which could be read
1108 I{chan'gan} if no delimiter is used in at least one of both cases.
1109
1110 Different rules exist where to place apostrophes. A simple yet sufficient
1111 rule is implemented in L{aeoApostropheRule()} which is used as default in
1112 this class. Syllables starting with one of the three vowels I{a}, I{e}, I{o}
1113 will be separated. Remember that vowels [i], [u], [y] are represented as
1114 I{yi}, I{wu}, I{yu} respectively, thus making syllable boundaries clear.
1115 L{compose()} will place apostrophes where required when composing the
1116 reading string.
1117
1118 An alternative rule can be specified to the constructor passing a function
1119 as an option C{PinyinApostropheFunction}. A possible function could be a
1120 rule separating all syllables by an apostrophe thus simplifying the reading
1121 process for beginners.
1122
1123 On decomposition of strings it is important to check which of the possibly
1124 several choices will be the one actually meant. E.g. syllable I{xian} given
1125 above should always be segmented into one syllable, solution I{xi'an} is not
1126 an option in this case. Therefore an alternative to L{aeoApostropheRule()}
1127 should make sure it guarantees proper decomposition, which is tested through
1128 L{isStrictDecomposition()}.
1129
1130 Last but not least C{compose(decompose(string))} will only be the identity
1131 if apostrophes are applied properly according to the rule as wrongly
1132 placed apostrophes will be kept when composing. Use L{removeApostrophes()}
1133 to remove separating apostrophes.
1134
1135 Example
1136 -------
1137
1138 >>> def noToneApostropheRule(precedingEntity, followingEntity):
1139 ... return precedingEntity and precedingEntity[0].isalpha() \
1140 ... and not precedingEntity[-1].isdigit() \
1141 ... and followingEntity[0].isalpha()
1142 ...
1143 >>> from cjklib.reading import ReadingFactory
1144 >>> f = ReadingFactory()
1145 >>> f.convert('an3ma5mi5ba5ni2mou1', 'Pinyin', 'Pinyin',
1146 ... sourceOptions={'toneMarkType': 'Numbers'},
1147 ... targetOptions={'toneMarkType': 'Numbers',
1148 ... 'missingToneMark': 'fifth',
1149 ... 'PinyinApostropheFunction': noToneApostropheRule})
1150 u"an3ma'mi'ba'ni2mou1"
1151
1152 R-colouring
1153 ===========
1154 The phenomenon X{Erhua} (兒化音/儿化音, Erhua yin), i.e. the X{r-colouring} of
1155 syllables, is found in the northern Chinese dialects and results from
1156 merging the formerly independent sound I{er} with the preceding syllable. In
1157 written form a word is followed by the character 兒/儿, e.g. 頭兒/头儿.
1158
1159 In Pinyin the Erhua sound is quite often expressed by appending a single
1160 I{r} to the syllable of the character preceding 兒/儿, e.g. I{tóur} for
1161 頭兒/头儿, to stress the monosyllabic nature and in contrast to words like
1162 兒子/儿子 I{ér'zi} where 兒/儿 I{ér} constitutes a single syllable.
1163
1164 For decomposing syllables in Pinyin it is thus important to decide if the
1165 I{r} marking r-colouring should be an entity on its own account stressing
1166 the representation in the character string with an own character or rather
1167 stressing the monosyllabic nature and being part of a syllable of the
1168 foregoing character. This can be configured once instantiation.
1169
1170 Source
1171 ======
1172 - Yǐn Bīnyōng (尹斌庸), Mary Felley (傅曼丽): Chinese romanization:
1173 Pronunciation and Orthography (汉语拼音和正词法). Sinolingua, Beijing,
1174 1990, ISBN 7-80052-148-6, ISBN 0-8351-1930-0.
1175
1176 @see:
1177 - Pinyin: U{http://www.pinyin.info/rules/where.html},
1178 U{http://www.pinyin.info/romanization/hanyu/apostrophes.html},
1179 U{http://www.pinyin.info/rules/initials_finals.html}
1180 - Erhua sound: U{http://en.wikipedia.org/wiki/Erhua}
1181
1182 @todo Impl: ISO 7098 asks for conversion of C{。、·「」} to C{.,-«»}. What
1183 about C{,?《》:-}? Implement a method for conversion to be optionally
1184 used.
1185 @todo Impl: Strict testing of tone mark placement. Currently it doesn't
1186 matter where tones are placed. All combinations are recognised.
1187 @todo Impl: Special marker for neutral tone: 'mȧ' (u'm\u0227', reported by
1188 Ching-song Gene Hsiao: A Manual of Transcription Systems For Chinese,
1189 中文拼音手册. Far Eastern Publications, Yale University, New Haven,
1190 Connecticut, 1985, ISBN 0-88710-141-0.), and '·ma' (u'\xb7ma', check!:
1191 现代汉语词典(第5版)[Xiàndài Hànyǔ Cídiǎn 5. Edition]. 商务印书馆
1192 [Shāngwù Yìnshūguǎn], Beijing, 2005, ISBN 7-100-04385-9.)
1193 """
1194 READING_NAME = 'Pinyin'
1195
1196 TONEMARK_VOWELS = [u'a', u'e', u'i', u'o', u'u', u'ü', u'n', u'm', u'r',
1197 u'ê']
1198 """
1199 List of characters of the nucleus possibly carrying the tone mark. I{n} is
1200 included in standalone syllables I{n} and I{ng}. I{r} is used for supporting
1201 I{Erhua} in a two syllable form.
1202 """
1203 TONEMARK_MAP = {u'\u0304': 1, u'\u0301': 2, u'\u030c': 3, u'\u0300': 4}
1204 """
1205 Mapping of I{Combining Diacritical Marks} to their Pinyin tone index.
1206
1207 @see:
1208 - The Unicode Consortium: The Unicode Standard, Version 5.0.0,
1209 Chapter 7, European Alphabetic Scripts, 7.9 Combining Marks,
1210 defined by: The Unicode Standard, Version 5.0 (Boston, MA,
1211 Addison-Wesley, 2007. ISBN 0-321-48091-0),
1212 U{http://www.unicode.org/versions/Unicode5.0.0/}
1213 - Unicode: X{Combining Diacritical Marks}, Range: 0300-036F:
1214 U{http://www.unicode.org/charts/PDF/U0300.pdf}
1215 - Unicode: FAQ - Characters and Combining Marks:
1216 U{http://unicode.org/faq/char_combmark.html}
1217 """
1218
1219 PINYIN_SOUND_REGEX \
1220 = re.compile(u'(?i)^([^aeiuoü]*)([aeiuoü]*)([^aeiuoü]*)$')
1221 """
1222 Regular Expression matching onset, nucleus and coda. Syllables 'n', 'ng',
1223 'r' (for Erhua) and 'ê' have to be handled separately.
1224 """
1225 toneMarkRegex = re.compile(u'[' + re.escape(''.join(TONEMARK_MAP.keys())) \
1226 + ']')
1227 """Regular Expression matching the Pinyin tone marks."""
1228 tonemarkMapReverse = dict([(TONEMARK_MAP[mark], mark) \
1229 for mark in TONEMARK_MAP.keys()])
1230 del mark
1231 """Reverse lookup of tone marks for tones provided by TONEMARK_MAP."""
1232
1234 u"""
1235 Creates an instance of the PinyinOperator.
1236
1237 The class instance can be configured by different optional options given
1238 as keywords.
1239
1240 @param options: extra options
1241 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
1242 given, default settings will be assumed.
1243 @keyword strictSegmentation: if C{True} segmentation (using
1244 L{segment()}) and thus decomposition (using L{decompose()}) will
1245 raise an exception if an alphabetic string is parsed which can not
1246 be segmented into single reading entities. If C{False} the aforesaid
1247 string will be returned unsegmented.
1248 @keyword toneMarkType: if set to C{'Diacritics'} tones will be marked
1249 using diacritic marks, if set to C{'Numbers'} appended numbers from
1250 1 to 5 will be used to mark tones, if set to C{'None'} no tone marks
1251 will be used and no tonal information will be supplied at all.
1252 @keyword missingToneMark: if set to C{'fifth'} no tone mark is set to
1253 indicate the fifth tone (I{qingsheng}, e.g. C{'wo3men'} stands for
1254 C{'wo3men5'}), if set to C{'noinfo'}, no tone information will be
1255 deduced when no tone mark is found (takes on value C{None}), if set
1256 to C{'ignore'} this entity will not be valid and for segmentation
1257 the behaviour defined by C{'strictSegmentation'} will take affect.
1258 This option is only valid for the tone mark type C{'Numbers'}.
1259 @keyword yVowel: a character (or string) that is taken as alternative
1260 for I{ü} which depicts (among others) the close front rounded vowel
1261 [y] (IPA) in Pinyin and includes an umlaut. Changes forms of
1262 syllables I{nü, nüe, lü, lüe}. This option is not valid for the
1263 tone mark type C{'Diacritics'}.
1264 @keyword PinyinApostrophe: an alternate apostrophe that is taken instead
1265 of the default one.
1266 @keyword PinyinApostropheFunction: a function that indicates when a
1267 syllable combination needs to be split by an I{apostrophe}, see
1268 L{aeoApostropheRule()} for the default implementation.
1269 @keyword Erhua: if set to C{'ignore'} no special support will be
1270 provided for retroflex -r at syllable end (I{Erhua}), i.e. I{zher}
1271 will raise an exception. If set to C{'twoSyllables'} syllables with
1272 an append r are given/will be segmented into two syllables, the -r
1273 suffix making up one syllable itself as C{'r'}. If set to
1274 C{'oneSyllable'} syllables with an appended r are given/will be
1275 segmented into one syllable only.
1276 """
1277 super(PinyinOperator, self).__init__(**options)
1278
1279
1280 if 'toneMarkType' in options:
1281 if options['toneMarkType'] not in ['Diacritics', 'Numbers', 'None']:
1282 raise ValueError("Invalid option '" \
1283 + str(options['toneMarkType']) \
1284 + "' for keyword 'toneMarkType'")
1285 self.optionValue['toneMarkType'] = options['toneMarkType']
1286
1287
1288 if 'missingToneMark' in options:
1289 if self.getOption('toneMarkType') != 'Numbers':
1290 raise ValueError("keyword 'missingToneMark' is only valid if" \
1291 + " tone mark type is set to 'Numbers'")
1292
1293 if options['missingToneMark'] not in ['fifth', 'noinfo', 'ignore']:
1294 raise ValueError("Invalid option '" \
1295 + str(options['missingToneMark']) \
1296 + "' for keyword 'missingToneMark'")
1297 self.optionValue['missingToneMark'] = options['missingToneMark']
1298
1299
1300 if 'yVowel' in options:
1301 if self.getOption('toneMarkType') == 'Diacritics' \
1302 and options['yVowel'] != u'ü':
1303 raise ValueError("keyword 'yVowel' is not valid for tone mark" \
1304 + " type 'Diacritics'")
1305
1306 self.optionValue['yVowel'] = options['yVowel']
1307
1308
1309 if 'PinyinApostrophe' in options:
1310 self.optionValue['PinyinApostrophe'] = options['PinyinApostrophe']
1311
1312
1313 if 'PinyinApostropheFunction' in options:
1314 self.optionValue['PinyinApostropheFunction'] \
1315 = options['PinyinApostropheFunction']
1316
1317
1318 if 'Erhua' in options:
1319 if options['Erhua'] not in ['ignore', 'twoSyllables',
1320 'oneSyllable']:
1321 raise ValueError("Invalid option '" + str(options['Erhua']) \
1322 + "' for keyword 'Erhua'")
1323 self.optionValue['Erhua'] = options['Erhua']
1324
1325
1326
1327 self.readingEntityRegex = re.compile(u'(?i)((?:' \
1328 + '|'.join([re.escape(v) for v in self._getDiacriticVowels()]) \
1329 + '|' + re.escape(self.getOption('yVowel')) \
1330 + u'|[a-zêü])+[12345]?)')
1331
1332 @classmethod
1334 options = super(PinyinOperator, cls).getDefaultOptions()
1335 options.update({'toneMarkType': 'Diacritics',
1336 'missingToneMark': 'noinfo', 'yVowel': u'ü',
1337 'PinyinApostrophe': "'", 'Erhua': 'twoSyllables',
1338 'PinyinApostropheFunction': cls.aeoApostropheRule})
1339
1340 return options
1341
1342 @staticmethod
1344 u"""
1345 Gets a list of Pinyin vowels with diacritical marks for tones.
1346
1347 The alternative for vowel ü does not need diacritical forms as the
1348 standard form doesn't allow changing the vowel.
1349
1350 @rtype: list of str
1351 @return: list of Pinyin vowels with diacritical marks
1352 """
1353 vowelList = []
1354 for vowel in PinyinOperator.TONEMARK_VOWELS:
1355 for mark in PinyinOperator.TONEMARK_MAP.keys():
1356 vowelList.append(unicodedata.normalize("NFC", vowel + mark))
1357 return vowelList
1358
1359 @classmethod
1361 u"""
1362 Takes a string written in Pinyin and guesses the reading dialect.
1363
1364 The basic options C{'toneMarkType'}, C{'yVowel'} and C{'Erhua'} are
1365 guessed. Unless C{'includeToneless'} is set to C{True} only the
1366 tone mark types C{'Diacritics'} and C{'Numbers'} are considered as the
1367 latter one can also represent the state of missing tones. Strings tested
1368 for C{'yVowel'} are C{ü}, C{v} and C{u:}. C{'Erhua'} is set to
1369 C{'twoSyllables'} by default and only tested when C{'toneMarkType'} is
1370 assumed to be set to C{'Numbers'}.
1371
1372 @type string: str
1373 @param string: Pinyin string
1374 @rtype: dict
1375 @return: dictionary of basic keyword settings
1376 """
1377 Y_VOWEL_LIST = [u'ü', 'v', 'u:']
1378 APOSTROPHE_LIST = ["'", u'’', u'´', u'‘', u'`', u'ʼ', u'ˈ', u'′', u'ʻ']
1379 readingStr = unicodedata.normalize("NFC", unicode(string))
1380
1381 diacriticVowels = PinyinOperator._getDiacriticVowels()
1382
1383 entities = re.findall(u'(?i)((?:' + '|'.join(diacriticVowels) \
1384 + '|'.join(Y_VOWEL_LIST) + u'|[a-uw-zê])+[12345]?)', readingStr)
1385
1386
1387 diacriticEntityCount = 0
1388 numberEntityCount = 0
1389 for entity in entities:
1390
1391 if entity[-1] in '12345':
1392 numberEntityCount = numberEntityCount + 1
1393 else:
1394 for vowel in diacriticVowels:
1395 if vowel in entity:
1396 diacriticEntityCount = diacriticEntityCount + 1
1397 break
1398
1399 if includeToneless \
1400 and (1.0 * max(diacriticEntityCount, numberEntityCount) \
1401 / len(entities)) < 0.1:
1402
1403
1404 toneMarkType = 'None'
1405 else:
1406 if diacriticEntityCount > numberEntityCount:
1407 toneMarkType = 'Diacritics'
1408 else:
1409 toneMarkType = 'Numbers'
1410
1411
1412 if toneMarkType == 'Diacritics':
1413 yVowel = u'ü'
1414 else:
1415 for vowel in Y_VOWEL_LIST:
1416 if vowel in readingStr:
1417 yVowel = vowel
1418 break
1419 else:
1420 yVowel = u'ü'
1421
1422
1423 for apostrophe in APOSTROPHE_LIST:
1424 if apostrophe in readingStr:
1425 PinyinApostrophe = apostrophe
1426 break
1427 else:
1428 PinyinApostrophe = "'"
1429
1430
1431 Erhua = 'twoSyllables'
1432 if toneMarkType == 'Numbers':
1433 lastIndex = 0
1434 while lastIndex != -1:
1435 lastIndex = readingStr.find('r', lastIndex+1)
1436 if lastIndex > 1:
1437 if len(readingStr) > lastIndex + 1 \
1438 and readingStr[lastIndex + 1] in '12345':
1439 if readingStr[lastIndex - 1] in '12345':
1440
1441
1442 break
1443 else:
1444
1445 Erhua = 'oneSyllable'
1446
1447 return {'toneMarkType': toneMarkType, 'yVowel': yVowel,
1448 'PinyinApostrophe': PinyinApostrophe, 'Erhua': Erhua}
1449
1451 tones = range(1, 6)
1452 if self.getOption('toneMarkType') == 'None' \
1453 or (self.getOption('missingToneMark') == 'noinfo' \
1454 and self.getOption('toneMarkType') == 'Numbers'):
1455 tones.append(None)
1456 return tones
1457
1458 - def compose(self, readingEntities):
1459 """
1460 Composes the given list of basic entities to a string. Applies an
1461 apostrophe between syllables if needed using default implementation
1462 L{aeoApostropheRule()}.
1463
1464 @type readingEntities: list of str
1465 @param readingEntities: list of basic syllables or other content
1466 @rtype: str
1467 @return: composed entities
1468 """
1469 newReadingEntities = []
1470 precedingEntity = None
1471 for entity in readingEntities:
1472 if self.getOption('PinyinApostropheFunction')(self, precedingEntity,
1473 entity):
1474 newReadingEntities.append(self.getOption('PinyinApostrophe'))
1475
1476 newReadingEntities.append(entity)
1477 precedingEntity = entity
1478 return ''.join(newReadingEntities)
1479
1480 - def removeApostrophes(self, readingEntities):
1481 """
1482 Removes apostrophes between two syllables for a given decomposition.
1483
1484 @type readingEntities: list of str
1485 @param readingEntities: list of basic syllables or other content
1486 @rtype: list of str
1487 @return: the given entity list without separating apostrophes
1488 """
1489 if len(readingEntities) == 0:
1490 return []
1491 elif len(readingEntities) > 2 \
1492 and readingEntities[1] == self.getOption('PinyinApostrophe') \
1493 and self.isReadingEntity(readingEntities[0]) \
1494 and self.isReadingEntity(readingEntities[2]):
1495
1496 newReadingEntities = [readingEntities[0]]
1497 newReadingEntities.extend(self.removeApostrophes(
1498 readingEntities[2:]))
1499 return newReadingEntities
1500 else:
1501 newReadingEntities = [readingEntities[0]]
1502 newReadingEntities.extend(self.removeApostrophes(
1503 readingEntities[1:]))
1504 return newReadingEntities
1505
1506 - def aeoApostropheRule(self, precedingEntity, followingEntity):
1507 """
1508 Checks if the given entities need to be separated by an apostrophe.
1509
1510 Returns true for syllables starting with one of the three vowels I{a},
1511 I{e}, I{o} having a preceding syllable. Additionally forms I{n} and
1512 I{ng} are separated from preceding syllables. Furthermore corner case
1513 I{e'r} will handled to distinguish from I{er}.
1514
1515 This function serves as the default apostrophe rule.
1516
1517 @type precedingEntity: str
1518 @param precedingEntity: the preceding syllable or any other content
1519 @type followingEntity: str
1520 @param followingEntity: the following syllable or any other content
1521 @rtype: bool
1522 @return: true if the syllables need to be separated, false otherwise
1523 """
1524
1525
1526
1527 if precedingEntity and self.isReadingEntity(precedingEntity) \
1528 and self.isReadingEntity(followingEntity):
1529 plainSyllable, tone = self.splitEntityTone(followingEntity)
1530
1531
1532
1533 if plainSyllable == 'r':
1534 precedingPlainSyllable, _ \
1535 = self.splitEntityTone(precedingEntity)
1536 return precedingPlainSyllable == 'e'
1537
1538 return plainSyllable[0] in ['a', 'e', 'o'] \
1539 or plainSyllable in ['n', 'ng', 'nr', 'ngr']
1540 return False
1541
1543 """
1544 Checks if the given decomposition follows the Pinyin format
1545 strictly for unambiguous decomposition: syllables have to be preceded by
1546 an apostrophe if the decomposition would be ambiguous otherwise.
1547
1548 The function stored given as option C{'PinyinApostropheFunction'} is
1549 used to check if a apostrophe should have been placed.
1550
1551 @type readingEntities: list of str
1552 @param readingEntities: decomposed reading string
1553 @rtype: bool
1554 @return: true if decomposition is strict, false otherwise
1555 """
1556 precedingEntity = None
1557 for entity in readingEntities:
1558 if self.isReadingEntity(entity):
1559
1560 if self.getOption('PinyinApostropheFunction')(self,
1561 precedingEntity, entity):
1562 return False
1563
1564 precedingEntity = entity
1565 else:
1566
1567 precedingEntity = None
1568
1569 return True
1570
1572
1573 plainEntity = unicodedata.normalize("NFC", unicode(plainEntity))
1574
1575 if tone != None:
1576 tone = int(tone)
1577 if tone not in self.getTones():
1578 raise InvalidEntityError("Invalid tone information given for '" \
1579 + plainEntity + "': '" + str(tone) + "'")
1580
1581 if self.getOption('toneMarkType') == 'None':
1582 return plainEntity
1583
1584 elif self.getOption('toneMarkType') == 'Numbers':
1585 if tone == None or (tone == 5 \
1586 and self.getOption('missingToneMark') == 'fifth'):
1587 return plainEntity
1588 else:
1589 return plainEntity + str(tone)
1590
1591 elif self.getOption('toneMarkType') == 'Diacritics':
1592
1593
1594 if plainEntity.lower() in ['n', 'ng', 'm', 'r', u'ê', 'nr', 'ngr',
1595 'mr', u'êr']:
1596 onset, nucleus, coda = ('', plainEntity[0], plainEntity[1:])
1597 elif plainEntity.lower() in ['hm', 'hng', 'hmr', 'hngr']:
1598 onset, nucleus, coda = (plainEntity[0], plainEntity[1],
1599 plainEntity[2:])
1600 else:
1601 matchObj = self.PINYIN_SOUND_REGEX.match(plainEntity)
1602 onset, nucleus, coda = matchObj.group(1, 2, 3)
1603 if not nucleus:
1604 raise InvalidEntityError("no nucleus found for '" \
1605 + plainEntity + "'")
1606
1607 tonalNucleus = self._placeNucleusToneMark(nucleus, tone)
1608 return onset + tonalNucleus + coda
1609
1611 """
1612 Places a tone mark on the given syllable nucleus according to the rules
1613 of the Pinyin standard.
1614
1615 @see: Pinyin.info - Where do the tone marks go?,
1616 U{http://www.pinyin.info/rules/where.html}.
1617
1618 @type nucleus: str
1619 @param nucleus: syllable nucleus
1620 @type tone: int
1621 @param tone: tone index (starting with 1)
1622 @rtype: str
1623 @return: nucleus with appropriate tone
1624 """
1625
1626 if tone != 5:
1627 if len(nucleus) == 1:
1628
1629 tonalNucleus = nucleus + self.tonemarkMapReverse[tone]
1630 elif nucleus[0].lower() in ('a', 'e', 'o'):
1631
1632 tonalNucleus = nucleus[0] + self.tonemarkMapReverse[tone] \
1633 + nucleus[1:]
1634 else:
1635
1636 tonalNucleus = nucleus[0] + nucleus[1] \
1637 + self.tonemarkMapReverse[tone] + nucleus[2:]
1638 else:
1639 tonalNucleus = nucleus
1640
1641 return unicodedata.normalize("NFC", tonalNucleus)
1642
1644 """
1645 Splits the entity into an entity without tone mark and the
1646 entity's tone index.
1647
1648 The plain entity returned will always be in Unicode's
1649 I{Normalization Form C} (NFC, see
1650 U{http://www.unicode.org/reports/tr15/}).
1651
1652 @type entity: str
1653 @param entity: entity with tonal information
1654 @rtype: tuple
1655 @return: plain entity without tone mark and entity's tone index
1656 (starting with 1)
1657 """
1658
1659 entity = unicodedata.normalize("NFD", unicode(entity))
1660 if self.getOption('toneMarkType') == 'None':
1661 plainEntity = entity
1662 tone = None
1663
1664 elif self.getOption('toneMarkType') == 'Numbers':
1665 matchObj = re.search(u"[12345]$", entity)
1666 if matchObj:
1667 plainEntity = entity[0:len(entity)-1]
1668 tone = int(matchObj.group(0))
1669 else:
1670 if self.getOption('missingToneMark') == 'fifth':
1671 plainEntity = entity
1672 tone = 5
1673 elif self.getOption('missingToneMark') == 'ignore':
1674 raise InvalidEntityError("No tone information given for '" \
1675 + entity + "'")
1676 else:
1677 plainEntity = entity
1678 tone = None
1679
1680 elif self.getOption('toneMarkType') == 'Diacritics':
1681
1682 matchObj = self.toneMarkRegex.search(entity)
1683 if matchObj:
1684 diacriticalMark = matchObj.group(0)
1685 tone = self.TONEMARK_MAP[diacriticalMark]
1686
1687 plainEntity = entity.replace(diacriticalMark, '')
1688 else:
1689
1690 plainEntity = entity
1691 tone = 5
1692
1693 return unicodedata.normalize("NFC", plainEntity), tone
1694
1696 u"""
1697 Gets the list of plain entities supported by this reading. Different to
1698 L{getReadingEntities()} the entities will carry no tone mark.
1699
1700 Depending on the type of Erhua support either additional syllables with
1701 an ending -r are added, or a single I{r} is included. The user specified
1702 character for vowel I{ü} will be used.
1703
1704 @rtype: set of str
1705 @return: set of supported syllables
1706 """
1707
1708 plainSyllables = set(self.db.selectScalars(
1709 select([self.db.tables['PinyinSyllables'].c.Pinyin])))
1710
1711 if self.getOption('Erhua') == 'twoSyllables':
1712
1713 plainSyllables.add('r')
1714 elif self.getOption('Erhua') == 'oneSyllable':
1715
1716 for syllable in plainSyllables.copy():
1717 if syllable not in ['e', 'er']:
1718 plainSyllables.add(syllable + 'r')
1719
1720
1721 if self.getOption('yVowel') != u'ü':
1722 for syllable in plainSyllables.copy():
1723 if syllable.find(u'ü') != -1:
1724 syllable = syllable.replace(u'ü', self.getOption('yVowel'))
1725 if syllable in plainSyllables:
1726
1727
1728 raise ValueError("syllable '" + syllable \
1729 + "' included more than once, " \
1730 + u"probably bad substitute for 'ü'")
1731 plainSyllables.add(syllable)
1732 return plainSyllables
1733
1735
1736
1737 syllables = self.getPlainReadingEntities()
1738 syllableSet = set()
1739 for syllable in syllables:
1740 if syllable == 'r':
1741
1742
1743 tones = [5]
1744 if None in self.getTones():
1745 tones.append(None)
1746 else:
1747 tones = self.getTones()
1748
1749 for tone in tones:
1750 syllableSet.add(self.getTonalEntity(syllable, tone))
1751 return syllableSet
1752
1754 """
1755 Splits the given plain syllable into onset (initial) and rhyme (final).
1756
1757 Pinyin can't be separated into onset and rhyme clearly within its own
1758 system. There are syllables with same finals written differently (e.g.
1759 I{wei} and I{dui} both ending in a final that can be described by
1760 I{uei}) and reduction of vowels (same example: I{dui} which is
1761 pronounced with vowels I{uei}). This method will use three forms not
1762 found as substrings in Pinyin (I{uei}, {uen} and I{iou}) and substitutes
1763 (pseudo) initials I{w} and I{y} with its vowel equivalents.
1764
1765 Furthermore final I{i} will be distinguished in three forms given by
1766 the following three examples: I{yi}, I{zhi} and I{zi} to express
1767 phonological difference.
1768
1769 @type plainSyllable: str
1770 @param plainSyllable: syllable without tone marks
1771 @rtype: tuple of str
1772 @return: tuple of entity onset and rhyme
1773 @raise InvalidEntityError: if the entity is invalid.
1774 @raise UnsupportedError: for entity I{r} when Erhua is handled as
1775 separate entity.
1776 """
1777 erhuaForm = False
1778 if self.getOption('Erhua') == 'oneSyllable' \
1779 and plainSyllable.endswith('r') and plainSyllable != 'er':
1780 plainSyllable = plainSyllable[:-1]
1781 erhuaForm = True
1782
1783 elif plainSyllable == 'r' and self.getOption('Erhua') == 'twoSyllables':
1784 raise UnsupportedError("Not supported for '" + plainSyllable + "'")
1785
1786 table = self.db.tables['PinyinInitialFinal']
1787 entry = self.db.selectRow(
1788 select([table.c.PinyinInitial, table.c.PinyinFinal],
1789 table.c.Pinyin == plainSyllable.lower()))
1790 if not entry:
1791 raise InvalidEntityError("'" + plainSyllable \
1792 + "' not a valid plain Pinyin syllable'")
1793
1794 if erhuaForm:
1795 return (entry[0], entry[1] + 'r')
1796 else:
1797 return (entry[0], entry[1])
1798
1801 u"""
1802 Provides an operator for the Mandarin X{Wade-Giles} romanisation.
1803
1804 Features:
1805 - tones marked by either standard numbers or subscripts,
1806 - configurable apostrophe for marking aspiration and
1807 - placement of hyphens between syllables.
1808
1809 @todo Lang: Get a good source for the syllables used. See also
1810 L{PinyinWadeGilesConverter}.
1811 @todo Lang: Respect mangled Wade-Giles writings. Possible steps: a)
1812 Warn/Error on syllables which are ambiguous when asume apostrophe are
1813 omitted. b) 'hsu' is no valid syllable but can be viewed as 'hsü'.
1814 Compare to different 'implementations' of the Wade-Giles romanisation.
1815 """
1816 READING_NAME = 'WadeGiles'
1817
1818 DB_ASPIRATION_APOSTROPHE = u"‘"
1819 """Default apostrophe used by Wade-Giles syllable data in database."""
1820
1821 TO_SUPERSCRIPT = {1: u'¹', 2: u'²', 3: u'³', 4: u'⁴', 5: u'⁵'}
1822 """Mapping of tone numbers to superscript numbers."""
1823 FROM_SUPERSCRIPT = dict([(value, key) \
1824 for key, value in TO_SUPERSCRIPT.iteritems()])
1825 """Mapping of superscript numbers to tone numbers."""
1826 del value
1827 del key
1828
1830 """
1831 Creates an instance of the WadeGilesOperator.
1832
1833 @param options: extra options
1834 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
1835 given, default settings will be assumed.
1836 @keyword strictSegmentation: if C{True} segmentation (using
1837 L{segment()}) and thus decomposition (using L{decompose()}) will
1838 raise an exception if an alphabetic string is parsed which can not
1839 be segmented into single reading entities. If C{False} the aforesaid
1840 string will be returned unsegmented.
1841 @keyword WadeGilesApostrophe: an alternate apostrophe that is taken
1842 instead of the default one.
1843 @keyword toneMarkType: if set to C{'Numbers'} appended numbers from 1 to
1844 5 will be used to mark tones, if set to C{'SuperscriptNumbers'}
1845 appended superscript numbers from 1 to 5 will be used to mark tones,
1846 if set to C{'None'} no tone marks will be used and no tonal
1847 information will be supplied at all.
1848 @keyword missingToneMark: if set to C{'fifth'} no tone mark is set to
1849 indicate the fifth tone (I{qingsheng}, e.g. C{'tsan2-men'} stands
1850 for C{'tsan2-men5'}), if set to C{'noinfo'}, no tone information
1851 will be deduced when no tone mark is found (takes on value C{None}),
1852 if set to C{'ignore'} this entity will not be valid and for
1853 segmentation the behaviour defined by C{'strictSegmentation'} will
1854 take affect.
1855 """
1856 super(WadeGilesOperator, self).__init__(**options)
1857
1858 if 'WadeGilesApostrophe' in options:
1859 self.optionValue['WadeGilesApostrophe'] \
1860 = options['WadeGilesApostrophe']
1861 self.readingEntityRegex = re.compile(u"((?:" \
1862 + re.escape(self.getOption('WadeGilesApostrophe')) \
1863 + u"|[A-ZÜa-zü])+[12345¹²³⁴⁵]?)")
1864
1865
1866 if 'toneMarkType' in options:
1867 if options['toneMarkType'] not in ['Numbers', 'SuperscriptNumbers',
1868 'None']:
1869 raise ValueError("Invalid option '" \
1870 + str(options['toneMarkType']) \
1871 + "' for keyword 'toneMarkType'")
1872 self.optionValue['toneMarkType'] = options['toneMarkType']
1873
1874
1875 if 'missingToneMark' in options:
1876 if self.getOption('toneMarkType') not in ['Numbers',
1877 'SuperscriptNumbers']:
1878 raise ValueError("keyword 'missingToneMark' is only valid if" \
1879 + " tone mark type is set to 'Numbers' or " \
1880 + "'SuperscriptNumbers'")
1881
1882 if options['missingToneMark'] not in ['fifth', 'noinfo', 'ignore']:
1883 raise ValueError("Invalid option '" \
1884 + str(options['missingToneMark']) \
1885 + "' for keyword 'missingToneMark'")
1886 self.optionValue['missingToneMark'] = options['missingToneMark']
1887
1888 @classmethod
1896
1898 if self.getOption('missingToneMark') == 'fifth':
1899 tones = [1, 2, 3, 4, None]
1900 else:
1901 tones = range(1, 6)
1902 if self.getOption('toneMarkType') == 'None' \
1903 or self.getOption('missingToneMark') == 'noinfo':
1904 tones.append(None)
1905 return tones
1906
1907 - def compose(self, readingEntities):
1908 """
1909 Composes the given list of basic entities to a string by applying a
1910 hyphen between syllables.
1911
1912 @type readingEntities: list of str
1913 @param readingEntities: list of basic syllables or other content
1914 @rtype: str
1915 @return: composed entities
1916 """
1917 newReadingEntities = []
1918 precedingEntity = None
1919 for entity in readingEntities:
1920
1921 if precedingEntity and self.isReadingEntity(precedingEntity) and \
1922 self.isReadingEntity(entity):
1923
1924
1925 newReadingEntities.append("-")
1926 newReadingEntities.append(entity)
1927 precedingEntity = entity
1928 return ''.join(newReadingEntities)
1929
1931 """
1932 Removes hyphens between two syllables for a given decomposition.
1933
1934 @type readingEntities: list of str
1935 @param readingEntities: list of basic syllables or other content
1936 @rtype: list of str
1937 @return: the given entity list without separating hyphens
1938 """
1939 if len(readingEntities) == 0:
1940 return []
1941 elif len(readingEntities) > 2 and readingEntities[1] == "-" \
1942 and self.isReadingEntity(readingEntities[0]) \
1943 and self.isReadingEntity(readingEntities[2]):
1944
1945 newReadingEntities = [readingEntities[0]]
1946 newReadingEntities.extend(self.removeHyphens(readingEntities[2:]))
1947 return newReadingEntities
1948 else:
1949 newReadingEntities = [readingEntities[0]]
1950 newReadingEntities.extend(self.removeHyphens(readingEntities[1:]))
1951 return newReadingEntities
1952
1954 if tone != None:
1955 tone = int(tone)
1956 if tone not in self.getTones():
1957 raise InvalidEntityError("Invalid tone information given for '" \
1958 + plainEntity + "': '" + str(tone) + "'")
1959
1960 if self.getOption('toneMarkType') == 'None':
1961 return plainEntity
1962
1963 if tone == None or (tone == 5 \
1964 and self.getOption('missingToneMark') == 'fifth'):
1965 return plainEntity
1966 else:
1967 if self.getOption('toneMarkType') == 'Numbers':
1968 return plainEntity + str(tone)
1969 elif self.getOption('toneMarkType') == 'SuperscriptNumbers':
1970 return plainEntity + self.TO_SUPERSCRIPT[tone]
1971 assert False
1972
1974 if self.getOption('toneMarkType') == 'None':
1975 plainEntity = entity
1976 tone = None
1977
1978 else:
1979 tone = None
1980 if self.getOption('toneMarkType') == 'Numbers':
1981 matchObj = re.search(u"[12345]$", entity)
1982 if matchObj:
1983 tone = int(matchObj.group(0))
1984 elif self.getOption('toneMarkType') == 'SuperscriptNumbers':
1985 matchObj = re.search(u"[¹²³⁴⁵]$", entity)
1986 if matchObj:
1987 tone = self.FROM_SUPERSCRIPT[matchObj.group(0)]
1988
1989 if tone:
1990 plainEntity = entity[0:len(entity)-1]
1991 else:
1992 if self.getOption('missingToneMark') == 'fifth':
1993 plainEntity = entity
1994 tone = 5
1995 elif self.getOption('missingToneMark') == 'ignore':
1996 raise InvalidEntityError("No tone information given for '" \
1997 + entity + "'")
1998 else:
1999 plainEntity = entity
2000
2001 return plainEntity, tone
2002
2004 """
2005 Gets the list of plain entities supported by this reading. Different to
2006 L{getReadingEntities()} the entities will carry no tone mark.
2007
2008 Syllables will use the user specified apostrophe to mark aspiration.
2009
2010 @rtype: set of str
2011 @return: set of supported syllables
2012 """
2013 plainSyllables = set(self.db.selectScalars(
2014 select([self.db.tables['WadeGilesSyllables'].c.WadeGiles])))
2015
2016 if self.getOption('WadeGilesApostrophe') \
2017 == self.DB_ASPIRATION_APOSTROPHE:
2018 return plainSyllables
2019 else:
2020 translatedSyllables = set()
2021 for syllable in plainSyllables:
2022 syllable = syllable.replace(self.DB_ASPIRATION_APOSTROPHE,
2023 self.getOption('WadeGilesApostrophe'))
2024 translatedSyllables.add(syllable)
2025 return translatedSyllables
2026
2027
2028 -class GROperator(TonalRomanisationOperator):
2029 u"""
2030 Provides an operator for the Mandarin X{Gwoyeu Romatzyh} romanisation.
2031
2032 Features:
2033 - support of abbreviated forms (zh, j, g),
2034 - conversion of abbreviated forms to full forms,
2035 - placement of apostrophes before 0-initial syllables,
2036 - support for different apostrophe characters,
2037 - support for I{r-coloured} syllables (I{Erlhuah}) and
2038 - guessing of input form (I{reading dialect}).
2039
2040 Limitations:
2041 - abbreviated forms for multiple syllables are not supported,
2042 - syllable repetition markers as reported by some will currently not be
2043 parsed.
2044
2045 R-colouring
2046 ===========
2047 Gwoyeu Romatzyh renders X{rhotacised} syllables (X{Erlhuah}) by trying to
2048 give the actual pronunciation. As the effect of r-colouring looses the
2049 information of the underlying etymological syllable conversion between the
2050 r-coloured form back to the underlying form can not be done in an
2051 unambiguous way. As furthermore finals I{i}, I{iu}, I{in}, I{iun} contrast
2052 in the first and the second tone but not in the third and the forth tone
2053 conversion between different tones (including the base form) cannot be made
2054 in a general manner: 小鸡儿 I{sheau-jiel} is different to 小街儿
2055 I{sheau-jie’l} but 几儿 I{jieel} equals 姐儿 I{jieel} (see Chao).
2056
2057 Thus this ReadingOperator lacks the general handling of syllable renderings
2058 and many methods narrow the range of syllables allowed. Unlike the original
2059 forms without r-colouring for Erlhuah forms the combination of a plain
2060 syllable with a specific tone is limited to the data given in the source, so
2061 operations involving tones may return with an L{UnsupportedError} if the
2062 given syllable isn't found with that tone.
2063
2064 Sources
2065 =======
2066 - Yuen Ren Chao: A Grammar of Spoken Chinese. University of California
2067 Press, Berkeley, 1968, ISBN 0-520-00219-9.
2068
2069 @see:
2070 - GR Junction by Richard Warmington:
2071 U{http://home.iprimus.com.au/richwarm/gr/gr.htm}
2072 - Article about Gwoyeu Romatzyh on the English Wikipedia:
2073 U{http://en.wikipedia.org/wiki/Gwoyeu_Romatzyh}
2074
2075 @todo Impl: Initial, medial, head, ending (ending1, ending2=l?)
2076 @todo Lang: Which character to use for optional neutral tone: C{'ₒ'} ?
2077 @todo Impl: Implement Erhua forms as stated in W. Simon: A Beginner's
2078 Chinese-English Dictionary.
2079 @todo Impl: Implement repetition markers as stated in W. Simon: A Beginner's
2080 Chinese-English Dictionary.
2081 @todo Impl: Implement a GRIPAConverter once IPA values are obtained for
2082 the PinyinIPAConverter. GRIPAConverter can work around missing Erhua
2083 conversion to Pinyin.
2084 @todo Lang: Special rule for non-Chinese names with initial r- to be
2085 transcribed with an r- cited by Ching-song Gene Hsiao: A Manual of
2086 Transcription Systems For Chinese, 中文拼音手册. Far Eastern Publications,
2087 Yale University, New Haven, Connecticut, 1985, ISBN 0-88710-141-0.
2088 """
2089 READING_NAME = 'GR'
2090
2091 TONES = ['1stTone', '2ndTone', '3rdTone', '4thTone',
2092 '5thToneEtymological1st', '5thToneEtymological2nd',
2093 '5thToneEtymological3rd', '5thToneEtymological4th',
2094 '1stToneOptional5th', '2ndToneOptional5th', '3rdToneOptional5th',
2095 '4thToneOptional5th']
2096
2097 SYLLABLE_STRUCTURE = re.compile(r"^((?:tz|ts|ch|sh|[bpmfdtnlsjrgkh])?)" \
2098 + "([aeiouy]+)((?:ngl|ng|n|l)?)$")
2099 """Regular expression describing the syllable structure in GR (C,V,C)."""
2100
2101 _syllableToneLookup = None
2102 """Holds the tonal syllable to plain syllable & tone lookup table."""
2103
2104 _abbrConversionLookup = None
2105 """Holds the abbreviated entity lookup table."""
2106
2107 DB_RHOTACISED_FINAL_MAPPING = {1: 'GRFinal_T1', 2: 'GRFinal_T2',
2108 3: 'GRFinal_T3', 4: 'GRFinal_T4'}
2109 """Database fields for tonal Erlhuah syllables."""
2110 DB_RHOTACISED_FINAL_MAPPING_ZEROINITIAL = {1: 'GRFinal_T1', 2: 'GRFinal_T2',
2111 3: 'GRFinal_T3_ZEROINITIAL', 4: 'GRFinal_T4_ZEROINITIAL'}
2112 """Database fields for tonal Erlhuah syllables with i, u and iu medials."""
2113
2114 DB_RHOTACISED_FINAL_APOSTROPHE = "'"
2115 """
2116 Default apostrophe used by GR syllable data in database for marking the
2117 longer and back vowel in rhotacised finals.
2118 """
2119
2121 u"""
2122 Creates an instance of the GROperator.
2123
2124 @param options: extra options
2125 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
2126 given, default settings will be assumed.
2127 @keyword strictSegmentation: if C{True} segmentation (using
2128 L{segment()}) and thus decomposition (using L{decompose()}) will
2129 raise an exception if an alphabetic string is parsed which can not
2130 be segmented into single reading entities. If C{False} the aforesaid
2131 string will be returned unsegmented.
2132 @keyword abbreviations: if set to C{True} abbreviated spellings will be
2133 supported.
2134 @keyword GRRhotacisedFinalApostrophe: an alternate apostrophe that is
2135 taken instead of the default one for marking a longer and back vowel
2136 in rhotacised finals.
2137 @keyword GRSyllableSeparatorApostrophe: an alternate apostrophe that is
2138 taken instead of the default one for separating 0-initial syllables
2139 from preceding ones.
2140 """
2141 super(GROperator, self).__init__(**options)
2142
2143 if 'abbreviations' in options:
2144 self.optionValue['abbreviations'] = options['abbreviations']
2145
2146 if 'GRRhotacisedFinalApostrophe' in options:
2147 self.optionValue['GRRhotacisedFinalApostrophe'] \
2148 = options['GRRhotacisedFinalApostrophe']
2149
2150 if 'GRSyllableSeparatorApostrophe' in options:
2151 self.optionValue['GRSyllableSeparatorApostrophe'] \
2152 = options['GRSyllableSeparatorApostrophe']
2153
2154 self.readingEntityRegex = re.compile(u"([\.ₒ]?(?:" \
2155 + re.escape(self.getOption('GRRhotacisedFinalApostrophe')) \
2156 + "|[A-Za-z])+)")
2157
2158 @classmethod
2160 options = super(GROperator, cls).getDefaultOptions()
2161 options.update({'abbreviations': True,
2162 'GRRhotacisedFinalApostrophe': u"’",
2163 'GRSyllableSeparatorApostrophe': u"’"})
2164
2165 return options
2166
2167 @classmethod
2169 u"""
2170 Takes a string written in GR and guesses the reading dialect.
2171
2172 The options C{'GRRhotacisedFinalApostrophe'} and
2173 C{'GRSyllableSeparatorApostrophe'} are guessed. Both will be set to the
2174 same value which derives from a list of different apostrophes and
2175 similar characters.
2176
2177 @type string: str
2178 @param string: GR string
2179 @rtype: dict
2180 @return: dictionary of basic keyword settings
2181 """
2182 APOSTROPHE_LIST = ["'", u'’', u'´', u'‘', u'`', u'ʼ', u'ˈ', u'′', u'ʻ']
2183 readingStr = unicodedata.normalize("NFC", unicode(string))
2184
2185
2186 for apostrophe in APOSTROPHE_LIST:
2187 if apostrophe in readingStr:
2188 break
2189 else:
2190 apostrophe = "'"
2191
2192 return {'GRRhotacisedFinalApostrophe': apostrophe,
2193 'GRSyllableSeparatorApostrophe': apostrophe}
2194
2196 return self.TONES[:]
2197
2198 - def compose(self, readingEntities):
2199 """
2200 Composes the given list of basic entities to a string. Applies an
2201 apostrophe between syllables if the second syllable has a zero-initial.
2202
2203 @type readingEntities: list of str
2204 @param readingEntities: list of basic syllables or other content
2205 @rtype: str
2206 @return: composed entities
2207 """
2208 newReadingEntities = []
2209 precedingEntity = None
2210
2211 for entity in readingEntities:
2212 if precedingEntity and self.isReadingEntity(precedingEntity) \
2213 and self.isReadingEntity(entity):
2214
2215 if entity[0] in ['a', 'e', 'i', 'o', 'u']:
2216 newReadingEntities.append(
2217 self.getOption('GRSyllableSeparatorApostrophe'))
2218
2219 newReadingEntities.append(entity)
2220 precedingEntity = entity
2221
2222 return ''.join(newReadingEntities)
2223
2225 precedingEntity = None
2226 for entity in readingEntities:
2227 if precedingEntity and self.isReadingEntity(precedingEntity) \
2228 and self.isReadingEntity(entity):
2229
2230 if entity[0] in ['a', 'e', 'i', 'o', 'u']:
2231 return False
2232
2233 precedingEntity = entity
2234
2235 return True
2236
2238
2239
2240 segmentationParts = []
2241 substringIndex = 1
2242 while substringIndex <= len(string) and \
2243 (self._hasSyllableSubstring(string[0:substringIndex].lower()) \
2244 or string[0:substringIndex] == "'"):
2245 syllable = string[0:substringIndex]
2246 if self.isReadingEntity(syllable) or syllable == "'":
2247 remaining = string[substringIndex:]
2248 if remaining != '':
2249 remainingParts = self._recursiveSegmentation(remaining)
2250 if remainingParts != []:
2251 segmentationParts.append((syllable, remainingParts))
2252 else:
2253 segmentationParts.append((syllable, None))
2254 substringIndex = substringIndex + 1
2255 return segmentationParts
2256
2257 - def removeApostrophes(self, readingEntities):
2258 """
2259 Removes apostrophes between two syllables for a given decomposition.
2260
2261 @type readingEntities: list of str
2262 @param readingEntities: list of basic syllables or other content
2263 @rtype: list of str
2264 @return: the given entity list without separating apostrophes
2265 """
2266 if len(readingEntities) == 0:
2267 return []
2268 elif len(readingEntities) > 2 and readingEntities[1] == "'" \
2269 and self.isReadingEntity(readingEntities[0]) \
2270 and self.isReadingEntity(readingEntities[2]):
2271
2272 newReadingEntities = [readingEntities[0]]
2273 newReadingEntities.extend(self.removeApostrophes(
2274 readingEntities[2:]))
2275 return newReadingEntities
2276 else:
2277 newReadingEntities = [readingEntities[0]]
2278 newReadingEntities.extend(self.removeApostrophes(
2279 readingEntities[1:]))
2280 return newReadingEntities
2281
2283 """
2284 Gets the tone number of the tone or the etymological tone if it is a
2285 neutral or optional neutral tone.
2286
2287 @type tone: str
2288 @param tone: tone
2289 @rtype: int
2290 @return: base tone number
2291 @raise InvalidEntityError: if an invalid tone is passed.
2292 """
2293 if tone not in self.getTones():
2294 raise InvalidEntityError("Invalid tone information given for '" \
2295 + unicode(tone) + "'")
2296
2297 if tone.startswith("5thToneEtymological"):
2298 return int(tone[-3])
2299 else:
2300 return int(tone[0])
2301
2303 """
2304 Splits the given plain syllable into consonants-vowels-consonants.
2305
2306 @type plainSyllable: str
2307 @param plainSyllable: entity without tonal information
2308 @rtype: tuple of str
2309 @return: syllable CVC triple
2310 @raise InvalidEntityError: if the entity is invalid.
2311 """
2312
2313 matchObj = self.SYLLABLE_STRUCTURE.match(plainSyllable)
2314 if not matchObj:
2315 print plainSyllable
2316 raise InvalidEntityError("Invalid entity given for '" \
2317 + plainSyllable + "'")
2318
2319 c1, v, c2 = matchObj.groups()
2320 return c1, v, c2
2321
2323 """
2324 Gets the entity with tone mark for the given plain entity and tone. This
2325 method only works for plain syllables that are not r-coloured (Erlhuah
2326 forms) as due to the depiction of Erlhuah in GR the information about
2327 the base syllable is lost and pronunciation partly varies between
2328 different syllables. Use L{getRhotacisedTonalEntity()} to get the tonal
2329 entity for a given etymological (base) syllable.
2330
2331 @type plainEntity: str
2332 @param plainEntity: entity without tonal information
2333 @type tone: str
2334 @param tone: tone
2335 @rtype: str
2336 @return: entity with appropriate tone
2337 @raise InvalidEntityError: if the entity is invalid.
2338 @raise UnsupportedError: if the given entity is an Erlhuah form.
2339 """
2340 if tone not in self.getTones():
2341 raise InvalidEntityError("Invalid tone information given for '" \
2342 + plainEntity + "': '" + unicode(tone) + "'")
2343
2344 if plainEntity.endswith('l') and plainEntity != 'el' \
2345 and self.isPlainReadingEntity(plainEntity[:-1]):
2346 raise UnsupportedError("Not supported for '" + plainEntity + "'")
2347
2348
2349 c1, v, c2 = self.splitPlainSyllableCVC(plainEntity)
2350
2351 baseTone = self.getBaseTone(tone)
2352
2353
2354 if baseTone == 1:
2355 if c1 not in ['m', 'n', 'l', 'r']:
2356
2357 tonalEntity = plainEntity
2358 else:
2359
2360 tonalEntity = c1 + 'h' + v + c2
2361
2362 elif baseTone == 2:
2363 if c1 not in ['m', 'n', 'l', 'r']:
2364
2365 if v == 'i' and not c2:
2366 tonalEntity = c1 + 'y' + v
2367 elif v[0] == 'i':
2368
2369 tonalEntity = c1 + 'y' + v[1:] + c2
2370 elif v == 'u' and not c2:
2371 tonalEntity = c1 + 'w' + v
2372 elif v[0] == 'u':
2373 tonalEntity = c1 + 'w' + v[1:] + c2
2374 else:
2375 tonalEntity = c1 + v + 'r' + c2
2376 else:
2377
2378 tonalEntity = plainEntity
2379
2380 elif baseTone == 3:
2381
2382 if len(v) == 1:
2383 tonalEntity = c1 + v + v + c2
2384 elif v in ['ie', 'ei']:
2385 tonalEntity = c1 + v[0] + 'e' + v[1] + c2
2386 elif v in ['ou', 'uo']:
2387 tonalEntity = c1 + v[0] + 'o' + v[1] + c2
2388
2389 elif v[0] == 'i':
2390
2391 tonalEntity = c1 + 'e' + v[1:] + c2
2392 elif v[0] == 'u':
2393 tonalEntity = c1 + 'o' + v[1:] + c2
2394 elif ('i' in v) or ('u' in v):
2395 tonalEntity = c1 + v.replace('i', 'e', 1).replace('u', 'o', 1) \
2396 + c2
2397
2398
2399 if not c1:
2400 if tonalEntity == 'iee':
2401 tonalEntity = 'yee'
2402 elif tonalEntity == 'uoo':
2403 tonalEntity = 'woo'
2404 elif v[0] == 'i':
2405
2406 tonalEntity = 'y' + tonalEntity
2407 elif v[0] == 'u':
2408 tonalEntity = 'w' + tonalEntity
2409
2410 elif baseTone == 4:
2411
2412 if not c2:
2413 if v in ['i', 'iu', 'u']:
2414 tonalEntity = c1 + v + c2 + 'h'
2415 elif v.endswith('i'):
2416 tonalEntity = c1 + v[:-1] + 'y' + c2
2417 elif v.endswith('u'):
2418 tonalEntity = c1 + v[:-1] + 'w' + c2
2419 else:
2420 tonalEntity = c1 + v + c2 + 'h'
2421 elif c2 == 'n':
2422 tonalEntity = c1 + v + 'nn'
2423 elif c2 == 'ng':
2424 tonalEntity = c1 + v + 'nq'
2425 elif c2 == 'l':
2426 tonalEntity = c1 + v + 'll'
2427
2428
2429 if not c1:
2430 if tonalEntity == 'ih':
2431 tonalEntity = 'yih'
2432 elif tonalEntity == 'uh':
2433 tonalEntity = 'wuh'
2434 elif tonalEntity == 'inn':
2435 tonalEntity = 'yinn'
2436 elif tonalEntity == 'inq':
2437 tonalEntity = 'yinq'
2438 elif v[0] == 'i':
2439
2440 tonalEntity = 'y' + tonalEntity[1:]
2441 elif v[0] == 'u':
2442 tonalEntity = 'w' + tonalEntity[1:]
2443
2444 if tone.startswith('5'):
2445 tonalEntity = '.' + tonalEntity
2446 elif tone.endswith('Optional5th'):
2447 tonalEntity = u'ₒ' + tonalEntity
2448
2449 return tonalEntity
2450
2468
2470 """
2471 Gets the r-coloured entity (Erlhuah form) with tone mark for the given
2472 plain entity and tone. Not all entity-tone combinations are supported.
2473
2474 @type plainEntity: str
2475 @param plainEntity: entity without tonal information
2476 @type tone: str
2477 @param tone: tone
2478 @rtype: str
2479 @return: entity with appropriate tone
2480 @raise InvalidEntityError: if the entity is invalid.
2481 @raise UnsupportedError: if the given entity is an Erlhuah form or the
2482 syllable is not supported in this given tone.
2483 @todo Fix: Build lookup for performance reasons.
2484 """
2485 if tone not in self.getTones():
2486 raise InvalidEntityError("Invalid tone information given for '" \
2487 + plainEntity + "': '" + unicode(tone) + "'")
2488
2489 if plainEntity.endswith('l') \
2490 and self.isPlainReadingEntity(plainEntity[:-1]):
2491 raise UnsupportedError("Not supported for '" + plainEntity + "'")
2492
2493
2494 c1, v, c2 = self.splitPlainSyllableCVC(plainEntity)
2495 baseTone = self.getBaseTone(tone)
2496
2497
2498 if c1 in ['m', 'n', 'l', 'r']:
2499 if baseTone == 1:
2500 c1 = c1 + 'h'
2501 elif baseTone == 2:
2502
2503 baseTone = 1
2504
2505
2506 if not c1 and v[0] in ['i', 'u']:
2507 column = self.DB_RHOTACISED_FINAL_MAPPING_ZEROINITIAL[baseTone]
2508 else:
2509 column = self.DB_RHOTACISED_FINAL_MAPPING[baseTone]
2510
2511 table = self.db.tables['GRRhotacisedFinals']
2512 tonalFinal = self.db.selectScalar(select([table.c[column]],
2513 table.c.GRFinal == v + c2))
2514 if not tonalFinal:
2515 raise UnsupportedError("No Erlhuah form for '" \
2516 + plainEntity + "' and tone '" + tone + "'")
2517
2518
2519
2520 if self.getOption('GRRhotacisedFinalApostrophe') \
2521 != self.DB_RHOTACISED_FINAL_APOSTROPHE:
2522 tonalFinal = tonalFinal.replace(self.DB_RHOTACISED_FINAL_APOSTROPHE,
2523 self.getOption('GRRhotacisedFinalApostrophe'))
2524
2525 tonalEntity = c1 + tonalFinal
2526
2527 if tone.startswith('5'):
2528 tonalEntity = '.' + tonalEntity
2529 elif tone.endswith('Optional5th'):
2530 tonalEntity = u'ₒ' + tonalEntity
2531
2532 return tonalEntity
2533
2561
2563 """
2564 Gets a list of abbreviated GR spellings.
2565
2566 @rtype: list
2567 @return: list of abbreviated GR forms
2568 """
2569 return self._getAbbreviatedLookup().keys()
2570
2572 """
2573 Returns true if the given entity is an abbreviated spelling.
2574
2575 Reading entities will be handled as being case insensitive.
2576
2577 @type entity: str
2578 @param entity: entity to check
2579 @rtype: bool
2580 @return: C{True} if entity is an abbreviated form.
2581 """
2582 return entity in self._getAbbreviatedLookup()
2583
2585 """
2586 Converts the given abbreviated GR spelling to the original form.
2587 Non-abbreviated forms will returned unchanged. Takes care of
2588 capitalisation.
2589
2590 @type entity: str
2591 @param entity: reading entity.
2592 @rtype: str
2593 @return: original entity
2594 @raise AmbiguousConversionError: if conversion is ambiguous.
2595 @todo Fix: Move this method to the Converter, AmbiguousConversionError
2596 not needed for import here then
2597 """
2598 if self.isAbbreviatedEntity(entity):
2599 if self._getAbbreviatedLookup()[entity] == None:
2600 raise AmbiguousConversionError("conversion for entity '" \
2601 + entity + "' is ambiguous")
2602
2603 originalEntity = self._getAbbreviatedLookup()[entity]
2604 if entity.isupper():
2605 originalEntity = originalEntity.upper()
2606 elif entity.istitle():
2607 originalEntity = originalEntity.capitalize()
2608
2609 return originalEntity
2610 else:
2611 return entity
2612
2614 """
2615 Gets the list of plain entities supported by this reading without
2616 r-coloured forms (Erlhuah forms). Different to L{getReadingEntities()}
2617 the entities will carry no tone mark.
2618
2619 @rtype: set of str
2620 @return: set of supported syllables
2621 """
2622 table = self.db.tables['GRSyllables']
2623 return set(self.db.selectScalars(select([table.c.GR])))
2624
2626 """
2627 Gets a set of full entities supported by the reading excluding
2628 abbreviated forms.
2629
2630 @rtype: set of str
2631 @return: set of supported syllables
2632 """
2633 plainSyllables = self.getPlainReadingEntities()
2634
2635 syllableSet = set()
2636 for syllable in plainSyllables:
2637 for tone in self.getTones():
2638 syllableSet.add(self.getTonalEntity(syllable, tone))
2639
2640
2641 for syllable in plainSyllables:
2642 for tone in self.getTones():
2643 try:
2644 erlhuahSyllable = self.getRhotacisedTonalEntity(syllable,
2645 tone)
2646 syllableSet.add(erlhuahSyllable)
2647 except UnsupportedError:
2648
2649 pass
2650
2651 return syllableSet
2652
2658
2662
2665 u"""
2666 Provides an operator on strings in Mandarin Chinese written in the
2667 I{International Phonetic Alphabet} (I{IPA}).
2668
2669 Features:
2670 - Tones can be marked either with tone numbers (1-4), tone contour
2671 numbers (e.g. 214), IPA tone bar characters or IPA diacritics,
2672 - support for low third tone (1/2 third tone) with tone contour 21,
2673 - four levels of the neutral tone for varying stress depending on the
2674 preceding syllable and
2675 - splitting of syllables into onset and rhyme using method
2676 L{getOnsetRhyme()}.
2677
2678 Tones
2679 =====
2680 Tones in IPA can be expressed using different schemes. The following schemes
2681 are implemented here:
2682 - Numbers, regular tone numbers from 1 to 5 for first tone to fifth
2683 (qingsheng),
2684 - ChaoDigits, numbers displaying the levels of tone contours, e.g.
2685 214 for the regular third tone,
2686 - IPAToneBar, IPA modifying tone bar characters, e.g. ɕi˨˩˦,
2687 - Diacritics, diacritical marks and finally
2688 - None, no support for tone marks
2689
2690 Unlike other operators for Mandarin, distinction is made for six different
2691 tonal occurrences. The third tone is affected by tone sandhi and basically
2692 two different tone contours exist. Therefore L{getTonalEntity()} and
2693 L{splitEntityTone()} work with string representations as tones defined in
2694 L{TONES}. Same behaviour as found in other operators for Mandarin can be
2695 achieved by simply using the first character of the given string:
2696
2697 >>> from cjklib.reading import operator
2698 >>> ipaOp = operator.MandarinIPAOperator(toneMarkType='IPAToneBar')
2699 >>> syllable, toneName = ipaOp.splitEntityTone(u'mən˧˥')
2700 >>> tone = int(toneName[0])
2701
2702 The implemented schemes render tone information differently. Mapping might
2703 lose information so a full back-transformation can not be guaranteed.
2704
2705 Source
2706 ======
2707 - Yuen Ren Chao: A Grammar of Spoken Chinese. University of California
2708 Press, Berkeley, 1968, ISBN 0-520-00219-9.
2709 """
2710 READING_NAME = "MandarinIPA"
2711
2712 TONE_MARK_PREFER = {'Numbers': {'3': '3rdToneRegular', '5': '5thTone'},
2713 'ChaoDigits': {}, 'IPAToneBar': {}, 'Diacritics': {}}
2714
2715 TONES = ['1stTone', '2ndTone', '3rdToneRegular', '3rdToneLow',
2716 '4thTone', '5thTone', '5thToneHalfHigh', '5thToneMiddle',
2717 '5thToneHalfLow', '5thToneLow']
2718
2719 TONE_MARK_MAPPING = {'Numbers': {'1stTone': '1', '2ndTone': '2',
2720 '3rdToneRegular': '3', '3rdToneLow': '3', '4thTone': '4',
2721 '5thTone':'5', '5thToneHalfHigh': '5', '5thToneMiddle': '5',
2722 '5thToneHalfLow': '5', '5thToneLow': '5'},
2723 'ChaoDigits': {'1stTone': '55', '2ndTone': '35',
2724 '3rdToneRegular': '214', '3rdToneLow': '21', '4thTone': '51',
2725 '5thTone':'', '5thToneHalfHigh': '', '5thToneMiddle': '',
2726 '5thToneHalfLow': '', '5thToneLow': ''},
2727 'IPAToneBar': {'1stTone': u'˥˥', '2ndTone': u'˧˥',
2728 '3rdToneRegular': u'˨˩˦', '3rdToneLow': u'˨˩', '4thTone': u'˥˩',
2729 '5thTone':'', '5thToneHalfHigh': u'꜉', '5thToneMiddle': u'꜊',
2730 '5thToneHalfLow': u'꜋', '5thToneLow': u'꜌'},
2731
2732
2733
2734
2735
2736 }
2737
2739 """
2740 Gets the list of plain entities supported by this reading. These
2741 entities will carry no tone mark.
2742
2743 @rtype: set of str
2744 @return: set of supported syllables
2745 """
2746 table = self.db.tables['MandarinIPAInitialFinal']
2747 return set(self.db.selectScalars(select([table.c.IPA])))
2748
2750 """
2751 Splits the given plain syllable into onset (initial) and rhyme (final).
2752
2753 @type plainSyllable: str
2754 @param plainSyllable: syllable in IPA without tone marks
2755 @rtype: tuple of str
2756 @return: tuple of syllable onset and rhyme
2757 @raise InvalidEntityError: if the entity is invalid (e.g. syllable
2758 nucleus or tone invalid).
2759 """
2760 table = self.db.tables['MandarinIPAInitialFinal']
2761 entry = set(self.db.selectRow(
2762 select([table.c.IPAInitial, table.c.IPAFinal],
2763 table.c.IPA == plainSyllable)))
2764 if not entry:
2765 raise InvalidEntityError("'" + plainSyllable \
2766 + "' not a valid IPA form in this system'")
2767 return (entry[0], entry[1])
2768
2771 u"""
2772 Provides an operator on strings written in the X{Braille} system.
2773 """
2774 READING_NAME = "MandarinBraille"
2775
2776 TONEMARKS = [u'⠁', u'⠂', u'⠄', u'⠆', '']
2777
2779 """
2780 Creates an instance of the MandarinBrailleOperator.
2781
2782 @param options: extra options
2783 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
2784 given, default settings will be assumed.
2785 """
2786 super(MandarinBrailleOperator, self).__init__(**options)
2787
2788
2789 initials = ''.join(self.db.selectScalars(
2790 select([self.db.tables['PinyinBrailleInitialMapping'].c.Braille],
2791 distinct=True)))
2792 finals = ''.join(self.db.selectScalars(
2793 select([self.db.tables['PinyinBrailleFinalMapping'].c.Braille],
2794 distinct=True)))
2795
2796 self.splitRegex = re.compile(ur'((?:(?:[' + re.escape(initials) \
2797 + '][' + re.escape(finals) + ']?)|['+ re.escape(finals) \
2798 + u'])[' + re.escape(''.join(self.TONEMARKS)) + ']?)')
2799 self.brailleRegex = re.compile(ur'([⠀-⣿]+|[^⠀-⣿]+)')
2800
2802 """
2803 Returns a set of tones supported by the reading.
2804
2805 @rtype: set
2806 @return: set of supported tone marks.
2807 """
2808 return range(1, 6)
2809
2811 """
2812 Decomposes the given string into basic entities that can be mapped to
2813 one Chinese character each (exceptions possible).
2814
2815 The given input string can contain other non reading characters, e.g.
2816 punctuation marks.
2817
2818 The returned list contains a mix of basic reading entities and other
2819 characters e.g. spaces and punctuation marks.
2820
2821 @type string: str
2822 @param string: reading string
2823 @rtype: list of str
2824 @return: a list of basic entities of the input string
2825 """
2826 def buildList(entityList):
2827
2828
2829 newList = self.brailleRegex.findall(entityList[0])
2830
2831 if len(entityList) > 1:
2832 newList.extend(buildList(entityList[1:]))
2833
2834 return newList
2835
2836 return buildList(self.splitRegex.split(string))
2837
2838 - def compose(self, readingEntities):
2839 """
2840 Composes the given list of basic entities to a string.
2841
2842 No special treatment is given for subsequent Braille entities. Use
2843 L{getSpaceSeparatedEntities()} to insert spaces between two Braille
2844 syllables.
2845
2846 @type readingEntities: list of str
2847 @param readingEntities: list of basic entities or other content
2848 @rtype: str
2849 @return: composed entities
2850 """
2851 return "".join(readingEntities)
2852
2854 """
2855 Inserts spaces between to Braille entities for a given list of reading
2856 entities.
2857
2858 Spaces in the Braille system are applied between words. This is not
2859 reflected here and instead a space will be added between single
2860 syllables.
2861
2862 @type readingEntities: list of str
2863 @param readingEntities: list of basic entities or other content
2864 @rtype: list of str
2865 @return: entities with spaces inserted between Braille sequences
2866 """
2867 def isBrailleChar(char):
2868 return char >= u'⠀' and char <= u'⣿'
2869
2870 newReadingEntities = []
2871 if len(readingEntities) > 0:
2872 lastIsBraille = False
2873 for entity in readingEntities:
2874 isBraille = len(entity) > 0 and isBrailleChar(entity[0])
2875
2876 if lastIsBraille and isBraille:
2877 newReadingEntities.append(u' ')
2878 newReadingEntities.append(entity)
2879 lastIsBraille = isBraille
2880 return newReadingEntities
2881
2883 """
2884 Gets the entity with tone mark for the given plain entity and tone.
2885
2886 @type plainEntity: str
2887 @param plainEntity: entity without tonal information
2888 @type tone: str
2889 @param tone: tone
2890 @rtype: str
2891 @return: entity with appropriate tone
2892 @raise InvalidEntityError: if the entity is invalid.
2893 """
2894 if tone not in self.getTones():
2895 raise InvalidEntityError("Invalid tone information given for '" \
2896 + plainEntity + "': '" + str(tone) + "'")
2897 return plainEntity + self.TONEMARKS[tone-1]
2898
2900 """
2901 Splits the entity into an entity without tone mark and the name of the
2902 entity's tone.
2903
2904 @type entity: str
2905 @param entity: entity with tonal information
2906 @rtype: tuple
2907 @return: plain entity without tone mark and additionally the tone
2908 @raise InvalidEntityError: if the entity is invalid.
2909 """
2910 if entity[-1] in self.TONEMARKS:
2911 return entity[:-1], self.TONEMARKS.index(entity[-1]) + 1
2912 else:
2913 return entity, 5
2914
2916 if not entity:
2917 return False
2918
2919 try:
2920 plainEntity, _ = self.splitEntityTone(entity)
2921 if not plainEntity:
2922 return False
2923
2924 initial, final = self.getOnsetRhyme(plainEntity)
2925
2926 finalTable = self.db.tables['PinyinBrailleFinalMapping']
2927 if final and self.db.selectScalar(select([finalTable.c['Braille']],
2928 finalTable.c['Braille'] == final, distinct=True)) == None:
2929 return False
2930
2931 initialTable = self.db.tables['PinyinBrailleInitialMapping']
2932 if initial and self.db.selectScalar(select(
2933 [initialTable.c['Braille']],
2934 initialTable.c['Braille'] == initial, distinct=True)) == None:
2935 return False
2936
2937 return True
2938 except InvalidEntityError:
2939 return False
2940
2942 """
2943 Splits the given plain syllable into onset (initial) and rhyme (final).
2944
2945 @type plainSyllable: str
2946 @param plainSyllable: syllable without tone marks
2947 @rtype: tuple of str
2948 @return: tuple of syllable onset and rhyme
2949 @raise InvalidEntityError: if the entity is invalid.
2950 """
2951 if len(plainSyllable) == 1:
2952 finalTable = self.db.tables['PinyinBrailleFinalMapping']
2953 if plainSyllable and self.db.selectScalar(
2954 select([finalTable.c.Braille],
2955 finalTable.c.Braille == plainSyllable,
2956 distinct=True)) != None:
2957 return '', plainSyllable
2958 else:
2959 return plainSyllable, ''
2960 elif len(plainSyllable) == 2:
2961 return plainSyllable[0], plainSyllable[1]
2962 else:
2963 raise InvalidEntityError("Invalid plain entity given with '" \
2964 + plainSyllable + "'")
2965
2968 """
2969 Provides an operator for the Cantonese romanisation X{Jyutping} made by the
2970 X{Linguistic Society of Hong Kong} (X{LSHK}).
2971
2972 @see:
2973 - The Linguistic Society of Hong Kong Cantonese Romanization Scheme:
2974 U{http://lshk.ctl.cityu.edu.hk/cantonese.php}
2975 """
2976 READING_NAME = 'Jyutping'
2977 readingEntityRegex = re.compile(u"([A-Za-z]+[123456]?)")
2978
2980 """
2981 Creates an instance of the JyutpingOperator.
2982
2983 @param options: extra options
2984 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
2985 given, default settings will be assumed.
2986 @keyword strictSegmentation: if C{True} segmentation (using
2987 L{segment()}) and thus decomposition (using L{decompose()}) will
2988 raise an exception if an alphabetic string is parsed which can not
2989 be segmented into single reading entities. If C{False} the aforesaid
2990 string will be returned unsegmented.
2991 @keyword toneMarkType: if set to C{'Numbers'} the default form of
2992 appended numbers from 1 to 6 will be used to mark tones, if set to
2993 C{'None'} no tone marks will be used and no tonal information will
2994 be supplied at all.
2995 @keyword missingToneMark: if set to C{'noinfo'} no tone information
2996 will be deduced when no tone mark is found (takes on value C{None}),
2997 if set to C{'ignore'} this entity will not be valid and for
2998 segmentation the behaviour defined by C{'strictSegmentation'} will
2999 take affect.
3000 """
3001 super(JyutpingOperator, self).__init__(**options)
3002
3003
3004 if 'toneMarkType' in options:
3005 if options['toneMarkType'] not in ['Numbers', 'None']:
3006 raise ValueError("Invalid option '" \
3007 + str(options['toneMarkType']) \
3008 + "' for keyword 'toneMarkType'")
3009 self.optionValue['toneMarkType'] = options['toneMarkType']
3010
3011
3012 if 'missingToneMark' in options:
3013 if options['missingToneMark'] not in ['noinfo', 'ignore']:
3014 raise ValueError("Invalid option '" \
3015 + str(options['missingToneMark']) \
3016 + "' for keyword 'missingToneMark'")
3017 self.optionValue['missingToneMark'] = options['missingToneMark']
3018
3019 @classmethod
3025
3027 tones = range(1, 7)
3028 if self.getOption('missingToneMark') != 'ignore' \
3029 or self.getOption('toneMarkType') == 'None':
3030 tones.append(None)
3031 return tones
3032
3033 - def compose(self, readingEntities):
3034 return "".join(readingEntities)
3035
3037 if self.getOption('toneMarkType') == 'None':
3038 return plainEntity
3039
3040 if tone != None:
3041 tone = int(tone)
3042 if tone not in self.getTones():
3043 raise InvalidEntityError("Invalid tone information given for '" \
3044 + plainEntity + "': '" + str(tone) + "'")
3045 if tone == None:
3046 return plainEntity
3047 return plainEntity + str(tone)
3048
3050 if self.getOption('toneMarkType') == 'None':
3051 return entity, None
3052
3053 matchObj = re.search(u"[123456]$", entity)
3054 if matchObj:
3055 tone = int(matchObj.group(0))
3056 return entity[0:len(entity)-1], tone
3057 else:
3058 if self.getOption('missingToneMark') == 'ignore':
3059 raise InvalidEntityError("No tone information given for '" \
3060 + entity + "'")
3061 else:
3062 return entity, None
3063
3065 return set(self.db.selectScalars(
3066 select([self.db.tables['JyutpingSyllables'].c.Jyutping])))
3067
3069 """
3070 Splits the given plain syllable into onset (initial) and rhyme (final).
3071
3072 The syllabic nasals I{m}, I{ng} will be regarded as being finals.
3073
3074 @type plainSyllable: str
3075 @param plainSyllable: syllable without tone marks
3076 @rtype: tuple of str
3077 @return: tuple of entity onset and rhyme
3078 @raise InvalidEntityError: if the entity is invalid.
3079 @todo Impl: Finals I{ing, ik, ung, uk} differ from other finals with
3080 same vowels. What semantics/view do we want to provide on the
3081 syllable parts?
3082 """
3083 table = self.db.tables['JyutpingInitialFinal']
3084 entry = self.db.selectRow(
3085 select([table.c.JyutpingInitial, table.c.JyutpingFinal],
3086 table.c.Jyutping == plainSyllable.lower()))
3087 if not entry:
3088 raise InvalidEntityError("'" + plainSyllable \
3089 + "' not a valid plain Jyutping syllable'")
3090 return (entry[0], entry[1])
3091
3094 u"""
3095 Provides an operator for the X{Cantonese Yale} romanisation.
3096
3097 Features:
3098 - tones marked by either diacritics or numbers,
3099 - choice between high level and high falling tone for number marks,
3100 - guessing of input form (reading dialect) and
3101 - splitting of syllables into onset, nucleus and coda.
3102
3103 High Level vs. High Falling Tone
3104 ================================
3105 Yale distinguishes two tones often subsumed under one: the high level tone
3106 with tone contour 55 as given in the commonly used pitch model by Yuen Ren
3107 Chao and the high falling tone given as pitch 53 (as by Chao), 52 or 51
3108 (Bauer and Benedikt, chapter 2.1.1 pp. 115).
3109 Many sources state that these two tones aren't distinguishable anymore in
3110 modern Hong Kong Cantonese and thus are subsumed under one tone in some
3111 romanisation systems for Cantonese.
3112
3113 In the abbreviated form of the Yale romanisation that uses numbers to
3114 represent tones this distinction is not made. The mapping of the tone number
3115 C{1} to either the high level or the high falling tone can be given by the
3116 user and is important when conversion is done involving this abbreviated
3117 form of the Yale romanisation. By default the the high level tone will be
3118 used as this primary use is indicated in the given sources.
3119
3120 Sources
3121 =======
3122 - Stephen Matthews, Virginia Yip: Cantonese: A Comprehensive Grammar.
3123 Routledge, 1994, ISBN 0-415-08945-X.
3124 - Robert S. Bauer, Paul K. Benedikt: Modern Cantonese Phonology
3125 (摩登廣州話語音學). Walter de Gruyter, 1997, ISBN 3-11-014893-5.
3126
3127 @see:
3128 - Cantonese: A Comprehensive Grammar (Preview):
3129 U{http://books.google.de/books?id=czbGJLu59S0C}
3130 - Modern Cantonese Phonology (Preview):
3131 U{http://books.google.de/books?id=QWNj5Yj6_CgC}
3132 """
3133 READING_NAME = 'CantoneseYale'
3134
3135 TONES = ['1stToneLevel', '1stToneFalling', '2ndTone', '3rdTone', '4thTone',
3136 '5thTone', '6thTone']
3137 """Names of tones used in the romanisation."""
3138 TONE_MARK_MAPPING = {'Numbers': {'1stToneLevel': ('1', ''),
3139 '1stToneFalling': ('1', ''), '2ndTone': ('2', ''),
3140 '3rdTone': ('3', ''), '4thTone': ('4', ''), '5thTone': ('5', ''),
3141 '6thTone': ('6', ''), None: ('', '')},
3142 'Diacritics': {'1stToneLevel': (u'\u0304', ''),
3143 '1stToneFalling': (u'\u0300', ''),
3144 '2ndTone': (u'\u0301', ''), '3rdTone': (u'', ''),
3145 '4thTone': (u'\u0300', 'h'), '5thTone': (u'\u0301', 'h'),
3146 '6thTone': (u'', 'h')},
3147 'Internal': {'1stToneLevel': ('0', ''),
3148 '1stToneFalling': ('1', ''), '2ndTone': ('2', ''),
3149 '3rdTone': ('3', ''), '4thTone': ('4', ''), '5thTone': ('5', ''),
3150 '6thTone': ('6', ''), None: ('', '')}}
3151 """
3152 Mapping of tone name to representation per tone mark type. Representations
3153 includes a diacritic mark and optional the letter 'h' marking a low tone.
3154
3155 The C{'Internal'} dialect is used for conversion between different forms of
3156 Cantonese Yale. As conversion to the other dialects can lose information
3157 (Diacritics: missing tone, Numbers: distinction between high level and high
3158 rising, None: no tones at all) conversion to this dialect can retain all
3159 information and thus can be used as a standard target reading.
3160 """
3161
3162 syllableRegex = re.compile(ur'((?:m|ng|h|' \
3163 + u'(?:[bcdfghjklmnpqrstvwxyz]*' \
3164 + u'(?:(?:[aeiou]|[\u0304\u0301\u0300])+|yu[\u0304\u0301\u0300]?)))' \
3165 + u'(?:h(?!(?:[aeiou]|yu)))?' \
3166 + '(?:[mnptk]|ng)?[0123456]?)')
3167 """
3168 Regex to split a string in NFD into several syllables in a crude way.
3169 The regular expressions works for both, diacritical and number tone marks.
3170 It consists of:
3171 - Nasal syllables,
3172 - Initial consonants,
3173 - vowels including diacritics,
3174 - tone mark h,
3175 - final consonants,
3176 - tone numbers.
3177 """
3178
3180 """
3181 Creates an instance of the CantoneseYaleOperator.
3182
3183 @param options: extra options
3184 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
3185 given, default settings will be assumed.
3186 @keyword strictSegmentation: if C{True} segmentation (using
3187 L{segment()}) and thus decomposition (using L{decompose()}) will
3188 raise an exception if an alphabetic string is parsed which can not
3189 be segmented into single reading entities. If C{False} the aforesaid
3190 string will be returned unsegmented.
3191 @keyword toneMarkType: if set to C{'Diacritics'} tones will be marked
3192 using diacritic marks and the character I{h} for low tones, if set
3193 to C{'Numbers'} appended numbers from 1 to 6 will be used to mark
3194 tones, if set to C{'None'} no tone marks will be used and no tonal
3195 information will be supplied at all.
3196 @keyword missingToneMark: if set to C{'noinfo'} no tone information
3197 will be deduced when no tone mark is found (takes on value C{None}),
3198 if set to C{'ignore'} this entity will not be valid and for
3199 segmentation the behaviour defined by C{'strictSegmentation'} will
3200 take affect. This option is only valid if the value C{'Numbers'} is
3201 given for the option I{toneMarkType}.
3202 @keyword YaleFirstTone: tone in Yale which the first tone for tone marks
3203 with numbers should be mapped to. Value can be C{'1stToneLevel'} to
3204 map to the level tone with contour 55 or C{'1stToneFalling'} to map
3205 to the falling tone with contour 53.
3206 """
3207 super(CantoneseYaleOperator, self).__init__(**options)
3208
3209
3210 if 'toneMarkType' in options:
3211 if options['toneMarkType'] not in ['Diacritics', 'Numbers', 'None',
3212 'Internal']:
3213 raise ValueError("Invalid option '" \
3214 + str(options['toneMarkType']) \
3215 + "' for keyword 'toneMarkType'")
3216 self.optionValue['toneMarkType'] = options['toneMarkType']
3217
3218
3219 if 'missingToneMark' in options:
3220 if option['toneMarkType'] not in ['Numbers', 'Internal', 'None']:
3221 raise ValueError("keyword 'missingToneMark' is only valid if" \
3222 + " tone mark type is set to 'Numbers', 'Internal' and "\
3223 + "'None'")
3224
3225 if options['missingToneMark'] not in ['noinfo', 'ignore']:
3226 raise ValueError("Invalid option '" \
3227 + str(options['missingToneMark']) \
3228 + "' for keyword 'missingToneMark'")
3229 self.optionValue['missingToneMark'] = options['missingToneMark']
3230
3231
3232
3233 if 'YaleFirstTone' in options:
3234 if options['YaleFirstTone'] not in ['1stToneLevel',
3235 '1stToneFalling', 'None']:
3236 raise ValueError("Invalid option '" \
3237 + unicode(options['YaleFirstTone']) \
3238 + "' for keyword 'YaleFirstTone'")
3239 self.optionValue['YaleFirstTone'] = options['YaleFirstTone']
3240
3241
3242 if self.getOption('toneMarkType') != 'None':
3243
3244 self.toneMarkLookup = {}
3245 for tone in self.getTones():
3246 toneMarks = self.TONE_MARK_MAPPING[
3247 self.getOption('toneMarkType')][tone]
3248 self.toneMarkLookup[toneMarks] = tone
3249 if self.getOption('toneMarkType') == 'Numbers':
3250
3251
3252 self.toneMarkLookup[('1', '')] = self.getOption('YaleFirstTone')
3253
3254
3255 if self.getOption('toneMarkType') != 'None':
3256 self.primaryToneRegex = re.compile(r"(?i)^[a-z]+([" \
3257 + r"".join(set([re.escape(toneMark) for toneMark, hChar \
3258 in self.TONE_MARK_MAPPING[self.getOption('toneMarkType')]\
3259 .values()])) \
3260 + r"]?)")
3261 self.hCharRegex = re.compile(r"^.*(?:[aeiou]|m|ng)(h)")
3262
3263
3264 self.readingEntityRegex = re.compile(u'(?i)((?:' \
3265 + '|'.join([re.escape(v) for v in self._getDiacriticVowels()]) \
3266 + u'|[a-z])+[0123456]?)')
3267
3268 @classmethod
3275
3276 @staticmethod
3278 """
3279 Gets a list of Cantonese Yale vowels with diacritical marks for tones.
3280
3281 The list includes characters I{m}, I{n} and I{h} for nasal forms.
3282
3283 @rtype: list of str
3284 @return: list of Cantonese Yale vowels with diacritical marks
3285 """
3286 vowelList = set([])
3287 for nucleusFirstChar in 'aeioumnh':
3288 for toneMark, hChar in \
3289 CantoneseYaleOperator.TONE_MARK_MAPPING['Diacritics'].values():
3290 if toneMark:
3291 vowelList.add(unicodedata.normalize("NFC",
3292 nucleusFirstChar + toneMark))
3293 return vowelList
3294
3295 @classmethod
3297 """
3298 Takes a string written in Cantonese Yale and guesses the reading
3299 dialect.
3300
3301 Currently only the option C{'toneMarkType'} is guessed. Unless
3302 C{'includeToneless'} is set to C{True} only the tone mark types
3303 C{'Diacritics'} and C{'Numbers'} are considered as the latter one can
3304 also represent the state of missing tones.
3305
3306 @type string: str
3307 @param string: Cantonese Yale string
3308 @rtype: dict
3309 @return: dictionary of basic keyword settings
3310 """
3311
3312 entities = cls.syllableRegex.findall(
3313 unicodedata.normalize("NFD", unicode(string)))
3314
3315
3316 diacriticEntityCount = 0
3317 numberEntityCount = 0
3318
3319 for entity in entities:
3320
3321 if entity[-1] in '123456':
3322 numberEntityCount = numberEntityCount + 1
3323 elif 'h' in entity[1:]:
3324
3325 diacriticEntityCount = diacriticEntityCount + 1
3326 else:
3327 for diacriticMarc in [u'\u0304', u'\u0301', u'\u0300']:
3328 if diacriticMarc in entity:
3329 diacriticEntityCount = diacriticEntityCount + 1
3330 break
3331
3332 if includeToneless \
3333 and (1.0 * max(diacriticEntityCount, numberEntityCount) \
3334 / len(entities)) < 0.1:
3335
3336
3337 toneMarkType = 'None'
3338 else:
3339 if diacriticEntityCount > numberEntityCount:
3340 toneMarkType = 'Diacritics'
3341 else:
3342 toneMarkType = 'Numbers'
3343
3344 return {'toneMarkType': toneMarkType}
3345
3347 tones = self.TONES[:]
3348 if (self.getOption('missingToneMark') == 'noinfo' \
3349 and self.getOption('toneMarkType') in ['Numbers', 'Internal']) \
3350 or self.getOption('toneMarkType') == 'None':
3351 tones.append(None)
3352 return tones
3353
3354 - def compose(self, readingEntities):
3355 return "".join(readingEntities)
3356
3358 """
3359 @todo Lang: Place the tone mark on the first character of the nucleus?
3360 """
3361 if tone not in self.getTones():
3362 raise InvalidEntityError("Invalid tone information given for '" \
3363 + plainEntity + "': '" + unicode(tone) + "'")
3364
3365 if self.getOption('toneMarkType') == 'None':
3366 return plainEntity
3367
3368 toneMark, hChar = self.TONE_MARK_MAPPING[
3369 self.getOption('toneMarkType')][tone]
3370
3371 if self.getOption('toneMarkType') == 'Diacritics':
3372
3373
3374 matchObj = re.match('(?i)^([^aeiou]*?)([aeiou]*)([^aeiou]*)$',
3375 plainEntity)
3376 if not matchObj:
3377 raise InvalidEntityError("Invalid entity given for '" \
3378 + plainEntity + "'")
3379
3380 nonVowelH, vowels, nonVowelT = matchObj.groups()
3381
3382
3383 if vowels:
3384 vowels = unicodedata.normalize("NFC", vowels[0] + toneMark \
3385 + vowels[1:] + hChar)
3386 else:
3387 nonVowelT = unicodedata.normalize("NFC", nonVowelT[0] \
3388 + toneMark + nonVowelT[1:] + hChar)
3389
3390 return nonVowelH + vowels + nonVowelT
3391 elif self.getOption('toneMarkType') in ['Numbers', 'Internal']:
3392 return plainEntity + toneMark
3393
3395 """
3396 Splits the entity into an entity without tone mark and the
3397 entity's tone index.
3398
3399 The plain entity returned will always be in Unicode's
3400 I{Normalization Form C} (NFC, see
3401 U{http://www.unicode.org/reports/tr15/}).
3402
3403 @type entity: str
3404 @param entity: entity with tonal information
3405 @rtype: tuple
3406 @return: plain entity without tone mark and entity's tone index
3407 (starting with 1)
3408 """
3409
3410 entity = unicodedata.normalize("NFD", unicode(entity))
3411 if self.getOption('toneMarkType') == 'None':
3412 return unicodedata.normalize("NFC", entity), None
3413
3414
3415 matchObj = self.primaryToneRegex.search(entity)
3416 if not matchObj:
3417 raise InvalidEntityError("Invalid entity or no tone information " \
3418 "given for '" + entity + "'")
3419 toneMark = matchObj.group(1)
3420 plainEntity = entity[0:matchObj.start(1)] + entity[matchObj.end(1):]
3421
3422
3423 matchObj = self.hCharRegex.search(plainEntity)
3424 if matchObj:
3425 hChar = matchObj.group(1)
3426 plainEntity = plainEntity[0:matchObj.start(1)] \
3427 + plainEntity[matchObj.end(1):]
3428 else:
3429 hChar = ''
3430
3431 try:
3432 tone = self.toneMarkLookup[(toneMark, hChar)]
3433 except KeyError:
3434 raise InvalidEntityError("Invalid entity or no tone information " \
3435 "given for '" + entity + "'")
3436
3437 return unicodedata.normalize("NFC", plainEntity), tone
3438
3440 return set(self.db.selectScalars(select(
3441 [self.db.tables['CantoneseYaleSyllables'].c.CantoneseYale])))
3442
3444 """
3445 Splits the given plain syllable into onset (initial) and rhyme (final).
3446
3447 The syllabic nasals I{m}, I{ng} will be returned as final. Syllables yu,
3448 yun, yut will fall into (y, yu, ), (y, yu, n) and (y, yu, t).
3449
3450 @type plainSyllable: str
3451 @param plainSyllable: syllable without tone marks
3452 @rtype: tuple of str
3453 @return: tuple of entity onset and rhyme
3454 @raise InvalidEntityError: if the entity is invalid.
3455 """
3456 onset, nucleus, coda = self.getOnsetNucleusCoda(plainSyllable)
3457 return onset, nucleus + coda
3458
3460 """
3461 Splits the given plain syllable into onset (initial), nucleus and coda,
3462 the latter building the rhyme (final).
3463
3464 The syllabic nasals I{m}, I{ng} will be returned as coda. Syllables yu,
3465 yun, yut will fall into (y, yu, ), (y, yu, n) and (y, yu, t).
3466
3467 @type plainSyllable: str
3468 @param plainSyllable: syllable in the Yale romanisation system without
3469 tone marks
3470 @rtype: tuple of str
3471 @return: tuple of syllable onset, nucleus and coda
3472 @raise InvalidEntityError: if the entity is invalid (e.g. syllable
3473 nucleus or tone invalid).
3474 @todo Impl: Finals I{ing, ik, ung, uk, eun, eut, a} differ from other
3475 finals with same vowels. What semantics/view do we want to provide
3476 on the syllable parts?
3477 """
3478
3479 table = self.db.tables['CantoneseYaleInitialNucleusCoda']
3480 entry = self.db.selectRow(
3481 select([table.c.CantoneseYaleInitial, table.c.CantoneseYaleNucleus,
3482 table.c.CantoneseYaleCoda],
3483 table.c.CantoneseYale == plainSyllable.lower()))
3484 if not entry:
3485 raise InvalidEntityError("'" + plainSyllable \
3486 + "' not a valid plain Cantonese Yale syllable'")
3487
3488 return (entry[0], entry[1], entry[2])
3489
3492 u"""
3493 Provides an operator on strings of the Cantonese language written in the
3494 I{International Phonetic Alphabet} (I{IPA}).
3495
3496 CantonteseIPAOperator does not supply the same closed set of syllables as
3497 other L{ReadingOperator}s as IPA provides different ways to represent
3498 pronunciation. Because of that a user defined IPA syllable will not easily
3499 map to another transcription system and thus only basic support is provided
3500 for this direction.
3501
3502 This operator supplies an additional method L{getOnsetRhyme()} which allows
3503 breaking down syllables into their onset and rhyme.
3504
3505 Features:
3506 - Tones can be marked either with tone numbers (1-6), tone contour
3507 numbers (e.g. 55), IPA tone bar characters or IPA diacritics,
3508 - choice between high level and high falling tone for number marks,
3509 - flexible set of tones,
3510 - support for stop tones,
3511 - handling of variable vowel length for tone contours of stop tone
3512 syllables and
3513 - splitting of syllables into onset and rhyme.
3514
3515 Tones
3516 =====
3517 Tones in IPA can be expressed using different schemes. The following schemes
3518 are implemented here:
3519 - Numbers, tone numbers for the six-tone scheme,
3520 - ChaoDigits, numbers displaying the levels of tone contours, e.g.
3521 55 for the high level tone,
3522 - IPAToneBar, IPA modifying tone bar characters, e.g. ɛw˥˥,
3523 - None, no support for tone marks
3524
3525 Sources
3526 =======
3527 - Robert S. Bauer, Paul K. Benedikt: Modern Cantonese Phonology
3528 (摩登廣州話語音學). Walter de Gruyter, 1997, ISBN 3-11-014893-5.
3529 - Robert S. Bauer: Hong Kong Cantonese Tone Contours. In: Studies in
3530 Cantonese Linguistics. Linguistic Society of Hong Kong, 1998,
3531 ISBN 962-7578-04-5.
3532
3533 @see:
3534 - Modern Cantonese Phonology (Preview):
3535 U{http://books.google.de/books?id=QWNj5Yj6_CgC}
3536
3537 @todo Lang: Shed more light on tone sandhi in Cantonese language.
3538 @todo Impl: Implement diacritics for Cantonese Tones. On which part of the
3539 syllable should they be placed. Document.
3540 @todo Lang: Binyām 變音
3541 @todo Impl: What are the semantics of non-level tones given for unreleased
3542 stop finals? Take high rising Binyam into account.
3543 """
3544 READING_NAME = "CantoneseIPA"
3545
3546 TONES = ['HighLevel', 'MidLevel', 'MidLowLevel', 'HighRising',
3547 'MidLowRising', 'MidLowFalling', 'HighFalling']
3548
3549 STOP_TONES = {'HighStopped': 'HighLevel', 'MidStopped': 'MidLevel',
3550 'MidLowStopped': 'MidLowLevel'}
3551 """Cantonese stop tone mapping to general level tones."""
3552
3553 STOP_TONES_EXPLICIT = {'HighStopped_Short': ('HighLevel', 'S'),
3554 'MidStopped_Short': ('MidLevel', 'S'),
3555 'MidLowStopped_Short': ('MidLowLevel', 'S'),
3556 'HighStopped_Long': ('HighLevel', 'L'),
3557 'MidStopped_Long': ('MidLevel', 'L'),
3558 'MidLowStopped_Long': ('MidLowLevel', 'L')}
3559 """
3560 Cantonese stop tone mapping to general level tones with stop tones realised
3561 for explicit marking short/long pronunciation.
3562 """
3563
3564 TONE_MARK_PREFER = {'Numbers': {'1': 'HighLevel'},
3565 'ChaoDigits': {}, 'IPAToneBar': {}, 'Diacritics': {}}
3566
3567 TONE_MARK_MAPPING = {'Numbers': {'HighLevel': '1', 'MidLevel': '3',
3568 'MidLowLevel': '6', 'HighRising': '2', 'MidLowRising': '5',
3569 'MidLowFalling': '4', 'HighFalling': '1', 'HighStopped_Short': '1',
3570 'MidStopped_Short': '3', 'MidLowStopped_Short': '6',
3571 'HighStopped_Long': '1', 'MidStopped_Long': '3',
3572 'MidLowStopped_Long': '6'},
3573 'ChaoDigits': {'HighLevel': '55', 'MidLevel': '33',
3574 'MidLowLevel': '22', 'HighRising': '25', 'MidLowRising': '23',
3575 'MidLowFalling': '21', 'HighFalling': '52',
3576 'HighStopped_Short': '5', 'MidStopped_Short': '3',
3577 'MidLowStopped_Short': '2', 'HighStopped_Long': '55',
3578 'MidStopped_Long': '33', 'MidLowStopped_Long': '22'},
3579 'IPAToneBar': {'HighLevel': u'˥˥', 'MidLevel': u'˧˧',
3580 'MidLowLevel': u'˨˨', 'HighRising': u'˨˥', 'MidLowRising': u'˨˧',
3581 'MidLowFalling': u'˨˩', 'HighFalling': u'˥˨',
3582 'HighStopped_Short': u'˥', 'MidStopped_Short': u'˧',
3583 'MidLowStopped_Short': u'˨', 'HighStopped_Long': u'˥˥',
3584 'MidStopped_Long': u'˧˧', 'MidLowStopped_Long': u'˨˨'},
3585
3586 }
3587
3589 """
3590 Creates an instance of the CantoneseIPAOperator.
3591
3592 By default no tone marks will be shown.
3593
3594 @param options: extra options
3595 @keyword dbConnectInst: instance of a L{DatabaseConnector}, if none is
3596 given, default settings will be assumed.
3597 @keyword toneMarkType: type of tone marks, one out of C{'Numbers'},
3598 C{'ChaoDigits'}, C{'IPAToneBar'}, C{'Diacritics'}, C{'None'}
3599 @keyword 1stToneName: tone for mark 1 under tone mark type C{'Numbers'},
3600 either I{'HighLevel'} or I{'HighFalling'}.
3601 @keyword stopTones: if set to C{'none'} the basic 6 (7) tones will be
3602 used and stop tones will be reported as one of them, if set to
3603 C{'general'} the three stop tones will be included, if set to
3604 C{'explicit'} the short and long forms will be explicitly supported.
3605 """
3606 super(CantoneseIPAOperator, self).__init__(**options)
3607
3608 if self.getOption('toneMarkType') == 'Diacritics':
3609 raise NotImplementedError()
3610
3611 if '1stToneName' in options:
3612 if self.getOption('toneMarkType') != 'Numbers':
3613 raise ValueError("keyword '1stToneName' is only valid if" \
3614 + " tone mark type is set to 'Numbers'")
3615 if options['1stToneName'] not in self.TONES:
3616 raise ValueError("Invalid option '" \
3617 + str(options['1stToneName']) \
3618 + "' for keyword '1stToneName'")
3619
3620 self.optionValue['toneMarkPrefer']['1'] = options['1stToneName']
3621
3622 if 'stopTones' in options:
3623 if options['stopTones'] not in ['none', 'general', 'explicit']:
3624 raise ValueError("Invalid option '" \
3625 + str(options['stopTones']) + "' for keyword 'stopTones'")
3626
3627 self.optionValue['stopTones'] = options['stopTones']
3628
3629
3630 self.stopToneLookup = {}
3631 for stopTone in self.STOP_TONES_EXPLICIT:
3632 baseTone, vowelLength = self.STOP_TONES_EXPLICIT[stopTone]
3633 if not baseTone in self.stopToneLookup:
3634 self.stopToneLookup[baseTone] = {}
3635 self.stopToneLookup[baseTone][vowelLength] = stopTone
3636
3637 for stopTone in self.STOP_TONES:
3638 self.stopToneLookup[stopTone] \
3639 = self.stopToneLookup[self.STOP_TONES[stopTone]]
3640
3641 @classmethod
3647
3649 tones = self.TONES[:]
3650 if self.getOption('stopTones') == 'general':
3651 tones.extend(self.STOP_TONES.keys())
3652 elif self.getOption('stopTones') == 'explicit':
3653 tones.extend(self.STOP_EXPLICIT.keys())
3654 if self.getOption('missingToneMark') == 'noinfo' \
3655 or self.getOption('toneMarkType') == 'None':
3656 tones.append(None)
3657
3658 return tones
3659
3661 return set(self.db.selectScalars(select(
3662 [self.db.tables['CantoneseIPAInitialFinal'].c.IPA])))
3663
3665 """
3666 Splits the given plain syllable into onset (initial) and rhyme (final).
3667
3668 @type plainSyllable: str
3669 @param plainSyllable: syllable in IPA without tone marks
3670 @rtype: tuple of str
3671 @return: tuple of syllable onset and rhyme
3672 @raise InvalidEntityError: if the entity is invalid (e.g. syllable
3673 nucleus or tone invalid).
3674 """
3675 table = self.db.tables['CantoneseIPAInitialFinal']
3676 entry = self.db.selectRow(
3677 select([table.c.IPAInitial, table.c.IPAFinal],
3678 table.c.IPA == plainSyllable))
3679 if not entry:
3680 raise InvalidEntityError("'" + plainSyllable \
3681 + "' not a valid IPA form in this system'")
3682 return (entry[0], entry[1])
3683
3685 if tone not in self.getTones():
3686 raise InvalidEntityError("Invalid tone information given for '" \
3687 + plainEntity + "': '" + str(tone) + "'")
3688 if self.getOption('toneMarkType') == "None" or tone == None:
3689 entity = plainEntity
3690 else:
3691
3692 tone = self.getExplicitTone(plainEntity, tone)
3693
3694 entity = plainEntity \
3695 + self.TONE_MARK_MAPPING[self.getOption('toneMarkType')][tone]
3696 return unicodedata.normalize("NFC", entity)
3697
3699
3700 entity = unicodedata.normalize("NFD", unicode(entity))
3701
3702 toneMarkType = self.getOption('toneMarkType')
3703 if toneMarkType == 'None':
3704 return unicodedata.normalize("NFC", entity), None
3705 else:
3706 matchObj = self.TONE_MARK_REGEX[toneMarkType].search(entity)
3707 if matchObj:
3708 toneMark = matchObj.group(1)
3709
3710 plainEntity = entity.replace(toneMark, '')
3711
3712 baseTone = self.getBaseToneForToneMark(toneMark)
3713
3714 return unicodedata.normalize("NFC", plainEntity), baseTone
3715 elif self.getOption('missingToneMark') == 'noinfo':
3716 return unicodedata.normalize("NFC", entity), None
3717
3718 raise InvalidEntityError("Invalid entity given for '" + entity + "'")
3719
3721 """
3722 Gets the explicit tone for the given plain syllable and base tone.
3723
3724 In case the 6 (7) base tones are used, the stop tone value can be
3725 deduced from the given syllable. The stop tone returned will be even
3726 more precise in denoting the vowel length that influences the tone
3727 contour.
3728
3729 @type plainSyllable: str
3730 @param plainSyllable: syllable without tonal information
3731 @type baseTone: str
3732 @param baseTone: tone
3733 @rtype: str
3734 @return: explicit tone
3735 @raise InvalidEntityError: if the entity is invalid.
3736 """
3737
3738 if baseTone in self.stopToneLookup:
3739
3740 table = self.db.tables['CantoneseIPAInitialFinal']
3741 unreleasedFinal, vowelLength = self.db.selectRow(
3742 select([table.c.UnreleasedFinal, table.c.VowelLength],
3743 table.c.IPA == plainSyllable))
3744 if unreleasedFinal:
3745 return self.stopToneLookup[baseTone][vowelLength]
3746
3747 if baseTone in self.STOP_TONES:
3748
3749 raise InvalidEntityError("Invalid tone information given for '" \
3750 + plainEntity + "': '" + str(tone) + "'")
3751
3752 return baseTone
3753
3755 """
3756 Gets the base tone (one of the 6/7 general tones) for the given tone
3757 mark.
3758
3759 @type toneMark: str
3760 @param toneMark: tone mark representation of the tone
3761 @rtype: str
3762 @return: base tone
3763 @raise InvalidEntityError: if the toneMark does not exist.
3764 """
3765 if self.toneMarkLookup == None:
3766
3767 self.toneMarkLookup = {}
3768 toneMarkType = self.getOption('toneMarkType')
3769 for tone in self.TONE_MARK_MAPPING[toneMarkType]:
3770 mark = self.TONE_MARK_MAPPING[toneMarkType][tone]
3771
3772
3773 reportTone = tone
3774 if reportTone not in self.TONES:
3775 if self.getOption('stopTones') == 'general':
3776 reportTone = self.STOP_TONES[tone]
3777 elif self.getOption('stopTones') == 'none':
3778 reportTone, _ = self.STOP_TONES_EXPLICIT[tone]
3779
3780 if mark not in self.toneMarkLookup \
3781 or (mark in self.TONE_MARK_PREFER[toneMarkType] \
3782 and self.TONE_MARK_PREFER[toneMarkType][mark] == tone):
3783 self.toneMarkLookup[mark] = reportTone
3784
3785 if toneMark in self.toneMarkLookup:
3786 return self.toneMarkLookup[toneMark]
3787 else:
3788 raise InvalidEntityError("Invalid tone mark given with '" \
3789 + toneMark + "'")
3790