1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 """
19 Provides the building methods for the cjklib package.
20
21 Each table that needs to be created has to be implemented by a L{TableBuilder}.
22 The L{DatabaseBuilder} is the central instance for managing the build process.
23 As the creation of a table can depend on other tables the DatabaseBuilder keeps
24 track of dependencies to process a build in the correct order.
25
26 Building is tested on the following storage methods:
27 - SQLite
28 - MySQL
29
30 Some L{TableBuilder} implementations aren't used by the CJK library but are
31 provided here for additional usage.
32
33 For MS Windows default versions provided seem to be a "X{narrow build}" and not
34 support characters outside the BMP (see e.g.
35 U{http://wordaligned.org/articles/narrow-python}). Currently no Unicode
36 characters outside the BMP will thus be supported on Windows platforms.
37
38 Examples
39 ========
40 The following examples should give a quick view into how to use this
41 package.
42 - Create the DatabaseBuilder object with default settings (read from
43 cjklib.conf or using 'cjklib.db' in same directory as default):
44
45 >>> from cjklib import build
46 >>> dbBuilder = build.DatabaseBuilder(dataPath=['./cjklib/data/'])
47 Removing conflicting builder(s) 'CharacterVariantBMPBuilder' in favour
48 of 'CharacterVariantBuilder'
49 Removing conflicting builder(s) 'SlimUnihanBuilder', 'UnihanBuilder',
50 'UnihanBMPBuilder' in favour of 'SlimUnihanBMPBuilder'
51 Removing conflicting builder(s) 'StrokeCountBuilder' in favour of
52 'CombinedStrokeCountBuilder'
53 Removing conflicting builder(s) 'CharacterResidualStrokeCountBuilder' in
54 favour of 'CombinedCharacterResidualStrokeCountBuilder'
55
56 - Build the table of Jyutping syllables from a csv file:
57
58 >>> dbBuilder.build(['JyutpingSyllables'])
59 building table 'JyutpingSyllables' with builder
60 'JyutpingSyllablesBuilder'...
61 Reading table definition from file './cjklib/data/jyutpingsyllables.sql'
62 Reading table 'JyutpingSyllables' from file
63 './cjklib/data/jyutpingsyllables.csv'
64
65 @todo Impl: Further character domains: BIG5 (Taiwan), kIRG_GSource (Unicode,
66 Simplified Chinese), kIRG_JSource (Unicode, Japanese), kIRG_KPSource and
67 kIRG_KSource (Unicode, Korean), kIRG_TSource (Unicode, Traditional Chinese),
68 kIRG_VSource (Unicode, Vietnamese)
69 @todo Fix: On interruption (Ctrl+C) remove tables that were only created
70 because of dependencies.
71 """
72
73 import types
74 import locale
75 import sys
76 import re
77 import os.path
78 import xml.sax
79 import csv
80
81 from sqlalchemy import Table, Column, Integer, String, Text, Index
82 from sqlalchemy import select, union
83 from sqlalchemy.sql import text, func
84 from sqlalchemy.sql import and_, or_, not_
85 import sqlalchemy
86
87 from cjklib import dbconnector
88 from cjklib import characterlookup
89 from cjklib import exception
94 """
95 TableBuilder provides the abstract layout for classes that build a distinct
96 table.
97 """
98 PROVIDES = ''
99 """Contains the name of the table provided by this module."""
100 DEPENDS = []
101 """Contains the names of the tables needed for the build process."""
102
103 - def __init__(self, dataPath=None, dbConnectInst=None, quiet=False):
104 """
105 Constructs the TableBuilder.
106
107 @type dataPath: list of str
108 @param dataPath: optional list of paths to the data file(s)
109 @type dbConnectInst: instance
110 @param dbConnectInst: instance of a L{DatabaseConnector}. If not given
111 all sql code will be printed to stdout.
112 @type quiet: bool
113 @param quiet: if true no status information will be printed to stderr
114 """
115 self.dataPath = dataPath
116 self.quiet = quiet
117 self.db = dbConnectInst
118
120 """
121 Build the table provided by the TableBuilder.
122
123 Methods should raise an IOError if reading a data source fails. The
124 L{DatabaseBuilder} knows how to handle this case and is able to proceed.
125 """
126 pass
127
129 """
130 Removes the table provided by the TableBuilder from the database.
131 """
132 pass
133
134 - def findFile(self, fileNames, fileType=None):
135 """
136 Tries to locate a file with a given list of possible file names under
137 the classes default data paths.
138
139 For each file name every given path is checked and the first match is
140 returned.
141
142 @type fileNames: str/list of str
143 @param fileNames: possible file names
144 @type fileType: str
145 @param fileType: textual type of file used in error msg
146 @rtype: str
147 @return: path to file of first match in search for existing file
148 @raise IOError: if no file found
149 """
150 if type(fileNames) != type([]):
151 fileNames = [fileNames]
152 for fileName in fileNames:
153 for path in self.dataPath:
154 filePath = os.path.join(os.path.expanduser(path), fileName)
155 if os.path.exists(filePath):
156 return filePath
157 if fileType == None:
158 fileType = "file"
159 raise IOError("No " + fileType + " found for '" + self.PROVIDES \
160 + "' under path(s)'" + "', '".join(self.dataPath) \
161 + "' for file names '" + "', '".join(fileNames) + "'")
162
163 - def buildTableObject(self, tableName, columns, columnTypeMap={},
164 primaryKeys=[]):
165 """
166 Returns a SQLAlchemy Table object.
167
168 @type tableName: str
169 @param tableName: name of table
170 @type columns: list of str
171 @param columns: column names
172 @type columnTypeMap: dict of str and object
173 @param columnTypeMap: mapping of column name to SQLAlchemy Column
174 @type primaryKeys: list of str
175 @param primaryKeys: list of primary key columns
176 """
177 table = Table(tableName, self.db.metadata)
178 for column in columns:
179 if column in columnTypeMap:
180 type_ = columnTypeMap[column]
181 else:
182 type_ = Text()
183 warn("column %s has no type, assuming default 'Text()'" \
184 % column)
185 table.append_column(Column(column, type_,
186 primary_key=(column in primaryKeys)))
187
188 return table
189
191 """
192 Returns a SQLAlchemy Table object.
193
194 @type tableName: str
195 @param tableName: name of table
196 @type indexKeyList: list of list of str
197 @param indexKeyList: a list of key combinations
198 @rtype: object
199 @return: SQLAlchemy Index
200 """
201 indexList = []
202 table = Table(tableName, self.db.metadata, autoload=True)
203 for indexKeyColumns in indexKeyList:
204 indexName = tableName + '__' + '_'.join(indexKeyColumns)
205 indexList.append(Index(indexName,
206 *[table.c[column] for column in indexKeyColumns]))
207
208 return indexList
209
210
211 -class EntryGeneratorBuilder(TableBuilder):
212 """
213 Implements an abstract class for building a table from a generator
214 providing entries.
215 """
216 COLUMNS = []
217 """Columns that will be built"""
218 PRIMARY_KEYS = []
219 """Primary keys of the created table"""
220 INDEX_KEYS = []
221 """Index keys (not unique) of the created table"""
222 COLUMN_TYPES = {}
223 """Column types for created table"""
224
225 - def getGenerator(self):
226 """
227 Returns the entry generator.
228 Needs to be implemented by child classes.
229 """
230 pass
231
232 - def getEntryDict(self, generator):
233 entryList = []
234
235 firstEntry = generator.next()
236 if type(firstEntry) == type(dict()):
237 entryList.append(firstEntry)
238
239 for newEntry in generator:
240 entryList.append(newEntry)
241 else:
242 firstEntryDict = dict([(column, firstEntry[i]) \
243 for i, column in enumerate(self.COLUMNS)])
244 entryList.append(firstEntryDict)
245
246 for newEntry in generator:
247 entryDict = dict([(column, newEntry[i]) \
248 for i, column in enumerate(self.COLUMNS)])
249 entryList.append(entryDict)
250
251 return entryList
252
254
255 generator = self.getGenerator()
256
257
258 table = self.buildTableObject(self.PROVIDES, self.COLUMNS,
259 self.COLUMN_TYPES, self.PRIMARY_KEYS)
260 table.create()
261
262
263
264
265
266
267
268
269
270
271 for newEntry in generator:
272 try:
273 table.insert(newEntry).execute()
274 except sqlalchemy.exceptions.IntegrityError, e:
275 warn(unicode(e))
276 raise
277
278 for index in self.buildIndexObjects(self.PROVIDES, self.INDEX_KEYS):
279 index.create()
280
282
283 table = Table(self.PROVIDES, self.db.metadata)
284 table.drop()
285
288 """A simple generator for a given list of elements."""
290 """
291 Initialises the ListGenerator.
292
293 @type entryList: list of str
294 @param entryList: user defined entry
295 """
296 self.entryList = entryList
297
299 for entry in self.entryList:
300 yield entry
301
306 """
307 Regular expression matching one entry in the Unihan database
308 (e.g. C{U+8682 kMandarin MA3 MA1 MA4}).
309 """
310 keySet = None
311 """Set of keys of the Unihan table."""
312
313 - def __init__(self, fileName, useKeys=None, quiet=False):
314 """
315 Constructs the UnihanGenerator.
316
317 @type fileName: str
318 @param fileName: path to the Unihan database file
319 @type useKeys: list
320 @param useKeys: if given only these keys will be read from the table,
321 otherwise all keys will be returned
322 @type quiet: bool
323 @param quiet: if true no status information will be printed to stderr
324 """
325 self.ENTRY_REGEX = re.compile(ur"U\+([0-9A-F]+)\s+(\w+)\s+(.+)\s*$")
326 self.fileName = fileName
327 self.quiet = quiet
328 if useKeys != None:
329 self.limitKeys = True
330 self.keySet = set(useKeys)
331 else:
332 self.limitKeys = False
333
335 """
336 Iterates over the Unihan entries.
337
338 The character definition is converted to the character's representation,
339 all other data is given as is. These are merged into one entry for each
340 character.
341 """
342
343
344 handle = self.getHandle()
345 entryIndex = -1
346 entry = {}
347 for line in handle:
348
349 if line.startswith('#'):
350 continue
351 resultObj = self.ENTRY_REGEX.match(line)
352 if not resultObj:
353 if not self.quiet:
354 warn("can't read line from Unihan.txt: '" + line + "'")
355 continue
356 unicodeHexIndex, key, value = resultObj.group(1, 2, 3)
357
358
359
360 if self.limitKeys and not key in self.keySet:
361 continue
362
363 if entryIndex != unicodeHexIndex and entryIndex != -1:
364 try:
365
366 char = unichr(int(entryIndex, 16))
367 yield(char, entry)
368 except ValueError:
369
370 pass
371
372 entry = {}
373 entryIndex = unicodeHexIndex
374 entry[key] = value
375
376 if entry:
377 try:
378
379 char = unichr(int(entryIndex, 16))
380 yield(char, entry)
381 except ValueError:
382
383 pass
384 handle.close()
385
387 """
388 Returns a handle of the Unihan database file.
389
390 @rtype: file
391 @return: file handle of the Unihan file
392 """
393 import zipfile
394 if zipfile.is_zipfile(self.fileName):
395 import StringIO
396 z = zipfile.ZipFile(self.fileName, "r")
397 handle = StringIO.StringIO(z.read("Unihan.txt").decode('utf-8'))
398 else:
399 import codecs
400 handle = codecs.open(self.fileName, 'r', 'utf-8')
401 return handle
402
404 """
405 Returns all keys read for the Unihan table.
406
407 If the whole table is read a seek through the file is needed first to
408 find all keys, otherwise the predefined set is returned.
409 @rtype: list
410 @return: list of column names
411 """
412 if not self.keySet:
413 if not self.quiet:
414 warn("looking for all keys in Unihan database...")
415 self.keySet = set()
416 handle = self.getHandle()
417 for line in handle:
418
419 if line.startswith('#'):
420 continue
421 resultObj = self.ENTRY_REGEX.match(line)
422 if not resultObj:
423 continue
424
425 unicodeHexIndex, key, value = resultObj.group(1, 2, 3)
426 self.keySet.add(key)
427 handle.close()
428 return list(self.keySet)
429
432 """Builds the Unihan database from the Unihan file provided by Unicode."""
434 """Generates the entries of the Unihan table."""
435
436 - def __init__(self, unihanGenerator):
437 """
438 Initialises the EntryGenerator.
439
440 @type unihanGenerator: instance
441 @param unihanGenerator: a L{UnihanGenerator} instance
442 """
443 self.unihanGenerator = unihanGenerator
444
445 - def generator(self):
446 """Provides all data of one character per entry."""
447 columns = self.unihanGenerator.keys()
448 for char, entryDict in self.unihanGenerator.generator():
449 newEntryDict = {UnihanBuilder.CHARACTER_COLUMN: char}
450 for column in columns:
451 if entryDict.has_key(column):
452 newEntryDict[column] = entryDict[column]
453 else:
454 newEntryDict[column] = None
455 yield newEntryDict
456
457 PROVIDES = 'Unihan'
458 CHARACTER_COLUMN = 'ChineseCharacter'
459 """Name of column for Chinese character key."""
460 COLUMN_TYPES = {CHARACTER_COLUMN: String(1), 'kCantonese': Text(),
461 'kFrequency': Integer(), 'kHangul': Text(), 'kHanyuPinlu': Text(),
462 'kJapaneseKun': Text(), 'kJapaneseOn': Text(), 'kKorean': Text(),
463 'kMandarin': Text(), 'kRSJapanese': Text(), 'kRSKanWa': Text(),
464 'kRSKangXi': Text(), 'kRSKorean': Text(),
465 'kSimplifiedVariant': Text(), 'kTotalStrokes': Integer(),
466 'kTraditionalVariant': Text(), 'kVietnamese': Text(),
467 'kZVariant': Text()}
468 unihanGenerator = None
469
470 - def __init__(self, dataPath, dbConnectInst, quiet=False):
473
475 """
476 Returns the L{UnihanGenerator}. Constructs it if needed.
477
478 @rtype: instance
479 @return: instance of a L{UnihanGenerator}
480 """
481 if not self.unihanGenerator:
482 path = self.findFile(['Unihan.txt', 'Unihan.zip'],
483 "Unihan database file")
484 self.unihanGenerator = UnihanGenerator(path)
485 if not self.quiet:
486 warn("reading file '" + path + "'")
487 return self.unihanGenerator
488
492
499
502 """
503 Builds the Unihan database from the Unihan file provided by Unicode for
504 characters from the Basic Multilingual Plane (BMP) with code values between
505 U+0000 and U+FFFF.
506
507 MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes:
508 U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html}
509 """
511
512 - def __init__(self, unihanGenerator):
513 """
514 Initialises the EntryGenerator.
515
516 @type unihanGenerator: instance
517 @param unihanGenerator: a L{UnihanGenerator} instance
518 """
519 gen = unihanGenerator.generator()
520 self.entryGen = UnihanBuilder.EntryGenerator(unihanGenerator)\
521 .generator()
522
523 - def generator(self):
524 for entryDict in self.entryGen:
525
526
527 char = entryDict[UnihanBuilder.CHARACTER_COLUMN]
528 if ord(char) < int('20000', 16):
529 yield entryDict
530
531 - def __init__(self, dataPath, dbConnectInst, quiet=False):
534
538
541 """
542 Builds a slim version of the Unihan database.
543
544 Keys imported into the database are specified in L{INCLUDE_KEYS}.
545 """
546 INCLUDE_KEYS = ['kCompatibilityVariant', 'kCantonese', 'kFrequency',
547 'kHangul', 'kHanyuPinlu', 'kJapaneseKun', 'kJapaneseOn', 'kMandarin',
548 'kRSJapanese', 'kRSKanWa', 'kRSKangXi', 'kRSKorean', 'kSemanticVariant',
549 'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kTotalStrokes',
550 'kTraditionalVariant', 'kVietnamese', 'kXHC1983', 'kZVariant',
551 'kIICore', 'kGB0']
552 """Keys for that data is read into the Unihan table in database."""
553
562
565 """
566 Builds a slim version of the Unihan database from the Unihan file provided
567 by Unicode for characters from the Basic Multilingual Plane (BMP) with code
568 values between U+0000 and U+FFFF.
569
570 MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes:
571 U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html}
572
573 Keys imported into the database are specified in L{INCLUDE_KEYS}.
574 """
575
576 pass
577
580 """
581 Builds the Kanjidic database from the Kanjidic2 XML file
582 U{http://www.csse.monash.edu.au/~jwb/kanjidic2/}.
583 """
585 """Extracts a list of given tags."""
586 - def __init__(self, entryList, tagDict):
587 self.entryList = entryList
588 self.tagDict = tagDict
589
590 self.currentElement = []
591 self.targetTag = None
592 self.targetTagTopElement = None
593
595 assert(len(self.currentElement) > 0)
596 assert(self.currentElement[-1] == name)
597 self.currentElement.pop()
598
599 if name == self.targetTagTopElement:
600 self.targetTag = None
601 self.targetTagTopElement = None
602
603 if name == 'character':
604 entryDict = {}
605 for tag, func in self.tagDict.values():
606 if tag in self.currentEntry:
607 entryDict[tag] = func(self.currentEntry[tag])
608 self.entryList.append(entryDict)
609
611 if self.targetTag:
612 if self.targetTag not in self.currentEntry:
613 self.currentEntry[self.targetTag] = []
614 self.currentEntry[self.targetTag].append(content)
615
617 self.currentElement.append(name)
618 if name == 'character':
619 self.currentEntry = {}
620 else:
621 if 'character' in self.currentElement:
622 idx = self.currentElement.index('character') + 1
623 tagHierachy = tuple(self.currentElement[idx:])
624
625 key = (tagHierachy, frozenset(attrs.items()))
626 if key in self.tagDict:
627 self.targetTagTopElement = name
628 self.targetTag, _ = self.tagDict[key]
629
631 """Generates the KANJIDIC table."""
633 """
634 Initialises the KanjidicGenerator.
635
636 @type dataPath: list of str
637 @param dataPath: optional list of paths to the data file(s)
638 """
639 self.dataPath = dataPath
640 self.tagDict = tagDict
641
643 """
644 Returns a handle of the KANJIDIC database file.
645
646 @rtype: file
647 @return: file handle of the KANJIDIC file
648 """
649 import gzip
650 if self.dataPath.endswith('.gz'):
651 import StringIO
652 z = gzip.GzipFile(self.dataPath, 'r')
653 handle = StringIO.StringIO(z.read())
654 else:
655 import codecs
656 handle = codecs.open(self.dataPath, 'r')
657 return handle
658
660 """Provides a pronunciation and a path to the audio file."""
661 entryList = []
662 xmlHandler = Kanjidic2Builder.XMLHandler(entryList, self.tagDict)
663
664 saxparser = xml.sax.make_parser()
665 saxparser.setContentHandler(xmlHandler)
666
667
668 saxparser.parse(self.getHandle())
669
670 for entry in entryList:
671 yield(entry)
672
673 PROVIDES = 'Kanjidic'
674 CHARACTER_COLUMN = 'ChineseCharacter'
675 """Name of column for Chinese character key."""
676 COLUMN_TYPES = {CHARACTER_COLUMN: String(1), 'NelsonRadical': Integer(),
677 'CharacterJapaneseOn': Text(), 'CharacterJapaneseKun': Text()}
678 KANJIDIC_TAG_MAPPING = {
679 (('literal', ), frozenset()): ('ChineseCharacter', lambda x: x[0]),
680 (('radical', 'rad_value'),
681 frozenset([('rad_type', 'nelson_c')])): ('NelsonCRadical',
682 lambda x: int(x[0])),
683 (('radical', 'rad_value'),
684 frozenset([('rad_type', 'nelson_n')])): ('NelsonNRadical',
685 lambda x: int(x[0])),
686
687
688
689
690 (('reading_meaning', 'rmgroup', 'reading'),
691 frozenset([('r_type', 'ja_on')])): ('CharacterJapaneseOn',
692 lambda x: ','.join(x)),
693 (('reading_meaning', 'rmgroup', 'reading'),
694 frozenset([('r_type', 'ja_kun')])): ('CharacterJapaneseKun',
695 lambda x: ','.join(x)),
696
697
698
699 (('misc', 'rad_name'), frozenset()): ('RadicalName',
700 lambda x: ','.join(x)),
701 (('reading_meaning', 'rmgroup', 'meaning'), frozenset()): ('Meaning_en',
702 lambda x: '/'.join(x)),
703 (('reading_meaning', 'rmgroup', 'meaning'),
704 frozenset([('m_lang', 'fr')])): ('Meaning_fr',
705 lambda x: '/'.join(x)),
706 (('reading_meaning', 'rmgroup', 'meaning'),
707 frozenset([('m_lang', 'es')])): ('Meaning_es',
708 lambda x: '/'.join(x)),
709 (('reading_meaning', 'rmgroup', 'meaning'),
710 frozenset([('m_lang', 'pt')])): ('Meaning_pt',
711 lambda x: '/'.join(x)),
712 }
713 """
714 Dictionary of tag keys mapping to a table column including a function
715 generating a string out of a list of entries given from the KANJIDIC entry.
716 The tag keys constist of a tuple giving the xml element hierarchy below the
717 'character' element and a set of attribute value pairs.
718 """
719
720 - def __init__(self, dataPath, dbConnectInst, quiet=False):
725
727 """
728 Returns the L{KanjidicGenerator}.
729
730 @rtype: instance
731 @return: instance of a L{KanjidicGenerator}
732 """
733 path = self.findFile(['kanjidic2.xml.gz', 'kanjidic2.xml'],
734 "KANJIDIC2 XML file")
735 if not self.quiet:
736 warn("reading file '" + path + "'")
737 return Kanjidic2Builder.KanjidicGenerator(path,
738 self.KANJIDIC_TAG_MAPPING).generator()
739
742 """
743 Provides an abstract class for building a table with a relation between a
744 Chinese character and another column using the Unihan database.
745 """
746 DEPENDS=['Unihan']
747 COLUMN_SOURCE = None
748 """
749 Unihan table column providing content for the table. Needs to be overwritten
750 in subclass.
751 """
752 COLUMN_TARGET = None
753 """
754 Column name for new data in created table. Needs to be overwritten in
755 subclass.
756 """
757 COLUMN_TARGET_TYPE = Text()
758 """
759 Type of column for new data in created table.
760 """
761 GENERATOR_CLASS = None
762 """
763 Class defining the iterator for creating the table's data. The constructor
764 needs to take two parameters for the list of entries from the Unihan
765 database and the 'quiet' flag. Needs to be overwritten in subclass.
766 """
767
768 - def __init__(self, dataPath, dbConnectInst, quiet=False):
777
785
791
794 """
795 Builds a mapping between characters and their stroke count using the Unihan
796 data.
797 """
799 """Extracts the character stroke count mapping."""
801 """
802 Initialises the StrokeCountExtractor.
803
804 @type entries: list of tuple
805 @param entries: character entries from the Unihan database
806 @type quiet: bool
807 @param quiet: if true no status information will be printed
808 """
809 self.entries = entries
810 self.quiet = quiet
811
813 """Provides one entry per radical and character."""
814 for character, strokeCount in self.entries:
815 yield(character, strokeCount)
816
817 PROVIDES = 'UnihanStrokeCount'
818 COLUMN_SOURCE = 'kTotalStrokes'
819 COLUMN_TARGET = 'StrokeCount'
820 COLUMN_TARGET_TYPE = Integer()
821 GENERATOR_CLASS = StrokeCountExtractor
822
825 """
826 Provides an abstract class for building a character radical mapping table
827 using the Unihan database.
828 """
830 """Generates the radical to character mapping from the Unihan table."""
832 """
833 Initialises the RadicalExtractor.
834
835 @type rsEntries: list of tuple
836 @param rsEntries: character radical entries from the Unihan database
837 @type quiet: bool
838 @param quiet: if true no status information will be printed
839 """
840 self.RADICAL_REGEX = re.compile(ur"(\d+)\.(\d+)")
841 self.rsEntries = rsEntries
842 self.quiet = quiet
843
845 """Provides one entry per radical and character."""
846 for character, radicalStroke in self.rsEntries:
847 matchObj = self.RADICAL_REGEX.match(radicalStroke)
848 if matchObj:
849 radical = matchObj.group(1)
850 yield(character, radical)
851 elif not self.quiet:
852 warn("unable to read radical information of character '" \
853 + character + "': '" + radicalStroke + "'")
854
855 COLUMN_TARGET = 'RadicalIndex'
856 COLUMN_TARGET_TYPE = Integer()
857 GENERATOR_CLASS = RadicalExtractor
858
861 """
862 Builds the character Kangxi radical mapping table from the Unihan database.
863 """
864 PROVIDES = 'CharacterKangxiRadical'
865 COLUMN_SOURCE = 'kRSKangXi'
866
869 """
870 Builds the character Dai Kan-Wa jiten radical mapping table from the Unihan
871 database.
872 """
873 PROVIDES = 'CharacterKanWaRadical'
874 COLUMN_SOURCE = 'kRSKanWa'
875
878 """
879 Builds the character Japanese radical mapping table from the Unihan
880 database.
881 """
882 PROVIDES = 'CharacterJapaneseRadical'
883 COLUMN_SOURCE = 'kRSJapanese'
884
887 """
888 Builds the character Korean radical mapping table from the Unihan
889 database.
890 """
891 PROVIDES = 'CharacterKoreanRadical'
892 COLUMN_SOURCE = 'kRSKorean'
893
896 """
897 Builds a character variant mapping table from the Unihan database.
898 """
900 """Generates the character to variant mapping from the Unihan table."""
901
902
903 HEX_INDEX_REGEX = re.compile(ur"\s*U\+([0-9A-F]+)\s*$")
904 MULT_HEX_INDEX_REGEX = re.compile(ur"\s*(U\+([0-9A-F]+)( |(?=$)))+\s*$")
905 MULT_HEX_INDEX_FIND_REGEX = re.compile(ur"U\+([0-9A-F]+)(?: |(?=$))")
906 SEMANTIC_REGEX = re.compile(ur"(U\+[0-9A-F]+(<\S+)?( |(?=$)))+$")
907 SEMANTIC_FIND_REGEX = re.compile(ur"U\+([0-9A-F]+)(?:<\S+)?(?: |(?=$))")
908 ZVARIANT_REGEX = re.compile(ur"\s*U\+([0-9A-F]+)(?:\:\S+)?\s*$")
909
910 VARIANT_REGEX_MAPPING = {'C': (HEX_INDEX_REGEX, HEX_INDEX_REGEX),
911 'M': (SEMANTIC_REGEX, SEMANTIC_FIND_REGEX),
912 'S': (MULT_HEX_INDEX_REGEX, MULT_HEX_INDEX_FIND_REGEX),
913 'P': (SEMANTIC_REGEX, SEMANTIC_FIND_REGEX),
914 'T': (MULT_HEX_INDEX_REGEX, MULT_HEX_INDEX_FIND_REGEX),
915 'Z': (ZVARIANT_REGEX, ZVARIANT_REGEX)}
916 """
917 Mapping of entry types to regular expression describing the entry's
918 pattern.
919 """
920
921 - def __init__(self, variantEntries, typeList, quiet=False):
922 """
923 Initialises the VariantGenerator.
924
925 @type variantEntries: list of tuple
926 @param variantEntries: character variant entries from the Unihan
927 database
928 @type typeList: list of str
929 @param typeList: variant types in the order given in tableEntries
930 @type quiet: bool
931 @param quiet: if true no status information will be printed
932 """
933 self.variantEntries = variantEntries
934 self.typeList = typeList
935 self.quiet = quiet
936
938 """Provides one entry per variant and character."""
939 for entries in self.variantEntries:
940 character = entries[0]
941 for i, variantType in enumerate(self.typeList):
942 variantInfo = entries[i+1]
943 if variantInfo:
944
945 matchR, findR = self.VARIANT_REGEX_MAPPING[variantType]
946 if matchR.match(variantInfo):
947
948 variantIndices = findR.findall(variantInfo)
949 for unicodeHexIndex in variantIndices:
950 try:
951 variant = unichr(int(unicodeHexIndex, 16))
952 yield(character, variant, variantType)
953 except ValueError:
954
955
956 pass
957 elif not self.quiet:
958
959 warn('unable to read variant information of ' \
960 + "character '" + character + "' for type '" \
961 + variantType + "': '" + variantInfo + "'")
962
963 PROVIDES = 'CharacterVariant'
964 DEPENDS=['Unihan']
965
966 COLUMN_SOURCE_ABBREV = {'kCompatibilityVariant': 'C',
967 'kSemanticVariant': 'M', 'kSimplifiedVariant': 'S',
968 'kSpecializedSemanticVariant': 'P', 'kTraditionalVariant': 'T',
969 'kZVariant': 'Z'}
970 """
971 Unihan table columns providing content for the table together with their
972 abbreviation used in the target table.
973 """
974 COLUMN_TYPES = {'ChineseCharacter': String(1), 'Variant': String(1),
975 'Type': String(1)}
976
977 - def __init__(self, dataPath, dbConnectInst, quiet=False):
983
996
1002
1005 """
1006 Builds a character variant mapping table from the Unihan database for
1007 characters from the Basic Multilingual Plane (BMP) with code values between
1008 U+0000 and U+FFFF.
1009
1010 MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes:
1011 U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html}
1012 """
1014
1015 - def __init__(self, variantEntries, typeList, quiet=False):
1016 """
1017 Initialises the BMPVariantGenerator.
1018
1019 @type variantEntries: list of tuple
1020 @param variantEntries: character variant entries from the Unihan
1021 database
1022 @type typeList: list of str
1023 @param typeList: variant types in the order given in tableEntries
1024 @type quiet: bool
1025 @param quiet: if true no status information will be printed
1026 """
1027 self.variantGen = CharacterVariantBuilder.VariantGenerator( \
1028 variantEntries, typeList, quiet).generator()
1029
1031 for character, variant, variantType in self.variantGen:
1032
1033
1034 if ord(variant) < int('20000', 16):
1035 yield(character, variant, variantType)
1036
1037 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1040
1053
1056 """
1057 Builds a simple list of characters that belong to a specific class using the
1058 Unihan data.
1059 """
1060 DEPENDS=['Unihan']
1061
1062 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1070
1079
1085
1088 u"""
1089 Builds a simple list of all characters in X{IICore}
1090 (Unicode I{International Ideograph Core)}.
1091 @see: Chinese Wikipedia on IICore:
1092 U{http://zh.wikipedia.org/wiki/國際表意文字核心}
1093 """
1094 PROVIDES = 'IICoreSet'
1095 COLUMN_SOURCE = 'kIICore'
1096
1099 """
1100 Builds a simple list of all characters in the Chinese standard X{GB2312-80}.
1101 """
1102 PROVIDES = 'GB2312Set'
1103 COLUMN_SOURCE = 'kGB0'
1104
1109 """
1110 Provides an abstract class for building a character reading mapping table
1111 using the Unihan database.
1112 """
1114 """Generates the reading entities from the Unihan table."""
1115 SPLIT_REGEX = re.compile(r"(\S+)")
1116
1117 - def __init__(self, readingEntries, quiet=False):
1118 """
1119 Initialises the ReadingSplitter.
1120
1121 @type readingEntries: list of tuple
1122 @param readingEntries: character reading entries from the Unihan
1123 database
1124 @type quiet: bool
1125 @param quiet: if true no status information will be printed
1126 """
1127 self.readingEntries = readingEntries
1128 self.quiet = quiet
1129
1131 """Provides one entry per reading entity and character."""
1132 for character, readings in self.readingEntries:
1133 readingList = self.SPLIT_REGEX.findall(readings)
1134 if not self.quiet and len(set(readingList)) < len(readingList):
1135 warn('reading information of character ' + character \
1136 + ' is inconsistent: ' + ', '.join(readingList))
1137 for reading in set(readingList):
1138 yield(character, reading.lower())
1139
1140 COLUMN_TARGET = 'Reading'
1141 COLUMN_TARGET_TYPE = Text()
1142 GENERATOR_CLASS = SimpleReadingSplitter
1143 DEPENDS=['Unihan']
1144
1147 """
1148 Builds the character Pinyin mapping table from the Unihan database.
1149 """
1150 PROVIDES = 'CharacterUnihanPinyin'
1151 COLUMN_SOURCE = 'kMandarin'
1152
1155 """Builds the character Jyutping mapping table from the Unihan database."""
1156 PROVIDES = 'CharacterJyutping'
1157 COLUMN_SOURCE = 'kCantonese'
1158
1161 """Builds the character Kun'yomi mapping table from the Unihan database."""
1162 PROVIDES = 'CharacterJapaneseKun'
1163 COLUMN_SOURCE = 'kJapaneseKun'
1164
1167 """Builds the character On'yomi mapping table from the Unihan database."""
1168 PROVIDES = 'CharacterJapaneseOn'
1169 COLUMN_SOURCE = 'kJapaneseOn'
1170
1173 """Builds the character Hangul mapping table from the Unihan database."""
1174 PROVIDES = 'CharacterHangul'
1175 COLUMN_SOURCE = 'kHangul'
1176
1179 """
1180 Builds the character Vietnamese mapping table from the Unihan database.
1181 """
1182 PROVIDES = 'CharacterVietnamese'
1183 COLUMN_SOURCE = 'kVietnamese'
1184
1187 """
1188 Builds the Xiandai Hanyu Pinlu Cidian Pinyin mapping table using the Unihan
1189 database.
1190 """
1192 """
1193 Generates the Xiandai Hanyu Pinlu Cidian Pinyin syllables from the
1194 Unihan table.
1195 """
1196 SPLIT_REGEX = re.compile(ur"([a-zü]+[1-5])\([0-9]+\)")
1197
1198 GENERATOR_CLASS = XHPCReadingSplitter
1199
1200 PROVIDES = 'CharacterXHPCPinyin'
1201 COLUMN_SOURCE = 'kHanyuPinlu'
1202
1205 """
1206 Builds the Xiandai Hanyu Cidian Pinyin mapping table using the Unihan
1207 database.
1208 """
1210 """
1211 Generates the Xiandai Hanyu Cidian Pinyin syllables from the Unihan
1212 table.
1213 """
1214 SPLIT_REGEX = re.compile(r"[0-9,.*]+:(\S+)")
1215
1216 TONEMARK_VOWELS = [u'a', u'e', u'i', u'o', u'u', u'ü', u'n', u'm', u'r',
1217 u'ê']
1218
1219 TONEMARK_MAP = {u'\u0304': 1, u'\u0301': 2, u'\u030c': 3, u'\u0300': 4}
1220
1221 - def __init__(self, readingEntries, quiet=False):
1222 """
1223 Initialises the XHCReadingSplitter.
1224
1225 @type readingEntries: list of tuple
1226 @param readingEntries: character reading entries from the Unihan
1227 database
1228 @type quiet: bool
1229 @param quiet: if true no status information will be printed
1230 """
1231 CharacterReadingBuilder.SimpleReadingSplitter.__init__(self,
1232 readingEntries, quiet)
1233 self._toneMarkRegex = re.compile(u'[' \
1234 + ''.join(self.TONEMARK_MAP.keys()) + ']')
1235
1237 """
1238 Converts the entity with diacritics into an entity with tone mark
1239 as appended number.
1240
1241 @type entity: str
1242 @param entity: entity with tonal information
1243 @rtype: tuple
1244 @return: plain entity without tone mark and entity's tone index
1245 (starting with 1)
1246 """
1247 import unicodedata
1248
1249 entity = unicodedata.normalize("NFD", unicode(entity))
1250
1251 matchObj = self._toneMarkRegex.search(entity)
1252 if matchObj:
1253 diacriticalMark = matchObj.group(0)
1254 tone = self.TONEMARK_MAP[diacriticalMark]
1255
1256 plainEntity = entity.replace(diacriticalMark, '')
1257
1258 return unicodedata.normalize("NFC", plainEntity) + str(tone)
1259 else:
1260
1261 return unicodedata.normalize("NFC", entity) + '5'
1262
1264 """Provides one entry per reading entity and character."""
1265 for character, readings in self.readingEntries:
1266 readingList = self.SPLIT_REGEX.findall(readings)
1267 if not self.quiet and len(set(readingList)) < len(readingList):
1268 warn('reading information of character ' + character \
1269 + ' is inconsistent: ' + ', '.join(readingList))
1270 for reading in set(readingList):
1271 yield(character, self.convertTonemark(reading.lower()))
1272
1273 GENERATOR_CLASS = XHCReadingSplitter
1274
1275 PROVIDES = 'CharacterXHCPinyin'
1276 COLUMN_SOURCE = 'kXHC1983'
1277
1280 """
1281 Builds the character Pinyin mapping table from the several sources.
1282 """
1283 PROVIDES = 'CharacterPinyin'
1284 DEPENDS=['CharacterUnihanPinyin', 'CharacterXHPCPinyin',
1285 'CharacterXHCPinyin']
1286
1287 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1296
1298
1299 selectQueries = []
1300 for tableName in self.DEPENDS:
1301 table = self.db.tables[tableName]
1302 selectQueries.append(
1303 select([table.c[column] for column in self.COLUMNS]))
1304
1305 tableEntries = self.db.selectRows(union(*selectQueries))
1306 return ListGenerator(tableEntries).generator()
1307
1312 """
1313 Builds a table by loading its data from a list of comma separated values
1314 (CSV).
1315 """
1316 TABLE_CSV_FILE_MAPPING = ''
1317 """csv file path"""
1318 TABLE_DECLARATION_FILE_MAPPING = ''
1319 """file path containing SQL create table code."""
1320 INDEX_KEYS = []
1321 """Index keys (not unique) of the created table"""
1322
1329
1330 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1332
1333
1334
1335 @staticmethod
1343
1344 @staticmethod
1346 for line in unicode_csv_data:
1347 yield line.encode('utf-8')
1348
1349 @staticmethod
1359
1360 return ByteStringDialect(dialect)
1361
1363 """
1364 Returns a csv reader object for a given file name.
1365
1366 The file can start with the character '#' to mark comments. These will
1367 be ignored. The first line after the leading comments will be used to
1368 guess the csv file's format.
1369
1370 @type fileHandle: file
1371 @param fileHandle: file handle of the CSV file
1372 @rtype: instance
1373 @return: CSV reader object returning one entry per line
1374 """
1375 def prependLineGenerator(line, data):
1376 """
1377 The first line red for guessing format has to be reinserted.
1378 """
1379 yield line
1380 for nextLine in data:
1381 yield nextLine
1382
1383 line = '#'
1384 try:
1385 while line.strip().startswith('#'):
1386 line = fileHandle.next()
1387 except StopIteration:
1388 return csv.reader(fileHandle)
1389 try:
1390 self.fileDialect = csv.Sniffer().sniff(line, ['\t', ','])
1391 except csv.Error:
1392 self.fileDialect = CSVFileLoader.DefaultDialect()
1393
1394 content = prependLineGenerator(line, fileHandle)
1395
1396 return CSVFileLoader.unicode_csv_reader(content, self.fileDialect)
1397
1399 import locale
1400 import codecs
1401
1402 definitionFile = self.findFile([self.TABLE_DECLARATION_FILE_MAPPING],
1403 "SQL table definition file")
1404 contentFile = self.findFile([self.TABLE_CSV_FILE_MAPPING], "table")
1405
1406
1407 if not self.quiet:
1408 warn("Reading table definition from file '" + definitionFile + "'")
1409
1410 fileHandle = codecs.open(definitionFile, 'r', 'utf-8')
1411 createStatement = ''.join(fileHandle.readlines()).strip("\n")
1412
1413 self.db.execute(text(createStatement))
1414 table = Table(self.PROVIDES, self.db.metadata, autoload=True)
1415
1416
1417 if not self.quiet:
1418 warn("Reading table '" + self.PROVIDES + "' from file '" \
1419 + contentFile + "'")
1420 fileHandle = codecs.open(contentFile, 'r', 'utf-8')
1421
1422 entries = []
1423 for line in self.getCSVReader(fileHandle):
1424 if len(line) == 1 and not line[0].strip():
1425 continue
1426 entryDict = dict([(column.name, line[i]) \
1427 for i, column in enumerate(table.columns)])
1428 entries.append(entryDict)
1429
1430 try:
1431 self.db.execute(table.insert(), entries)
1432 except sqlalchemy.exceptions.IntegrityError, e:
1433 warn(unicode(e))
1434
1435 raise
1436
1437
1438 for index in self.buildIndexObjects(self.PROVIDES, self.INDEX_KEYS):
1439 index.create()
1440
1442
1443 table = Table(self.PROVIDES, self.db.metadata)
1444 table.drop()
1445
1455
1465
1475
1485
1495
1505
1515
1525
1535
1538 """
1539 Builds a mapping of Cantonese syllable in the Yale romanisation
1540 system to the syllables' initial, nucleus and coda.
1541 """
1542 PROVIDES = 'CantoneseYaleInitialNucleusCoda'
1543
1544 TABLE_CSV_FILE_MAPPING = 'cantoneseyaleinitialnucleuscoda.csv'
1545 TABLE_DECLARATION_FILE_MAPPING = 'cantoneseyaleinitialnucleuscoda.sql'
1546
1557
1567
1577
1588
1599
1610
1621
1631
1634 """
1635 Builds a mapping between Kangxi radical index and radical equivalent
1636 characters without radical form.
1637 """
1638 PROVIDES = 'KangxiRadicalIsolatedCharacter'
1639
1640 TABLE_CSV_FILE_MAPPING = 'kangxiradicalisolatedcharacter.csv'
1641 TABLE_DECLARATION_FILE_MAPPING = 'kangxiradicalisolatedcharacter.sql'
1642
1645 """
1646 Builds a mapping between I{Unicode radical forms} and
1647 I{Unicode radical variants} on one side and I{equivalent characters} on the
1648 other side.
1649 """
1650 PROVIDES = 'RadicalEquivalentCharacter'
1651
1652 TABLE_CSV_FILE_MAPPING = 'radicalequivalentcharacter.csv'
1653 TABLE_DECLARATION_FILE_MAPPING = 'radicalequivalentcharacter.sql'
1654
1664
1674
1685
1695
1698 """
1699 Builds a mapping of Mandarin Chinese syllable initials in Pinyin to Braille
1700 characters.
1701 """
1702 PROVIDES = 'PinyinBrailleInitialMapping'
1703
1704 TABLE_CSV_FILE_MAPPING = 'pinyinbrailleinitialmapping.csv'
1705 TABLE_DECLARATION_FILE_MAPPING = 'pinyinbrailleinitialmapping.sql'
1706
1717
1723 """
1724 Builds a list of glyph indices for characters.
1725 @todo Impl: Check if all Z-variants in LocaleCharacterVariant are included.
1726 @todo Bug: Forms with two variants in CharacterDecomposition are missing,
1727 e.g. ⾓.
1728 """
1729 PROVIDES = 'ZVariants'
1730 DEPENDS = ['CharacterDecomposition', 'StrokeOrder', 'Unihan']
1731
1732
1733 COLUMNS = ['ChineseCharacter', 'ZVariant']
1734 PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant']
1735 INDEX_KEYS = [['ChineseCharacter']]
1736 COLUMN_TYPES = {'ChineseCharacter': String(1), 'ZVariant': Integer()}
1737
1738 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1740
1742 decompositionTable = self.db.tables['CharacterDecomposition']
1743 strokeOrderTable = self.db.tables['CharacterDecomposition']
1744 unihanTable = self.db.tables['Unihan']
1745
1746 characterSet = set(self.db.selectRows(
1747 select([decompositionTable.c.ChineseCharacter,
1748 decompositionTable.c.ZVariant], distinct=True)))
1749 characterSet.update(self.db.selectRows(
1750 select([strokeOrderTable.c.ChineseCharacter,
1751 strokeOrderTable.c.ZVariant])))
1752
1753
1754
1755
1756 unihanCharacters = self.db.selectScalars(
1757 select([unihanTable.c.ChineseCharacter],
1758 or_(unihanTable.c.kTotalStrokes != None,
1759 unihanTable.c.kRSKangXi != None)))
1760 characterSet.update([(char, 0) for char in unihanCharacters])
1761
1762 return ListGenerator(characterSet).generator()
1763
1766 """
1767 Builds a mapping between characters and their stroke count.
1768 """
1770 """Generates the character stroke count mapping."""
1771 - def __init__(self, dbConnectInst, characterSet, quiet=False):
1772 """
1773 Initialises the StrokeCountGenerator.
1774
1775 @type dbConnectInst: instance
1776 @param dbConnectInst: instance of a L{DatabaseConnector}.
1777 @type characterSet: set
1778 @param characterSet: set of characters to generate the table for
1779 @type quiet: bool
1780 @param quiet: if true no status information will be printed to
1781 stderr
1782 """
1783 self.characterSet = characterSet
1784 self.quiet = quiet
1785 self.cjk = characterlookup.CharacterLookup(
1786 dbConnectInst=dbConnectInst)
1787
1788 self.cjk.hasStrokeCount = False
1789
1791 """Provides one entry per character, z-Variant and locale subset."""
1792 for char, zVariant in self.characterSet:
1793 try:
1794
1795
1796 strokeCount = self.cjk.getStrokeCount(char,
1797 zVariant=zVariant)
1798 yield {'ChineseCharacter': char, 'StrokeCount': strokeCount,
1799 'ZVariant': zVariant}
1800 except exception.NoInformationError:
1801 pass
1802 except IndexError:
1803 if not self.quiet:
1804 warn("malformed IDS for character '" + char \
1805 + "'")
1806
1807 PROVIDES = 'StrokeCount'
1808 DEPENDS = ['CharacterDecomposition', 'StrokeOrder']
1809
1810 COLUMNS = ['ChineseCharacter', 'StrokeCount', 'ZVariant']
1811 PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant']
1812 COLUMN_TYPES = {'ChineseCharacter': String(1), 'StrokeCount': Integer(),
1813 'ZVariant': Integer()}
1814
1815 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1817
1819 decompositionTable = self.db.tables['CharacterDecomposition']
1820 strokeOrderTable = self.db.tables['StrokeOrder']
1821
1822 characterSet = set(self.db.selectRows(
1823 select([decompositionTable.c.ChineseCharacter,
1824 decompositionTable.c.ZVariant], distinct=True)))
1825 characterSet.update(self.db.selectRows(
1826 select([strokeOrderTable.c.ChineseCharacter,
1827 strokeOrderTable.c.ZVariant])))
1828 return StrokeCountBuilder.StrokeCountGenerator(self.db, characterSet,
1829 self.quiet).generator()
1830
1833 """
1834 Builds a mapping between characters and their stroke count. Includes stroke
1835 count data from the Unihan database to make up for missing data in own data
1836 files.
1837 """
1839 """Generates the character stroke count mapping."""
1840 - def __init__(self, dbConnectInst, characterSet, tableEntries,
1841 preferredBuilder, quiet=False):
1842 """
1843 Initialises the CombinedStrokeCountGenerator.
1844
1845 @type dbConnectInst: instance
1846 @param dbConnectInst: instance of a L{DatabaseConnector}.
1847 @type characterSet: set
1848 @param characterSet: set of characters to generate the table for
1849 @type tableEntries: list of list
1850 @param tableEntries: list of characters with Z-variant
1851 @type preferredBuilder: instance
1852 @param preferredBuilder: TableBuilder which forms are preferred over
1853 entries from the Unihan table
1854 @type quiet: bool
1855 @param quiet: if true no status information will be printed to
1856 stderr
1857 """
1858 self.characterSet = characterSet
1859 self.tableEntries = tableEntries
1860 self.preferredBuilder = preferredBuilder
1861 self.quiet = quiet
1862 self.cjk = characterlookup.CharacterLookup(
1863 dbConnectInst=dbConnectInst)
1864 self.db = dbConnectInst
1865
1866 - def getStrokeCount(self, char, zVariant, strokeCountDict,
1867 unihanStrokeCountDict, decompositionDict):
1868 """
1869 Gets the stroke count of the given character by summing up the
1870 stroke count of its components and using the Unihan table as
1871 fallback.
1872
1873 For the sake of consistency this method doesn't take the stroke
1874 count given by Unihan directly but sums up the stroke counts of the
1875 components to make sure the sum of component's stroke count will
1876 always give the characters stroke count. The result yielded will be
1877 in many cases even more precise than the value given in Unihan (not
1878 depending on the actual glyph form).
1879
1880 Once calculated the stroke count will be cached in the given
1881 strokeCountDict object.
1882
1883 @type char: str
1884 @param char: Chinese character
1885 @type zVariant: int
1886 @param zVariant: Z-variant of character
1887 @rtype: int
1888 @return: stroke count
1889 @raise ValueError: if stroke count is ambiguous due to inconsistent
1890 values wrt Unihan vs. own data.
1891 @raise NoInformationError: if decomposition is incomplete
1892 """
1893 if char == u'?':
1894
1895 raise exception.NoInformationError("incomplete decomposition")
1896
1897 if (char, zVariant) not in strokeCountDict:
1898 lastStrokeCount = None
1899 if (char, zVariant) in decompositionDict:
1900
1901
1902 for decomposition in decompositionDict[(char, zVariant)]:
1903 try:
1904 accumulatedStrokeCount = 0
1905
1906 for entry in decomposition:
1907 if type(entry) == types.TupleType:
1908 component, componentZVariant = entry
1909
1910 accumulatedStrokeCount += \
1911 self.getStrokeCount(component,
1912 componentZVariant, strokeCountDict,
1913 unihanStrokeCountDict,
1914 decompositionDict)
1915
1916 if lastStrokeCount != None \
1917 and lastStrokeCount != accumulatedStrokeCount:
1918
1919
1920 raise ValueError("ambiguous stroke count " \
1921 + "information, due to various stroke " \
1922 + "count sources for " \
1923 + repr((char, ZVariant)))
1924 else:
1925
1926 lastStrokeCount = accumulatedStrokeCount
1927
1928 except exception.NoInformationError:
1929 continue
1930
1931 if lastStrokeCount != None:
1932 strokeCountDict[(char, zVariant)] = lastStrokeCount
1933 else:
1934
1935
1936 if (char, 0) in strokeCountDict:
1937
1938 strokeCountDict[(char, zVariant)] \
1939 = strokeCountDict[(char, 0)]
1940
1941 elif char in unihanStrokeCountDict:
1942
1943 strokeCountDict[(char, zVariant)] \
1944 = unihanStrokeCountDict[char]
1945
1946 else:
1947 strokeCountDict[(char, zVariant)] = None
1948
1949 if strokeCountDict[(char, zVariant)] == None:
1950 raise exception.NoInformationError(
1951 "missing stroke count information")
1952 else:
1953 return strokeCountDict[(char, zVariant)]
1954
1956 """Provides one entry per character, z-Variant and locale subset."""
1957
1958 strokeCountDict = {}
1959 for entry in self.preferredBuilder:
1960 yield entry
1961
1962
1963 key = (entry['ChineseCharacter'], entry['ZVariant'])
1964 strokeCountDict[key] = entry['StrokeCount']
1965
1966
1967
1968
1969 unihanStrokeCountDict = {}
1970 for char, strokeCount in self.tableEntries:
1971 if (char, 0) not in strokeCountDict:
1972 unihanStrokeCountDict[char] = strokeCount
1973
1974
1975
1976
1977
1978 self.characterSet.difference_update(strokeCountDict.keys())
1979
1980
1981 decompositionDict = self.cjk.getDecompositionEntriesDict()
1982
1983 for char, zVariant in self.characterSet:
1984 warningZVariants = []
1985 try:
1986
1987 strokeCount = self.getStrokeCount(char, zVariant,
1988 strokeCountDict, unihanStrokeCountDict,
1989 decompositionDict)
1990
1991 yield {'ChineseCharacter': char, 'ZVariant': zVariant,
1992 'StrokeCount': strokeCount}
1993 except ValueError, e:
1994 warningZVariants.append(zVariant)
1995 except exception.NoInformationError:
1996 pass
1997
1998 if not self.quiet and warningZVariants:
1999 warn("ambiguous stroke count information (mixed sources) " \
2000 "for character '" + char + "' for Z-variant(s) '" \
2001 + ''.join([str(z) for z in warningZVariants]) + "'")
2002
2003 DEPENDS = ['CharacterDecomposition', 'StrokeOrder', 'Unihan']
2004 COLUMN_SOURCE = 'kTotalStrokes'
2005
2007 decompositionTable = self.db.tables['CharacterDecomposition']
2008 strokeOrderTable = self.db.tables['StrokeOrder']
2009 unihanTable = self.db.tables['Unihan']
2010
2011 characterSet = set(self.db.selectRows(
2012 select([decompositionTable.c.ChineseCharacter,
2013 decompositionTable.c.ZVariant], distinct=True)))
2014 characterSet.update(self.db.selectRows(
2015 select([strokeOrderTable.c.ChineseCharacter,
2016 strokeOrderTable.c.ZVariant])))
2017 preferredBuilder = \
2018 CombinedStrokeCountBuilder.StrokeCountGenerator(self.db,
2019 characterSet, self.quiet).generator()
2020
2021 tableEntries = self.db.selectRows(
2022 select([unihanTable.c.ChineseCharacter,
2023 unihanTable.c[self.COLUMN_SOURCE]],
2024 unihanTable.c[self.COLUMN_SOURCE] != None))
2025
2026
2027
2028
2029 characterSet.update([(char, 0) for char, totalCount in tableEntries])
2030
2031 return CombinedStrokeCountBuilder.CombinedStrokeCountGenerator(self.db,
2032 characterSet, tableEntries, preferredBuilder, self.quiet)\
2033 .generator()
2034
2037 """
2038 Builds a mapping between characters and their components.
2039 """
2041 """Generates the component to character mapping."""
2042
2043 - def __init__(self, dbConnectInst, characterSet):
2044 """
2045 Initialises the CharacterComponentGenerator.
2046
2047 @type dbConnectInst: instance
2048 @param dbConnectInst: instance of a L{DatabaseConnector}
2049 @type characterSet: set
2050 @param characterSet: set of characters to generate the table for
2051 """
2052 self.characterSet = characterSet
2053 self.cjk = characterlookup.CharacterLookup(
2054 dbConnectInst=dbConnectInst)
2055
2056 - def getComponents(self, char, zVariant, decompositionDict,
2057 componentDict):
2058 """
2059 Gets all character components for the given glyph.
2060
2061 @type char: str
2062 @param char: Chinese character
2063 @type zVariant: int
2064 @param zVariant: Z-variant of character
2065 @rtype: set
2066 @return: all components of the character
2067 """
2068 if (char, zVariant) not in componentDict:
2069 componentDict[(char, zVariant)] = set()
2070
2071 if (char, zVariant) in decompositionDict:
2072 for decomposition in decompositionDict[(char, zVariant)]:
2073 componentDict[(char, zVariant)].update(
2074 [entry for entry in decomposition \
2075 if type(entry) == types.TupleType])
2076
2077 componentSet = set()
2078 for component, componentZVariant in componentDict[(char, zVariant)]:
2079 componentSet.add((component, componentZVariant))
2080
2081 componentSet.update(self.getComponents(component,
2082 componentZVariant, decompositionDict, componentDict))
2083
2084 return componentSet
2085
2087 """Provides the component entries."""
2088 decompositionDict = self.cjk.getDecompositionEntriesDict()
2089 componentDict = {}
2090 for char, zVariant in self.characterSet:
2091 for component, componentZVariant \
2092 in self.getComponents(char, zVariant, decompositionDict,
2093 componentDict):
2094 yield {'ChineseCharacter': char, 'ZVariant': zVariant,
2095 'Component': component,
2096 'ComponentZVariant': componentZVariant}
2097
2098 PROVIDES = 'ComponentLookup'
2099 DEPENDS = ['CharacterDecomposition']
2100
2101 COLUMNS = ['ChineseCharacter', 'ZVariant', 'Component', 'ComponentZVariant']
2102 PRIMARY_KEYS = COLUMNS
2103 INDEX_KEYS = [['Component']]
2104 COLUMN_TYPES = {'ChineseCharacter': String(1), 'ZVariant': Integer(),
2105 'Component': String(1), 'ComponentZVariant': Integer()}
2106
2107 - def __init__(self, dataPath, dbConnectInst, quiet=False):
2110
2118
2121 """
2122 Builds a mapping between characters and their radical with stroke count of
2123 residual components.
2124
2125 This class can be extended by inheriting
2126 L{CharacterRadicalStrokeCountGenerator} and overwriting
2127 L{CharacterRadicalStrokeCountGenerator.getFormRadicalIndex()} to implement
2128 which forms should be regarded as radicals as well as
2129 L{CharacterRadicalStrokeCountGenerator.filterForms()} to filter entries
2130 before creation.
2131 """
2133 """Generates the character to radical/residual stroke count mapping."""
2134
2135 - def __init__(self, dbConnectInst, characterSet, quiet=False):
2136 """
2137 Initialises the CharacterRadicalStrokeCountGenerator.
2138
2139 @type dbConnectInst: instance
2140 @param dbConnectInst: instance of a L{DatabaseConnector}
2141 @type characterSet: set
2142 @param characterSet: set of characters to generate the table for
2143 @type quiet: bool
2144 @param quiet: if true no status information will be printed to
2145 stderr
2146 """
2147 self.characterSet = characterSet
2148 self.quiet = quiet
2149 self.cjk = characterlookup.CharacterLookup(
2150 dbConnectInst=dbConnectInst)
2151 self.radicalForms = None
2152
2174
2190
2191 - def getEntries(self, char, zVariant, strokeCountDict, decompositionDict,
2192 entriesDict):
2193 u"""
2194 Gets all radical/residual stroke count combinations from the given
2195 decomposition.
2196
2197 @rtype: list
2198 @return: all radical/residual stroke count combinations for the
2199 character
2200 @raise ValueError: if IDS is malformed or ambiguous residual stroke
2201 count is calculated
2202 @todo Fix: Remove validity check, only needed as long
2203 decomposition entries aren't checked against stroke order
2204 entries.
2205 """
2206 def getCharLayout(mainCharacterLayout, mainLayoutPosition,
2207 subCharLayout, subLayoutPosition):
2208 u"""
2209 Returns the character layout for the radical form within the
2210 component with layout subCharLayout itself belonging to a parent
2211 char with layout mainCharacterLayout.
2212 E.g. 鸺 can be decomposed into ⿰休鸟 and 休 can be furthermore
2213 decomposed into ⿰亻木. 亻 is found in a lower layer of
2214 decomposition, but as the structure of 休 and 鸺 are the same,
2215 and 亻 is on the left side of 休 which is on the left side of 鸺
2216 one can deduce 亻 as being on the utmost left side of 鸺. Thus
2217 (⿰, 0) would be returned.
2218 """
2219 specialReturn = {
2220 (u'⿰', 0, u'⿰', 0): (u'⿰', 0),
2221 (u'⿰', 1, u'⿰', 1): (u'⿰', 1),
2222 (u'⿱', 0, u'⿱', 0): (u'⿱', 0),
2223 (u'⿱', 1, u'⿱', 1): (u'⿱', 1),
2224 (u'⿲', 0, u'⿲', 0): (u'⿰', 0),
2225 (u'⿲', 2, u'⿲', 2): (u'⿰', 1),
2226 (u'⿳', 0, u'⿳', 0): (u'⿱', 0),
2227 (u'⿳', 2, u'⿳', 2): (u'⿱', 0),
2228 (u'⿲', 0, u'⿰', 0): (u'⿰', 0),
2229 (u'⿲', 2, u'⿰', 1): (u'⿰', 1),
2230 (u'⿰', 0, u'⿲', 0): (u'⿰', 0),
2231 (u'⿰', 1, u'⿲', 1): (u'⿰', 1),
2232 (u'⿳', 0, u'⿱', 0): (u'⿱', 0),
2233 (u'⿳', 2, u'⿱', 1): (u'⿱', 1),
2234 (u'⿱', 0, u'⿳', 0): (u'⿱', 0),
2235 (u'⿱', 1, u'⿳', 2): (u'⿱', 1),
2236 }
2237 entry = (mainCharacterLayout, mainLayoutPosition, subCharLayout,
2238 subLayoutPosition)
2239 if entry in specialReturn:
2240 return specialReturn[entry]
2241 elif subCharLayout == u'⿻':
2242
2243 return (u'⿻', 0)
2244 elif mainCharacterLayout == None:
2245
2246 return subCharLayout, subLayoutPosition
2247 else:
2248
2249 return (u'⿻', 0)
2250
2251
2252 if (char, zVariant) not in decompositionDict:
2253 return []
2254
2255 if (char, zVariant) not in entriesDict:
2256 entriesDict[(char, zVariant)] = set()
2257
2258 for decomposition in decompositionDict[(char, zVariant)]:
2259 componentRadicalForms = []
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274 layoutStack = [(None, None)]
2275
2276 for entry in decomposition:
2277 try:
2278 layout, position = layoutStack.pop()
2279 except IndexError:
2280 raise ValueError("malformed IDS for character '" \
2281 + mainChar + "'")
2282
2283 if type(entry) != types.TupleType:
2284
2285
2286
2287
2288 if self.cjk.isTrinaryIDSOperator(entry):
2289 posRange = [2, 1, 0]
2290 else:
2291 posRange = [1, 0]
2292
2293 for componentPos in posRange:
2294
2295
2296 layoutStack.append(getCharLayout(layout,
2297 position, entry, componentPos))
2298 else:
2299
2300 componentChar, componentZVariant = entry
2301
2302
2303 radicalIndex \
2304 = self.getFormRadicalIndex(componentChar)
2305 if radicalIndex != None:
2306
2307
2308
2309 componentRadicalForms.append(
2310 {'Component': entry,
2311 'Form': componentChar,
2312 'Z-variant': componentZVariant,
2313 'ResidualStrokeCount': 0,
2314 'CharacterLayout': layout,
2315 'RadicalIndex': radicalIndex,
2316 'RadicalPosition': position})
2317
2318
2319
2320 for radicalEntry in self.getEntries(componentChar,
2321 componentZVariant, strokeCountDict,
2322 decompositionDict, entriesDict):
2323
2324
2325 charLayout, charPosition = getCharLayout(layout,
2326 position, radicalEntry['CharacterLayout'],
2327 radicalEntry['RadicalPosition'])
2328 componentEntry = radicalEntry.copy()
2329 componentEntry['Component'] = entry
2330 componentEntry['CharacterLayout'] = charLayout
2331 componentEntry['RadicalPosition'] = charPosition
2332 componentRadicalForms.append(componentEntry)
2333
2334
2335 residualCharacters = {}
2336 charactersSeen = []
2337 for entry in decomposition:
2338
2339 if type(entry) == types.TupleType:
2340
2341 for seenEntry in residualCharacters:
2342 residualCharacters[seenEntry].append(entry)
2343
2344
2345 residualCharacters[entry] = charactersSeen[:]
2346
2347 charactersSeen.append(entry)
2348
2349
2350 for componentEntry in componentRadicalForms:
2351
2352
2353
2354 for entry in \
2355 residualCharacters[componentEntry['Component']]:
2356
2357 if entry not in strokeCountDict:
2358 break
2359
2360 componentEntry['ResidualStrokeCount'] \
2361 += strokeCountDict[entry]
2362 else:
2363
2364 del componentEntry['Component']
2365 entriesDict[(char, zVariant)].add(
2366 frozenset(componentEntry.items()))
2367
2368
2369
2370 seenEntriesDict = {}
2371 for entry in [dict(d) for d in entriesDict[(char, zVariant)]]:
2372 keyEntry = (entry['Form'], entry['Z-variant'],
2373 entry['CharacterLayout'], entry['RadicalIndex'],
2374 entry['RadicalPosition'])
2375 if keyEntry in seenEntriesDict \
2376 and seenEntriesDict[keyEntry] \
2377 != entry['ResidualStrokeCount']:
2378 raise ValueError("ambiguous residual stroke count for " \
2379 + "character '" + mainChar + "' with entry '" \
2380 + "', '".join(list([unicode(column) \
2381 for column in keyEntry])) \
2382 + "': '" + str(seenEntriesDict[keyEntry]) + "'/'" \
2383 + str(entry['ResidualStrokeCount']) + "'")
2384 seenEntriesDict[keyEntry] = entry['ResidualStrokeCount']
2385
2386
2387 return self.filterForms(
2388 [dict(d) for d in entriesDict[(char, zVariant)]])
2389
2391 """Provides the radical/stroke count entries."""
2392 strokeCountDict = self.cjk.getStrokeCountDict()
2393 decompositionDict = self.cjk.getDecompositionEntriesDict()
2394 entryDict = {}
2395
2396 for char, zVariant in self.characterSet:
2397 if self.cjk.isRadicalChar(char):
2398
2399 continue
2400
2401 for entry in self.getEntries(char, zVariant, strokeCountDict,
2402 decompositionDict, entryDict):
2403
2404 yield [char, zVariant, entry['RadicalIndex'], entry['Form'],
2405 entry['Z-variant'], entry['CharacterLayout'],
2406 entry['RadicalPosition'], entry['ResidualStrokeCount']]
2407
2408 PROVIDES = 'CharacterRadicalResidualStrokeCount'
2409 DEPENDS = ['CharacterDecomposition', 'StrokeCount', 'KangxiRadical',
2410 'KangxiRadicalIsolatedCharacter', 'RadicalEquivalentCharacter',
2411 'CharacterKangxiRadical']
2412
2413 COLUMNS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex', 'RadicalForm',
2414 'RadicalZVariant', 'MainCharacterLayout', 'RadicalRelativePosition',
2415 'ResidualStrokeCount']
2416 PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant', 'RadicalForm',
2417 'RadicalZVariant', 'MainCharacterLayout', 'RadicalRelativePosition']
2418 COLUMN_TYPES = {'ChineseCharacter': String(1), 'RadicalIndex': Integer(),
2419 'RadicalForm': String(1), 'ZVariant': Integer(),
2420 'RadicalZVariant': Integer(), 'MainCharacterLayout': String(1),
2421 'RadicalRelativePosition': Integer(), 'ResidualStrokeCount': Integer()}
2422
2423 - def __init__(self, dataPath, dbConnectInst, quiet=False):
2426
2436
2439 """
2440 Builds a mapping between characters and their residual stroke count when
2441 splitting of the radical form. This is stripped off information gathered
2442 from table C{CharacterRadicalStrokeCount}.
2443 """
2445 """
2446 Generates the character to residual stroke count mapping from the
2447 C{CharacterRadicalResidualStrokeCount} table.
2448 """
2450 """
2451 Initialises the ResidualStrokeCountExtractor.
2452
2453 @type dbConnectInst: instance
2454 @param dbConnectInst: instance of a L{DatabaseConnector}
2455 @type characterSet: set
2456 @param characterSet: set of characters to generate the table for
2457 """
2458 self.characterSet = characterSet
2459 self.cjk = characterlookup.CharacterLookup(
2460 dbConnectInst=dbConnectInst)
2461
2463 u"""
2464 Gets a list of radical residual entries. For multiple radical
2465 occurrences (e.g. 伦) only returns the residual stroke count for the
2466 "main" radical form.
2467
2468 @type char: str
2469 @param char: Chinese character
2470 @type zVariant: int
2471 @param zVariant: I{Z-variant} of given character
2472 @rtype: list of tuple
2473 @return: list of residual stroke count entries
2474 @todo Lang: Implement, find a good algorithm to turn down unwanted
2475 forms, don't just choose random one. See the following list::
2476
2477 >>> from cjklib import characterlookup
2478 >>> cjk = characterlookup.CharacterLookup()
2479 >>> for char in cjk.db.selectSoleValue('CharacterRadicalResidualStrokeCount',
2480 ... 'ChineseCharacter', distinctValues=True):
2481 ... try:
2482 ... entries = cjk.getCharacterKangxiRadicalResidualStrokeCount(char, 'C')
2483 ... lastEntry = entries[0]
2484 ... for entry in entries[1:]:
2485 ... # print if diff. radical forms and diff. residual stroke count
2486 ... if lastEntry[0] != entry[0] and lastEntry[2] != entry[2]:
2487 ... print char
2488 ... break
2489 ... lastEntry = entry
2490 ... except:
2491 ... pass
2492 ...
2493 渌
2494 犾
2495 玺
2496 珏
2497 缧
2498 >>> cjk.getCharacterKangxiRadicalResidualStrokeCount(u'缧')
2499 [(u'\u7cf8', 0, u'\u2ffb', 0, 8), (u'\u7e9f', 0, u'\u2ff0', 0, 11)]
2500 """
2501
2502
2503 filteredEntries = []
2504 for radicalIdx in radicalDict[(char, zVariant)]:
2505 _, _, _, _, residualStrokeCount \
2506 = radicalDict[(char, zVariant)][radicalIdx][0]
2507 filteredEntries.append((radicalIdx, residualStrokeCount))
2508
2509 return filteredEntries
2510
2512 """Provides one entry per character, z-Variant and locale subset."""
2513 radicalDict = self.cjk.getCharacterRadicalResidualStrokeCountDict()
2514 for char, zVariant in self.characterSet:
2515 for radicalIndex, residualStrokeCount in self.getEntries(char,
2516 zVariant, radicalDict):
2517 yield [char, zVariant, radicalIndex, residualStrokeCount]
2518
2519 PROVIDES = 'CharacterResidualStrokeCount'
2520 DEPENDS = ['CharacterRadicalResidualStrokeCount']
2521
2522 COLUMNS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex',
2523 'ResidualStrokeCount']
2524 PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex']
2525 INDEX_KEYS = [['RadicalIndex']]
2526 COLUMN_TYPES = {'ChineseCharacter': String(1), 'RadicalIndex': Integer(),
2527 'ZVariant': Integer(), 'ResidualStrokeCount': Integer()}
2528
2529 - def __init__(self, dataPath, dbConnectInst, quiet=False):
2532
2540
2544 """
2545 Builds a mapping between characters and their residual stroke count when
2546 splitting of the radical form. Includes stroke count data from the Unihan
2547 database to make up for missing data in own data files.
2548 """
2550 """
2551 Generates the character to residual stroke count mapping.
2552 """
2554 """
2555 Initialises the CombinedResidualStrokeCountExtractor.
2556
2557 @type tableEntries: list of list
2558 @param tableEntries: list of characters with Z-variant
2559 @type preferredBuilder: instance
2560 @param preferredBuilder: TableBuilder which forms are preferred over
2561 entries from the Unihan table
2562 @type quiet: bool
2563 @param quiet: if true no status information will be printed
2564 """
2565 self.RADICAL_REGEX = re.compile(ur"(\d+)\.(\d+)")
2566 self.tableEntries = tableEntries
2567 self.preferredBuilder = preferredBuilder
2568 self.quiet = quiet
2569
2571 """Provides one entry per character and z-Variant."""
2572
2573 seenCharactersSet = set()
2574 for entry in self.preferredBuilder:
2575 yield entry
2576 char = entry[0]
2577 radicalIdx = entry[2]
2578 seenCharactersSet.add((char, radicalIdx))
2579
2580
2581 for char, radicalStroke in self.tableEntries:
2582 matchObj = self.RADICAL_REGEX.match(radicalStroke)
2583 if matchObj:
2584 try:
2585 radicalIndex = int(matchObj.group(1))
2586 residualStrokeCount = int(matchObj.group(2))
2587 if (char, radicalIndex) not in seenCharactersSet:
2588 yield [char, 0, radicalIndex, residualStrokeCount]
2589 except ValueError:
2590 if not self.quiet:
2591 warn("unable to read radical information of " \
2592 + "character '" + character + "': '" \
2593 + radicalStroke + "'")
2594 elif not self.quiet:
2595 warn("unable to read radical information of character '" \
2596 + character + "': '" + radicalStroke + "'")
2597
2598 DEPENDS = ['CharacterRadicalResidualStrokeCount', 'Unihan']
2599 COLUMN_SOURCE = 'kRSKangXi'
2600
2618
2979
2982 """
2983 Builds a translation word index for a given dictionary.
2984
2985 Searching for a word will return a headword and reading. This allows to find
2986 several dictionary entries with same headword and reading, with only one
2987 including the translation word.
2988
2989 @todo Fix: Word regex is specialised for HanDeDict.
2990 @todo Fix: Using a row_id for joining instead of Headword(Traditional) and
2991 Reading would maybe speed up table joins. Needs a workaround to include
2992 multiple rows for one actual headword entry though.
2993 """
2995 """Generates words for a list of dictionary entries."""
2996
2997 - def __init__(self, entries):
2998 """
2999 Initialises the WordEntryGenerator.
3000
3001 @type entries: list of tuple
3002 @param entries: a list of headword and its translation
3003 """
3004 self.entries = entries
3005
3006
3007 self.wordRegex = re.compile(r'\([^\)]+\)|' \
3008 + r'(?:; Bsp.: [^/]+?--[^/]+)|([^/,\(\)\[\]\!\?]+)')
3009
3010 - def generator(self):
3011 """Provides all data of one word per entry."""
3012
3013 seenWordEntries = set()
3014 newEntryDict = {}
3015
3016 for headword, reading, translation in self.entries:
3017 newEntryDict['Headword'] = headword
3018 newEntryDict['Reading'] = reading
3019 for word in self.wordRegex.findall(translation):
3020 word = word.strip().lower()
3021 if not word:
3022 continue
3023 if word \
3024 and (headword, reading, word) not in seenWordEntries:
3025 seenWordEntries.add((headword, reading, word))
3026 newEntryDict['Word'] = word
3027 yield newEntryDict
3028
3029 COLUMNS = ['Headword', 'Reading', 'Word']
3030 COLUMN_TYPES = {'Headword': String(255), 'Reading': String(255),
3031 'Word': String(255)}
3032 INDEX_KEYS = [['Word']]
3033
3034 TABLE_SOURCE = None
3035 """Dictionary source"""
3036 HEADWORD_SOURCE = 'Headword'
3037 """Source of headword"""
3038
3039 - def __init__(self, dataPath, dbConnectInst, quiet=False):
3041
3048
3058
3061 """
3062 Builds the word index of the EDICT dictionary.
3063 """
3064 PROVIDES = 'EDICT_Words'
3065 DEPENDS = ['EDICT']
3066 TABLE_SOURCE = 'EDICT'
3067
3090
3093 """
3094 Builds the CEDICT dictionary.
3095 """
3097 """
3098 Converts the C{'u:'} to C{'ü'}.
3099
3100 @type entry: tuple
3101 @param entry: a dictionary entry
3102 @rtype: tuple
3103 @return: the given entry with corrected ü-voul
3104 """
3105 if type(entry) == type({}):
3106 entry['Reading'] = entry['Reading'].replace('u:', u'ü')
3107 return entry
3108 else:
3109 trad, simp, reading, translation = entry
3110 reading = reading.replace('u:', u'ü')
3111 return [trad, simp, reading, translation]
3112
3113 PROVIDES = 'CEDICT'
3114 FILE_NAMES = ['cedict_1_0_ts_utf-8_mdbg.zip',
3115 'cedict_1_0_ts_utf-8_mdbg.txt.gz', 'cedictu8.zip', 'cedict_ts.u8',
3116 'cedict_1_0_ts_utf-8_mdbg.txt']
3117 ENCODING = 'utf-8'
3118 FILTER = filterUmlaut
3119
3120 - def getArchiveContentName(self, filePath):
3121 return 'cedict_ts.u8'
3122
3132
3135 """
3136 Builds the CEDICT-GR dictionary.
3137 """
3138 PROVIDES = 'CEDICTGR'
3139 FILE_NAMES = ['cedictgr.zip', 'cedictgr.b5']
3140 ENCODING = 'big5hkscs'
3141
3142 - def getArchiveContentName(self, filePath):
3143 return 'cedictgr.b5'
3144
3154
3157 """
3158 Builds the HanDeDict dictionary.
3159 """
3161 """
3162 Converts wrong spacing in readings of entries in HanDeDict.
3163
3164 @type entry: tuple
3165 @param entry: a dictionary entry
3166 @rtype: tuple
3167 @return: the given entry with corrected spacing
3168 """
3169 if type(entry) == type({}):
3170 headword = entry['HeadwordTraditional']
3171 reading = entry['Reading']
3172 else:
3173 headword, headwordSimplified, reading, translation = entry
3174
3175 readingEntities = []
3176 precedingIsNonReading = False
3177 for idx, entity in enumerate(reading.split(' ')):
3178 if idx < len(headword) and entity == headword[idx]:
3179
3180
3181 if not precedingIsNonReading:
3182 readingEntities.append(' ')
3183
3184 precedingIsNonReading = True
3185 elif idx != 0:
3186 readingEntities.append(' ')
3187 precedingIsNonReading = False
3188
3189 readingEntities.append(entity)
3190
3191 reading = ''.join(readingEntities)
3192
3193 if type(entry) == type({}):
3194 entry['Reading'] = reading
3195 return entry
3196 else:
3197 return [headword, headwordSimplified, reading, translation]
3198
3199 PROVIDES = 'HanDeDict'
3200 FILE_NAMES = ['handedict-*.zip', 'handedict-*.tar.bz2', 'handedict.u8']
3201 ENCODING = 'utf-8'
3202 FILTER = filterSpacing
3203
3205 fileName = os.path.basename(filePath)
3206 matchObj = re.match(r'handedict-(\d{8})\.', fileName)
3207 if matchObj:
3208 return matchObj.group(1)
3209
3211 timeStamps = []
3212 for filePath in filePaths:
3213 ts = self.extractTimeStamp(filePath)
3214 if ts:
3215 timeStamps.append((ts, filePath))
3216 if timeStamps:
3217 _, filePath = max(timeStamps)
3218 return filePath
3219 else:
3220 filePaths[0]
3221
3222 - def getArchiveContentName(self, filePath):
3223 timeStamp = self.extractTimeStamp(filePath)
3224 return 'handedict-' + timeStamp + '/handedict.u8'
3225
3226 - def findFile(self, fileGlobs, fileType=None):
3227 """
3228 Tries to locate a file with a given list of possible file names under
3229 the classes default data paths.
3230
3231 Uses the newest version of all files found.
3232
3233 @type fileGlobs: str/list of str
3234 @param fileGlobs: possible file names
3235 @type fileType: str
3236 @param fileType: textual type of file used in error msg
3237 @rtype: str
3238 @return: path to file of first match in search for existing file
3239 @raise IOError: if no file found
3240 """
3241 import glob
3242
3243 if type(fileGlobs) != type([]):
3244 fileGlobs = [fileGlobs]
3245 foundFiles = []
3246 for fileGlob in fileGlobs:
3247 for path in self.dataPath:
3248 globPath = os.path.join(os.path.expanduser(path), fileGlob)
3249 for filePath in glob.glob(globPath):
3250 if os.path.exists(filePath):
3251 fileName = os.path.basename(filePath)
3252 foundFiles.append((fileName, filePath))
3253
3254 if foundFiles:
3255 if hasattr(self, 'getPreferredFile'):
3256 return self.getPreferredFile([path for _, path in foundFiles])
3257 else:
3258 _, newestPath = max(foundFiles)
3259 return newestPath
3260 else:
3261 if fileType == None:
3262 fileType = "file"
3263 raise IOError("No " + fileType + " found for '" + self.PROVIDES \
3264 + "' under path(s)'" + "', '".join(self.dataPath) \
3265 + "' for file names '" + "', '".join(fileGlobs) + "'")
3266
3269 """
3270 Builds the word index of the HanDeDict dictionary.
3271 """
3272 PROVIDES = 'HanDeDict_Words'
3273 DEPENDS = ['HanDeDict']
3274 TABLE_SOURCE = 'HanDeDict'
3275 HEADWORD_SOURCE = 'HeadwordTraditional'
3276
3281 """
3282 DatabaseBuilder provides the main class for building up a database for the
3283 cjklib package.
3284
3285 It contains all L{TableBuilder} classes and a dependency graph to handle
3286 build requests.
3287 """
3288 - def __init__(self, databaseSettings={}, dbConnectInst=None, dataPath=[],
3289 quiet=False, rebuildDepending=True, rebuildExisting=True, noFail=False,
3290 prefer=[], additionalBuilders=[]):
3291 """
3292 Constructs the DatabaseBuilder.
3293
3294 @type databaseSettings: dict
3295 @param databaseSettings: dictionary holding the database options for the
3296 dbconnector module.
3297 @type dbConnectInst: instance
3298 @param dbConnectInst: instance of a L{DatabaseConnector}
3299 @type dataPath: list of str
3300 @param dataPath: optional list of paths to the data file(s)
3301 @type quiet: bool
3302 @param quiet: if true no status information will be printed to stderr
3303 @type rebuildDepending: bool
3304 @param rebuildDepending: if true existing tables that depend on updated
3305 tables will be dropped and built from scratch
3306 @type rebuildExisting: bool
3307 @param rebuildExisting: if true existing tables will be dropped and
3308 built from scratch
3309 @type noFail: bool
3310 @param noFail: if true build process won't terminate even if one table
3311 fails to build
3312 @type prefer: list
3313 @param prefer: list of L{TableBuilder} names to prefer in conflicting
3314 cases
3315 @type additionalBuilders: list of classobj
3316 @param additionalBuilders: list of externally provided TableBuilders
3317 """
3318 if not dataPath:
3319 buildModule = __import__("cjklib.build")
3320 self.dataPath = [os.path.join(buildModule.__path__[0], 'data')]
3321 else:
3322 if type(dataPath) == type([]):
3323 self.dataPath = dataPath
3324 else:
3325
3326 self.dataPath = [dataPath]
3327 self.quiet = quiet
3328 self.rebuildDepending = rebuildDepending
3329 self.rebuildExisting = rebuildExisting
3330 self.noFail = noFail
3331
3332 if dbConnectInst:
3333 self.db = dbConnectInst
3334 else:
3335 self.db = dbconnector.DatabaseConnector.getDBConnector(
3336 databaseSettings)
3337
3338
3339 tableBuilderClasses = DatabaseBuilder.getTableBuilderClasses(
3340 set(prefer), quiet=self.quiet,
3341 additionalBuilders=additionalBuilders)
3342
3343
3344 self.tableBuilderLookup = {}
3345 for tableBuilder in tableBuilderClasses.values():
3346 if self.tableBuilderLookup.has_key(tableBuilder.PROVIDES):
3347 raise Exception("Table '" + tableBuilder.PROVIDES \
3348 + "' provided by several builders")
3349 self.tableBuilderLookup[tableBuilder.PROVIDES] = tableBuilder
3350
3352 """
3353 Changes the data path.
3354
3355 @type dataPath: list of str
3356 @param dataPath: list of paths to the data file(s)
3357 """
3358 if type(dataPath) == type([]):
3359 self.dataPath = dataPath
3360 else:
3361
3362 self.dataPath = [dataPath]
3363
3364 - def build(self, tables):
3365 """
3366 Builds the given tables.
3367
3368 @type tables: list
3369 @param tables: list of tables to build
3370 """
3371 if type(tables) != type([]):
3372 tables = [tables]
3373
3374 warn("Building database '%s'" % self.db.databaseUrl)
3375
3376
3377 filteredTables = []
3378 for table in tables:
3379 if table not in self.tableBuilderLookup:
3380 raise exception.UnsupportedError("Table '%s' not provided" \
3381 % table)
3382
3383 if self.needsRebuild(table):
3384 filteredTables.append(table)
3385 else:
3386 if not self.quiet:
3387 warn("Skipping table '%s' because it already exists" \
3388 % table)
3389 tables = filteredTables
3390
3391
3392 dependingTables = []
3393 if self.rebuildDepending:
3394 dependingTables = self.getRebuiltDependingTables(tables)
3395 if dependingTables:
3396 warn("Tables rebuilt because of dependencies updated: '" \
3397 +"', '".join(dependingTables) + "'")
3398 tables.extend(dependingTables)
3399
3400
3401 buildDependentTables = self.getBuildDependentTables(tables)
3402 buildTables = set(tables) | buildDependentTables
3403
3404 builderClasses = self.getClassesInBuildOrder(buildTables)
3405
3406
3407 if not self.quiet and self.rebuildExisting:
3408 warn("Rebuilding tables and overwriting old ones...")
3409 builderClasses.reverse()
3410 instancesUnrequestedTable = set()
3411 while builderClasses:
3412 builder = builderClasses.pop()
3413
3414
3415 transaction = self.db.connection.begin()
3416 try:
3417 instance = builder(self.dataPath, self.db, self.quiet)
3418
3419
3420 if builder.PROVIDES in buildDependentTables \
3421 and not self.db.engine.has_table(builder.PROVIDES):
3422 instancesUnrequestedTable.add(instance)
3423
3424 if self.db:
3425 if self.db.engine.has_table(builder.PROVIDES):
3426 if not self.quiet:
3427 warn("Removing previously built table '" \
3428 + builder.PROVIDES + "'")
3429 instance.remove()
3430 else:
3431 instance.remove()
3432
3433 if not self.quiet:
3434 warn("Building table '" + builder.PROVIDES \
3435 + "' with builder '" + builder.__name__ + "'...")
3436
3437 instance.build()
3438 transaction.commit()
3439 except IOError, e:
3440 transaction.rollback()
3441
3442 if self.noFail:
3443 if not self.quiet:
3444 warn("Building table '" + builder.PROVIDES \
3445 + "' failed: '" + str(e) + "', skipping")
3446 dependingTables = [builder.PROVIDES]
3447 remainingBuilderClasses = []
3448 for clss in builderClasses:
3449 if set(clss.DEPENDS) & set(dependingTables):
3450
3451 dependingTables.append(clss.PROVIDES)
3452 else:
3453 remainingBuilderClasses.append(clss)
3454 if not self.quiet and len(dependingTables) > 1:
3455 warn("Ignoring depending table(s) '" \
3456 + "', '".join(dependingTables[1:]) + "'")
3457 builderClasses = remainingBuilderClasses
3458 else:
3459 raise
3460 except Exception, e:
3461 transaction.rollback()
3462 raise
3463
3464
3465 if instancesUnrequestedTable:
3466 for instance in instancesUnrequestedTable:
3467 if not self.quiet:
3468 warn("Removing table '" + instance.PROVIDES \
3469 + "' as it was only created to solve build " \
3470 + "dependencies")
3471 instance.remove()
3472
3474 """
3475 Removes the given tables.
3476
3477 @type tables: list
3478 @param tables: list of tables to remove
3479 """
3480 if type(tables) != type([]):
3481 tables = [tables]
3482
3483 tableBuilderClasses = []
3484 for table in set(tables):
3485 if not self.tableBuilderLookup.has_key(table):
3486 raise exception.UnsupportedError("table '" + table \
3487 + "' not provided")
3488 tableBuilderClasses.append(self.tableBuilderLookup[table])
3489
3490 for builder in tableBuilderClasses:
3491 instance = builder(self.dataPath, self.db, self.quiet)
3492 if self.db:
3493 if self.db.engine.has_table(builder.PROVIDES):
3494 if not self.quiet:
3495 warn("Removing previously built table '" \
3496 + builder.PROVIDES + "'")
3497 instance.remove()
3498 else:
3499 instance.remove()
3500
3502 """
3503 Returns true if either rebuild is turned on by default or we build into
3504 database and the table doesn't exist yet.
3505
3506 @type tableName: classobj
3507 @param tableName: L{TableBuilder} class
3508 @rtype: bool
3509 @return: True, if table needs to be rebuilt
3510 """
3511 if self.rebuildExisting:
3512 return True
3513 else:
3514 return not self.db.engine.has_table(tableName)
3515
3517 """
3518 Gets the name of the tables that needs to be built to resolve
3519 dependencies.
3520
3521 @type tableNames: list of str
3522 @param tableNames: list of tables to build
3523 @rtype: list of str
3524 @return: names of tables needed to resolve dependencies
3525 """
3526 def solveDependencyRecursive(table):
3527 """
3528 Gets all tables on which the given table depends and that need to be
3529 rebuilt. Also will mark tables skipped which won't be rebuilt.
3530
3531 Uses parent's variables to store data.
3532
3533 @type table: str
3534 @param table: table name for which to solve dependencies
3535 """
3536 if table in tableNames:
3537
3538 return
3539 if self.db and self.db.engine.has_table(table):
3540 skippedTables.add(table)
3541 return
3542
3543 dependedTablesNames.add(table)
3544
3545
3546 if not self.tableBuilderLookup.has_key(table):
3547
3548
3549
3550 raise exception.UnsupportedError("table '" + table \
3551 + "' not provided, might be related to conflicting " \
3552 + "builders")
3553 builderClass = self.tableBuilderLookup[table]
3554 for dependantTable in builderClass.DEPENDS:
3555 solveDependencyRecursive(dependantTable)
3556
3557 tableNames = set(tableNames)
3558 dependedTablesNames = set()
3559 skippedTables = set()
3560
3561 for table in tableNames:
3562 builderClass = self.tableBuilderLookup[table]
3563 for depededTable in builderClass.DEPENDS:
3564 solveDependencyRecursive(depededTable)
3565
3566 if not self.quiet and skippedTables:
3567 warn("Newly built tables depend on table(s) '" \
3568 + "', '".join(skippedTables) \
3569 + "' but skipping because they already exist")
3570 return dependedTablesNames
3571
3573 """
3574 Gets the name of the tables that depend on the given tables to be built
3575 and are not included in the given set.
3576
3577 Dependencies depend on the choice of table builders and thus may vary.
3578
3579 @type tableNames: list of str
3580 @param tableNames: list of tables
3581 @rtype: list of str
3582 @return: names of tables that depend on given tables
3583 """
3584 dependencyTables = set(tableNames)
3585 dependingTablesNames = set()
3586 residualTables = self.getCurrentSupportedTables() - dependencyTables
3587
3588 while dependencyTables:
3589 dependencyTable = dependencyTables.pop()
3590 for table in residualTables:
3591 builderClass = self.tableBuilderLookup[table]
3592 if dependencyTable in builderClass.DEPENDS:
3593
3594 dependingTablesNames.add(table)
3595
3596 dependencyTables.add(table)
3597
3598 residualTables = residualTables - dependencyTables
3599
3600 return dependingTablesNames
3601
3603 """
3604 Gets the name of the tables that depend on the given tables to be built
3605 and already exist, thus need to be rebuilt.
3606
3607 @type tableNames: list of str
3608 @param tableNames: list of tables
3609 @rtype: list of str
3610 @return: names of tables that need to be rebuilt because of dependencies
3611 """
3612 dependingTables = self.getDependingTables(tableNames)
3613
3614 needRebuild = set()
3615 for tableName in dependingTables:
3616 if self.db.engine.has_table(tableName):
3617 needRebuild.add(tableName)
3618 return needRebuild
3619
3621 """
3622 Gets the build order for the given table names.
3623
3624 @type tableNames: list of str
3625 @param tableNames: list of names of tables to build
3626 @rtype: list of classobj
3627 @return: L{TableBuilder}s in build order
3628 """
3629
3630 tableBuilderClasses = []
3631 for table in set(tableNames):
3632 if not self.tableBuilderLookup.has_key(table):
3633
3634
3635
3636 raise exception.UnsupportedError("table '" + table \
3637 + "' not provided, might be related to conflicting " \
3638 + "builders")
3639 tableBuilderClasses.append(self.tableBuilderLookup[table])
3640 return self.getBuildDependencyOrder(tableBuilderClasses)
3641
3642 @staticmethod
3644 """
3645 Create order in which the tables have to be created.
3646
3647 @type tableBuilderClasses: list of classobj
3648 @param tableBuilderClasses: list of L{TableBuilder} classes
3649 @rtype: list of classobj
3650 @return: the given classes ordered in build dependency order
3651 """
3652 dependencyOrder = []
3653 providedTables = [bc.PROVIDES for bc in tableBuilderClasses]
3654 includedTableNames = set()
3655 while tableBuilderClasses:
3656 for builderClass in tableBuilderClasses:
3657 if set(builderClass.DEPENDS).intersection(providedTables) \
3658 <= includedTableNames:
3659
3660
3661
3662 break
3663 else:
3664
3665
3666
3667
3668 raise Exception("Unfulfillable depend request, " \
3669 + "might be related to conflicting builders or cycle. " \
3670 + "Builders included: '" \
3671 + "', '".join([clss.__name__ for clss in dependencyOrder]) \
3672 + "'. Builders with open depends: '" \
3673 + "', '".join([builder.PROVIDES \
3674 for builder in tableBuilderClasses]) + "'")
3675 dependencyOrder.append(builderClass)
3676 includedTableNames.add(builderClass.PROVIDES)
3677 tableBuilderClasses.remove(builderClass)
3678 return dependencyOrder
3679
3680 @staticmethod
3681 - def getTableBuilderClasses(preferClassSet=set(), resolveConflicts=True,
3682 quiet=True, additionalBuilders=[]):
3683 """
3684 Gets all classes in module that implement L{TableBuilder}.
3685
3686 @type preferClassSet: set of str
3687 @param preferClassSet: set of L{TableBuilder} names to prefer in
3688 conflicting cases, resolveConflicting must be True to take effect
3689 (default)
3690 @type resolveConflicts: bool
3691 @param resolveConflicts: if true conflicting builders will be removed
3692 so that only one builder is left per Table.
3693 @type quiet: bool
3694 @param quiet: if true no status information will be printed to stderr
3695 @type additionalBuilders: list of classobj
3696 @param additionalBuilders: list of externally provided TableBuilders
3697 @rtype: dict
3698 @return: dictionary of all classes inheriting form L{TableBuilder} that
3699 provide a table (i.d. non abstract implementations), with its name
3700 as key
3701 """
3702 tableBuilderClasses = {}
3703 buildModule = __import__("cjklib.build")
3704
3705 tableBuilderClasses = dict([(clss.__name__, clss) \
3706 for clss in buildModule.build.__dict__.values() \
3707 if type(clss) == types.TypeType \
3708 and issubclass(clss, buildModule.build.TableBuilder) \
3709 and clss.PROVIDES])
3710
3711 tableBuilderClasses.update(dict([(clss.__name__, clss) \
3712 for clss in additionalBuilders]))
3713
3714
3715
3716 tableToBuilderMapping = {}
3717 for clssName, clss in tableBuilderClasses.iteritems():
3718 if clss.PROVIDES not in tableToBuilderMapping:
3719 tableToBuilderMapping[clss.PROVIDES] = set()
3720
3721 tableToBuilderMapping[clss.PROVIDES].add(clssName)
3722
3723 if resolveConflicts:
3724
3725 for tableName, builderClssSet in tableToBuilderMapping.items():
3726 preferredBuilders = builderClssSet & preferClassSet
3727 if preferredBuilders:
3728 if len(preferredBuilders) > 1:
3729
3730
3731 raise Exception("More than one TableBuilder " \
3732 + "preferred for conflicting table.")
3733 preferred = preferredBuilders.pop()
3734 builderClssSet.remove(preferred)
3735 else:
3736 preferred = builderClssSet.pop()
3737 if not quiet and builderClssSet:
3738 warn("Removing conflicting builder(s) '" \
3739 + "', '".join(builderClssSet) + "' in favour of '" \
3740 + preferred + "'")
3741
3742 for clssName in builderClssSet:
3743 del tableBuilderClasses[clssName]
3744 return tableBuilderClasses
3745
3746 @staticmethod
3748 """
3749 Gets names of supported tables.
3750
3751 @rtype: list of str
3752 @return: names of tables
3753 """
3754 classDict = DatabaseBuilder.getTableBuilderClasses(
3755 resolveConflicts=False)
3756 return set([clss.PROVIDES for clss in classDict.values()])
3757
3759 """
3760 Gets names of tables supported by this instance of the database builder.
3761
3762 This list can have more entries then L{getSupportedTables()} as
3763 additional external builders can be supplied on instantiation.
3764
3765 @rtype: list of str
3766 @return: names of tables
3767 """
3768 return set(self.tableBuilderLookup.keys())
3769
3771 """
3772 Checks if the current database supports optimization.
3773
3774 @rtype: boolean
3775 @return: True if optimizable, False otherwise
3776 """
3777 return self.db.engine.name in ['sqlite']
3778
3780 """
3781 Optimizes the current database.
3782
3783 @raise Exception: if database does not support optimization
3784 @raise OperationalError: if optimization failed
3785 """
3786 if self.db.engine.name == 'sqlite':
3787 self.db.execute('VACUUM')
3788 else:
3789 raise Exception('Database does not seem to support optimization')
3790
3791
3792
3793
3794 -def warn(message):
3795 """
3796 Prints the given message to stderr with the system's default encoding.
3797
3798 @type message: str
3799 @param message: message to print
3800 """
3801 print >> sys.stderr, message.encode(locale.getpreferredencoding(),
3802 'replace')
3803