Package cjklib :: Module build
[hide private]
[frames] | no frames]

Source Code for Module cjklib.build

   1  #!/usr/bin/python 
   2  # -*- coding: utf-8  -*- 
   3  # This file is part of cjklib. 
   4  # 
   5  # cjklib is free software: you can redistribute it and/or modify 
   6  # it under the terms of the GNU Lesser General Public License as published by 
   7  # the Free Software Foundation, either version 3 of the License, or 
   8  # (at your option) any later version. 
   9  # 
  10  # cjklib is distributed in the hope that it will be useful, 
  11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
  12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  13  # GNU Lesser General Public License for more details. 
  14  # 
  15  # You should have received a copy of the GNU Lesser General Public License 
  16  # along with cjklib.  If not, see <http://www.gnu.org/licenses/>. 
  17   
  18  """ 
  19  Provides the building methods for the cjklib package. 
  20   
  21  Each table that needs to be created has to be implemented by a L{TableBuilder}. 
  22  The L{DatabaseBuilder} is the central instance for managing the build process. 
  23  As the creation of a table can depend on other tables the DatabaseBuilder keeps 
  24  track of dependencies to process a build in the correct order. 
  25   
  26  Building is tested on the following storage methods: 
  27      - SQLite 
  28      - MySQL 
  29   
  30  Some L{TableBuilder} implementations aren't used by the CJK library but are 
  31  provided here for additional usage. 
  32   
  33  For MS Windows default versions provided seem to be a "X{narrow build}" and not 
  34  support characters outside the BMP (see e.g. 
  35  U{http://wordaligned.org/articles/narrow-python}). Currently no Unicode 
  36  characters outside the BMP will thus be supported on Windows platforms. 
  37   
  38  Examples 
  39  ======== 
  40  The following examples should give a quick view into how to use this 
  41  package. 
  42      - Create the DatabaseBuilder object with default settings (read from 
  43          cjklib.conf or using 'cjklib.db' in same directory as default): 
  44   
  45          >>> from cjklib import build 
  46          >>> dbBuilder = build.DatabaseBuilder(dataPath=['./cjklib/data/']) 
  47          Removing conflicting builder(s) 'CharacterVariantBMPBuilder' in favour 
  48          of 'CharacterVariantBuilder' 
  49          Removing conflicting builder(s) 'SlimUnihanBuilder', 'UnihanBuilder', 
  50          'UnihanBMPBuilder' in favour of 'SlimUnihanBMPBuilder' 
  51          Removing conflicting builder(s) 'StrokeCountBuilder' in favour of 
  52          'CombinedStrokeCountBuilder' 
  53          Removing conflicting builder(s) 'CharacterResidualStrokeCountBuilder' in 
  54          favour of 'CombinedCharacterResidualStrokeCountBuilder' 
  55   
  56      - Build the table of Jyutping syllables from a csv file: 
  57   
  58          >>> dbBuilder.build(['JyutpingSyllables']) 
  59          building table 'JyutpingSyllables' with builder 
  60          'JyutpingSyllablesBuilder'... 
  61          Reading table definition from file './cjklib/data/jyutpingsyllables.sql' 
  62          Reading table 'JyutpingSyllables' from file 
  63          './cjklib/data/jyutpingsyllables.csv' 
  64   
  65  @todo Impl: Further character domains: BIG5 (Taiwan), kIRG_GSource (Unicode, 
  66      Simplified Chinese), kIRG_JSource (Unicode, Japanese), kIRG_KPSource and 
  67      kIRG_KSource (Unicode, Korean), kIRG_TSource (Unicode, Traditional Chinese), 
  68      kIRG_VSource (Unicode, Vietnamese) 
  69  @todo Fix:  On interruption (Ctrl+C) remove tables that were only created 
  70      because of dependencies. 
  71  """ 
  72   
  73  import types 
  74  import locale 
  75  import sys 
  76  import re 
  77  import os.path 
  78  import xml.sax 
  79  import csv 
  80   
  81  from sqlalchemy import Table, Column, Integer, String, Text, Index 
  82  from sqlalchemy import select, union 
  83  from sqlalchemy.sql import text, func 
  84  from sqlalchemy.sql import and_, or_, not_ 
  85  import sqlalchemy 
  86   
  87  from cjklib import dbconnector 
  88  from cjklib import characterlookup 
  89  from cjklib import exception 
90 91 #{ TableBuilder and generic classes 92 93 -class TableBuilder(object):
94 """ 95 TableBuilder provides the abstract layout for classes that build a distinct 96 table. 97 """ 98 PROVIDES = '' 99 """Contains the name of the table provided by this module.""" 100 DEPENDS = [] 101 """Contains the names of the tables needed for the build process.""" 102
103 - def __init__(self, dataPath=None, dbConnectInst=None, quiet=False):
104 """ 105 Constructs the TableBuilder. 106 107 @type dataPath: list of str 108 @param dataPath: optional list of paths to the data file(s) 109 @type dbConnectInst: instance 110 @param dbConnectInst: instance of a L{DatabaseConnector}. If not given 111 all sql code will be printed to stdout. 112 @type quiet: bool 113 @param quiet: if true no status information will be printed to stderr 114 """ 115 self.dataPath = dataPath 116 self.quiet = quiet 117 self.db = dbConnectInst
118
119 - def build(self):
120 """ 121 Build the table provided by the TableBuilder. 122 123 Methods should raise an IOError if reading a data source fails. The 124 L{DatabaseBuilder} knows how to handle this case and is able to proceed. 125 """ 126 pass
127
128 - def remove(self):
129 """ 130 Removes the table provided by the TableBuilder from the database. 131 """ 132 pass
133
134 - def findFile(self, fileNames, fileType=None):
135 """ 136 Tries to locate a file with a given list of possible file names under 137 the classes default data paths. 138 139 For each file name every given path is checked and the first match is 140 returned. 141 142 @type fileNames: str/list of str 143 @param fileNames: possible file names 144 @type fileType: str 145 @param fileType: textual type of file used in error msg 146 @rtype: str 147 @return: path to file of first match in search for existing file 148 @raise IOError: if no file found 149 """ 150 if type(fileNames) != type([]): 151 fileNames = [fileNames] 152 for fileName in fileNames: 153 for path in self.dataPath: 154 filePath = os.path.join(os.path.expanduser(path), fileName) 155 if os.path.exists(filePath): 156 return filePath 157 if fileType == None: 158 fileType = "file" 159 raise IOError("No " + fileType + " found for '" + self.PROVIDES \ 160 + "' under path(s)'" + "', '".join(self.dataPath) \ 161 + "' for file names '" + "', '".join(fileNames) + "'")
162
163 - def buildTableObject(self, tableName, columns, columnTypeMap={}, 164 primaryKeys=[]):
165 """ 166 Returns a SQLAlchemy Table object. 167 168 @type tableName: str 169 @param tableName: name of table 170 @type columns: list of str 171 @param columns: column names 172 @type columnTypeMap: dict of str and object 173 @param columnTypeMap: mapping of column name to SQLAlchemy Column 174 @type primaryKeys: list of str 175 @param primaryKeys: list of primary key columns 176 """ 177 table = Table(tableName, self.db.metadata) 178 for column in columns: 179 if column in columnTypeMap: 180 type_ = columnTypeMap[column] 181 else: 182 type_ = Text() 183 warn("column %s has no type, assuming default 'Text()'" \ 184 % column) 185 table.append_column(Column(column, type_, 186 primary_key=(column in primaryKeys))) 187 188 return table
189
190 - def buildIndexObjects(self, tableName, indexKeyList):
191 """ 192 Returns a SQLAlchemy Table object. 193 194 @type tableName: str 195 @param tableName: name of table 196 @type indexKeyList: list of list of str 197 @param indexKeyList: a list of key combinations 198 @rtype: object 199 @return: SQLAlchemy Index 200 """ 201 indexList = [] 202 table = Table(tableName, self.db.metadata, autoload=True) 203 for indexKeyColumns in indexKeyList: 204 indexName = tableName + '__' + '_'.join(indexKeyColumns) 205 indexList.append(Index(indexName, 206 *[table.c[column] for column in indexKeyColumns])) 207 208 return indexList
209
210 211 -class EntryGeneratorBuilder(TableBuilder):
212 """ 213 Implements an abstract class for building a table from a generator 214 providing entries. 215 """ 216 COLUMNS = [] 217 """Columns that will be built""" 218 PRIMARY_KEYS = [] 219 """Primary keys of the created table""" 220 INDEX_KEYS = [] 221 """Index keys (not unique) of the created table""" 222 COLUMN_TYPES = {} 223 """Column types for created table""" 224
225 - def getGenerator(self):
226 """ 227 Returns the entry generator. 228 Needs to be implemented by child classes. 229 """ 230 pass
231
232 - def getEntryDict(self, generator):
233 entryList = [] 234 235 firstEntry = generator.next() 236 if type(firstEntry) == type(dict()): 237 entryList.append(firstEntry) 238 239 for newEntry in generator: 240 entryList.append(newEntry) 241 else: 242 firstEntryDict = dict([(column, firstEntry[i]) \ 243 for i, column in enumerate(self.COLUMNS)]) 244 entryList.append(firstEntryDict) 245 246 for newEntry in generator: 247 entryDict = dict([(column, newEntry[i]) \ 248 for i, column in enumerate(self.COLUMNS)]) 249 entryList.append(entryDict) 250 251 return entryList
252
253 - def build(self):
254 # get generator, might raise an Exception if source not found 255 generator = self.getGenerator() 256 257 # get create statement 258 table = self.buildTableObject(self.PROVIDES, self.COLUMNS, 259 self.COLUMN_TYPES, self.PRIMARY_KEYS) 260 table.create() 261 262 # write table content 263 #try: 264 #entries = self.getEntryDict(self.getGenerator()) 265 #self.db.execute(table.insert(), entries) 266 #except sqlalchemy.exceptions.IntegrityError, e: 267 #warn(unicode(e)) 268 ##warn(unicode(insertStatement)) 269 #raise 270 271 for newEntry in generator: 272 try: 273 table.insert(newEntry).execute() 274 except sqlalchemy.exceptions.IntegrityError, e: 275 warn(unicode(e)) 276 raise 277 278 for index in self.buildIndexObjects(self.PROVIDES, self.INDEX_KEYS): 279 index.create()
280
281 - def remove(self):
282 # get drop table statement 283 table = Table(self.PROVIDES, self.db.metadata) 284 table.drop()
285
286 287 -class ListGenerator:
288 """A simple generator for a given list of elements."""
289 - def __init__(self, entryList):
290 """ 291 Initialises the ListGenerator. 292 293 @type entryList: list of str 294 @param entryList: user defined entry 295 """ 296 self.entryList = entryList
297
298 - def generator(self):
299 for entry in self.entryList: 300 yield entry
301
302 #} 303 #{ Unihan character information 304 305 -class UnihanGenerator:
306 """ 307 Regular expression matching one entry in the Unihan database 308 (e.g. C{U+8682 kMandarin MA3 MA1 MA4}). 309 """ 310 keySet = None 311 """Set of keys of the Unihan table.""" 312
313 - def __init__(self, fileName, useKeys=None, quiet=False):
314 """ 315 Constructs the UnihanGenerator. 316 317 @type fileName: str 318 @param fileName: path to the Unihan database file 319 @type useKeys: list 320 @param useKeys: if given only these keys will be read from the table, 321 otherwise all keys will be returned 322 @type quiet: bool 323 @param quiet: if true no status information will be printed to stderr 324 """ 325 self.ENTRY_REGEX = re.compile(ur"U\+([0-9A-F]+)\s+(\w+)\s+(.+)\s*$") 326 self.fileName = fileName 327 self.quiet = quiet 328 if useKeys != None: 329 self.limitKeys = True 330 self.keySet = set(useKeys) 331 else: 332 self.limitKeys = False
333
334 - def generator(self):
335 """ 336 Iterates over the Unihan entries. 337 338 The character definition is converted to the character's representation, 339 all other data is given as is. These are merged into one entry for each 340 character. 341 """ 342 # attributes a separated over several lines. Read over lines until new 343 # character found and yield old entry. 344 handle = self.getHandle() 345 entryIndex = -1 346 entry = {} 347 for line in handle: 348 # ignore comments 349 if line.startswith('#'): 350 continue 351 resultObj = self.ENTRY_REGEX.match(line) 352 if not resultObj: 353 if not self.quiet: 354 warn("can't read line from Unihan.txt: '" + line + "'") 355 continue 356 unicodeHexIndex, key, value = resultObj.group(1, 2, 3) 357 358 # if we have a limited target key set, check if the current one is 359 # to be included 360 if self.limitKeys and not key in self.keySet: 361 continue 362 # check if new character entry found 363 if entryIndex != unicodeHexIndex and entryIndex != -1: 364 try: 365 # yield old one 366 char = unichr(int(entryIndex, 16)) 367 yield(char, entry) 368 except ValueError: 369 # catch for Unicode characters outside BMP for narrow builds 370 pass 371 # empty old entry 372 entry = {} 373 entryIndex = unicodeHexIndex 374 entry[key] = value 375 # generate last entry 376 if entry: 377 try: 378 # yield old one 379 char = unichr(int(entryIndex, 16)) 380 yield(char, entry) 381 except ValueError: 382 # catch for Unicode characters outside BMP for narrow builds 383 pass 384 handle.close()
385
386 - def getHandle(self):
387 """ 388 Returns a handle of the Unihan database file. 389 390 @rtype: file 391 @return: file handle of the Unihan file 392 """ 393 import zipfile 394 if zipfile.is_zipfile(self.fileName): 395 import StringIO 396 z = zipfile.ZipFile(self.fileName, "r") 397 handle = StringIO.StringIO(z.read("Unihan.txt").decode('utf-8')) 398 else: 399 import codecs 400 handle = codecs.open(self.fileName, 'r', 'utf-8') 401 return handle
402
403 - def keys(self):
404 """ 405 Returns all keys read for the Unihan table. 406 407 If the whole table is read a seek through the file is needed first to 408 find all keys, otherwise the predefined set is returned. 409 @rtype: list 410 @return: list of column names 411 """ 412 if not self.keySet: 413 if not self.quiet: 414 warn("looking for all keys in Unihan database...") 415 self.keySet = set() 416 handle = self.getHandle() 417 for line in handle: 418 # ignore comments 419 if line.startswith('#'): 420 continue 421 resultObj = self.ENTRY_REGEX.match(line) 422 if not resultObj: 423 continue 424 425 unicodeHexIndex, key, value = resultObj.group(1, 2, 3) 426 self.keySet.add(key) 427 handle.close() 428 return list(self.keySet)
429
430 431 -class UnihanBuilder(EntryGeneratorBuilder):
432 """Builds the Unihan database from the Unihan file provided by Unicode."""
433 - class EntryGenerator:
434 """Generates the entries of the Unihan table.""" 435
436 - def __init__(self, unihanGenerator):
437 """ 438 Initialises the EntryGenerator. 439 440 @type unihanGenerator: instance 441 @param unihanGenerator: a L{UnihanGenerator} instance 442 """ 443 self.unihanGenerator = unihanGenerator
444
445 - def generator(self):
446 """Provides all data of one character per entry.""" 447 columns = self.unihanGenerator.keys() 448 for char, entryDict in self.unihanGenerator.generator(): 449 newEntryDict = {UnihanBuilder.CHARACTER_COLUMN: char} 450 for column in columns: 451 if entryDict.has_key(column): 452 newEntryDict[column] = entryDict[column] 453 else: 454 newEntryDict[column] = None 455 yield newEntryDict
456 457 PROVIDES = 'Unihan' 458 CHARACTER_COLUMN = 'ChineseCharacter' 459 """Name of column for Chinese character key.""" 460 COLUMN_TYPES = {CHARACTER_COLUMN: String(1), 'kCantonese': Text(), 461 'kFrequency': Integer(), 'kHangul': Text(), 'kHanyuPinlu': Text(), 462 'kJapaneseKun': Text(), 'kJapaneseOn': Text(), 'kKorean': Text(), 463 'kMandarin': Text(), 'kRSJapanese': Text(), 'kRSKanWa': Text(), 464 'kRSKangXi': Text(), 'kRSKorean': Text(), 465 'kSimplifiedVariant': Text(), 'kTotalStrokes': Integer(), 466 'kTraditionalVariant': Text(), 'kVietnamese': Text(), 467 'kZVariant': Text()} 468 unihanGenerator = None 469
470 - def __init__(self, dataPath, dbConnectInst, quiet=False):
471 super(UnihanBuilder, self).__init__(dataPath, dbConnectInst, quiet) 472 self.PRIMARY_KEYS = [self.CHARACTER_COLUMN]
473
474 - def getUnihanGenerator(self):
475 """ 476 Returns the L{UnihanGenerator}. Constructs it if needed. 477 478 @rtype: instance 479 @return: instance of a L{UnihanGenerator} 480 """ 481 if not self.unihanGenerator: 482 path = self.findFile(['Unihan.txt', 'Unihan.zip'], 483 "Unihan database file") 484 self.unihanGenerator = UnihanGenerator(path) 485 if not self.quiet: 486 warn("reading file '" + path + "'") 487 return self.unihanGenerator
488
489 - def getGenerator(self):
492
493 - def build(self):
494 generator = self.getUnihanGenerator() 495 self.COLUMNS = [self.CHARACTER_COLUMN] 496 self.COLUMNS.extend(generator.keys()) 497 498 EntryGeneratorBuilder.build(self)
499
500 501 -class UnihanBMPBuilder(UnihanBuilder):
502 """ 503 Builds the Unihan database from the Unihan file provided by Unicode for 504 characters from the Basic Multilingual Plane (BMP) with code values between 505 U+0000 and U+FFFF. 506 507 MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes: 508 U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html} 509 """
510 - class BMPEntryGenerator:
511
512 - def __init__(self, unihanGenerator):
513 """ 514 Initialises the EntryGenerator. 515 516 @type unihanGenerator: instance 517 @param unihanGenerator: a L{UnihanGenerator} instance 518 """ 519 gen = unihanGenerator.generator() 520 self.entryGen = UnihanBuilder.EntryGenerator(unihanGenerator)\ 521 .generator()
522
523 - def generator(self):
524 for entryDict in self.entryGen: 525 # skip characters outside the BMP, i.e. for Chinese characters 526 # >= 0x20000 527 char = entryDict[UnihanBuilder.CHARACTER_COLUMN] 528 if ord(char) < int('20000', 16): 529 yield entryDict
530
531 - def __init__(self, dataPath, dbConnectInst, quiet=False):
532 super(UnihanBMPBuilder, self).__init__(dataPath, dbConnectInst, quiet) 533 self.PRIMARY_KEYS = [self.CHARACTER_COLUMN]
534
535 - def getGenerator(self):
538
539 540 -class SlimUnihanBuilder(UnihanBuilder):
541 """ 542 Builds a slim version of the Unihan database. 543 544 Keys imported into the database are specified in L{INCLUDE_KEYS}. 545 """ 546 INCLUDE_KEYS = ['kCompatibilityVariant', 'kCantonese', 'kFrequency', 547 'kHangul', 'kHanyuPinlu', 'kJapaneseKun', 'kJapaneseOn', 'kMandarin', 548 'kRSJapanese', 'kRSKanWa', 'kRSKangXi', 'kRSKorean', 'kSemanticVariant', 549 'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kTotalStrokes', 550 'kTraditionalVariant', 'kVietnamese', 'kXHC1983', 'kZVariant', 551 'kIICore', 'kGB0'] 552 """Keys for that data is read into the Unihan table in database.""" 553
554 - def getUnihanGenerator(self):
555 if not self.unihanGenerator: 556 path = self.findFile(['Unihan.txt', 'Unihan.zip'], 557 "Unihan database file") 558 self.unihanGenerator = UnihanGenerator(path, self.INCLUDE_KEYS) 559 if not self.quiet: 560 warn("reading file '" + path + "'") 561 return self.unihanGenerator
562
563 564 -class SlimUnihanBMPBuilder(SlimUnihanBuilder, UnihanBMPBuilder):
565 """ 566 Builds a slim version of the Unihan database from the Unihan file provided 567 by Unicode for characters from the Basic Multilingual Plane (BMP) with code 568 values between U+0000 and U+FFFF. 569 570 MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes: 571 U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html} 572 573 Keys imported into the database are specified in L{INCLUDE_KEYS}. 574 """ 575 # all work is done in SlimUnihanBuilder and UnihanBMPBuilder 576 pass
577
578 579 -class Kanjidic2Builder(EntryGeneratorBuilder):
580 """ 581 Builds the Kanjidic database from the Kanjidic2 XML file 582 U{http://www.csse.monash.edu.au/~jwb/kanjidic2/}. 583 """
584 - class XMLHandler(xml.sax.ContentHandler):
585 """Extracts a list of given tags."""
586 - def __init__(self, entryList, tagDict):
587 self.entryList = entryList 588 self.tagDict = tagDict 589 590 self.currentElement = [] 591 self.targetTag = None 592 self.targetTagTopElement = None
593
594 - def endElement(self, name):
595 assert(len(self.currentElement) > 0) 596 assert(self.currentElement[-1] == name) 597 self.currentElement.pop() 598 599 if name == self.targetTagTopElement: 600 self.targetTag = None 601 self.targetTagTopElement = None 602 603 if name == 'character': 604 entryDict = {} 605 for tag, func in self.tagDict.values(): 606 if tag in self.currentEntry: 607 entryDict[tag] = func(self.currentEntry[tag]) 608 self.entryList.append(entryDict)
609
610 - def characters(self, content):
611 if self.targetTag: 612 if self.targetTag not in self.currentEntry: 613 self.currentEntry[self.targetTag] = [] 614 self.currentEntry[self.targetTag].append(content)
615
616 - def startElement(self, name, attrs):
617 self.currentElement.append(name) 618 if name == 'character': 619 self.currentEntry = {} 620 else: 621 if 'character' in self.currentElement: 622 idx = self.currentElement.index('character') + 1 623 tagHierachy = tuple(self.currentElement[idx:]) 624 625 key = (tagHierachy, frozenset(attrs.items())) 626 if key in self.tagDict: 627 self.targetTagTopElement = name 628 self.targetTag, _ = self.tagDict[key]
629
630 - class KanjidicGenerator:
631 """Generates the KANJIDIC table."""
632 - def __init__(self, dataPath, tagDict):
633 """ 634 Initialises the KanjidicGenerator. 635 636 @type dataPath: list of str 637 @param dataPath: optional list of paths to the data file(s) 638 """ 639 self.dataPath = dataPath 640 self.tagDict = tagDict
641
642 - def getHandle(self):
643 """ 644 Returns a handle of the KANJIDIC database file. 645 646 @rtype: file 647 @return: file handle of the KANJIDIC file 648 """ 649 import gzip 650 if self.dataPath.endswith('.gz'): 651 import StringIO 652 z = gzip.GzipFile(self.dataPath, 'r') 653 handle = StringIO.StringIO(z.read()) 654 else: 655 import codecs 656 handle = codecs.open(self.dataPath, 'r') 657 return handle
658
659 - def generator(self):
660 """Provides a pronunciation and a path to the audio file.""" 661 entryList = [] 662 xmlHandler = Kanjidic2Builder.XMLHandler(entryList, self.tagDict) 663 664 saxparser = xml.sax.make_parser() 665 saxparser.setContentHandler(xmlHandler) 666 ## don't check DTD as this raises an exception 667 #saxparser.setFeature(xml.sax.handler.feature_external_ges, False) 668 saxparser.parse(self.getHandle()) 669 670 for entry in entryList: 671 yield(entry)
672 673 PROVIDES = 'Kanjidic' 674 CHARACTER_COLUMN = 'ChineseCharacter' 675 """Name of column for Chinese character key.""" 676 COLUMN_TYPES = {CHARACTER_COLUMN: String(1), 'NelsonRadical': Integer(), 677 'CharacterJapaneseOn': Text(), 'CharacterJapaneseKun': Text()} 678 KANJIDIC_TAG_MAPPING = { 679 (('literal', ), frozenset()): ('ChineseCharacter', lambda x: x[0]), 680 (('radical', 'rad_value'), 681 frozenset([('rad_type', 'nelson_c')])): ('NelsonCRadical', 682 lambda x: int(x[0])), 683 (('radical', 'rad_value'), 684 frozenset([('rad_type', 'nelson_n')])): ('NelsonNRadical', 685 lambda x: int(x[0])), 686 # TODO On and Kun reading in KANJIDICT include further optional 687 # attributes that makes the method miss the entry: 688 # on_type and r_status, these are currently not implemented in the 689 # file though 690 (('reading_meaning', 'rmgroup', 'reading'), 691 frozenset([('r_type', 'ja_on')])): ('CharacterJapaneseOn', 692 lambda x: ','.join(x)), 693 (('reading_meaning', 'rmgroup', 'reading'), 694 frozenset([('r_type', 'ja_kun')])): ('CharacterJapaneseKun', 695 lambda x: ','.join(x)), 696 #(('reading_meaning', 'rmgroup', 'reading'), 697 #frozenset([('r_type', 'pinyin')])): ('Pinyin', 698 #lambda x: ','.join(x)), 699 (('misc', 'rad_name'), frozenset()): ('RadicalName', 700 lambda x: ','.join(x)), 701 (('reading_meaning', 'rmgroup', 'meaning'), frozenset()): ('Meaning_en', 702 lambda x: '/'.join(x)), 703 (('reading_meaning', 'rmgroup', 'meaning'), 704 frozenset([('m_lang', 'fr')])): ('Meaning_fr', 705 lambda x: '/'.join(x)), 706 (('reading_meaning', 'rmgroup', 'meaning'), 707 frozenset([('m_lang', 'es')])): ('Meaning_es', 708 lambda x: '/'.join(x)), 709 (('reading_meaning', 'rmgroup', 'meaning'), 710 frozenset([('m_lang', 'pt')])): ('Meaning_pt', 711 lambda x: '/'.join(x)), 712 } 713 """ 714 Dictionary of tag keys mapping to a table column including a function 715 generating a string out of a list of entries given from the KANJIDIC entry. 716 The tag keys constist of a tuple giving the xml element hierarchy below the 717 'character' element and a set of attribute value pairs. 718 """ 719
720 - def __init__(self, dataPath, dbConnectInst, quiet=False):
721 super(Kanjidic2Builder, self).__init__(dataPath, dbConnectInst, quiet) 722 tags = [tag for tag, _ in self.KANJIDIC_TAG_MAPPING.values()] 723 self.COLUMNS = tags 724 self.PRIMARY_KEYS = [self.CHARACTER_COLUMN]
725
726 - def getGenerator(self):
727 """ 728 Returns the L{KanjidicGenerator}. 729 730 @rtype: instance 731 @return: instance of a L{KanjidicGenerator} 732 """ 733 path = self.findFile(['kanjidic2.xml.gz', 'kanjidic2.xml'], 734 "KANJIDIC2 XML file") 735 if not self.quiet: 736 warn("reading file '" + path + "'") 737 return Kanjidic2Builder.KanjidicGenerator(path, 738 self.KANJIDIC_TAG_MAPPING).generator()
739
740 741 -class UnihanDerivedBuilder(EntryGeneratorBuilder):
742 """ 743 Provides an abstract class for building a table with a relation between a 744 Chinese character and another column using the Unihan database. 745 """ 746 DEPENDS=['Unihan'] 747 COLUMN_SOURCE = None 748 """ 749 Unihan table column providing content for the table. Needs to be overwritten 750 in subclass. 751 """ 752 COLUMN_TARGET = None 753 """ 754 Column name for new data in created table. Needs to be overwritten in 755 subclass. 756 """ 757 COLUMN_TARGET_TYPE = Text() 758 """ 759 Type of column for new data in created table. 760 """ 761 GENERATOR_CLASS = None 762 """ 763 Class defining the iterator for creating the table's data. The constructor 764 needs to take two parameters for the list of entries from the Unihan 765 database and the 'quiet' flag. Needs to be overwritten in subclass. 766 """ 767
768 - def __init__(self, dataPath, dbConnectInst, quiet=False):
769 super(UnihanDerivedBuilder, self).__init__(dataPath, dbConnectInst, 770 quiet) 771 # create name mappings 772 self.COLUMNS = ['ChineseCharacter', self.COLUMN_TARGET] 773 self.PRIMARY_KEYS = self.COLUMNS 774 # set column types 775 self.COLUMN_TYPES = {'ChineseCharacter': String(1), 776 self.COLUMN_TARGET: self.COLUMN_TARGET_TYPE}
777
778 - def getGenerator(self):
779 # create generator 780 table = self.db.tables['Unihan'] 781 tableEntries = self.db.selectRows( 782 select([table.c.ChineseCharacter, table.c[self.COLUMN_SOURCE]], 783 table.c[self.COLUMN_SOURCE] != None)) 784 return self.GENERATOR_CLASS(tableEntries, self.quiet).generator()
785
786 - def build(self):
787 if not self.quiet: 788 warn("Reading table content from Unihan column '" \ 789 + self.COLUMN_SOURCE + "'") 790 super(UnihanDerivedBuilder, self).build()
791
792 793 -class UnihanStrokeCountBuilder(UnihanDerivedBuilder):
794 """ 795 Builds a mapping between characters and their stroke count using the Unihan 796 data. 797 """
798 - class StrokeCountExtractor:
799 """Extracts the character stroke count mapping."""
800 - def __init__(self, entries, quiet=False):
801 """ 802 Initialises the StrokeCountExtractor. 803 804 @type entries: list of tuple 805 @param entries: character entries from the Unihan database 806 @type quiet: bool 807 @param quiet: if true no status information will be printed 808 """ 809 self.entries = entries 810 self.quiet = quiet
811
812 - def generator(self):
813 """Provides one entry per radical and character.""" 814 for character, strokeCount in self.entries: 815 yield(character, strokeCount)
816 817 PROVIDES = 'UnihanStrokeCount' 818 COLUMN_SOURCE = 'kTotalStrokes' 819 COLUMN_TARGET = 'StrokeCount' 820 COLUMN_TARGET_TYPE = Integer() 821 GENERATOR_CLASS = StrokeCountExtractor
822
823 824 -class CharacterRadicalBuilder(UnihanDerivedBuilder):
825 """ 826 Provides an abstract class for building a character radical mapping table 827 using the Unihan database. 828 """
829 - class RadicalExtractor:
830 """Generates the radical to character mapping from the Unihan table."""
831 - def __init__(self, rsEntries, quiet=False):
832 """ 833 Initialises the RadicalExtractor. 834 835 @type rsEntries: list of tuple 836 @param rsEntries: character radical entries from the Unihan database 837 @type quiet: bool 838 @param quiet: if true no status information will be printed 839 """ 840 self.RADICAL_REGEX = re.compile(ur"(\d+)\.(\d+)") 841 self.rsEntries = rsEntries 842 self.quiet = quiet
843
844 - def generator(self):
845 """Provides one entry per radical and character.""" 846 for character, radicalStroke in self.rsEntries: 847 matchObj = self.RADICAL_REGEX.match(radicalStroke) 848 if matchObj: 849 radical = matchObj.group(1) 850 yield(character, radical) 851 elif not self.quiet: 852 warn("unable to read radical information of character '" \ 853 + character + "': '" + radicalStroke + "'")
854 855 COLUMN_TARGET = 'RadicalIndex' 856 COLUMN_TARGET_TYPE = Integer() 857 GENERATOR_CLASS = RadicalExtractor
858
859 860 -class CharacterKangxiRadicalBuilder(CharacterRadicalBuilder):
861 """ 862 Builds the character Kangxi radical mapping table from the Unihan database. 863 """ 864 PROVIDES = 'CharacterKangxiRadical' 865 COLUMN_SOURCE = 'kRSKangXi'
866
867 868 -class CharacterKanWaRadicalBuilder(CharacterRadicalBuilder):
869 """ 870 Builds the character Dai Kan-Wa jiten radical mapping table from the Unihan 871 database. 872 """ 873 PROVIDES = 'CharacterKanWaRadical' 874 COLUMN_SOURCE = 'kRSKanWa'
875
876 877 -class CharacterJapaneseRadicalBuilder(CharacterRadicalBuilder):
878 """ 879 Builds the character Japanese radical mapping table from the Unihan 880 database. 881 """ 882 PROVIDES = 'CharacterJapaneseRadical' 883 COLUMN_SOURCE = 'kRSJapanese'
884
885 886 -class CharacterKoreanRadicalBuilder(CharacterRadicalBuilder):
887 """ 888 Builds the character Korean radical mapping table from the Unihan 889 database. 890 """ 891 PROVIDES = 'CharacterKoreanRadical' 892 COLUMN_SOURCE = 'kRSKorean'
893
894 895 -class CharacterVariantBuilder(EntryGeneratorBuilder):
896 """ 897 Builds a character variant mapping table from the Unihan database. 898 """
899 - class VariantGenerator:
900 """Generates the character to variant mapping from the Unihan table.""" 901 902 # Regular expressions for different entry types 903 HEX_INDEX_REGEX = re.compile(ur"\s*U\+([0-9A-F]+)\s*$") 904 MULT_HEX_INDEX_REGEX = re.compile(ur"\s*(U\+([0-9A-F]+)( |(?=$)))+\s*$") 905 MULT_HEX_INDEX_FIND_REGEX = re.compile(ur"U\+([0-9A-F]+)(?: |(?=$))") 906 SEMANTIC_REGEX = re.compile(ur"(U\+[0-9A-F]+(<\S+)?( |(?=$)))+$") 907 SEMANTIC_FIND_REGEX = re.compile(ur"U\+([0-9A-F]+)(?:<\S+)?(?: |(?=$))") 908 ZVARIANT_REGEX = re.compile(ur"\s*U\+([0-9A-F]+)(?:\:\S+)?\s*$") 909 910 VARIANT_REGEX_MAPPING = {'C': (HEX_INDEX_REGEX, HEX_INDEX_REGEX), 911 'M': (SEMANTIC_REGEX, SEMANTIC_FIND_REGEX), 912 'S': (MULT_HEX_INDEX_REGEX, MULT_HEX_INDEX_FIND_REGEX), 913 'P': (SEMANTIC_REGEX, SEMANTIC_FIND_REGEX), 914 'T': (MULT_HEX_INDEX_REGEX, MULT_HEX_INDEX_FIND_REGEX), 915 'Z': (ZVARIANT_REGEX, ZVARIANT_REGEX)} 916 """ 917 Mapping of entry types to regular expression describing the entry's 918 pattern. 919 """ 920
921 - def __init__(self, variantEntries, typeList, quiet=False):
922 """ 923 Initialises the VariantGenerator. 924 925 @type variantEntries: list of tuple 926 @param variantEntries: character variant entries from the Unihan 927 database 928 @type typeList: list of str 929 @param typeList: variant types in the order given in tableEntries 930 @type quiet: bool 931 @param quiet: if true no status information will be printed 932 """ 933 self.variantEntries = variantEntries 934 self.typeList = typeList 935 self.quiet = quiet
936
937 - def generator(self):
938 """Provides one entry per variant and character.""" 939 for entries in self.variantEntries: 940 character = entries[0] 941 for i, variantType in enumerate(self.typeList): 942 variantInfo = entries[i+1] 943 if variantInfo: 944 # get proper regular expression for given variant info 945 matchR, findR = self.VARIANT_REGEX_MAPPING[variantType] 946 if matchR.match(variantInfo): 947 # get all hex indices 948 variantIndices = findR.findall(variantInfo) 949 for unicodeHexIndex in variantIndices: 950 try: 951 variant = unichr(int(unicodeHexIndex, 16)) 952 yield(character, variant, variantType) 953 except ValueError: 954 # catch for Unicode characters outside BMP 955 # for narrow builds 956 pass 957 elif not self.quiet: 958 # didn't match the regex 959 warn('unable to read variant information of ' \ 960 + "character '" + character + "' for type '" \ 961 + variantType + "': '" + variantInfo + "'")
962 963 PROVIDES = 'CharacterVariant' 964 DEPENDS=['Unihan'] 965 966 COLUMN_SOURCE_ABBREV = {'kCompatibilityVariant': 'C', 967 'kSemanticVariant': 'M', 'kSimplifiedVariant': 'S', 968 'kSpecializedSemanticVariant': 'P', 'kTraditionalVariant': 'T', 969 'kZVariant': 'Z'} 970 """ 971 Unihan table columns providing content for the table together with their 972 abbreviation used in the target table. 973 """ 974 COLUMN_TYPES = {'ChineseCharacter': String(1), 'Variant': String(1), 975 'Type': String(1)} 976
977 - def __init__(self, dataPath, dbConnectInst, quiet=False):
978 super(CharacterVariantBuilder, self).__init__(dataPath, dbConnectInst, 979 quiet) 980 # create name mappings 981 self.COLUMNS = ['ChineseCharacter', 'Variant', 'Type'] 982 self.PRIMARY_KEYS = self.COLUMNS
983
984 - def getGenerator(self):
985 # create generator 986 keys = self.COLUMN_SOURCE_ABBREV.keys() 987 variantTypes = [self.COLUMN_SOURCE_ABBREV[key] for key in keys] 988 selectKeys = ['ChineseCharacter'] 989 selectKeys.extend(keys) 990 991 table = self.db.tables['Unihan'] 992 tableEntries = self.db.selectRows( 993 select([table.c[column] for column in selectKeys])) 994 return CharacterVariantBuilder.VariantGenerator(tableEntries, 995 variantTypes, self.quiet).generator()
996
997 - def build(self):
998 if not self.quiet: 999 warn("Reading table content from Unihan columns '" \ 1000 + ', '.join(self.COLUMN_SOURCE_ABBREV.keys()) + "'") 1001 super(CharacterVariantBuilder, self).build()
1002
1003 1004 -class CharacterVariantBMPBuilder(CharacterVariantBuilder):
1005 """ 1006 Builds a character variant mapping table from the Unihan database for 1007 characters from the Basic Multilingual Plane (BMP) with code values between 1008 U+0000 and U+FFFF. 1009 1010 MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes: 1011 U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html} 1012 """
1013 - class BMPVariantGenerator:
1014
1015 - def __init__(self, variantEntries, typeList, quiet=False):
1016 """ 1017 Initialises the BMPVariantGenerator. 1018 1019 @type variantEntries: list of tuple 1020 @param variantEntries: character variant entries from the Unihan 1021 database 1022 @type typeList: list of str 1023 @param typeList: variant types in the order given in tableEntries 1024 @type quiet: bool 1025 @param quiet: if true no status information will be printed 1026 """ 1027 self.variantGen = CharacterVariantBuilder.VariantGenerator( \ 1028 variantEntries, typeList, quiet).generator()
1029
1030 - def generator(self):
1031 for character, variant, variantType in self.variantGen: 1032 # skip characters outside the BMP, i.e. for Chinese characters 1033 # >= 0x20000 1034 if ord(variant) < int('20000', 16): 1035 yield(character, variant, variantType)
1036
1037 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1038 super(CharacterVariantBMPBuilder, self).__init__(dataPath, 1039 dbConnectInst, quiet)
1040
1041 - def getGenerator(self):
1042 # create generator 1043 keys = self.COLUMN_SOURCE_ABBREV.keys() 1044 variantTypes = [self.COLUMN_SOURCE_ABBREV[key] for key in keys] 1045 selectKeys = ['ChineseCharacter'] 1046 selectKeys.extend(keys) 1047 1048 table = self.db.tables['Unihan'] 1049 tableEntries = self.db.selectRows( 1050 select([table.c[column] for column in selectKeys])) 1051 return CharacterVariantBMPBuilder.BMPVariantGenerator(tableEntries, 1052 variantTypes, self.quiet).generator()
1053
1054 1055 -class UnihanCharacterSetBuilder(EntryGeneratorBuilder):
1056 """ 1057 Builds a simple list of characters that belong to a specific class using the 1058 Unihan data. 1059 """ 1060 DEPENDS=['Unihan'] 1061
1062 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1063 super(UnihanCharacterSetBuilder, self).__init__(dataPath, dbConnectInst, 1064 quiet) 1065 # create name mappings 1066 self.COLUMNS = ['ChineseCharacter'] 1067 self.PRIMARY_KEYS = self.COLUMNS 1068 # set column types 1069 self.COLUMN_TYPES = {'ChineseCharacter': String(1)}
1070
1071 - def getGenerator(self):
1072 # create generator 1073 table = self.db.tables['Unihan'] 1074 # read rows here instead of scalars to yield tuples for the generator 1075 tableEntries = self.db.selectRows( 1076 select([table.c.ChineseCharacter], 1077 table.c[self.COLUMN_SOURCE] != None)) 1078 return ListGenerator(tableEntries).generator()
1079
1080 - def build(self):
1081 if not self.quiet: 1082 warn("Reading table content from Unihan column '" \ 1083 + self.COLUMN_SOURCE + "'") 1084 super(UnihanCharacterSetBuilder, self).build()
1085
1086 1087 -class IICoreSetBuilder(UnihanCharacterSetBuilder):
1088 u""" 1089 Builds a simple list of all characters in X{IICore} 1090 (Unicode I{International Ideograph Core)}. 1091 @see: Chinese Wikipedia on IICore: 1092 U{http://zh.wikipedia.org/wiki/國際表意文字核心} 1093 """ 1094 PROVIDES = 'IICoreSet' 1095 COLUMN_SOURCE = 'kIICore'
1096
1097 1098 -class GB2312SetBuilder(UnihanCharacterSetBuilder):
1099 """ 1100 Builds a simple list of all characters in the Chinese standard X{GB2312-80}. 1101 """ 1102 PROVIDES = 'GB2312Set' 1103 COLUMN_SOURCE = 'kGB0'
1104
1105 #} 1106 #{ Unihan reading information 1107 1108 -class CharacterReadingBuilder(UnihanDerivedBuilder):
1109 """ 1110 Provides an abstract class for building a character reading mapping table 1111 using the Unihan database. 1112 """
1114 """Generates the reading entities from the Unihan table.""" 1115 SPLIT_REGEX = re.compile(r"(\S+)") 1116
1117 - def __init__(self, readingEntries, quiet=False):
1118 """ 1119 Initialises the ReadingSplitter. 1120 1121 @type readingEntries: list of tuple 1122 @param readingEntries: character reading entries from the Unihan 1123 database 1124 @type quiet: bool 1125 @param quiet: if true no status information will be printed 1126 """ 1127 self.readingEntries = readingEntries 1128 self.quiet = quiet
1129
1130 - def generator(self):
1131 """Provides one entry per reading entity and character.""" 1132 for character, readings in self.readingEntries: 1133 readingList = self.SPLIT_REGEX.findall(readings) 1134 if not self.quiet and len(set(readingList)) < len(readingList): 1135 warn('reading information of character ' + character \ 1136 + ' is inconsistent: ' + ', '.join(readingList)) 1137 for reading in set(readingList): 1138 yield(character, reading.lower())
1139 1140 COLUMN_TARGET = 'Reading' 1141 COLUMN_TARGET_TYPE = Text() 1142 GENERATOR_CLASS = SimpleReadingSplitter 1143 DEPENDS=['Unihan']
1144
1145 1146 -class CharacterUnihanPinyinBuilder(CharacterReadingBuilder):
1147 """ 1148 Builds the character Pinyin mapping table from the Unihan database. 1149 """ 1150 PROVIDES = 'CharacterUnihanPinyin' 1151 COLUMN_SOURCE = 'kMandarin'
1152
1153 1154 -class CharacterJyutpingBuilder(CharacterReadingBuilder):
1155 """Builds the character Jyutping mapping table from the Unihan database.""" 1156 PROVIDES = 'CharacterJyutping' 1157 COLUMN_SOURCE = 'kCantonese'
1158
1159 1160 -class CharacterJapaneseKunBuilder(CharacterReadingBuilder):
1161 """Builds the character Kun'yomi mapping table from the Unihan database.""" 1162 PROVIDES = 'CharacterJapaneseKun' 1163 COLUMN_SOURCE = 'kJapaneseKun'
1164
1165 1166 -class CharacterJapaneseOnBuilder(CharacterReadingBuilder):
1167 """Builds the character On'yomi mapping table from the Unihan database.""" 1168 PROVIDES = 'CharacterJapaneseOn' 1169 COLUMN_SOURCE = 'kJapaneseOn'
1170
1171 1172 -class CharacterHangulBuilder(CharacterReadingBuilder):
1173 """Builds the character Hangul mapping table from the Unihan database.""" 1174 PROVIDES = 'CharacterHangul' 1175 COLUMN_SOURCE = 'kHangul'
1176
1177 1178 -class CharacterVietnameseBuilder(CharacterReadingBuilder):
1179 """ 1180 Builds the character Vietnamese mapping table from the Unihan database. 1181 """ 1182 PROVIDES = 'CharacterVietnamese' 1183 COLUMN_SOURCE = 'kVietnamese'
1184
1185 1186 -class CharacterXHPCReadingBuilder(CharacterReadingBuilder):
1187 """ 1188 Builds the Xiandai Hanyu Pinlu Cidian Pinyin mapping table using the Unihan 1189 database. 1190 """
1191 - class XHPCReadingSplitter(CharacterReadingBuilder.SimpleReadingSplitter):
1192 """ 1193 Generates the Xiandai Hanyu Pinlu Cidian Pinyin syllables from the 1194 Unihan table. 1195 """ 1196 SPLIT_REGEX = re.compile(ur"([a-zü]+[1-5])\([0-9]+\)")
1197 1198 GENERATOR_CLASS = XHPCReadingSplitter 1199 1200 PROVIDES = 'CharacterXHPCPinyin' 1201 COLUMN_SOURCE = 'kHanyuPinlu'
1202
1203 1204 -class CharacterXHCReadingBuilder(CharacterReadingBuilder):
1205 """ 1206 Builds the Xiandai Hanyu Cidian Pinyin mapping table using the Unihan 1207 database. 1208 """
1209 - class XHCReadingSplitter(CharacterReadingBuilder.SimpleReadingSplitter):
1210 """ 1211 Generates the Xiandai Hanyu Cidian Pinyin syllables from the Unihan 1212 table. 1213 """ 1214 SPLIT_REGEX = re.compile(r"[0-9,.*]+:(\S+)") 1215 1216 TONEMARK_VOWELS = [u'a', u'e', u'i', u'o', u'u', u'ü', u'n', u'm', u'r', 1217 u'ê'] 1218 1219 TONEMARK_MAP = {u'\u0304': 1, u'\u0301': 2, u'\u030c': 3, u'\u0300': 4} 1220
1221 - def __init__(self, readingEntries, quiet=False):
1222 """ 1223 Initialises the XHCReadingSplitter. 1224 1225 @type readingEntries: list of tuple 1226 @param readingEntries: character reading entries from the Unihan 1227 database 1228 @type quiet: bool 1229 @param quiet: if true no status information will be printed 1230 """ 1231 CharacterReadingBuilder.SimpleReadingSplitter.__init__(self, 1232 readingEntries, quiet) 1233 self._toneMarkRegex = re.compile(u'[' \ 1234 + ''.join(self.TONEMARK_MAP.keys()) + ']')
1235
1236 - def convertTonemark(self, entity):
1237 """ 1238 Converts the entity with diacritics into an entity with tone mark 1239 as appended number. 1240 1241 @type entity: str 1242 @param entity: entity with tonal information 1243 @rtype: tuple 1244 @return: plain entity without tone mark and entity's tone index 1245 (starting with 1) 1246 """ 1247 import unicodedata 1248 # get decomposed Unicode string, e.g. C{'ū'} to C{'u\u0304'} 1249 entity = unicodedata.normalize("NFD", unicode(entity)) 1250 # find character with tone marker 1251 matchObj = self._toneMarkRegex.search(entity) 1252 if matchObj: 1253 diacriticalMark = matchObj.group(0) 1254 tone = self.TONEMARK_MAP[diacriticalMark] 1255 # strip off diacritical mark 1256 plainEntity = entity.replace(diacriticalMark, '') 1257 # compose Unicode string (used for ê) and return with tone 1258 return unicodedata.normalize("NFC", plainEntity) + str(tone) 1259 else: 1260 # fifth tone doesn't have any marker 1261 return unicodedata.normalize("NFC", entity) + '5'
1262
1263 - def generator(self):
1264 """Provides one entry per reading entity and character.""" 1265 for character, readings in self.readingEntries: 1266 readingList = self.SPLIT_REGEX.findall(readings) 1267 if not self.quiet and len(set(readingList)) < len(readingList): 1268 warn('reading information of character ' + character \ 1269 + ' is inconsistent: ' + ', '.join(readingList)) 1270 for reading in set(readingList): 1271 yield(character, self.convertTonemark(reading.lower()))
1272 1273 GENERATOR_CLASS = XHCReadingSplitter 1274 1275 PROVIDES = 'CharacterXHCPinyin' 1276 COLUMN_SOURCE = 'kXHC1983'
1277
1278 1279 -class CharacterPinyinBuilder(EntryGeneratorBuilder):
1280 """ 1281 Builds the character Pinyin mapping table from the several sources. 1282 """ 1283 PROVIDES = 'CharacterPinyin' 1284 DEPENDS=['CharacterUnihanPinyin', 'CharacterXHPCPinyin', 1285 'CharacterXHCPinyin'] 1286
1287 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1288 super(CharacterPinyinBuilder, self).__init__(dataPath, dbConnectInst, 1289 quiet) 1290 # create name mappings 1291 self.COLUMNS = ['ChineseCharacter', 'Reading'] 1292 self.PRIMARY_KEYS = self.COLUMNS 1293 # set column types 1294 self.COLUMN_TYPES = {'ChineseCharacter': String(1), 1295 'Reading': String(255)}
1296
1297 - def getGenerator(self):
1298 # create generator 1299 selectQueries = [] 1300 for tableName in self.DEPENDS: 1301 table = self.db.tables[tableName] 1302 selectQueries.append( 1303 select([table.c[column] for column in self.COLUMNS])) 1304 1305 tableEntries = self.db.selectRows(union(*selectQueries)) 1306 return ListGenerator(tableEntries).generator()
1307
1308 #} 1309 #{ CSV file based 1310 1311 -class CSVFileLoader(TableBuilder):
1312 """ 1313 Builds a table by loading its data from a list of comma separated values 1314 (CSV). 1315 """ 1316 TABLE_CSV_FILE_MAPPING = '' 1317 """csv file path""" 1318 TABLE_DECLARATION_FILE_MAPPING = '' 1319 """file path containing SQL create table code.""" 1320 INDEX_KEYS = [] 1321 """Index keys (not unique) of the created table""" 1322
1323 - class DefaultDialect(csv.Dialect):
1324 """Defines a default dialect for the case sniffing fails.""" 1325 quoting = csv.QUOTE_NONE 1326 delimiter = ',' 1327 lineterminator = '\n' 1328 quotechar = "'"
1329
1330 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1331 super(CSVFileLoader, self).__init__(dataPath, dbConnectInst, quiet)
1332 1333 # TODO unicode_csv_reader(), utf_8_encoder(), byte_string_dialect() used 1334 # to work around missing Unicode support in csv module 1335 @staticmethod
1336 - def unicode_csv_reader(unicode_csv_data, dialect, **kwargs):
1337 # csv.py doesn't do Unicode; encode temporarily as UTF-8: 1338 csv_reader = csv.reader(CSVFileLoader.utf_8_encoder(unicode_csv_data), 1339 dialect=CSVFileLoader.byte_string_dialect(dialect), **kwargs) 1340 for row in csv_reader: 1341 # decode UTF-8 back to Unicode, cell by cell: 1342 yield [unicode(cell, 'utf-8') for cell in row]
1343 1344 @staticmethod
1345 - def utf_8_encoder(unicode_csv_data):
1346 for line in unicode_csv_data: 1347 yield line.encode('utf-8')
1348 1349 @staticmethod
1350 - def byte_string_dialect(dialect):
1351 class ByteStringDialect(csv.Dialect): 1352 def __init__(self, dialect): 1353 self.delimiter = str(dialect.delimiter) 1354 if dialect.escapechar: 1355 self.escapechar = str(dialect.escapechar) 1356 self.lineterminator = str(dialect.lineterminator) 1357 self.quotechar = str(dialect.quotechar) 1358 self.quoting = dialect.quoting
1359 1360 return ByteStringDialect(dialect) 1361
1362 - def getCSVReader(self, fileHandle):
1363 """ 1364 Returns a csv reader object for a given file name. 1365 1366 The file can start with the character '#' to mark comments. These will 1367 be ignored. The first line after the leading comments will be used to 1368 guess the csv file's format. 1369 1370 @type fileHandle: file 1371 @param fileHandle: file handle of the CSV file 1372 @rtype: instance 1373 @return: CSV reader object returning one entry per line 1374 """ 1375 def prependLineGenerator(line, data): 1376 """ 1377 The first line red for guessing format has to be reinserted. 1378 """ 1379 yield line 1380 for nextLine in data: 1381 yield nextLine
1382 1383 line = '#' 1384 try: 1385 while line.strip().startswith('#'): 1386 line = fileHandle.next() 1387 except StopIteration: 1388 return csv.reader(fileHandle) 1389 try: 1390 self.fileDialect = csv.Sniffer().sniff(line, ['\t', ',']) 1391 except csv.Error: 1392 self.fileDialect = CSVFileLoader.DefaultDialect() 1393 1394 content = prependLineGenerator(line, fileHandle) 1395 #return csv.reader(content, dialect=self.fileDialect) # TODO 1396 return CSVFileLoader.unicode_csv_reader(content, self.fileDialect) 1397
1398 - def build(self):
1399 import locale 1400 import codecs 1401 1402 definitionFile = self.findFile([self.TABLE_DECLARATION_FILE_MAPPING], 1403 "SQL table definition file") 1404 contentFile = self.findFile([self.TABLE_CSV_FILE_MAPPING], "table") 1405 1406 # get create statement 1407 if not self.quiet: 1408 warn("Reading table definition from file '" + definitionFile + "'") 1409 1410 fileHandle = codecs.open(definitionFile, 'r', 'utf-8') 1411 createStatement = ''.join(fileHandle.readlines()).strip("\n") 1412 # get create statement 1413 self.db.execute(text(createStatement)) 1414 table = Table(self.PROVIDES, self.db.metadata, autoload=True) 1415 1416 # write table content 1417 if not self.quiet: 1418 warn("Reading table '" + self.PROVIDES + "' from file '" \ 1419 + contentFile + "'") 1420 fileHandle = codecs.open(contentFile, 'r', 'utf-8') 1421 1422 entries = [] 1423 for line in self.getCSVReader(fileHandle): 1424 if len(line) == 1 and not line[0].strip(): 1425 continue 1426 entryDict = dict([(column.name, line[i]) \ 1427 for i, column in enumerate(table.columns)]) 1428 entries.append(entryDict) 1429 1430 try: 1431 self.db.execute(table.insert(), entries) 1432 except sqlalchemy.exceptions.IntegrityError, e: 1433 warn(unicode(e)) 1434 #warn(unicode(insertStatement)) 1435 raise 1436 1437 # get create index statement 1438 for index in self.buildIndexObjects(self.PROVIDES, self.INDEX_KEYS): 1439 index.create()
1440
1441 - def remove(self):
1442 # get drop table statement 1443 table = Table(self.PROVIDES, self.db.metadata) 1444 table.drop()
1445
1446 1447 -class PinyinSyllablesBuilder(CSVFileLoader):
1448 """ 1449 Builds a list of Pinyin syllables. 1450 """ 1451 PROVIDES = 'PinyinSyllables' 1452 1453 TABLE_CSV_FILE_MAPPING = 'pinyinsyllables.csv' 1454 TABLE_DECLARATION_FILE_MAPPING = 'pinyinsyllables.sql'
1455
1456 1457 -class PinyinInitialFinalBuilder(CSVFileLoader):
1458 """ 1459 Builds a mapping from Pinyin syllables to their initial/final parts. 1460 """ 1461 PROVIDES = 'PinyinInitialFinal' 1462 1463 TABLE_CSV_FILE_MAPPING = 'pinyininitialfinal.csv' 1464 TABLE_DECLARATION_FILE_MAPPING = 'pinyininitialfinal.sql'
1465
1466 1467 -class WadeGilesSyllablesBuilder(CSVFileLoader):
1468 """ 1469 Builds a list of Wade-Giles syllables. 1470 """ 1471 PROVIDES = 'WadeGilesSyllables' 1472 1473 TABLE_CSV_FILE_MAPPING = 'wadegilessyllables.csv' 1474 TABLE_DECLARATION_FILE_MAPPING = 'wadegilessyllables.sql'
1475
1476 1477 -class GRSyllablesBuilder(CSVFileLoader):
1478 """ 1479 Builds a list of Gwoyeu Romatzyh syllables. 1480 """ 1481 PROVIDES = 'GRSyllables' 1482 1483 TABLE_CSV_FILE_MAPPING = 'grsyllables.csv' 1484 TABLE_DECLARATION_FILE_MAPPING = 'grsyllables.sql'
1485
1486 1487 -class GRRhotacisedFinalsBuilder(CSVFileLoader):
1488 """ 1489 Builds a list of Gwoyeu Romatzyh rhotacised finals. 1490 """ 1491 PROVIDES = 'GRRhotacisedFinals' 1492 1493 TABLE_CSV_FILE_MAPPING = 'grrhotacisedfinals.csv' 1494 TABLE_DECLARATION_FILE_MAPPING = 'grrhotacisedfinals.sql'
1495
1496 1497 -class GRAbbreviationBuilder(CSVFileLoader):
1498 """ 1499 Builds a list of Gwoyeu Romatzyh abbreviated spellings. 1500 """ 1501 PROVIDES = 'GRAbbreviation' 1502 1503 TABLE_CSV_FILE_MAPPING = 'grabbreviation.csv' 1504 TABLE_DECLARATION_FILE_MAPPING = 'grabbreviation.sql'
1505
1506 1507 -class JyutpingSyllablesBuilder(CSVFileLoader):
1508 """ 1509 Builds a list of Jyutping syllables. 1510 """ 1511 PROVIDES = 'JyutpingSyllables' 1512 1513 TABLE_CSV_FILE_MAPPING = 'jyutpingsyllables.csv' 1514 TABLE_DECLARATION_FILE_MAPPING = 'jyutpingsyllables.sql'
1515
1516 1517 -class JyutpingInitialFinalBuilder(CSVFileLoader):
1518 """ 1519 Builds a mapping from Jyutping syllables to their initial/final parts. 1520 """ 1521 PROVIDES = 'JyutpingInitialFinal' 1522 1523 TABLE_CSV_FILE_MAPPING = 'jyutpinginitialfinal.csv' 1524 TABLE_DECLARATION_FILE_MAPPING = 'jyutpinginitialfinal.sql'
1525
1526 1527 -class CantoneseYaleSyllablesBuilder(CSVFileLoader):
1528 """ 1529 Builds a list of Cantonese Yale syllables. 1530 """ 1531 PROVIDES = 'CantoneseYaleSyllables' 1532 1533 TABLE_CSV_FILE_MAPPING = 'cantoneseyalesyllables.csv' 1534 TABLE_DECLARATION_FILE_MAPPING = 'cantoneseyalesyllables.sql'
1535
1536 1537 -class CantoneseYaleInitialNucleusCodaBuilder(CSVFileLoader):
1538 """ 1539 Builds a mapping of Cantonese syllable in the Yale romanisation 1540 system to the syllables' initial, nucleus and coda. 1541 """ 1542 PROVIDES = 'CantoneseYaleInitialNucleusCoda' 1543 1544 TABLE_CSV_FILE_MAPPING = 'cantoneseyaleinitialnucleuscoda.csv' 1545 TABLE_DECLARATION_FILE_MAPPING = 'cantoneseyaleinitialnucleuscoda.sql'
1546
1547 1548 -class JyutpingYaleMappingBuilder(CSVFileLoader):
1549 """ 1550 Builds a mapping between syllables in Jyutping and the Yale romanization 1551 system. 1552 """ 1553 PROVIDES = 'JyutpingYaleMapping' 1554 1555 TABLE_CSV_FILE_MAPPING = 'jyutpingyalemapping.csv' 1556 TABLE_DECLARATION_FILE_MAPPING = 'jyutpingyalemapping.sql'
1557
1558 1559 -class WadeGilesPinyinMappingBuilder(CSVFileLoader):
1560 """ 1561 Builds a mapping between syllables in Wade-Giles and Pinyin. 1562 """ 1563 PROVIDES = 'WadeGilesPinyinMapping' 1564 1565 TABLE_CSV_FILE_MAPPING = 'wadegilespinyinmapping.csv' 1566 TABLE_DECLARATION_FILE_MAPPING = 'wadegilespinyinmapping.sql'
1567
1568 1569 -class PinyinGRMappingBuilder(CSVFileLoader):
1570 """ 1571 Builds a mapping between syllables in Pinyin and Gwoyeu Romatzyh. 1572 """ 1573 PROVIDES = 'PinyinGRMapping' 1574 1575 TABLE_CSV_FILE_MAPPING = 'pinyingrmapping.csv' 1576 TABLE_DECLARATION_FILE_MAPPING = 'pinyingrmapping.sql'
1577
1578 1579 -class PinyinIPAMappingBuilder(CSVFileLoader):
1580 """ 1581 Builds a mapping between syllables in Pinyin and their representation in 1582 IPA. 1583 """ 1584 PROVIDES = 'PinyinIPAMapping' 1585 1586 TABLE_CSV_FILE_MAPPING = 'pinyinipamapping.csv' 1587 TABLE_DECLARATION_FILE_MAPPING = 'pinyinipamapping.sql'
1588
1589 1590 -class MandarinIPAInitialFinalBuilder(CSVFileLoader):
1591 """ 1592 Builds a mapping from Mandarin syllables in IPA to their initial/final 1593 parts. 1594 """ 1595 PROVIDES = 'MandarinIPAInitialFinal' 1596 1597 TABLE_CSV_FILE_MAPPING = 'mandarinipainitialfinal.csv' 1598 TABLE_DECLARATION_FILE_MAPPING = 'mandarinipainitialfinal.sql'
1599
1600 1601 -class JyutpingIPAMappingBuilder(CSVFileLoader):
1602 """ 1603 Builds a mapping between syllables in Jyutping and their representation in 1604 IPA. 1605 """ 1606 PROVIDES = 'JyutpingIPAMapping' 1607 1608 TABLE_CSV_FILE_MAPPING = 'jyutpingipamapping.csv' 1609 TABLE_DECLARATION_FILE_MAPPING = 'jyutpingipamapping.sql'
1610
1611 1612 -class CantoneseIPAInitialFinalBuilder(CSVFileLoader):
1613 """ 1614 Builds a mapping from Cantonese syllables in IPA to their initial/final 1615 parts. 1616 """ 1617 PROVIDES = 'CantoneseIPAInitialFinal' 1618 1619 TABLE_CSV_FILE_MAPPING = 'cantoneseipainitialfinal.csv' 1620 TABLE_DECLARATION_FILE_MAPPING = 'cantoneseipainitialfinal.sql'
1621
1622 1623 -class KangxiRadicalBuilder(CSVFileLoader):
1624 """ 1625 Builds a mapping between Kangxi radical index and radical characters. 1626 """ 1627 PROVIDES = 'KangxiRadical' 1628 1629 TABLE_CSV_FILE_MAPPING = 'kangxiradical.csv' 1630 TABLE_DECLARATION_FILE_MAPPING = 'kangxiradical.sql'
1631
1632 1633 -class KangxiRadicalIsolatedCharacterBuilder(CSVFileLoader):
1634 """ 1635 Builds a mapping between Kangxi radical index and radical equivalent 1636 characters without radical form. 1637 """ 1638 PROVIDES = 'KangxiRadicalIsolatedCharacter' 1639 1640 TABLE_CSV_FILE_MAPPING = 'kangxiradicalisolatedcharacter.csv' 1641 TABLE_DECLARATION_FILE_MAPPING = 'kangxiradicalisolatedcharacter.sql'
1642
1643 1644 -class RadicalEquivalentCharacterBuilder(CSVFileLoader):
1645 """ 1646 Builds a mapping between I{Unicode radical forms} and 1647 I{Unicode radical variants} on one side and I{equivalent characters} on the 1648 other side. 1649 """ 1650 PROVIDES = 'RadicalEquivalentCharacter' 1651 1652 TABLE_CSV_FILE_MAPPING = 'radicalequivalentcharacter.csv' 1653 TABLE_DECLARATION_FILE_MAPPING = 'radicalequivalentcharacter.sql'
1654
1655 1656 -class StrokesBuilder(CSVFileLoader):
1657 """ 1658 Builds a list of strokes and their names. 1659 """ 1660 PROVIDES = 'Strokes' 1661 1662 TABLE_CSV_FILE_MAPPING = 'strokes.csv' 1663 TABLE_DECLARATION_FILE_MAPPING = 'strokes.sql'
1664
1665 1666 -class StrokeOrderBuilder(CSVFileLoader):
1667 """ 1668 Builds a mapping between characters and their stroke order. 1669 """ 1670 PROVIDES = 'StrokeOrder' 1671 1672 TABLE_CSV_FILE_MAPPING = 'strokeorder.csv' 1673 TABLE_DECLARATION_FILE_MAPPING = 'strokeorder.sql'
1674
1675 1676 -class CharacterDecompositionBuilder(CSVFileLoader):
1677 """ 1678 Builds a mapping between characters and their decomposition. 1679 """ 1680 PROVIDES = 'CharacterDecomposition' 1681 1682 TABLE_CSV_FILE_MAPPING = 'characterdecomposition.csv' 1683 TABLE_DECLARATION_FILE_MAPPING = 'characterdecomposition.sql' 1684 INDEX_KEYS = [['ChineseCharacter', 'ZVariant']]
1685
1686 1687 -class LocaleCharacterVariantBuilder(CSVFileLoader):
1688 """ 1689 Builds a mapping between a character under a locale and its default variant. 1690 """ 1691 PROVIDES = 'LocaleCharacterVariant' 1692 1693 TABLE_CSV_FILE_MAPPING = 'localecharactervariant.csv' 1694 TABLE_DECLARATION_FILE_MAPPING = 'localecharactervariant.sql'
1695
1696 1697 -class MandarinBraileInitialBuilder(CSVFileLoader):
1698 """ 1699 Builds a mapping of Mandarin Chinese syllable initials in Pinyin to Braille 1700 characters. 1701 """ 1702 PROVIDES = 'PinyinBrailleInitialMapping' 1703 1704 TABLE_CSV_FILE_MAPPING = 'pinyinbrailleinitialmapping.csv' 1705 TABLE_DECLARATION_FILE_MAPPING = 'pinyinbrailleinitialmapping.sql'
1706
1707 1708 -class MandarinBraileFinalBuilder(CSVFileLoader):
1709 """ 1710 Builds a mapping of Mandarin Chinese syllable finals in Pinyin to Braille 1711 characters. 1712 """ 1713 PROVIDES = 'PinyinBrailleFinalMapping' 1714 1715 TABLE_CSV_FILE_MAPPING = 'pinyinbraillefinalmapping.csv' 1716 TABLE_DECLARATION_FILE_MAPPING = 'pinyinbraillefinalmapping.sql'
1717
1718 1719 #} 1720 #{ Library dependant 1721 1722 -class ZVariantBuilder(EntryGeneratorBuilder):
1723 """ 1724 Builds a list of glyph indices for characters. 1725 @todo Impl: Check if all Z-variants in LocaleCharacterVariant are included. 1726 @todo Bug: Forms with two variants in CharacterDecomposition are missing, 1727 e.g. ⾓. 1728 """ 1729 PROVIDES = 'ZVariants' 1730 DEPENDS = ['CharacterDecomposition', 'StrokeOrder', 'Unihan'] 1731 # TODO 'LocaleCharacterVariant' 1732 1733 COLUMNS = ['ChineseCharacter', 'ZVariant'] 1734 PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant'] 1735 INDEX_KEYS = [['ChineseCharacter']] 1736 COLUMN_TYPES = {'ChineseCharacter': String(1), 'ZVariant': Integer()} 1737
1738 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1739 super(ZVariantBuilder, self).__init__(dataPath, dbConnectInst, quiet)
1740
1741 - def getGenerator(self):
1742 decompositionTable = self.db.tables['CharacterDecomposition'] 1743 strokeOrderTable = self.db.tables['CharacterDecomposition'] 1744 unihanTable = self.db.tables['Unihan'] 1745 1746 characterSet = set(self.db.selectRows( 1747 select([decompositionTable.c.ChineseCharacter, 1748 decompositionTable.c.ZVariant], distinct=True))) 1749 characterSet.update(self.db.selectRows( 1750 select([strokeOrderTable.c.ChineseCharacter, 1751 strokeOrderTable.c.ZVariant]))) 1752 # TODO 1753 #characterSet.update(self.db.select('LocaleCharacterVariant', 1754 #['ChineseCharacter', 'ZVariant'])) 1755 # Add characters from Unihan as Z-variant 0 1756 unihanCharacters = self.db.selectScalars( 1757 select([unihanTable.c.ChineseCharacter], 1758 or_(unihanTable.c.kTotalStrokes != None, 1759 unihanTable.c.kRSKangXi != None))) 1760 characterSet.update([(char, 0) for char in unihanCharacters]) 1761 1762 return ListGenerator(characterSet).generator()
1763
1764 1765 -class StrokeCountBuilder(EntryGeneratorBuilder):
1766 """ 1767 Builds a mapping between characters and their stroke count. 1768 """
1769 - class StrokeCountGenerator:
1770 """Generates the character stroke count mapping."""
1771 - def __init__(self, dbConnectInst, characterSet, quiet=False):
1772 """ 1773 Initialises the StrokeCountGenerator. 1774 1775 @type dbConnectInst: instance 1776 @param dbConnectInst: instance of a L{DatabaseConnector}. 1777 @type characterSet: set 1778 @param characterSet: set of characters to generate the table for 1779 @type quiet: bool 1780 @param quiet: if true no status information will be printed to 1781 stderr 1782 """ 1783 self.characterSet = characterSet 1784 self.quiet = quiet 1785 self.cjk = characterlookup.CharacterLookup( 1786 dbConnectInst=dbConnectInst) 1787 # make sure a currently existing table is not used 1788 self.cjk.hasStrokeCount = False
1789
1790 - def generator(self):
1791 """Provides one entry per character, z-Variant and locale subset.""" 1792 for char, zVariant in self.characterSet: 1793 try: 1794 # cjklib's stroke count method uses the stroke order 1795 # information as long as this table doesn't exist 1796 strokeCount = self.cjk.getStrokeCount(char, 1797 zVariant=zVariant) 1798 yield {'ChineseCharacter': char, 'StrokeCount': strokeCount, 1799 'ZVariant': zVariant} 1800 except exception.NoInformationError: 1801 pass 1802 except IndexError: 1803 if not self.quiet: 1804 warn("malformed IDS for character '" + char \ 1805 + "'")
1806 1807 PROVIDES = 'StrokeCount' 1808 DEPENDS = ['CharacterDecomposition', 'StrokeOrder'] 1809 1810 COLUMNS = ['ChineseCharacter', 'StrokeCount', 'ZVariant'] 1811 PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant'] 1812 COLUMN_TYPES = {'ChineseCharacter': String(1), 'StrokeCount': Integer(), 1813 'ZVariant': Integer()} 1814
1815 - def __init__(self, dataPath, dbConnectInst, quiet=False):
1816 super(StrokeCountBuilder, self).__init__(dataPath, dbConnectInst, quiet)
1817
1818 - def getGenerator(self):
1819 decompositionTable = self.db.tables['CharacterDecomposition'] 1820 strokeOrderTable = self.db.tables['StrokeOrder'] 1821 1822 characterSet = set(self.db.selectRows( 1823 select([decompositionTable.c.ChineseCharacter, 1824 decompositionTable.c.ZVariant], distinct=True))) 1825 characterSet.update(self.db.selectRows( 1826 select([strokeOrderTable.c.ChineseCharacter, 1827 strokeOrderTable.c.ZVariant]))) 1828 return StrokeCountBuilder.StrokeCountGenerator(self.db, characterSet, 1829 self.quiet).generator()
1830
1831 1832 -class CombinedStrokeCountBuilder(StrokeCountBuilder):
1833 """ 1834 Builds a mapping between characters and their stroke count. Includes stroke 1835 count data from the Unihan database to make up for missing data in own data 1836 files. 1837 """
1839 """Generates the character stroke count mapping."""
1840 - def __init__(self, dbConnectInst, characterSet, tableEntries, 1841 preferredBuilder, quiet=False):
1842 """ 1843 Initialises the CombinedStrokeCountGenerator. 1844 1845 @type dbConnectInst: instance 1846 @param dbConnectInst: instance of a L{DatabaseConnector}. 1847 @type characterSet: set 1848 @param characterSet: set of characters to generate the table for 1849 @type tableEntries: list of list 1850 @param tableEntries: list of characters with Z-variant 1851 @type preferredBuilder: instance 1852 @param preferredBuilder: TableBuilder which forms are preferred over 1853 entries from the Unihan table 1854 @type quiet: bool 1855 @param quiet: if true no status information will be printed to 1856 stderr 1857 """ 1858 self.characterSet = characterSet 1859 self.tableEntries = tableEntries 1860 self.preferredBuilder = preferredBuilder 1861 self.quiet = quiet 1862 self.cjk = characterlookup.CharacterLookup( 1863 dbConnectInst=dbConnectInst) 1864 self.db = dbConnectInst
1865
1866 - def getStrokeCount(self, char, zVariant, strokeCountDict, 1867 unihanStrokeCountDict, decompositionDict):
1868 """ 1869 Gets the stroke count of the given character by summing up the 1870 stroke count of its components and using the Unihan table as 1871 fallback. 1872 1873 For the sake of consistency this method doesn't take the stroke 1874 count given by Unihan directly but sums up the stroke counts of the 1875 components to make sure the sum of component's stroke count will 1876 always give the characters stroke count. The result yielded will be 1877 in many cases even more precise than the value given in Unihan (not 1878 depending on the actual glyph form). 1879 1880 Once calculated the stroke count will be cached in the given 1881 strokeCountDict object. 1882 1883 @type char: str 1884 @param char: Chinese character 1885 @type zVariant: int 1886 @param zVariant: Z-variant of character 1887 @rtype: int 1888 @return: stroke count 1889 @raise ValueError: if stroke count is ambiguous due to inconsistent 1890 values wrt Unihan vs. own data. 1891 @raise NoInformationError: if decomposition is incomplete 1892 """ 1893 if char == u'?': 1894 # we have an incomplete decomposition, can't build 1895 raise exception.NoInformationError("incomplete decomposition") 1896 1897 if (char, zVariant) not in strokeCountDict: 1898 lastStrokeCount = None 1899 if (char, zVariant) in decompositionDict: 1900 # try all decompositions of this character, all need to 1901 # return the same count for sake of consistency 1902 for decomposition in decompositionDict[(char, zVariant)]: 1903 try: 1904 accumulatedStrokeCount = 0 1905 1906 for entry in decomposition: 1907 if type(entry) == types.TupleType: 1908 component, componentZVariant = entry 1909 1910 accumulatedStrokeCount += \ 1911 self.getStrokeCount(component, 1912 componentZVariant, strokeCountDict, 1913 unihanStrokeCountDict, 1914 decompositionDict) 1915 1916 if lastStrokeCount != None \ 1917 and lastStrokeCount != accumulatedStrokeCount: 1918 # different stroke counts taken from different 1919 # decompositions, can't build at all 1920 raise ValueError("ambiguous stroke count " \ 1921 + "information, due to various stroke " \ 1922 + "count sources for " \ 1923 + repr((char, ZVariant))) 1924 else: 1925 # first run or equal to previous calculation 1926 lastStrokeCount = accumulatedStrokeCount 1927 1928 except exception.NoInformationError: 1929 continue 1930 1931 if lastStrokeCount != None: 1932 strokeCountDict[(char, zVariant)] = lastStrokeCount 1933 else: 1934 # couldn't get stroke counts from components, check fallback 1935 # resources 1936 if (char, 0) in strokeCountDict: 1937 # own sources have info for fallback zVariant 1938 strokeCountDict[(char, zVariant)] \ 1939 = strokeCountDict[(char, 0)] 1940 1941 elif char in unihanStrokeCountDict: 1942 # take Unihan info 1943 strokeCountDict[(char, zVariant)] \ 1944 = unihanStrokeCountDict[char] 1945 1946 else: 1947 strokeCountDict[(char, zVariant)] = None 1948 1949 if strokeCountDict[(char, zVariant)] == None: 1950 raise exception.NoInformationError( 1951 "missing stroke count information") 1952 else: 1953 return strokeCountDict[(char, zVariant)]
1954
1955 - def generator(self):
1956 """Provides one entry per character, z-Variant and locale subset.""" 1957 # handle chars from own data first 1958 strokeCountDict = {} 1959 for entry in self.preferredBuilder: 1960 yield entry 1961 1962 # save stroke count for later processing, prefer Z-variant 0 1963 key = (entry['ChineseCharacter'], entry['ZVariant']) 1964 strokeCountDict[key] = entry['StrokeCount'] 1965 1966 # now get stroke counts from Unihan table 1967 1968 # get Unihan table stroke count data 1969 unihanStrokeCountDict = {} 1970 for char, strokeCount in self.tableEntries: 1971 if (char, 0) not in strokeCountDict: 1972 unihanStrokeCountDict[char] = strokeCount 1973 1974 # finally fill up with characters from Unihan; proper glyph 1975 # information missing though in some cases. 1976 1977 # remove glyphs we already have an entry for 1978 self.characterSet.difference_update(strokeCountDict.keys()) 1979 1980 # get character decompositions 1981 decompositionDict = self.cjk.getDecompositionEntriesDict() 1982 1983 for char, zVariant in self.characterSet: 1984 warningZVariants = [] 1985 try: 1986 # build stroke count from mixed source 1987 strokeCount = self.getStrokeCount(char, zVariant, 1988 strokeCountDict, unihanStrokeCountDict, 1989 decompositionDict) 1990 1991 yield {'ChineseCharacter': char, 'ZVariant': zVariant, 1992 'StrokeCount': strokeCount} 1993 except ValueError, e: 1994 warningZVariants.append(zVariant) 1995 except exception.NoInformationError: 1996 pass 1997 1998 if not self.quiet and warningZVariants: 1999 warn("ambiguous stroke count information (mixed sources) " \ 2000 "for character '" + char + "' for Z-variant(s) '" \ 2001 + ''.join([str(z) for z in warningZVariants]) + "'")
2002 2003 DEPENDS = ['CharacterDecomposition', 'StrokeOrder', 'Unihan'] 2004 COLUMN_SOURCE = 'kTotalStrokes' 2005
2006 - def getGenerator(self):
2007 decompositionTable = self.db.tables['CharacterDecomposition'] 2008 strokeOrderTable = self.db.tables['StrokeOrder'] 2009 unihanTable = self.db.tables['Unihan'] 2010 2011 characterSet = set(self.db.selectRows( 2012 select([decompositionTable.c.ChineseCharacter, 2013 decompositionTable.c.ZVariant], distinct=True))) 2014 characterSet.update(self.db.selectRows( 2015 select([strokeOrderTable.c.ChineseCharacter, 2016 strokeOrderTable.c.ZVariant]))) 2017 preferredBuilder = \ 2018 CombinedStrokeCountBuilder.StrokeCountGenerator(self.db, 2019 characterSet, self.quiet).generator() 2020 # get main builder 2021 tableEntries = self.db.selectRows( 2022 select([unihanTable.c.ChineseCharacter, 2023 unihanTable.c[self.COLUMN_SOURCE]], 2024 unihanTable.c[self.COLUMN_SOURCE] != None)) 2025 2026 # get characters to build combined stroke count for. Some characters 2027 # from the CharacterDecomposition table might not have a stroke count 2028 # entry in Unihan though their components do have. 2029 characterSet.update([(char, 0) for char, totalCount in tableEntries]) 2030 2031 return CombinedStrokeCountBuilder.CombinedStrokeCountGenerator(self.db, 2032 characterSet, tableEntries, preferredBuilder, self.quiet)\ 2033 .generator()
2034
2035 2036 -class CharacterComponentLookupBuilder(EntryGeneratorBuilder):
2037 """ 2038 Builds a mapping between characters and their components. 2039 """
2041 """Generates the component to character mapping.""" 2042
2043 - def __init__(self, dbConnectInst, characterSet):
2044 """ 2045 Initialises the CharacterComponentGenerator. 2046 2047 @type dbConnectInst: instance 2048 @param dbConnectInst: instance of a L{DatabaseConnector} 2049 @type characterSet: set 2050 @param characterSet: set of characters to generate the table for 2051 """ 2052 self.characterSet = characterSet 2053 self.cjk = characterlookup.CharacterLookup( 2054 dbConnectInst=dbConnectInst)
2055
2056 - def getComponents(self, char, zVariant, decompositionDict, 2057 componentDict):
2058 """ 2059 Gets all character components for the given glyph. 2060 2061 @type char: str 2062 @param char: Chinese character 2063 @type zVariant: int 2064 @param zVariant: Z-variant of character 2065 @rtype: set 2066 @return: all components of the character 2067 """ 2068 if (char, zVariant) not in componentDict: 2069 componentDict[(char, zVariant)] = set() 2070 2071 if (char, zVariant) in decompositionDict: 2072 for decomposition in decompositionDict[(char, zVariant)]: 2073 componentDict[(char, zVariant)].update( 2074 [entry for entry in decomposition \ 2075 if type(entry) == types.TupleType]) 2076 2077 componentSet = set() 2078 for component, componentZVariant in componentDict[(char, zVariant)]: 2079 componentSet.add((component, componentZVariant)) 2080 # get sub-components 2081 componentSet.update(self.getComponents(component, 2082 componentZVariant, decompositionDict, componentDict)) 2083 2084 return componentSet
2085
2086 - def generator(self):
2087 """Provides the component entries.""" 2088 decompositionDict = self.cjk.getDecompositionEntriesDict() 2089 componentDict = {} 2090 for char, zVariant in self.characterSet: 2091 for component, componentZVariant \ 2092 in self.getComponents(char, zVariant, decompositionDict, 2093 componentDict): 2094 yield {'ChineseCharacter': char, 'ZVariant': zVariant, 2095 'Component': component, 2096 'ComponentZVariant': componentZVariant}
2097 2098 PROVIDES = 'ComponentLookup' 2099 DEPENDS = ['CharacterDecomposition'] 2100 2101 COLUMNS = ['ChineseCharacter', 'ZVariant', 'Component', 'ComponentZVariant'] 2102 PRIMARY_KEYS = COLUMNS 2103 INDEX_KEYS = [['Component']] 2104 COLUMN_TYPES = {'ChineseCharacter': String(1), 'ZVariant': Integer(), 2105 'Component': String(1), 'ComponentZVariant': Integer()} 2106
2107 - def __init__(self, dataPath, dbConnectInst, quiet=False):
2108 super(CharacterComponentLookupBuilder, self).__init__(dataPath, 2109 dbConnectInst, quiet)
2110
2111 - def getGenerator(self):
2112 decompositionTable = self.db.tables['CharacterDecomposition'] 2113 characterSet = set(self.db.selectRows( 2114 select([decompositionTable.c.ChineseCharacter, 2115 decompositionTable.c.ZVariant], distinct=True))) 2116 return CharacterComponentLookupBuilder.CharacterComponentGenerator( 2117 self.db, characterSet).generator()
2118
2119 2120 -class CharacterRadicalStrokeCountBuilder(EntryGeneratorBuilder):
2121 """ 2122 Builds a mapping between characters and their radical with stroke count of 2123 residual components. 2124 2125 This class can be extended by inheriting 2126 L{CharacterRadicalStrokeCountGenerator} and overwriting 2127 L{CharacterRadicalStrokeCountGenerator.getFormRadicalIndex()} to implement 2128 which forms should be regarded as radicals as well as 2129 L{CharacterRadicalStrokeCountGenerator.filterForms()} to filter entries 2130 before creation. 2131 """
2133 """Generates the character to radical/residual stroke count mapping.""" 2134
2135 - def __init__(self, dbConnectInst, characterSet, quiet=False):
2136 """ 2137 Initialises the CharacterRadicalStrokeCountGenerator. 2138 2139 @type dbConnectInst: instance 2140 @param dbConnectInst: instance of a L{DatabaseConnector} 2141 @type characterSet: set 2142 @param characterSet: set of characters to generate the table for 2143 @type quiet: bool 2144 @param quiet: if true no status information will be printed to 2145 stderr 2146 """ 2147 self.characterSet = characterSet 2148 self.quiet = quiet 2149 self.cjk = characterlookup.CharacterLookup( 2150 dbConnectInst=dbConnectInst) 2151 self.radicalForms = None
2152
2153 - def getFormRadicalIndex(self, form):
2154 """ 2155 Returns the Kangxi radical index for the given component. 2156 2157 @type form: str 2158 @param form: component 2159 @rtype: int 2160 @return: radical index of the given radical form. 2161 """ 2162 if self.radicalForms == None: 2163 self.radicalForms = {} 2164 for loc in ['T', 'C', 'J', 'K', 'V']: 2165 for radicalIdx in range(1, 215): 2166 for f in \ 2167 self.cjk.getKangxiRadicalRepresentativeCharacters( 2168 radicalIdx, loc): 2169 self.radicalForms[f] = radicalIdx 2170 2171 if form not in self.radicalForms: 2172 return None 2173 return self.radicalForms[form]
2174
2175 - def filterForms(self, formSet):
2176 u""" 2177 Filters the set of given radical form entries to return only one 2178 single occurrence of a radical. 2179 2180 @type formSet: set of dict 2181 @param formSet: radical/residual stroke count entries as generated 2182 by L{getEntries()}. 2183 @rtype: set of dict 2184 @return: subset of input 2185 @todo Lang: On multiple occurrences of same radical (may be in 2186 different forms): Which one to choose? Implement to turn down 2187 unwanted forms. 2188 """ 2189 return formSet
2190
2191 - def getEntries(self, char, zVariant, strokeCountDict, decompositionDict, 2192 entriesDict):
2193 u""" 2194 Gets all radical/residual stroke count combinations from the given 2195 decomposition. 2196 2197 @rtype: list 2198 @return: all radical/residual stroke count combinations for the 2199 character 2200 @raise ValueError: if IDS is malformed or ambiguous residual stroke 2201 count is calculated 2202 @todo Fix: Remove validity check, only needed as long 2203 decomposition entries aren't checked against stroke order 2204 entries. 2205 """ 2206 def getCharLayout(mainCharacterLayout, mainLayoutPosition, 2207 subCharLayout, subLayoutPosition): 2208 u""" 2209 Returns the character layout for the radical form within the 2210 component with layout subCharLayout itself belonging to a parent 2211 char with layout mainCharacterLayout. 2212 E.g. 鸺 can be decomposed into ⿰休鸟 and 休 can be furthermore 2213 decomposed into ⿰亻木. 亻 is found in a lower layer of 2214 decomposition, but as the structure of 休 and 鸺 are the same, 2215 and 亻 is on the left side of 休 which is on the left side of 鸺 2216 one can deduce 亻 as being on the utmost left side of 鸺. Thus 2217 (⿰, 0) would be returned. 2218 """ 2219 specialReturn = { 2220 (u'⿰', 0, u'⿰', 0): (u'⿰', 0), 2221 (u'⿰', 1, u'⿰', 1): (u'⿰', 1), 2222 (u'⿱', 0, u'⿱', 0): (u'⿱', 0), 2223 (u'⿱', 1, u'⿱', 1): (u'⿱', 1), 2224 (u'⿲', 0, u'⿲', 0): (u'⿰', 0), 2225 (u'⿲', 2, u'⿲', 2): (u'⿰', 1), 2226 (u'⿳', 0, u'⿳', 0): (u'⿱', 0), 2227 (u'⿳', 2, u'⿳', 2): (u'⿱', 0), 2228 (u'⿲', 0, u'⿰', 0): (u'⿰', 0), 2229 (u'⿲', 2, u'⿰', 1): (u'⿰', 1), 2230 (u'⿰', 0, u'⿲', 0): (u'⿰', 0), 2231 (u'⿰', 1, u'⿲', 1): (u'⿰', 1), 2232 (u'⿳', 0, u'⿱', 0): (u'⿱', 0), 2233 (u'⿳', 2, u'⿱', 1): (u'⿱', 1), 2234 (u'⿱', 0, u'⿳', 0): (u'⿱', 0), 2235 (u'⿱', 1, u'⿳', 2): (u'⿱', 1), 2236 } 2237 entry = (mainCharacterLayout, mainLayoutPosition, subCharLayout, 2238 subLayoutPosition) 2239 if entry in specialReturn: 2240 return specialReturn[entry] 2241 elif subCharLayout == u'⿻': 2242 # default value for complex position 2243 return (u'⿻', 0) 2244 elif mainCharacterLayout == None: 2245 # main layout 2246 return subCharLayout, subLayoutPosition 2247 else: 2248 # radical component has complex position 2249 return (u'⿻', 0)
2250 2251 # if no decomposition available then there is nothing to do 2252 if (char, zVariant) not in decompositionDict: 2253 return [] 2254 2255 if (char, zVariant) not in entriesDict: 2256 entriesDict[(char, zVariant)] = set() 2257 2258 for decomposition in decompositionDict[(char, zVariant)]: 2259 componentRadicalForms = [] 2260 # if a radical is found in a subcharacter an entry is added 2261 # containing the radical form, its variant, the stroke 2262 # count of residual characters in this main character and 2263 # it's position in the main char (e.g. for 鸺 contains 2264 # Form 鸟, Z-variant 0, residual stroke count 6, main 2265 # layout ⿰ and position 1 (right side), as 亻 and 木 2266 # together form the residual components, and the 2267 # simplified structure of 鸺 applies to a left/right 2268 # model, with 鸟 being at the 2nd position. 2269 2270 # get all radical entries 2271 2272 # layout stack which holds the IDS operators and a position 2273 # in the IDS operator itself for each Chinese character 2274 layoutStack = [(None, None)] 2275 2276 for entry in decomposition: 2277 try: 2278 layout, position = layoutStack.pop() 2279 except IndexError: 2280 raise ValueError("malformed IDS for character '" \ 2281 + mainChar + "'") 2282 2283 if type(entry) != types.TupleType: 2284 # ideographic description character found, derive 2285 # layout from IDS and parent character and store 2286 # in layout stack to be consumed by following 2287 # Chinese characters 2288 if self.cjk.isTrinaryIDSOperator(entry): 2289 posRange = [2, 1, 0] 2290 else: 2291 posRange = [1, 0] 2292 2293 for componentPos in posRange: 2294 # append to stack one per following element, 2295 # adapt layout to parent one 2296 layoutStack.append(getCharLayout(layout, 2297 position, entry, componentPos)) 2298 else: 2299 # Chinese character found 2300 componentChar, componentZVariant = entry 2301 2302 # create entries for this component 2303 radicalIndex \ 2304 = self.getFormRadicalIndex(componentChar) 2305 if radicalIndex != None: 2306 # main component is radical, no residual stroke 2307 # count, save relative position in main 2308 # character 2309 componentRadicalForms.append( 2310 {'Component': entry, 2311 'Form': componentChar, 2312 'Z-variant': componentZVariant, 2313 'ResidualStrokeCount': 0, 2314 'CharacterLayout': layout, 2315 'RadicalIndex': radicalIndex, 2316 'RadicalPosition': position}) 2317 2318 # get all radical forms for this entry from 2319 # sub-components 2320 for radicalEntry in self.getEntries(componentChar, 2321 componentZVariant, strokeCountDict, 2322 decompositionDict, entriesDict): 2323 2324 # get layout for this character wrt parent char 2325 charLayout, charPosition = getCharLayout(layout, 2326 position, radicalEntry['CharacterLayout'], 2327 radicalEntry['RadicalPosition']) 2328 componentEntry = radicalEntry.copy() 2329 componentEntry['Component'] = entry 2330 componentEntry['CharacterLayout'] = charLayout 2331 componentEntry['RadicalPosition'] = charPosition 2332 componentRadicalForms.append(componentEntry) 2333 2334 # for each character get the residual characters first 2335 residualCharacters = {} 2336 charactersSeen = [] 2337 for entry in decomposition: 2338 # get Chinese characters 2339 if type(entry) == types.TupleType: 2340 # fill up already seen characters with next found 2341 for seenEntry in residualCharacters: 2342 residualCharacters[seenEntry].append(entry) 2343 2344 # set current character to already seen ones 2345 residualCharacters[entry] = charactersSeen[:] 2346 2347 charactersSeen.append(entry) 2348 2349 # calculate residual stroke count and create entries 2350 for componentEntry in componentRadicalForms: 2351 # residual stroke count is the sum of the component's 2352 # residual stroke count (with out radical) and count 2353 # of the other components 2354 for entry in \ 2355 residualCharacters[componentEntry['Component']]: 2356 2357 if entry not in strokeCountDict: 2358 break 2359 2360 componentEntry['ResidualStrokeCount'] \ 2361 += strokeCountDict[entry] 2362 else: 2363 # all stroke counts available 2364 del componentEntry['Component'] 2365 entriesDict[(char, zVariant)].add( 2366 frozenset(componentEntry.items())) 2367 2368 # validity check # TODO only needed as long decomposition and 2369 # stroke order entries aren't checked for validity 2370 seenEntriesDict = {} 2371 for entry in [dict(d) for d in entriesDict[(char, zVariant)]]: 2372 keyEntry = (entry['Form'], entry['Z-variant'], 2373 entry['CharacterLayout'], entry['RadicalIndex'], 2374 entry['RadicalPosition']) 2375 if keyEntry in seenEntriesDict \ 2376 and seenEntriesDict[keyEntry] \ 2377 != entry['ResidualStrokeCount']: 2378 raise ValueError("ambiguous residual stroke count for " \ 2379 + "character '" + mainChar + "' with entry '" \ 2380 + "', '".join(list([unicode(column) \ 2381 for column in keyEntry])) \ 2382 + "': '" + str(seenEntriesDict[keyEntry]) + "'/'" \ 2383 + str(entry['ResidualStrokeCount']) + "'") 2384 seenEntriesDict[keyEntry] = entry['ResidualStrokeCount'] 2385 2386 # filter forms, i.e. for multiple radical occurrences prefer one 2387 return self.filterForms( 2388 [dict(d) for d in entriesDict[(char, zVariant)]])
2389
2390 - def generator(self):
2391 """Provides the radical/stroke count entries.""" 2392 strokeCountDict = self.cjk.getStrokeCountDict() 2393 decompositionDict = self.cjk.getDecompositionEntriesDict() 2394 entryDict = {} 2395 2396 for char, zVariant in self.characterSet: 2397 if self.cjk.isRadicalChar(char): 2398 # ignore Unicode radical forms 2399 continue 2400 2401 for entry in self.getEntries(char, zVariant, strokeCountDict, 2402 decompositionDict, entryDict): 2403 2404 yield [char, zVariant, entry['RadicalIndex'], entry['Form'], 2405 entry['Z-variant'], entry['CharacterLayout'], 2406 entry['RadicalPosition'], entry['ResidualStrokeCount']]
2407 2408 PROVIDES = 'CharacterRadicalResidualStrokeCount' 2409 DEPENDS = ['CharacterDecomposition', 'StrokeCount', 'KangxiRadical', 2410 'KangxiRadicalIsolatedCharacter', 'RadicalEquivalentCharacter', 2411 'CharacterKangxiRadical'] 2412 2413 COLUMNS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex', 'RadicalForm', 2414 'RadicalZVariant', 'MainCharacterLayout', 'RadicalRelativePosition', 2415 'ResidualStrokeCount'] 2416 PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant', 'RadicalForm', 2417 'RadicalZVariant', 'MainCharacterLayout', 'RadicalRelativePosition'] 2418 COLUMN_TYPES = {'ChineseCharacter': String(1), 'RadicalIndex': Integer(), 2419 'RadicalForm': String(1), 'ZVariant': Integer(), 2420 'RadicalZVariant': Integer(), 'MainCharacterLayout': String(1), 2421 'RadicalRelativePosition': Integer(), 'ResidualStrokeCount': Integer()} 2422
2423 - def __init__(self, dataPath, dbConnectInst, quiet=False):
2424 super(CharacterRadicalStrokeCountBuilder, self).__init__(dataPath, 2425 dbConnectInst, quiet)
2426
2427 - def getGenerator(self):
2428 # get all characters we have component information for 2429 decompositionTable = self.db.tables['CharacterDecomposition'] 2430 characterSet = set(self.db.selectRows( 2431 select([decompositionTable.c.ChineseCharacter, 2432 decompositionTable.c.ZVariant], distinct=True))) 2433 return CharacterRadicalStrokeCountBuilder\ 2434 .CharacterRadicalStrokeCountGenerator(self.db, characterSet, 2435 self.quiet).generator()
2436
2437 2438 -class CharacterResidualStrokeCountBuilder(EntryGeneratorBuilder):
2439 """ 2440 Builds a mapping between characters and their residual stroke count when 2441 splitting of the radical form. This is stripped off information gathered 2442 from table C{CharacterRadicalStrokeCount}. 2443 """
2445 """ 2446 Generates the character to residual stroke count mapping from the 2447 C{CharacterRadicalResidualStrokeCount} table. 2448 """
2449 - def __init__(self, dbConnectInst, characterSet):
2450 """ 2451 Initialises the ResidualStrokeCountExtractor. 2452 2453 @type dbConnectInst: instance 2454 @param dbConnectInst: instance of a L{DatabaseConnector} 2455 @type characterSet: set 2456 @param characterSet: set of characters to generate the table for 2457 """ 2458 self.characterSet = characterSet 2459 self.cjk = characterlookup.CharacterLookup( 2460 dbConnectInst=dbConnectInst)
2461
2462 - def getEntries(self, char, zVariant, radicalDict):
2463 u""" 2464 Gets a list of radical residual entries. For multiple radical 2465 occurrences (e.g. 伦) only returns the residual stroke count for the 2466 "main" radical form. 2467 2468 @type char: str 2469 @param char: Chinese character 2470 @type zVariant: int 2471 @param zVariant: I{Z-variant} of given character 2472 @rtype: list of tuple 2473 @return: list of residual stroke count entries 2474 @todo Lang: Implement, find a good algorithm to turn down unwanted 2475 forms, don't just choose random one. See the following list:: 2476 2477 >>> from cjklib import characterlookup 2478 >>> cjk = characterlookup.CharacterLookup() 2479 >>> for char in cjk.db.selectSoleValue('CharacterRadicalResidualStrokeCount', 2480 ... 'ChineseCharacter', distinctValues=True): 2481 ... try: 2482 ... entries = cjk.getCharacterKangxiRadicalResidualStrokeCount(char, 'C') 2483 ... lastEntry = entries[0] 2484 ... for entry in entries[1:]: 2485 ... # print if diff. radical forms and diff. residual stroke count 2486 ... if lastEntry[0] != entry[0] and lastEntry[2] != entry[2]: 2487 ... print char 2488 ... break 2489 ... lastEntry = entry 2490 ... except: 2491 ... pass 2492 ... 2493 2494 2495 2496 2497 2498 >>> cjk.getCharacterKangxiRadicalResidualStrokeCount(u'缧') 2499 [(u'\u7cf8', 0, u'\u2ffb', 0, 8), (u'\u7e9f', 0, u'\u2ff0', 0, 11)] 2500 """ 2501 # filter entries to return only the main radical form 2502 # TODO provisional solution, take first entry per radical index 2503 filteredEntries = [] 2504 for radicalIdx in radicalDict[(char, zVariant)]: 2505 _, _, _, _, residualStrokeCount \ 2506 = radicalDict[(char, zVariant)][radicalIdx][0] 2507 filteredEntries.append((radicalIdx, residualStrokeCount)) 2508 2509 return filteredEntries
2510
2511 - def generator(self):
2512 """Provides one entry per character, z-Variant and locale subset.""" 2513 radicalDict = self.cjk.getCharacterRadicalResidualStrokeCountDict() 2514 for char, zVariant in self.characterSet: 2515 for radicalIndex, residualStrokeCount in self.getEntries(char, 2516 zVariant, radicalDict): 2517 yield [char, zVariant, radicalIndex, residualStrokeCount]
2518 2519 PROVIDES = 'CharacterResidualStrokeCount' 2520 DEPENDS = ['CharacterRadicalResidualStrokeCount'] 2521 2522 COLUMNS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex', 2523 'ResidualStrokeCount'] 2524 PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex'] 2525 INDEX_KEYS = [['RadicalIndex']] 2526 COLUMN_TYPES = {'ChineseCharacter': String(1), 'RadicalIndex': Integer(), 2527 'ZVariant': Integer(), 'ResidualStrokeCount': Integer()} 2528
2529 - def __init__(self, dataPath, dbConnectInst, quiet=False):
2530 super(CharacterResidualStrokeCountBuilder, self).__init__(dataPath, 2531 dbConnectInst, quiet)
2532
2533 - def getGenerator(self):
2534 residualSCTable = self.db.tables['CharacterRadicalResidualStrokeCount'] 2535 characterSet = set(self.db.selectRows( 2536 select([residualSCTable.c.ChineseCharacter, 2537 residualSCTable.c.ZVariant], distinct=True))) 2538 return CharacterResidualStrokeCountBuilder.ResidualStrokeCountExtractor( 2539 self.db, characterSet).generator()
2540
2541 2542 -class CombinedCharacterResidualStrokeCountBuilder( 2543 CharacterResidualStrokeCountBuilder):
2544 """ 2545 Builds a mapping between characters and their residual stroke count when 2546 splitting of the radical form. Includes stroke count data from the Unihan 2547 database to make up for missing data in own data files. 2548 """
2550 """ 2551 Generates the character to residual stroke count mapping. 2552 """
2553 - def __init__(self, tableEntries, preferredBuilder, quiet=False):
2554 """ 2555 Initialises the CombinedResidualStrokeCountExtractor. 2556 2557 @type tableEntries: list of list 2558 @param tableEntries: list of characters with Z-variant 2559 @type preferredBuilder: instance 2560 @param preferredBuilder: TableBuilder which forms are preferred over 2561 entries from the Unihan table 2562 @type quiet: bool 2563 @param quiet: if true no status information will be printed 2564 """ 2565 self.RADICAL_REGEX = re.compile(ur"(\d+)\.(\d+)") 2566 self.tableEntries = tableEntries 2567 self.preferredBuilder = preferredBuilder 2568 self.quiet = quiet
2569
2570 - def generator(self):
2571 """Provides one entry per character and z-Variant.""" 2572 # handle chars from own data first 2573 seenCharactersSet = set() 2574 for entry in self.preferredBuilder: 2575 yield entry 2576 char = entry[0] 2577 radicalIdx = entry[2] 2578 seenCharactersSet.add((char, radicalIdx)) 2579 2580 # now fill up with characters from Unihan, Z-variant missing though 2581 for char, radicalStroke in self.tableEntries: 2582 matchObj = self.RADICAL_REGEX.match(radicalStroke) 2583 if matchObj: 2584 try: 2585 radicalIndex = int(matchObj.group(1)) 2586 residualStrokeCount = int(matchObj.group(2)) 2587 if (char, radicalIndex) not in seenCharactersSet: 2588 yield [char, 0, radicalIndex, residualStrokeCount] 2589 except ValueError: 2590 if not self.quiet: 2591 warn("unable to read radical information of " \ 2592 + "character '" + character + "': '" \ 2593 + radicalStroke + "'") 2594 elif not self.quiet: 2595 warn("unable to read radical information of character '" \ 2596 + character + "': '" + radicalStroke + "'")
2597 2598 DEPENDS = ['CharacterRadicalResidualStrokeCount', 'Unihan'] 2599 COLUMN_SOURCE = 'kRSKangXi' 2600
2601 - def getGenerator(self):
2602 residualSCTable = self.db.tables['CharacterRadicalResidualStrokeCount'] 2603 characterSet = set(self.db.selectRows( 2604 select([residualSCTable.c.ChineseCharacter, 2605 residualSCTable.c.ZVariant], distinct=True))) 2606 preferredBuilder = CombinedCharacterResidualStrokeCountBuilder\ 2607 .ResidualStrokeCountExtractor(self.db, characterSet).generator() 2608 2609 # get main builder 2610 unihanTable = self.db.tables['Unihan'] 2611 tableEntries = set(self.db.selectRows( 2612 select([unihanTable.c.ChineseCharacter, 2613 unihanTable.c[self.COLUMN_SOURCE]], 2614 unihanTable.c[self.COLUMN_SOURCE] != None))) 2615 return CombinedCharacterResidualStrokeCountBuilder\ 2616 .CombinedResidualStrokeCountExtractor(tableEntries, 2617 preferredBuilder, self.quiet).generator()
2618
2619 #} 2620 #{ Dictionary builder 2621 2622 -class EDICTFormatBuilder(EntryGeneratorBuilder):
2623 """ 2624 Provides an abstract class for loading EDICT formatted dictionaries. 2625 2626 One column will be provided for the headword, one for the reading (in EDICT 2627 that is the Kana) and one for the translation. 2628 @todo Fix: Optimize insert, use transaction which disables autocommit and 2629 cosider passing data all at once, requiring proper handling of row 2630 indices. 2631 """
2632 - class TableGenerator:
2633 """Generates the dictionary entries.""" 2634
2635 - def __init__(self, fileHandle, quiet=False, entryRegex=None, 2636 columns=None, filterFunc=None):
2637 """ 2638 Initialises the TableGenerator. 2639 2640 @type fileHandle: file 2641 @param fileHandle: handle of file to read from 2642 @type quiet: bool 2643 @param quiet: if true no status information will be printed 2644 @type entryRegex: instance 2645 @param entryRegex: regular expression object for entry pattern 2646 @type columns: list of str 2647 @param columns: column names of generated data 2648 @type filterFunc: function 2649 @param filterFunc: function used to filter entry content 2650 """ 2651 self.fileHandle = fileHandle 2652 self.quiet = quiet 2653 self.columns = columns 2654 self.filterFunc = filterFunc 2655 if entryRegex: 2656 self.entryRegex = entryRegex 2657 else: 2658 # the EDICT dictionary itself omits the KANA in brackets if 2659 # the headword is already a KANA word 2660 # KANJI [KANA] /english_1/english_2/.../ 2661 # KANA /english_1/.../ 2662 self.entryRegex = \ 2663 re.compile(r'\s*(\S+)\s*(?:\[([^\]]*)\]\s*)?(/.*/)\s*$')
2664
2665 - def generator(self):
2666 """Provides the dictionary entries.""" 2667 a = 0 2668 for line in self.fileHandle: 2669 # ignore comments 2670 if line.lstrip().startswith('#'): 2671 continue 2672 # parse line 2673 matchObj = self.entryRegex.match(line) 2674 if not matchObj: 2675 if line.strip() != '': 2676 warn("error reading line '" + line + "'") 2677 continue 2678 # get entries 2679 entry = matchObj.groups() 2680 if self.columns: 2681 entry = dict([(self.columns[idx], cell) for idx, cell \ 2682 in enumerate(entry)]) 2683 if self.filterFunc: 2684 entry = self.filterFunc(entry) 2685 yield entry
2686 2687 COLUMNS = ['Headword', 'Reading', 'Translation'] 2688 PRIMARY_KEYS = [] 2689 INDEX_KEYS = [['Headword'], ['Reading']] 2690 COLUMN_TYPES = {'Headword': String(255), 'Reading': String(255), 2691 'Translation': Text()} 2692 2693 FULLTEXT_COLUMNS = ['Translation'] 2694 """Column names which shall be fulltext searchable.""" 2695 FILE_NAMES = None 2696 """Names of file containing the edict formated dictionary.""" 2697 ENCODING = 'utf-8' 2698 """Encoding of the dictionary file.""" 2699 ENTRY_REGEX = None 2700 """ 2701 Regular Expression matching a dictionary entry. Needs to be overwritten if 2702 not strictly follows the EDICT format. 2703 """ 2704 IGNORE_LINES = 0 2705 """Number of starting lines to ignore.""" 2706 FILTER = None 2707 """Filter to apply to the read entry before writing to table.""" 2708
2709 - def __init__(self, dataPath, dbConnectInst, quiet=False):
2710 super(EDICTFormatBuilder, self).__init__(dataPath, dbConnectInst, quiet)
2711
2712 - def getGenerator(self):
2713 # get file handle 2714 import os.path as path 2715 filePath = self.findFile(self.FILE_NAMES) 2716 handle = self.getFileHandle(filePath) 2717 if not self.quiet: 2718 warn("Reading table from file '" + filePath + "'") 2719 # ignore starting lines 2720 for i in range(0, self.IGNORE_LINES): 2721 handle.readline() 2722 # create generator 2723 return EDICTFormatBuilder.TableGenerator(handle, self.quiet, 2724 self.ENTRY_REGEX, self.COLUMNS, self.FILTER).generator()
2725
2726 - def getArchiveContentName(self, filePath):
2727 """ 2728 Function extracting the name of contained file from the zipped archive 2729 using the file name. 2730 Reimplement and adapt to own needs. 2731 2732 @type filePath: str 2733 @param filePath: path of file 2734 @rtype: str 2735 @return: name of file in archive 2736 """ 2737 fileName = os.path.basename(filePath) 2738 fileRoot, _ = os.path.splitext(fileName) 2739 return fileRoot
2740
2741 - def getFileHandle(self, filePath):
2742 """ 2743 Returns a handle to the give file. 2744 2745 The file can be either normal content, zip, tar, .tar.gz, tar.bz2 2746 2747 @type filePath: str 2748 @param filePath: path of file 2749 @rtype: file 2750 @return: handle to file's content 2751 """ 2752 import zipfile 2753 import tarfile 2754 2755 fileName = os.path.basename(filePath) 2756 2757 if zipfile.is_zipfile(filePath): 2758 import StringIO 2759 z = zipfile.ZipFile(filePath, 'r') 2760 archiveContent = self.getArchiveContentName(filePath) 2761 return StringIO.StringIO(z.read(archiveContent)\ 2762 .decode(self.ENCODING)) 2763 elif tarfile.is_tarfile(filePath): 2764 import StringIO 2765 mode = '' 2766 if filePath.endswith('bz2'): 2767 mode = ':bz2' 2768 elif filePath.endswith('gz'): 2769 mode = ':gz' 2770 z = tarfile.open(filePath, 'r' + mode) 2771 archiveContent = self.getArchiveContentName(filePath) 2772 file = z.extractfile(archiveContent) 2773 return StringIO.StringIO(file.read().decode(self.ENCODING)) 2774 elif filePath.endswith('.gz'): 2775 import gzip 2776 import StringIO 2777 z = gzip.GzipFile(filePath, 'r') 2778 return StringIO.StringIO(z.read().decode(self.ENCODING)) 2779 else: 2780 import codecs 2781 return codecs.open(filePath, 'r', self.ENCODING)
2782
2783 - def buildFTS3CreateTableStatement(self, table):
2784 """ 2785 Returns a SQL statement for creating a virtual table using FTS3 for 2786 SQLite. 2787 2788 @type table: object 2789 @param table: SQLAlchemy table object representing the FTS3 table 2790 @rtype: str 2791 @return: Create table statement 2792 """ 2793 preparer = self.db.engine.dialect.identifier_preparer 2794 2795 preparedColumns = [] 2796 for column in table.columns: 2797 preparedColumns.append(preparer.format_column(column)) 2798 preparedTableName = preparer.format_table(table) 2799 return text("CREATE VIRTUAL TABLE %s USING FTS3(%s);" \ 2800 % (preparedTableName, ', '.join(preparedColumns)))
2801
2802 - def buildFTS3Tables(self, tableName, columns, columnTypeMap={}, 2803 primaryKeys=[], fullTextColumns=[]):
2804 """ 2805 Builds a FTS3 table construct for supporting full text search under 2806 SQLite. 2807 2808 @type tableName: str 2809 @param tableName: name of table 2810 @type columns: list of str 2811 @param columns: column names 2812 @type columnTypeMap: dict of str and object 2813 @param columnTypeMap: mapping of column name to SQLAlchemy Column 2814 @type primaryKeys: list of str 2815 @param primaryKeys: list of primary key columns 2816 @type fullTextColumns: list of str 2817 @param fullTextColumns: list of fulltext columns 2818 """ 2819 2820 # table with non-FTS3 data 2821 simpleColumns = [column for column in columns \ 2822 if column not in fullTextColumns] 2823 simpleTable = self.buildTableObject(tableName + '_Normal', 2824 simpleColumns, columnTypeMap, primaryKeys) 2825 simpleTable.create() 2826 2827 # FTS3 table 2828 fts3Table = self.buildTableObject(tableName + '_Text', fullTextColumns, 2829 columnTypeMap) 2830 createFTS3Statement = self.buildFTS3CreateTableStatement(fts3Table) 2831 self.db.execute(createFTS3Statement) 2832 2833 # view to mask FTS3 table construct as simple table 2834 view = Table(tableName, self.db.metadata) 2835 preparer = self.db.engine.dialect.identifier_preparer 2836 simpleTableName = preparer.format_table(simpleTable) 2837 fts3TableName = preparer.format_table(fts3Table) 2838 2839 createViewStatement = text("""CREATE VIEW %s AS SELECT * FROM %s JOIN %s 2840 ON %s.rowid = %s.rowid;""" \ 2841 % (preparer.format_table(view), simpleTableName, fts3TableName, 2842 simpleTableName, fts3TableName)) 2843 self.db.execute(createViewStatement) 2844 # register view so processes depending on this succeed, see special 2845 # view handling in DatabaseBuilder.__init__, workaround for SQLalchemy 2846 # TODO Bug in SQLalchemy that doesn't reflect table on reload? 2847 #t = Table(tableName, self.db.metadata, autoload=True, useexisting=True) 2848 self.db.engine.reflecttable(view)
2849
2850 - def insertFTS3Tables(self, tableName, generator, columns=[], 2851 fullTextColumns=[]):
2852 2853 simpleColumns = [column for column in columns \ 2854 if column not in fullTextColumns] 2855 simpleTable = Table(tableName + '_Normal', self.db.metadata, 2856 autoload=True) 2857 fts3Table = Table(tableName + '_Text', self.db.metadata, 2858 autoload=True) 2859 fts3FullRows = ['rowid'] 2860 fts3FullRows.extend(fullTextColumns) 2861 2862 for newEntry in generator: 2863 try: 2864 if type(newEntry) == type([]): 2865 simpleData = [newEntry[i] \ 2866 for i, column in enumerate(columns) \ 2867 if column not in fullTextColumns] 2868 fts3Data = [newEntry[i] \ 2869 for i, column in enumerate(columns) \ 2870 if column in fullTextColumns] 2871 fts3Data.insert('rowid', 0) 2872 else: 2873 simpleData = dict([(key, value) \ 2874 for key, value in newEntry.items() \ 2875 if key in simpleColumns]) 2876 fts3Data = dict([(key, value) \ 2877 for key, value in newEntry.items() \ 2878 if key in fullTextColumns]) 2879 fts3Data['rowid'] = func.last_insert_rowid() 2880 2881 # table with non-FTS3 data 2882 simpleTable.insert(simpleData).execute() 2883 fts3Table.insert(fts3Data).execute() 2884 except sqlalchemy.exceptions.IntegrityError: 2885 warn(unicode(e)) 2886 #warn(unicode(insertStatement)) 2887 raise
2888
2889 - def testFTS3(self):
2890 """ 2891 Tests if the SQLite FTS3 extension is supported on the build system. 2892 2893 @rtype: bool 2894 @return: C{True} if the FTS3 extension exists, C{False} otherwise. 2895 """ 2896 # Until #3436 is fixed (http://www.sqlite.org/cvstrac/tktview?tn=3436,5) 2897 # do it the bad way 2898 try: 2899 dummyTable = Table('cjklib_test_fts3_presence', self.db.metadata, 2900 Column('dummy'), useexisting=True) 2901 createStatement = self.buildFTS3CreateTableStatement(dummyTable) 2902 self.db.execute(createStatement) 2903 try: 2904 dummyTable.drop() 2905 except sqlalchemy.exceptions.OperationalError: 2906 pass 2907 return True 2908 except sqlalchemy.exceptions.OperationalError: 2909 return False
2910
2911 - def build(self):
2912 """ 2913 Build the table provided by the TableBuilder. 2914 2915 A search index is created to allow for fulltext searching. 2916 """ 2917 # get generator, might raise an Exception if source not found 2918 generator = self.getGenerator() 2919 2920 hasFTS3 = self.db.engine.name == 'sqlite' and self.testFTS3() 2921 if not hasFTS3: 2922 warn("No SQLite FTS3 support found, fulltext search not supported.") 2923 # get create statement 2924 table = self.buildTableObject(self.PROVIDES, self.COLUMNS, 2925 self.COLUMN_TYPES, self.PRIMARY_KEYS) 2926 table.create() 2927 else: 2928 # get create statement 2929 self.buildFTS3Tables(self.PROVIDES, self.COLUMNS, self.COLUMN_TYPES, 2930 self.PRIMARY_KEYS, self.FULLTEXT_COLUMNS) 2931 2932 if not hasFTS3: 2933 # write table content 2934 #try: 2935 #entries = self.getEntryDict(generator) 2936 #self.db.execute(table.insert(), entries) 2937 #except sqlalchemy.exceptions.IntegrityError, e: 2938 #warn(unicode(e)) 2939 ##warn(unicode(insertStatement)) 2940 #raise 2941 for newEntry in generator: 2942 try: 2943 table.insert(newEntry).execute() 2944 except sqlalchemy.exceptions.IntegrityError: 2945 warn(unicode(e)) 2946 #warn(unicode(insertStatement)) 2947 raise 2948 else: 2949 # write table content 2950 self.insertFTS3Tables(self.PROVIDES, generator, self.COLUMNS, 2951 self.FULLTEXT_COLUMNS) 2952 2953 # get create index statement 2954 if not hasFTS3: 2955 for index in self.buildIndexObjects(self.PROVIDES, self.INDEX_KEYS): 2956 index.create() 2957 else: 2958 for index in self.buildIndexObjects(self.PROVIDES + '_Normal', 2959 self.INDEX_KEYS): 2960 index.create()
2961
2962 - def remove(self):
2963 # get drop table statement 2964 2965 hasFTS3 = self.db.engine.has_table(self.PROVIDES + '_Text') 2966 if not hasFTS3: 2967 table = Table(self.PROVIDES, self.db.metadata) 2968 table.drop() 2969 else: 2970 preparer = self.db.engine.dialect.identifier_preparer 2971 view = Table(self.PROVIDES, self.db.metadata) 2972 dropViewStatement = text("DROP VIEW %s" \ 2973 % preparer.format_table(view)) 2974 self.db.execute(dropViewStatement) 2975 table = Table(self.PROVIDES + '_Normal', self.db.metadata) 2976 table.drop() 2977 table = Table(self.PROVIDES + '_Text', self.db.metadata) 2978 table.drop()
2979
2980 2981 -class WordIndexBuilder(EntryGeneratorBuilder):
2982 """ 2983 Builds a translation word index for a given dictionary. 2984 2985 Searching for a word will return a headword and reading. This allows to find 2986 several dictionary entries with same headword and reading, with only one 2987 including the translation word. 2988 2989 @todo Fix: Word regex is specialised for HanDeDict. 2990 @todo Fix: Using a row_id for joining instead of Headword(Traditional) and 2991 Reading would maybe speed up table joins. Needs a workaround to include 2992 multiple rows for one actual headword entry though. 2993 """
2994 - class WordEntryGenerator:
2995 """Generates words for a list of dictionary entries.""" 2996
2997 - def __init__(self, entries):
2998 """ 2999 Initialises the WordEntryGenerator. 3000 3001 @type entries: list of tuple 3002 @param entries: a list of headword and its translation 3003 """ 3004 self.entries = entries 3005 # TODO this regex is adapted to HanDeDict, might be not general 3006 # enough 3007 self.wordRegex = re.compile(r'\([^\)]+\)|' \ 3008 + r'(?:; Bsp.: [^/]+?--[^/]+)|([^/,\(\)\[\]\!\?]+)')
3009
3010 - def generator(self):
3011 """Provides all data of one word per entry.""" 3012 # remember seen entries to prevent double entries 3013 seenWordEntries = set() 3014 newEntryDict = {} 3015 3016 for headword, reading, translation in self.entries: 3017 newEntryDict['Headword'] = headword 3018 newEntryDict['Reading'] = reading 3019 for word in self.wordRegex.findall(translation): 3020 word = word.strip().lower() 3021 if not word: 3022 continue 3023 if word \ 3024 and (headword, reading, word) not in seenWordEntries: 3025 seenWordEntries.add((headword, reading, word)) 3026 newEntryDict['Word'] = word 3027 yield newEntryDict
3028 3029 COLUMNS = ['Headword', 'Reading', 'Word'] 3030 COLUMN_TYPES = {'Headword': String(255), 'Reading': String(255), 3031 'Word': String(255)} 3032 INDEX_KEYS = [['Word']] 3033 3034 TABLE_SOURCE = None 3035 """Dictionary source""" 3036 HEADWORD_SOURCE = 'Headword' 3037 """Source of headword""" 3038
3039 - def __init__(self, dataPath, dbConnectInst, quiet=False):
3040 super(WordIndexBuilder, self).__init__(dataPath, dbConnectInst, quiet)
3041
3042 - def getGenerator(self):
3043 table = self.db.tables[self.TABLE_SOURCE] 3044 entries = self.db.selectRows( 3045 select([table.c[self.HEADWORD_SOURCE], table.c.Reading, 3046 table.c.Translation])) 3047 return WordIndexBuilder.WordEntryGenerator(entries).generator()
3048
3049 3050 -class EDICTBuilder(EDICTFormatBuilder):
3051 """ 3052 Builds the EDICT dictionary. 3053 """ 3054 PROVIDES = 'EDICT' 3055 FILE_NAMES = ['edict.gz', 'edict.zip', 'edict'] 3056 ENCODING = 'euc-jp' 3057 IGNORE_LINES = 1
3058
3059 3060 -class EDICTWordIndexBuilder(WordIndexBuilder):
3061 """ 3062 Builds the word index of the EDICT dictionary. 3063 """ 3064 PROVIDES = 'EDICT_Words' 3065 DEPENDS = ['EDICT'] 3066 TABLE_SOURCE = 'EDICT'
3067
3068 3069 -class CEDICTFormatBuilder(EDICTFormatBuilder):
3070 """ 3071 Provides an abstract class for loading CEDICT formatted dictionaries. 3072 3073 Two column will be provided for the headword (one for traditional and 3074 simplified writings each), one for the reading (e.g. in CEDICT Pinyin) and 3075 one for the translation. 3076 @todo Impl: Proper collation for Translation and Reading columns. 3077 """ 3078 COLUMNS = ['HeadwordTraditional', 'HeadwordSimplified', 'Reading', 3079 'Translation'] 3080 INDEX_KEYS = [['HeadwordTraditional'], ['HeadwordSimplified'], ['Reading']] 3081 COLUMN_TYPES = {'HeadwordTraditional': String(255), 3082 'HeadwordSimplified': String(255), 'Reading': String(255), 3083 'Translation': Text()} 3084
3085 - def __init__(self, dataPath, dbConnectInst, quiet=False):
3086 self.ENTRY_REGEX = \ 3087 re.compile(r'\s*(\S+)(?:\s+(\S+))?\s*\[([^\]]*)\]\s*(/.*/)\s*$') 3088 super(CEDICTFormatBuilder, self).__init__(dataPath, dbConnectInst, 3089 quiet)
3090
3091 3092 -class CEDICTBuilder(CEDICTFormatBuilder):
3093 """ 3094 Builds the CEDICT dictionary. 3095 """
3096 - def filterUmlaut(self, entry):
3097 """ 3098 Converts the C{'u:'} to C{'ü'}. 3099 3100 @type entry: tuple 3101 @param entry: a dictionary entry 3102 @rtype: tuple 3103 @return: the given entry with corrected ü-voul 3104 """ 3105 if type(entry) == type({}): 3106 entry['Reading'] = entry['Reading'].replace('u:', u'ü') 3107 return entry 3108 else: 3109 trad, simp, reading, translation = entry 3110 reading = reading.replace('u:', u'ü') 3111 return [trad, simp, reading, translation]
3112 3113 PROVIDES = 'CEDICT' 3114 FILE_NAMES = ['cedict_1_0_ts_utf-8_mdbg.zip', 3115 'cedict_1_0_ts_utf-8_mdbg.txt.gz', 'cedictu8.zip', 'cedict_ts.u8', 3116 'cedict_1_0_ts_utf-8_mdbg.txt'] 3117 ENCODING = 'utf-8' 3118 FILTER = filterUmlaut 3119
3120 - def getArchiveContentName(self, filePath):
3121 return 'cedict_ts.u8'
3122
3123 3124 -class CEDICTWordIndexBuilder(WordIndexBuilder):
3125 """ 3126 Builds the word index of the CEDICT dictionary. 3127 """ 3128 PROVIDES = 'CEDICT_Words' 3129 DEPENDS = ['CEDICT'] 3130 TABLE_SOURCE = 'CEDICT' 3131 HEADWORD_SOURCE = 'HeadwordTraditional'
3132
3133 3134 -class CEDICTGRBuilder(EDICTFormatBuilder):
3135 """ 3136 Builds the CEDICT-GR dictionary. 3137 """ 3138 PROVIDES = 'CEDICTGR' 3139 FILE_NAMES = ['cedictgr.zip', 'cedictgr.b5'] 3140 ENCODING = 'big5hkscs' 3141
3142 - def getArchiveContentName(self, filePath):
3143 return 'cedictgr.b5'
3144
3145 3146 -class CEDICTGRWordIndexBuilder(WordIndexBuilder):
3147 """ 3148 Builds the word index of the CEDICT-GR dictionary. 3149 """ 3150 PROVIDES = 'CEDICTGR_Words' 3151 DEPENDS = ['CEDICTGR'] 3152 TABLE_SOURCE = 'CEDICTGR' 3153 HEADWORD_SOURCE = 'Headword'
3154
3155 3156 -class HanDeDictBuilder(CEDICTFormatBuilder):
3157 """ 3158 Builds the HanDeDict dictionary. 3159 """
3160 - def filterSpacing(self, entry):
3161 """ 3162 Converts wrong spacing in readings of entries in HanDeDict. 3163 3164 @type entry: tuple 3165 @param entry: a dictionary entry 3166 @rtype: tuple 3167 @return: the given entry with corrected spacing 3168 """ 3169 if type(entry) == type({}): 3170 headword = entry['HeadwordTraditional'] 3171 reading = entry['Reading'] 3172 else: 3173 headword, headwordSimplified, reading, translation = entry 3174 3175 readingEntities = [] 3176 precedingIsNonReading = False 3177 for idx, entity in enumerate(reading.split(' ')): 3178 if idx < len(headword) and entity == headword[idx]: 3179 # for entities showing up in both strings, ommit spaces 3180 # (e.g. "IC卡", "I C kǎ") 3181 if not precedingIsNonReading: 3182 readingEntities.append(' ') 3183 3184 precedingIsNonReading = True 3185 elif idx != 0: 3186 readingEntities.append(' ') 3187 precedingIsNonReading = False 3188 3189 readingEntities.append(entity) 3190 3191 reading = ''.join(readingEntities) 3192 3193 if type(entry) == type({}): 3194 entry['Reading'] = reading 3195 return entry 3196 else: 3197 return [headword, headwordSimplified, reading, translation]
3198 3199 PROVIDES = 'HanDeDict' 3200 FILE_NAMES = ['handedict-*.zip', 'handedict-*.tar.bz2', 'handedict.u8'] 3201 ENCODING = 'utf-8' 3202 FILTER = filterSpacing 3203
3204 - def extractTimeStamp(self, filePath):
3205 fileName = os.path.basename(filePath) 3206 matchObj = re.match(r'handedict-(\d{8})\.', fileName) 3207 if matchObj: 3208 return matchObj.group(1)
3209
3210 - def getPreferredFile(self, filePaths):
3211 timeStamps = [] 3212 for filePath in filePaths: 3213 ts = self.extractTimeStamp(filePath) 3214 if ts: 3215 timeStamps.append((ts, filePath)) 3216 if timeStamps: 3217 _, filePath = max(timeStamps) 3218 return filePath 3219 else: 3220 filePaths[0]
3221
3222 - def getArchiveContentName(self, filePath):
3223 timeStamp = self.extractTimeStamp(filePath) 3224 return 'handedict-' + timeStamp + '/handedict.u8'
3225
3226 - def findFile(self, fileGlobs, fileType=None):
3227 """ 3228 Tries to locate a file with a given list of possible file names under 3229 the classes default data paths. 3230 3231 Uses the newest version of all files found. 3232 3233 @type fileGlobs: str/list of str 3234 @param fileGlobs: possible file names 3235 @type fileType: str 3236 @param fileType: textual type of file used in error msg 3237 @rtype: str 3238 @return: path to file of first match in search for existing file 3239 @raise IOError: if no file found 3240 """ 3241 import glob 3242 3243 if type(fileGlobs) != type([]): 3244 fileGlobs = [fileGlobs] 3245 foundFiles = [] 3246 for fileGlob in fileGlobs: 3247 for path in self.dataPath: 3248 globPath = os.path.join(os.path.expanduser(path), fileGlob) 3249 for filePath in glob.glob(globPath): 3250 if os.path.exists(filePath): 3251 fileName = os.path.basename(filePath) 3252 foundFiles.append((fileName, filePath)) 3253 3254 if foundFiles: 3255 if hasattr(self, 'getPreferredFile'): 3256 return self.getPreferredFile([path for _, path in foundFiles]) 3257 else: 3258 _, newestPath = max(foundFiles) 3259 return newestPath 3260 else: 3261 if fileType == None: 3262 fileType = "file" 3263 raise IOError("No " + fileType + " found for '" + self.PROVIDES \ 3264 + "' under path(s)'" + "', '".join(self.dataPath) \ 3265 + "' for file names '" + "', '".join(fileGlobs) + "'")
3266
3267 3268 -class HanDeDictWordIndexBuilder(WordIndexBuilder):
3269 """ 3270 Builds the word index of the HanDeDict dictionary. 3271 """ 3272 PROVIDES = 'HanDeDict_Words' 3273 DEPENDS = ['HanDeDict'] 3274 TABLE_SOURCE = 'HanDeDict' 3275 HEADWORD_SOURCE = 'HeadwordTraditional'
3276
3277 #} 3278 #{ DatabaseBuilder 3279 3280 -class DatabaseBuilder:
3281 """ 3282 DatabaseBuilder provides the main class for building up a database for the 3283 cjklib package. 3284 3285 It contains all L{TableBuilder} classes and a dependency graph to handle 3286 build requests. 3287 """
3288 - def __init__(self, databaseSettings={}, dbConnectInst=None, dataPath=[], 3289 quiet=False, rebuildDepending=True, rebuildExisting=True, noFail=False, 3290 prefer=[], additionalBuilders=[]):
3291 """ 3292 Constructs the DatabaseBuilder. 3293 3294 @type databaseSettings: dict 3295 @param databaseSettings: dictionary holding the database options for the 3296 dbconnector module. 3297 @type dbConnectInst: instance 3298 @param dbConnectInst: instance of a L{DatabaseConnector} 3299 @type dataPath: list of str 3300 @param dataPath: optional list of paths to the data file(s) 3301 @type quiet: bool 3302 @param quiet: if true no status information will be printed to stderr 3303 @type rebuildDepending: bool 3304 @param rebuildDepending: if true existing tables that depend on updated 3305 tables will be dropped and built from scratch 3306 @type rebuildExisting: bool 3307 @param rebuildExisting: if true existing tables will be dropped and 3308 built from scratch 3309 @type noFail: bool 3310 @param noFail: if true build process won't terminate even if one table 3311 fails to build 3312 @type prefer: list 3313 @param prefer: list of L{TableBuilder} names to prefer in conflicting 3314 cases 3315 @type additionalBuilders: list of classobj 3316 @param additionalBuilders: list of externally provided TableBuilders 3317 """ 3318 if not dataPath: 3319 buildModule = __import__("cjklib.build") 3320 self.dataPath = [os.path.join(buildModule.__path__[0], 'data')] 3321 else: 3322 if type(dataPath) == type([]): 3323 self.dataPath = dataPath 3324 else: 3325 # wrap as list 3326 self.dataPath = [dataPath] 3327 self.quiet = quiet 3328 self.rebuildDepending = rebuildDepending 3329 self.rebuildExisting = rebuildExisting 3330 self.noFail = noFail 3331 # get connector to database 3332 if dbConnectInst: 3333 self.db = dbConnectInst 3334 else: 3335 self.db = dbconnector.DatabaseConnector.getDBConnector( 3336 databaseSettings) 3337 3338 # get TableBuilder classes 3339 tableBuilderClasses = DatabaseBuilder.getTableBuilderClasses( 3340 set(prefer), quiet=self.quiet, 3341 additionalBuilders=additionalBuilders) 3342 3343 # build lookup 3344 self.tableBuilderLookup = {} 3345 for tableBuilder in tableBuilderClasses.values(): 3346 if self.tableBuilderLookup.has_key(tableBuilder.PROVIDES): 3347 raise Exception("Table '" + tableBuilder.PROVIDES \ 3348 + "' provided by several builders") 3349 self.tableBuilderLookup[tableBuilder.PROVIDES] = tableBuilder
3350
3351 - def setDataPath(self, dataPath):
3352 """ 3353 Changes the data path. 3354 3355 @type dataPath: list of str 3356 @param dataPath: list of paths to the data file(s) 3357 """ 3358 if type(dataPath) == type([]): 3359 self.dataPath = dataPath 3360 else: 3361 # wrap as list 3362 self.dataPath = [dataPath]
3363
3364 - def build(self, tables):
3365 """ 3366 Builds the given tables. 3367 3368 @type tables: list 3369 @param tables: list of tables to build 3370 """ 3371 if type(tables) != type([]): 3372 tables = [tables] 3373 3374 warn("Building database '%s'" % self.db.databaseUrl) 3375 3376 # remove tables that don't need to be rebuilt 3377 filteredTables = [] 3378 for table in tables: 3379 if table not in self.tableBuilderLookup: 3380 raise exception.UnsupportedError("Table '%s' not provided" \ 3381 % table) 3382 3383 if self.needsRebuild(table): 3384 filteredTables.append(table) 3385 else: 3386 if not self.quiet: 3387 warn("Skipping table '%s' because it already exists" \ 3388 % table) 3389 tables = filteredTables 3390 3391 # get depending tables that need to be updated when dependencies change 3392 dependingTables = [] 3393 if self.rebuildDepending: 3394 dependingTables = self.getRebuiltDependingTables(tables) 3395 if dependingTables: 3396 warn("Tables rebuilt because of dependencies updated: '" \ 3397 +"', '".join(dependingTables) + "'") 3398 tables.extend(dependingTables) 3399 3400 # get table list according to dependencies 3401 buildDependentTables = self.getBuildDependentTables(tables) 3402 buildTables = set(tables) | buildDependentTables 3403 # get build order and remove tables we don't need to build 3404 builderClasses = self.getClassesInBuildOrder(buildTables) 3405 3406 # build tables 3407 if not self.quiet and self.rebuildExisting: 3408 warn("Rebuilding tables and overwriting old ones...") 3409 builderClasses.reverse() 3410 instancesUnrequestedTable = set() 3411 while builderClasses: 3412 builder = builderClasses.pop() 3413 # check first if the table will only be created for resolving 3414 # dependencies and note it down for deletion 3415 transaction = self.db.connection.begin() 3416 try: 3417 instance = builder(self.dataPath, self.db, self.quiet) 3418 # mark tables as deletable if its only provided because of 3419 # dependencies and the table doesn't exists yet 3420 if builder.PROVIDES in buildDependentTables \ 3421 and not self.db.engine.has_table(builder.PROVIDES): 3422 instancesUnrequestedTable.add(instance) 3423 3424 if self.db: 3425 if self.db.engine.has_table(builder.PROVIDES): 3426 if not self.quiet: 3427 warn("Removing previously built table '" \ 3428 + builder.PROVIDES + "'") 3429 instance.remove() 3430 else: 3431 instance.remove() 3432 3433 if not self.quiet: 3434 warn("Building table '" + builder.PROVIDES \ 3435 + "' with builder '" + builder.__name__ + "'...") 3436 3437 instance.build() 3438 transaction.commit() 3439 except IOError, e: 3440 transaction.rollback() 3441 # data not available, can't build table 3442 if self.noFail: 3443 if not self.quiet: 3444 warn("Building table '" + builder.PROVIDES \ 3445 + "' failed: '" + str(e) + "', skipping") 3446 dependingTables = [builder.PROVIDES] 3447 remainingBuilderClasses = [] 3448 for clss in builderClasses: 3449 if set(clss.DEPENDS) & set(dependingTables): 3450 # this class depends on one being removed 3451 dependingTables.append(clss.PROVIDES) 3452 else: 3453 remainingBuilderClasses.append(clss) 3454 if not self.quiet and len(dependingTables) > 1: 3455 warn("Ignoring depending table(s) '" \ 3456 + "', '".join(dependingTables[1:]) + "'") 3457 builderClasses = remainingBuilderClasses 3458 else: 3459 raise 3460 except Exception, e: 3461 transaction.rollback() 3462 raise 3463 3464 # remove tables that where only created as build dependencies 3465 if instancesUnrequestedTable: 3466 for instance in instancesUnrequestedTable: 3467 if not self.quiet: 3468 warn("Removing table '" + instance.PROVIDES \ 3469 + "' as it was only created to solve build " \ 3470 + "dependencies") 3471 instance.remove()
3472
3473 - def remove(self, tables):
3474 """ 3475 Removes the given tables. 3476 3477 @type tables: list 3478 @param tables: list of tables to remove 3479 """ 3480 if type(tables) != type([]): 3481 tables = [tables] 3482 3483 tableBuilderClasses = [] 3484 for table in set(tables): 3485 if not self.tableBuilderLookup.has_key(table): 3486 raise exception.UnsupportedError("table '" + table \ 3487 + "' not provided") 3488 tableBuilderClasses.append(self.tableBuilderLookup[table]) 3489 3490 for builder in tableBuilderClasses: 3491 instance = builder(self.dataPath, self.db, self.quiet) 3492 if self.db: 3493 if self.db.engine.has_table(builder.PROVIDES): 3494 if not self.quiet: 3495 warn("Removing previously built table '" \ 3496 + builder.PROVIDES + "'") 3497 instance.remove() 3498 else: 3499 instance.remove()
3500
3501 - def needsRebuild(self, tableName):
3502 """ 3503 Returns true if either rebuild is turned on by default or we build into 3504 database and the table doesn't exist yet. 3505 3506 @type tableName: classobj 3507 @param tableName: L{TableBuilder} class 3508 @rtype: bool 3509 @return: True, if table needs to be rebuilt 3510 """ 3511 if self.rebuildExisting: 3512 return True 3513 else: 3514 return not self.db.engine.has_table(tableName)
3515
3516 - def getBuildDependentTables(self, tableNames):
3517 """ 3518 Gets the name of the tables that needs to be built to resolve 3519 dependencies. 3520 3521 @type tableNames: list of str 3522 @param tableNames: list of tables to build 3523 @rtype: list of str 3524 @return: names of tables needed to resolve dependencies 3525 """ 3526 def solveDependencyRecursive(table): 3527 """ 3528 Gets all tables on which the given table depends and that need to be 3529 rebuilt. Also will mark tables skipped which won't be rebuilt. 3530 3531 Uses parent's variables to store data. 3532 3533 @type table: str 3534 @param table: table name for which to solve dependencies 3535 """ 3536 if table in tableNames: 3537 # don't add dependant tables if they are given explicitly 3538 return 3539 if self.db and self.db.engine.has_table(table): 3540 skippedTables.add(table) 3541 return 3542 3543 dependedTablesNames.add(table) 3544 3545 # add dependent tables if needed (recursively) 3546 if not self.tableBuilderLookup.has_key(table): 3547 # either we have no builder or the builder was removed in 3548 # favour of another builder that shares at least one table 3549 # with the removed one 3550 raise exception.UnsupportedError("table '" + table \ 3551 + "' not provided, might be related to conflicting " \ 3552 + "builders") 3553 builderClass = self.tableBuilderLookup[table] 3554 for dependantTable in builderClass.DEPENDS: 3555 solveDependencyRecursive(dependantTable)
3556 3557 tableNames = set(tableNames) 3558 dependedTablesNames = set() 3559 skippedTables = set() 3560 3561 for table in tableNames: 3562 builderClass = self.tableBuilderLookup[table] 3563 for depededTable in builderClass.DEPENDS: 3564 solveDependencyRecursive(depededTable) 3565 3566 if not self.quiet and skippedTables: 3567 warn("Newly built tables depend on table(s) '" \ 3568 + "', '".join(skippedTables) \ 3569 + "' but skipping because they already exist") 3570 return dependedTablesNames
3571
3572 - def getDependingTables(self, tableNames):
3573 """ 3574 Gets the name of the tables that depend on the given tables to be built 3575 and are not included in the given set. 3576 3577 Dependencies depend on the choice of table builders and thus may vary. 3578 3579 @type tableNames: list of str 3580 @param tableNames: list of tables 3581 @rtype: list of str 3582 @return: names of tables that depend on given tables 3583 """ 3584 dependencyTables = set(tableNames) 3585 dependingTablesNames = set() 3586 residualTables = self.getCurrentSupportedTables() - dependencyTables 3587 3588 while dependencyTables: 3589 dependencyTable = dependencyTables.pop() 3590 for table in residualTables: 3591 builderClass = self.tableBuilderLookup[table] 3592 if dependencyTable in builderClass.DEPENDS: 3593 # found a table that depends on the given table 3594 dependingTablesNames.add(table) 3595 # queue for check of depending tables 3596 dependencyTables.add(table) 3597 # no need for further testing on the newly found table 3598 residualTables = residualTables - dependencyTables 3599 3600 return dependingTablesNames
3601
3602 - def getRebuiltDependingTables(self, tableNames):
3603 """ 3604 Gets the name of the tables that depend on the given tables to be built 3605 and already exist, thus need to be rebuilt. 3606 3607 @type tableNames: list of str 3608 @param tableNames: list of tables 3609 @rtype: list of str 3610 @return: names of tables that need to be rebuilt because of dependencies 3611 """ 3612 dependingTables = self.getDependingTables(tableNames) 3613 3614 needRebuild = set() 3615 for tableName in dependingTables: 3616 if self.db.engine.has_table(tableName): 3617 needRebuild.add(tableName) 3618 return needRebuild
3619
3620 - def getClassesInBuildOrder(self, tableNames):
3621 """ 3622 Gets the build order for the given table names. 3623 3624 @type tableNames: list of str 3625 @param tableNames: list of names of tables to build 3626 @rtype: list of classobj 3627 @return: L{TableBuilder}s in build order 3628 """ 3629 # get dependencies and save order 3630 tableBuilderClasses = [] 3631 for table in set(tableNames): 3632 if not self.tableBuilderLookup.has_key(table): 3633 # either we have no builder or the builder was removed in favour 3634 # of another builder that shares at least one table with the 3635 # removed one 3636 raise exception.UnsupportedError("table '" + table \ 3637 + "' not provided, might be related to conflicting " \ 3638 + "builders") 3639 tableBuilderClasses.append(self.tableBuilderLookup[table]) 3640 return self.getBuildDependencyOrder(tableBuilderClasses)
3641 3642 @staticmethod
3643 - def getBuildDependencyOrder(tableBuilderClasses):
3644 """ 3645 Create order in which the tables have to be created. 3646 3647 @type tableBuilderClasses: list of classobj 3648 @param tableBuilderClasses: list of L{TableBuilder} classes 3649 @rtype: list of classobj 3650 @return: the given classes ordered in build dependency order 3651 """ 3652 dependencyOrder = [] 3653 providedTables = [bc.PROVIDES for bc in tableBuilderClasses] 3654 includedTableNames = set() 3655 while tableBuilderClasses: 3656 for builderClass in tableBuilderClasses: 3657 if set(builderClass.DEPENDS).intersection(providedTables) \ 3658 <= includedTableNames: 3659 # found a terminal class or one whose dependencies are 3660 # already covered (at least no dependency on one of the 3661 # tables in the list) 3662 break 3663 else: 3664 # one dependency can not be fulfilled, might be that no 3665 # TableBuilder is implemented, that it was removed due to 3666 # conflicting other builder, or that a cycle in DEPEND graph 3667 # exists 3668 raise Exception("Unfulfillable depend request, " \ 3669 + "might be related to conflicting builders or cycle. " \ 3670 + "Builders included: '" \ 3671 + "', '".join([clss.__name__ for clss in dependencyOrder]) \ 3672 + "'. Builders with open depends: '" \ 3673 + "', '".join([builder.PROVIDES \ 3674 for builder in tableBuilderClasses]) + "'") 3675 dependencyOrder.append(builderClass) 3676 includedTableNames.add(builderClass.PROVIDES) 3677 tableBuilderClasses.remove(builderClass) 3678 return dependencyOrder
3679 3680 @staticmethod
3681 - def getTableBuilderClasses(preferClassSet=set(), resolveConflicts=True, 3682 quiet=True, additionalBuilders=[]):
3683 """ 3684 Gets all classes in module that implement L{TableBuilder}. 3685 3686 @type preferClassSet: set of str 3687 @param preferClassSet: set of L{TableBuilder} names to prefer in 3688 conflicting cases, resolveConflicting must be True to take effect 3689 (default) 3690 @type resolveConflicts: bool 3691 @param resolveConflicts: if true conflicting builders will be removed 3692 so that only one builder is left per Table. 3693 @type quiet: bool 3694 @param quiet: if true no status information will be printed to stderr 3695 @type additionalBuilders: list of classobj 3696 @param additionalBuilders: list of externally provided TableBuilders 3697 @rtype: dict 3698 @return: dictionary of all classes inheriting form L{TableBuilder} that 3699 provide a table (i.d. non abstract implementations), with its name 3700 as key 3701 """ 3702 tableBuilderClasses = {} 3703 buildModule = __import__("cjklib.build") 3704 # get all classes that inherit from TableBuilder 3705 tableBuilderClasses = dict([(clss.__name__, clss) \ 3706 for clss in buildModule.build.__dict__.values() \ 3707 if type(clss) == types.TypeType \ 3708 and issubclass(clss, buildModule.build.TableBuilder) \ 3709 and clss.PROVIDES]) 3710 # add additionally provided 3711 tableBuilderClasses.update(dict([(clss.__name__, clss) \ 3712 for clss in additionalBuilders])) 3713 3714 # check for conflicting builders and keep only one per conflicting group 3715 # group builders first 3716 tableToBuilderMapping = {} 3717 for clssName, clss in tableBuilderClasses.iteritems(): 3718 if clss.PROVIDES not in tableToBuilderMapping: 3719 tableToBuilderMapping[clss.PROVIDES] = set() 3720 3721 tableToBuilderMapping[clss.PROVIDES].add(clssName) 3722 3723 if resolveConflicts: 3724 # now check conflicting and choose preferred if given 3725 for tableName, builderClssSet in tableToBuilderMapping.items(): 3726 preferredBuilders = builderClssSet & preferClassSet 3727 if preferredBuilders: 3728 if len(preferredBuilders) > 1: 3729 # the user specified more than one preferred table that 3730 # both provided at least one same table 3731 raise Exception("More than one TableBuilder " \ 3732 + "preferred for conflicting table.") 3733 preferred = preferredBuilders.pop() 3734 builderClssSet.remove(preferred) 3735 else: 3736 preferred = builderClssSet.pop() 3737 if not quiet and builderClssSet: 3738 warn("Removing conflicting builder(s) '" \ 3739 + "', '".join(builderClssSet) + "' in favour of '" \ 3740 + preferred + "'") 3741 # remove other conflicting 3742 for clssName in builderClssSet: 3743 del tableBuilderClasses[clssName] 3744 return tableBuilderClasses
3745 3746 @staticmethod
3747 - def getSupportedTables():
3748 """ 3749 Gets names of supported tables. 3750 3751 @rtype: list of str 3752 @return: names of tables 3753 """ 3754 classDict = DatabaseBuilder.getTableBuilderClasses( 3755 resolveConflicts=False) 3756 return set([clss.PROVIDES for clss in classDict.values()])
3757
3758 - def getCurrentSupportedTables(self):
3759 """ 3760 Gets names of tables supported by this instance of the database builder. 3761 3762 This list can have more entries then L{getSupportedTables()} as 3763 additional external builders can be supplied on instantiation. 3764 3765 @rtype: list of str 3766 @return: names of tables 3767 """ 3768 return set(self.tableBuilderLookup.keys())
3769
3770 - def isOptimizable(self):
3771 """ 3772 Checks if the current database supports optimization. 3773 3774 @rtype: boolean 3775 @return: True if optimizable, False otherwise 3776 """ 3777 return self.db.engine.name in ['sqlite']
3778
3779 - def optimize(self):
3780 """ 3781 Optimizes the current database. 3782 3783 @raise Exception: if database does not support optimization 3784 @raise OperationalError: if optimization failed 3785 """ 3786 if self.db.engine.name == 'sqlite': 3787 self.db.execute('VACUUM') 3788 else: 3789 raise Exception('Database does not seem to support optimization')
3790
3791 #} 3792 #{ Global methods 3793 3794 -def warn(message):
3795 """ 3796 Prints the given message to stderr with the system's default encoding. 3797 3798 @type message: str 3799 @param message: message to print 3800 """ 3801 print >> sys.stderr, message.encode(locale.getpreferredencoding(), 3802 'replace')
3803