Package cjklib :: Module build
Source Code for Module cjklib.build

   1  #!/usr/bin/python 
   2  # -*- coding: utf-8  -*- 
   3  # This file is part of cjklib. 
   4  # 
   5  # cjklib is free software: you can redistribute it and/or modify 
   6  # it under the terms of the GNU Lesser General Public License as published by 
   7  # the Free Software Foundation, either version 3 of the License, or 
   8  # (at your option) any later version. 
   9  # 
  10  # cjklib is distributed in the hope that it will be useful, 
  11  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
  12  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
  13  # GNU Lesser General Public License for more details. 
  14  # 
  15  # You should have received a copy of the GNU Lesser General Public License 
  16  # along with cjklib.  If not, see <http://www.gnu.org/licenses/>. 
  17   
  18  """ 
  19  Provides the building methods for the cjklib package. 
  20   
  21  Each table that needs to be created has to be implemented by a L{TableBuilder}. 
  22  The L{DatabaseBuilder} is the central instance for managing the build process. 
  23  As the creation of a table can depend on other tables the DatabaseBuilder keeps 
  24  track of dependencies to process a build in the correct order. 
  25   
  26  Building is tested on the following storage methods: 
  27      - SQLite 
  28      - MySQL 
  29   
  30  Some L{TableBuilder} implementations aren't used by the CJK library but are 
  31  provided here for additional usage. 
  32   
  33  For MS Windows default versions provided seem to be a "X{narrow build}" and not 
  34  support characters outside the BMP (see e.g. 
  35  U{http://wordaligned.org/articles/narrow-python}). Currently no Unicode 
  36  characters outside the BMP will thus be supported on Windows platforms. 
  37   
  38  Examples 
  39  ======== 
  40  The following examples should give a quick view into how to use this 
  41  package. 
  42      - Create the DatabaseBuilder object with default settings (read from 
  43          cjklib.conf or using 'cjklib.db' in same directory as default): 
  44   
  45          >>> from cjklib import build 
  46          >>> dbBuilder = build.DatabaseBuilder(dataPath=['./cjklib/data/']) 
  47          Removing conflicting builder(s) 'CharacterVariantBMPBuilder' in favour 
  48          of 'CharacterVariantBuilder' 
  49          Removing conflicting builder(s) 'SlimUnihanBuilder', 'UnihanBuilder', 
  50          'UnihanBMPBuilder' in favour of 'SlimUnihanBMPBuilder' 
  51          Removing conflicting builder(s) 'StrokeCountBuilder' in favour of 
  52          'CombinedStrokeCountBuilder' 
  53          Removing conflicting builder(s) 'CharacterResidualStrokeCountBuilder' in 
  54          favour of 'CombinedCharacterResidualStrokeCountBuilder' 
  55   
  56      - Build the table of Jyutping syllables from a csv file: 
  57   
  58          >>> dbBuilder.build(['JyutpingSyllables']) 
  59          building table 'JyutpingSyllables' with builder 
  60          'JyutpingSyllablesBuilder'... 
  61          Reading table definition from file './cjklib/data/jyutpingsyllables.sql' 
  62          Reading table 'JyutpingSyllables' from file 
  63          './cjklib/data/jyutpingsyllables.csv' 
  64   
  65  @todo Impl: Further character domains: BIG5 (Taiwan), kIRG_GSource (Unicode, 
  66      Simplified Chinese), kIRG_JSource (Unicode, Japanese), kIRG_KPSource and 
  67      kIRG_KSource (Unicode, Korean), kIRG_TSource (Unicode, Traditional Chinese), 
  68      kIRG_VSource (Unicode, Vietnamese) 
  69  @todo Fix:  On interruption (Ctrl+C) remove tables that were only created 
  70      because of dependencies. 
  71  """ 
  72   
  73  import types 
  74  import locale 
  75  import sys 
  76  import re 
  77  import os.path 
  78  import xml.sax 
  79  import csv 
  80   
  81  from sqlalchemy import Table, Column, Integer, String, Text, Index 
  82  from sqlalchemy import select, union 
  83  from sqlalchemy.sql import text, func 
  84  from sqlalchemy.sql import and_, or_, not_ 
  85  import sqlalchemy 
  86   
  87  from cjklib import dbconnector 
  88  from cjklib import characterlookup 
  89  from cjklib import exception 
  90   
  91  #{ TableBuilder and generic classes 
  92   
  93 -class TableBuilder(object): 
  94      """ 
  95      TableBuilder provides the abstract layout for classes that build a distinct 
  96      table. 
  97      """ 
  98      PROVIDES = '' 
  99      """Contains the name of the table provided by this module.""" 
 100      DEPENDS = [] 
 101      """Contains the names of the tables needed for the build process.""" 
 102   
 103 -    def __init__(self, dataPath=None, dbConnectInst=None, quiet=False): 
 104          """ 
 105          Constructs the TableBuilder. 
 106   
 107          @type dataPath: list of str 
 108          @param dataPath: optional list of paths to the data file(s) 
 109          @type dbConnectInst: instance 
 110          @param dbConnectInst: instance of a L{DatabaseConnector}. If not given 
 111              all sql code will be printed to stdout. 
 112          @type quiet: bool 
 113          @param quiet: if true no status information will be printed to stderr 
 114          """ 
 115          self.dataPath = dataPath 
 116          self.quiet = quiet 
 117          self.db = dbConnectInst 
 118   
 119 -    def build(self): 
 120          """ 
 121          Build the table provided by the TableBuilder. 
 122   
 123          Methods should raise an IOError if reading a data source fails. The 
 124          L{DatabaseBuilder} knows how to handle this case and is able to proceed. 
 125          """ 
 126          pass 
 127   
 128 -    def remove(self): 
 129          """ 
 130          Removes the table provided by the TableBuilder from the database. 
 131          """ 
 132          pass 
 133   
 134 -    def findFile(self, fileNames, fileType=None): 
 135          """ 
 136          Tries to locate a file with a given list of possible file names under 
 137          the classes default data paths. 
 138   
 139          For each file name every given path is checked and the first match is 
 140          returned. 
 141   
 142          @type fileNames: str/list of str 
 143          @param fileNames: possible file names 
 144          @type fileType: str 
 145          @param fileType: textual type of file used in error msg 
 146          @rtype: str 
 147          @return: path to file of first match in search for existing file 
 148          @raise IOError: if no file found 
 149          """ 
 150          if type(fileNames) != type([]): 
 151              fileNames = [fileNames] 
 152          for fileName in fileNames: 
 153              for path in self.dataPath: 
 154                  filePath = os.path.join(os.path.expanduser(path), fileName) 
 155                  if os.path.exists(filePath): 
 156                      return filePath 
 157          if fileType == None: 
 158              fileType = "file" 
 159          raise IOError("No " + fileType + " found for '" + self.PROVIDES \ 
 160              + "' under path(s)'" + "', '".join(self.dataPath) \ 
 161              + "' for file names '" + "', '".join(fileNames) + "'") 
 162   
 163 -    def buildTableObject(self, tableName, columns, columnTypeMap={}, 
 164          primaryKeys=[]): 
 165          """ 
 166          Returns a SQLAlchemy Table object. 
 167   
 168          @type tableName: str 
 169          @param tableName: name of table 
 170          @type columns: list of str 
 171          @param columns: column names 
 172          @type columnTypeMap: dict of str and object 
 173          @param columnTypeMap: mapping of column name to SQLAlchemy Column 
 174          @type primaryKeys: list of str 
 175          @param primaryKeys: list of primary key columns 
 176          """ 
 177          table = Table(tableName, self.db.metadata) 
 178          for column in columns: 
 179              if column in columnTypeMap: 
 180                  type_ = columnTypeMap[column] 
 181              else: 
 182                  type_ = Text() 
 183                  warn("column %s has no type, assuming default 'Text()'" \ 
 184                      % column) 
 185              table.append_column(Column(column, type_, 
 186                  primary_key=(column in primaryKeys))) 
 187   
 188          return table 
 189   
 190 -    def buildIndexObjects(self, tableName, indexKeyList): 
 191          """ 
 192          Returns a SQLAlchemy Table object. 
 193   
 194          @type tableName: str 
 195          @param tableName: name of table 
 196          @type indexKeyList: list of list of str 
 197          @param indexKeyList: a list of key combinations 
 198          @rtype: object 
 199          @return: SQLAlchemy Index 
 200          """ 
 201          indexList = [] 
 202          table = Table(tableName, self.db.metadata, autoload=True) 
 203          for indexKeyColumns in indexKeyList: 
 204              indexName = tableName + '__' + '_'.join(indexKeyColumns) 
 205              indexList.append(Index(indexName, 
 206                  *[table.c[column] for column in indexKeyColumns])) 
 207   
 208          return indexList 
 209   
 210   
 211 -class EntryGeneratorBuilder(TableBuilder): 
 212      """ 
 213      Implements an abstract class for building a table from a generator 
 214      providing entries. 
 215      """ 
 216      COLUMNS = [] 
 217      """Columns that will be built""" 
 218      PRIMARY_KEYS = [] 
 219      """Primary keys of the created table""" 
 220      INDEX_KEYS = [] 
 221      """Index keys (not unique) of the created table""" 
 222      COLUMN_TYPES = {} 
 223      """Column types for created table""" 
 224   
 225 -    def getGenerator(self): 
 226          """ 
 227          Returns the entry generator. 
 228          Needs to be implemented by child classes. 
 229          """ 
 230          pass 
 231   
 232 -    def getEntryDict(self, generator): 
 233          entryList = [] 
 234   
 235          firstEntry = generator.next() 
 236          if type(firstEntry) == type(dict()): 
 237              entryList.append(firstEntry) 
 238   
 239              for newEntry in generator: 
 240                  entryList.append(newEntry) 
 241          else: 
 242              firstEntryDict = dict([(column, firstEntry[i]) \ 
 243                  for i, column in enumerate(self.COLUMNS)]) 
 244              entryList.append(firstEntryDict) 
 245   
 246              for newEntry in generator: 
 247                  entryDict = dict([(column, newEntry[i]) \ 
 248                      for i, column in enumerate(self.COLUMNS)]) 
 249                  entryList.append(entryDict) 
 250   
 251          return entryList 
 252   
 253 -    def build(self): 
 254          # get generator, might raise an Exception if source not found 
 255          generator = self.getGenerator() 
 256   
 257          # get create statement 
 258          table = self.buildTableObject(self.PROVIDES, self.COLUMNS, 
 259              self.COLUMN_TYPES, self.PRIMARY_KEYS) 
 260          table.create() 
 261   
 262          # write table content 
 263          #try: 
 264              #entries = self.getEntryDict(self.getGenerator()) 
 265              #self.db.execute(table.insert(), entries) 
 266          #except sqlalchemy.exceptions.IntegrityError, e: 
 267              #warn(unicode(e)) 
 268              ##warn(unicode(insertStatement)) 
 269              #raise 
 270   
 271          for newEntry in generator: 
 272              try: 
 273                  table.insert(newEntry).execute() 
 274              except sqlalchemy.exceptions.IntegrityError, e: 
 275                  warn(unicode(e)) 
 276                  raise 
 277   
 278          for index in self.buildIndexObjects(self.PROVIDES, self.INDEX_KEYS): 
 279              index.create() 
 280   
 281 -    def remove(self): 
 282          # get drop table statement 
 283          table = Table(self.PROVIDES, self.db.metadata) 
 284          table.drop() 
 285   
 286   
 287 -class ListGenerator: 
 288      """A simple generator for a given list of elements.""" 
 289 -    def __init__(self, entryList): 
 290          """ 
 291          Initialises the ListGenerator. 
 292   
 293          @type entryList: list of str 
 294          @param entryList: user defined entry 
 295          """ 
 296          self.entryList = entryList 
 297   
 298 -    def generator(self): 
 299          for entry in self.entryList: 
 300              yield entry 
 301   
 302  #} 
 303  #{ Unihan character information 
 304   
 305 -class UnihanGenerator: 
 306      """ 
 307      Regular expression matching one entry in the Unihan database 
 308      (e.g. C{U+8682  kMandarin       MA3 MA1 MA4}). 
 309      """ 
 310      keySet = None 
 311      """Set of keys of the Unihan table.""" 
 312   
 313 -    def __init__(self, fileName, useKeys=None, quiet=False): 
 314          """ 
 315          Constructs the UnihanGenerator. 
 316   
 317          @type fileName: str 
 318          @param fileName: path to the Unihan database file 
 319          @type useKeys: list 
 320          @param useKeys: if given only these keys will be read from the table, 
 321              otherwise all keys will be returned 
 322          @type quiet: bool 
 323          @param quiet: if true no status information will be printed to stderr 
 324          """ 
 325          self.ENTRY_REGEX = re.compile(ur"U\+([0-9A-F]+)\s+(\w+)\s+(.+)\s*$") 
 326          self.fileName = fileName 
 327          self.quiet = quiet 
 328          if useKeys != None: 
 329              self.limitKeys = True 
 330              self.keySet = set(useKeys) 
 331          else: 
 332              self.limitKeys = False 
 333   
 334 -    def generator(self): 
 335          """ 
 336          Iterates over the Unihan entries. 
 337   
 338          The character definition is converted to the character's representation, 
 339          all other data is given as is. These are merged into one entry for each 
 340          character. 
 341          """ 
 342          # attributes a separated over several lines. Read over lines until new 
 343          # character found and yield old entry. 
 344          handle = self.getHandle() 
 345          entryIndex = -1 
 346          entry = {} 
 347          for line in handle: 
 348              # ignore comments 
 349              if line.startswith('#'): 
 350                  continue 
 351              resultObj = self.ENTRY_REGEX.match(line) 
 352              if not resultObj: 
 353                  if not self.quiet: 
 354                      warn("can't read line from Unihan.txt: '" + line + "'") 
 355                  continue 
 356              unicodeHexIndex, key, value = resultObj.group(1, 2, 3) 
 357   
 358              # if we have a limited target key set, check if the current one is 
 359              # to be included 
 360              if self.limitKeys and not key in self.keySet: 
 361                  continue 
 362              # check if new character entry found 
 363              if entryIndex != unicodeHexIndex and entryIndex != -1: 
 364                  try: 
 365                      # yield old one 
 366                      char = unichr(int(entryIndex, 16)) 
 367                      yield(char, entry) 
 368                  except ValueError: 
 369                      # catch for Unicode characters outside BMP for narrow builds 
 370                      pass 
 371                  # empty old entry 
 372                  entry = {} 
 373              entryIndex = unicodeHexIndex 
 374              entry[key] = value 
 375          # generate last entry 
 376          if entry: 
 377              try: 
 378                  # yield old one 
 379                  char = unichr(int(entryIndex, 16)) 
 380                  yield(char, entry) 
 381              except ValueError: 
 382                  # catch for Unicode characters outside BMP for narrow builds 
 383                  pass 
 384          handle.close() 
 385   
 386 -    def getHandle(self): 
 387          """  
 388          Returns a handle of the Unihan database file. 
 389   
 390          @rtype: file 
 391          @return: file handle of the Unihan file 
 392          """ 
 393          import zipfile 
 394          if zipfile.is_zipfile(self.fileName): 
 395              import StringIO 
 396              z = zipfile.ZipFile(self.fileName, "r") 
 397              handle = StringIO.StringIO(z.read("Unihan.txt").decode('utf-8')) 
 398          else: 
 399              import codecs 
 400              handle = codecs.open(self.fileName, 'r', 'utf-8') 
 401          return handle 
 402   
 403 -    def keys(self): 
 404          """ 
 405          Returns all keys read for the Unihan table. 
 406   
 407          If the whole table is read a seek through the file is needed first to 
 408          find all keys, otherwise the predefined set is returned. 
 409          @rtype: list 
 410          @return: list of column names 
 411          """ 
 412          if not self.keySet: 
 413              if not self.quiet: 
 414                  warn("looking for all keys in Unihan database...") 
 415              self.keySet = set() 
 416              handle = self.getHandle() 
 417              for line in handle: 
 418                  # ignore comments 
 419                  if line.startswith('#'): 
 420                      continue 
 421                  resultObj = self.ENTRY_REGEX.match(line) 
 422                  if not resultObj: 
 423                      continue 
 424   
 425                  unicodeHexIndex, key, value = resultObj.group(1, 2, 3) 
 426                  self.keySet.add(key) 
 427              handle.close() 
 428          return list(self.keySet) 
 429   
 430   
 431 -class UnihanBuilder(EntryGeneratorBuilder): 
 432      """Builds the Unihan database from the Unihan file provided by Unicode.""" 
 433 -    class EntryGenerator: 
 434          """Generates the entries of the Unihan table.""" 
 435   
 436 -        def __init__(self, unihanGenerator): 
 437              """ 
 438              Initialises the EntryGenerator. 
 439   
 440              @type unihanGenerator: instance 
 441              @param unihanGenerator: a L{UnihanGenerator} instance 
 442              """ 
 443              self.unihanGenerator = unihanGenerator 
 444   
 445 -        def generator(self): 
 446              """Provides all data of one character per entry.""" 
 447              columns = self.unihanGenerator.keys() 
 448              for char, entryDict in self.unihanGenerator.generator(): 
 449                  newEntryDict = {UnihanBuilder.CHARACTER_COLUMN: char} 
 450                  for column in columns: 
 451                      if entryDict.has_key(column): 
 452                          newEntryDict[column] = entryDict[column] 
 453                      else: 
 454                          newEntryDict[column] = None 
 455                  yield newEntryDict 
 456   
 457      PROVIDES = 'Unihan' 
 458      CHARACTER_COLUMN = 'ChineseCharacter' 
 459      """Name of column for Chinese character key.""" 
 460      COLUMN_TYPES = {CHARACTER_COLUMN: String(1), 'kCantonese': Text(), 
 461          'kFrequency': Integer(), 'kHangul': Text(), 'kHanyuPinlu': Text(), 
 462          'kJapaneseKun': Text(), 'kJapaneseOn': Text(), 'kKorean': Text(), 
 463          'kMandarin': Text(), 'kRSJapanese': Text(), 'kRSKanWa': Text(), 
 464          'kRSKangXi': Text(), 'kRSKorean': Text(), 
 465          'kSimplifiedVariant': Text(), 'kTotalStrokes': Integer(), 
 466          'kTraditionalVariant': Text(), 'kVietnamese': Text(), 
 467          'kZVariant': Text()} 
 468      unihanGenerator = None 
 469   
 470 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
 471          super(UnihanBuilder, self).__init__(dataPath, dbConnectInst, quiet) 
 472          self.PRIMARY_KEYS = [self.CHARACTER_COLUMN] 
 473   
 474 -    def getUnihanGenerator(self): 
 475          """ 
 476          Returns the L{UnihanGenerator}. Constructs it if needed. 
 477   
 478          @rtype: instance 
 479          @return: instance of a L{UnihanGenerator} 
 480          """ 
 481          if not self.unihanGenerator: 
 482              path = self.findFile(['Unihan.txt', 'Unihan.zip'], 
 483                  "Unihan database file") 
 484              self.unihanGenerator = UnihanGenerator(path) 
 485              if not self.quiet: 
 486                  warn("reading file '" + path + "'") 
 487          return self.unihanGenerator 
 488   
 489 -    def getGenerator(self): 
 490          return UnihanBuilder.EntryGenerator(self.getUnihanGenerator())\ 
 491              .generator() 
 492   
 493 -    def build(self): 
 494          generator = self.getUnihanGenerator() 
 495          self.COLUMNS = [self.CHARACTER_COLUMN] 
 496          self.COLUMNS.extend(generator.keys()) 
 497   
 498          EntryGeneratorBuilder.build(self) 
 499   
 500   
 501 -class UnihanBMPBuilder(UnihanBuilder): 
 502      """ 
 503      Builds the Unihan database from the Unihan file provided by Unicode for 
 504      characters from the Basic Multilingual Plane (BMP) with code values between 
 505      U+0000 and U+FFFF. 
 506   
 507      MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes: 
 508      U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html} 
 509      """ 
 510 -    class BMPEntryGenerator: 
 511   
 512 -        def __init__(self, unihanGenerator): 
 513              """ 
 514              Initialises the EntryGenerator. 
 515   
 516              @type unihanGenerator: instance 
 517              @param unihanGenerator: a L{UnihanGenerator} instance 
 518              """ 
 519              gen = unihanGenerator.generator() 
 520              self.entryGen = UnihanBuilder.EntryGenerator(unihanGenerator)\ 
 521                  .generator() 
 522   
 523 -        def generator(self): 
 524              for entryDict in self.entryGen: 
 525                  # skip characters outside the BMP, i.e. for Chinese characters 
 526                  # >= 0x20000 
 527                  char = entryDict[UnihanBuilder.CHARACTER_COLUMN] 
 528                  if ord(char) < int('20000', 16): 
 529                      yield entryDict 
 530   
 531 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
 532          super(UnihanBMPBuilder, self).__init__(dataPath, dbConnectInst, quiet) 
 533          self.PRIMARY_KEYS = [self.CHARACTER_COLUMN] 
 534   
 535 -    def getGenerator(self): 
 536          return UnihanBMPBuilder.BMPEntryGenerator(self.getUnihanGenerator())\ 
 537              .generator() 
 538   
 539   
 540 -class SlimUnihanBuilder(UnihanBuilder): 
 541      """ 
 542      Builds a slim version of the Unihan database. 
 543   
 544      Keys imported into the database are specified in L{INCLUDE_KEYS}. 
 545      """ 
 546      INCLUDE_KEYS = ['kCompatibilityVariant', 'kCantonese', 'kFrequency', 
 547          'kHangul', 'kHanyuPinlu', 'kJapaneseKun', 'kJapaneseOn', 'kMandarin', 
 548          'kRSJapanese', 'kRSKanWa', 'kRSKangXi', 'kRSKorean', 'kSemanticVariant', 
 549          'kSimplifiedVariant', 'kSpecializedSemanticVariant', 'kTotalStrokes', 
 550          'kTraditionalVariant', 'kVietnamese', 'kXHC1983', 'kZVariant', 
 551          'kIICore', 'kGB0'] 
 552      """Keys for that data is read into the Unihan table in database.""" 
 553   
 554 -    def getUnihanGenerator(self): 
 555          if not self.unihanGenerator: 
 556              path = self.findFile(['Unihan.txt', 'Unihan.zip'], 
 557                  "Unihan database file") 
 558              self.unihanGenerator = UnihanGenerator(path, self.INCLUDE_KEYS) 
 559              if not self.quiet: 
 560                  warn("reading file '" + path + "'") 
 561          return self.unihanGenerator 
 562   
 563   
 564 -class SlimUnihanBMPBuilder(SlimUnihanBuilder, UnihanBMPBuilder): 
 565      """ 
 566      Builds a slim version of the Unihan database from the Unihan file provided 
 567      by Unicode for characters from the Basic Multilingual Plane (BMP) with code 
 568      values between U+0000 and U+FFFF. 
 569   
 570      MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes: 
 571      U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html} 
 572   
 573      Keys imported into the database are specified in L{INCLUDE_KEYS}. 
 574      """ 
 575      # all work is done in SlimUnihanBuilder and UnihanBMPBuilder 
 576      pass 
 577   
 578   
 579 -class Kanjidic2Builder(EntryGeneratorBuilder): 
 580      """ 
 581      Builds the Kanjidic database from the Kanjidic2 XML file 
 582      U{http://www.csse.monash.edu.au/~jwb/kanjidic2/}. 
 583      """ 
 584 -    class XMLHandler(xml.sax.ContentHandler): 
 585          """Extracts a list of given tags.""" 
 586 -        def __init__(self, entryList, tagDict): 
 587              self.entryList = entryList 
 588              self.tagDict = tagDict 
 589   
 590              self.currentElement = [] 
 591              self.targetTag = None 
 592              self.targetTagTopElement = None 
 593   
 594 -        def endElement(self, name): 
 595              assert(len(self.currentElement) > 0) 
 596              assert(self.currentElement[-1] == name) 
 597              self.currentElement.pop() 
 598   
 599              if name == self.targetTagTopElement: 
 600                  self.targetTag = None 
 601                  self.targetTagTopElement = None 
 602   
 603              if name == 'character': 
 604                  entryDict = {} 
 605                  for tag, func in self.tagDict.values(): 
 606                      if tag in self.currentEntry: 
 607                          entryDict[tag] = func(self.currentEntry[tag]) 
 608                  self.entryList.append(entryDict) 
 609   
 610 -        def characters(self, content): 
 611              if self.targetTag: 
 612                  if self.targetTag not in self.currentEntry: 
 613                      self.currentEntry[self.targetTag] = [] 
 614                  self.currentEntry[self.targetTag].append(content) 
 615   
 616 -        def startElement(self, name, attrs): 
 617              self.currentElement.append(name) 
 618              if name == 'character': 
 619                  self.currentEntry = {} 
 620              else: 
 621                  if 'character' in self.currentElement: 
 622                      idx = self.currentElement.index('character') + 1 
 623                      tagHierachy = tuple(self.currentElement[idx:]) 
 624   
 625                      key = (tagHierachy, frozenset(attrs.items())) 
 626                      if key in self.tagDict: 
 627                          self.targetTagTopElement = name 
 628                          self.targetTag, _ = self.tagDict[key] 
 629   
 630 -    class KanjidicGenerator: 
 631          """Generates the KANJIDIC table.""" 
 632 -        def __init__(self, dataPath, tagDict): 
 633              """ 
 634              Initialises the KanjidicGenerator. 
 635   
 636              @type dataPath: list of str 
 637              @param dataPath: optional list of paths to the data file(s) 
 638              """ 
 639              self.dataPath = dataPath 
 640              self.tagDict = tagDict 
 641   
 642 -        def getHandle(self): 
 643              """ 
 644              Returns a handle of the KANJIDIC database file. 
 645   
 646              @rtype: file 
 647              @return: file handle of the KANJIDIC file 
 648              """ 
 649              import gzip 
 650              if self.dataPath.endswith('.gz'): 
 651                  import StringIO 
 652                  z = gzip.GzipFile(self.dataPath, 'r') 
 653                  handle = StringIO.StringIO(z.read()) 
 654              else: 
 655                  import codecs 
 656                  handle = codecs.open(self.dataPath, 'r') 
 657              return handle 
 658   
 659 -        def generator(self): 
 660              """Provides a pronunciation and a path to the audio file.""" 
 661              entryList = [] 
 662              xmlHandler = Kanjidic2Builder.XMLHandler(entryList, self.tagDict) 
 663   
 664              saxparser = xml.sax.make_parser() 
 665              saxparser.setContentHandler(xmlHandler) 
 666              ## don't check DTD as this raises an exception 
 667              #saxparser.setFeature(xml.sax.handler.feature_external_ges, False) 
 668              saxparser.parse(self.getHandle()) 
 669   
 670              for entry in entryList: 
 671                  yield(entry) 
 672   
 673      PROVIDES = 'Kanjidic' 
 674      CHARACTER_COLUMN = 'ChineseCharacter' 
 675      """Name of column for Chinese character key.""" 
 676      COLUMN_TYPES = {CHARACTER_COLUMN: String(1), 'NelsonRadical': Integer(), 
 677          'CharacterJapaneseOn': Text(), 'CharacterJapaneseKun': Text()} 
 678      KANJIDIC_TAG_MAPPING = { 
 679          (('literal', ), frozenset()): ('ChineseCharacter', lambda x: x[0]), 
 680          (('radical', 'rad_value'), 
 681              frozenset([('rad_type', 'nelson_c')])): ('NelsonCRadical', 
 682                  lambda x: int(x[0])), 
 683          (('radical', 'rad_value'), 
 684              frozenset([('rad_type', 'nelson_n')])): ('NelsonNRadical', 
 685                  lambda x: int(x[0])), 
 686          # TODO On and Kun reading in KANJIDICT include further optional 
 687          #   attributes that makes the method miss the entry: 
 688          #   on_type and r_status, these are currently not implemented in the 
 689          #   file though 
 690          (('reading_meaning', 'rmgroup', 'reading'), 
 691              frozenset([('r_type', 'ja_on')])): ('CharacterJapaneseOn', 
 692                  lambda x: ','.join(x)), 
 693          (('reading_meaning', 'rmgroup', 'reading'), 
 694              frozenset([('r_type', 'ja_kun')])): ('CharacterJapaneseKun', 
 695                  lambda x: ','.join(x)), 
 696          #(('reading_meaning', 'rmgroup', 'reading'), 
 697              #frozenset([('r_type', 'pinyin')])): ('Pinyin', 
 698                  #lambda x: ','.join(x)), 
 699          (('misc', 'rad_name'), frozenset()): ('RadicalName', 
 700                  lambda x: ','.join(x)), 
 701          (('reading_meaning', 'rmgroup', 'meaning'), frozenset()): ('Meaning_en', 
 702                  lambda x: '/'.join(x)), 
 703          (('reading_meaning', 'rmgroup', 'meaning'), 
 704              frozenset([('m_lang', 'fr')])): ('Meaning_fr', 
 705                  lambda x: '/'.join(x)), 
 706          (('reading_meaning', 'rmgroup', 'meaning'), 
 707              frozenset([('m_lang', 'es')])): ('Meaning_es', 
 708                  lambda x: '/'.join(x)), 
 709          (('reading_meaning', 'rmgroup', 'meaning'), 
 710              frozenset([('m_lang', 'pt')])): ('Meaning_pt', 
 711                  lambda x: '/'.join(x)), 
 712          } 
 713      """ 
 714      Dictionary of tag keys mapping to a table column including a function 
 715      generating a string out of a list of entries given from the KANJIDIC entry. 
 716      The tag keys constist of a tuple giving the xml element hierarchy below the 
 717      'character' element and a set of attribute value pairs. 
 718      """ 
 719   
 720 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
 721          super(Kanjidic2Builder, self).__init__(dataPath, dbConnectInst, quiet) 
 722          tags = [tag for tag, _ in self.KANJIDIC_TAG_MAPPING.values()] 
 723          self.COLUMNS = tags 
 724          self.PRIMARY_KEYS = [self.CHARACTER_COLUMN] 
 725   
 726 -    def getGenerator(self): 
 727          """ 
 728          Returns the L{KanjidicGenerator}. 
 729   
 730          @rtype: instance 
 731          @return: instance of a L{KanjidicGenerator} 
 732          """ 
 733          path = self.findFile(['kanjidic2.xml.gz', 'kanjidic2.xml'], 
 734              "KANJIDIC2 XML file") 
 735          if not self.quiet: 
 736              warn("reading file '" + path + "'") 
 737          return Kanjidic2Builder.KanjidicGenerator(path, 
 738              self.KANJIDIC_TAG_MAPPING).generator() 
 739   
 740   
 741 -class UnihanDerivedBuilder(EntryGeneratorBuilder): 
 742      """ 
 743      Provides an abstract class for building a table with a relation between a 
 744      Chinese character and another column using the Unihan database. 
 745      """ 
 746      DEPENDS=['Unihan'] 
 747      COLUMN_SOURCE = None 
 748      """ 
 749      Unihan table column providing content for the table. Needs to be overwritten 
 750      in subclass. 
 751      """ 
 752      COLUMN_TARGET = None 
 753      """ 
 754      Column name for new data in created table. Needs to be overwritten in 
 755      subclass. 
 756      """ 
 757      COLUMN_TARGET_TYPE = Text() 
 758      """ 
 759      Type of column for new data in created table. 
 760      """ 
 761      GENERATOR_CLASS = None 
 762      """ 
 763      Class defining the iterator for creating the table's data. The constructor 
 764      needs to take two parameters for the list of entries from the Unihan 
 765      database and the 'quiet' flag. Needs to be overwritten in subclass. 
 766      """ 
 767   
 768 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
 769          super(UnihanDerivedBuilder, self).__init__(dataPath, dbConnectInst, 
 770              quiet) 
 771          # create name mappings 
 772          self.COLUMNS = ['ChineseCharacter', self.COLUMN_TARGET] 
 773          self.PRIMARY_KEYS = self.COLUMNS 
 774          # set column types 
 775          self.COLUMN_TYPES = {'ChineseCharacter': String(1), 
 776              self.COLUMN_TARGET: self.COLUMN_TARGET_TYPE} 
 777   
 778 -    def getGenerator(self): 
 779          # create generator 
 780          table = self.db.tables['Unihan'] 
 781          tableEntries = self.db.selectRows( 
 782              select([table.c.ChineseCharacter, table.c[self.COLUMN_SOURCE]], 
 783                  table.c[self.COLUMN_SOURCE] != None)) 
 784          return self.GENERATOR_CLASS(tableEntries, self.quiet).generator() 
 785   
 786 -    def build(self): 
 787          if not self.quiet: 
 788              warn("Reading table content from Unihan column '" \ 
 789                  + self.COLUMN_SOURCE + "'") 
 790          super(UnihanDerivedBuilder, self).build() 
 791   
 792   
 793 -class UnihanStrokeCountBuilder(UnihanDerivedBuilder): 
 794      """ 
 795      Builds a mapping between characters and their stroke count using the Unihan 
 796      data. 
 797      """ 
 798 -    class StrokeCountExtractor: 
 799          """Extracts the character stroke count mapping.""" 
 800 -        def __init__(self, entries, quiet=False): 
 801              """ 
 802              Initialises the StrokeCountExtractor. 
 803   
 804              @type entries: list of tuple 
 805              @param entries: character entries from the Unihan database 
 806              @type quiet: bool 
 807              @param quiet: if true no status information will be printed 
 808              """ 
 809              self.entries = entries 
 810              self.quiet = quiet 
 811   
 812 -        def generator(self): 
 813              """Provides one entry per radical and character.""" 
 814              for character, strokeCount in self.entries: 
 815                  yield(character, strokeCount) 
 816   
 817      PROVIDES = 'UnihanStrokeCount' 
 818      COLUMN_SOURCE = 'kTotalStrokes' 
 819      COLUMN_TARGET = 'StrokeCount' 
 820      COLUMN_TARGET_TYPE = Integer() 
 821      GENERATOR_CLASS = StrokeCountExtractor 
 822   
 823   
 824 -class CharacterRadicalBuilder(UnihanDerivedBuilder): 
 825      """ 
 826      Provides an abstract class for building a character radical mapping table 
 827      using the Unihan database. 
 828      """ 
 829 -    class RadicalExtractor: 
 830          """Generates the radical to character mapping from the Unihan table.""" 
 831 -        def __init__(self, rsEntries, quiet=False): 
 832              """ 
 833              Initialises the RadicalExtractor. 
 834   
 835              @type rsEntries: list of tuple 
 836              @param rsEntries: character radical entries from the Unihan database 
 837              @type quiet: bool 
 838              @param quiet: if true no status information will be printed 
 839              """ 
 840              self.RADICAL_REGEX = re.compile(ur"(\d+)\.(\d+)") 
 841              self.rsEntries = rsEntries 
 842              self.quiet = quiet 
 843   
 844 -        def generator(self): 
 845              """Provides one entry per radical and character.""" 
 846              for character, radicalStroke in self.rsEntries: 
 847                  matchObj = self.RADICAL_REGEX.match(radicalStroke) 
 848                  if matchObj: 
 849                      radical = matchObj.group(1) 
 850                      yield(character, radical) 
 851                  elif not self.quiet: 
 852                      warn("unable to read radical information of character '" \ 
 853                          + character + "': '" + radicalStroke + "'") 
 854   
 855      COLUMN_TARGET = 'RadicalIndex' 
 856      COLUMN_TARGET_TYPE = Integer() 
 857      GENERATOR_CLASS = RadicalExtractor 
 858   
 859   
 860 -class CharacterKangxiRadicalBuilder(CharacterRadicalBuilder): 
 861      """ 
 862      Builds the character Kangxi radical mapping table from the Unihan database. 
 863      """ 
 864      PROVIDES = 'CharacterKangxiRadical' 
 865      COLUMN_SOURCE = 'kRSKangXi' 
 866   
 867   
 868 -class CharacterKanWaRadicalBuilder(CharacterRadicalBuilder): 
 869      """ 
 870      Builds the character Dai Kan-Wa jiten radical mapping table from the Unihan 
 871      database. 
 872      """ 
 873      PROVIDES = 'CharacterKanWaRadical' 
 874      COLUMN_SOURCE = 'kRSKanWa' 
 875   
 876   
 877 -class CharacterJapaneseRadicalBuilder(CharacterRadicalBuilder): 
 878      """ 
 879      Builds the character Japanese radical mapping table from the Unihan 
 880      database. 
 881      """ 
 882      PROVIDES = 'CharacterJapaneseRadical' 
 883      COLUMN_SOURCE = 'kRSJapanese' 
 884   
 885   
 886 -class CharacterKoreanRadicalBuilder(CharacterRadicalBuilder): 
 887      """ 
 888      Builds the character Korean radical mapping table from the Unihan 
 889      database. 
 890      """ 
 891      PROVIDES = 'CharacterKoreanRadical' 
 892      COLUMN_SOURCE = 'kRSKorean' 
 893   
 894   
 895 -class CharacterVariantBuilder(EntryGeneratorBuilder): 
 896      """ 
 897      Builds a character variant mapping table from the Unihan database. 
 898      """ 
 899 -    class VariantGenerator: 
 900          """Generates the character to variant mapping from the Unihan table.""" 
 901   
 902          # Regular expressions for different entry types 
 903          HEX_INDEX_REGEX = re.compile(ur"\s*U\+([0-9A-F]+)\s*$") 
 904          MULT_HEX_INDEX_REGEX = re.compile(ur"\s*(U\+([0-9A-F]+)( |(?=$)))+\s*$") 
 905          MULT_HEX_INDEX_FIND_REGEX = re.compile(ur"U\+([0-9A-F]+)(?: |(?=$))") 
 906          SEMANTIC_REGEX = re.compile(ur"(U\+[0-9A-F]+(<\S+)?( |(?=$)))+$") 
 907          SEMANTIC_FIND_REGEX = re.compile(ur"U\+([0-9A-F]+)(?:<\S+)?(?: |(?=$))") 
 908          ZVARIANT_REGEX = re.compile(ur"\s*U\+([0-9A-F]+)(?:\:\S+)?\s*$") 
 909   
 910          VARIANT_REGEX_MAPPING = {'C': (HEX_INDEX_REGEX, HEX_INDEX_REGEX), 
 911              'M': (SEMANTIC_REGEX, SEMANTIC_FIND_REGEX), 
 912              'S': (MULT_HEX_INDEX_REGEX, MULT_HEX_INDEX_FIND_REGEX), 
 913              'P': (SEMANTIC_REGEX, SEMANTIC_FIND_REGEX), 
 914              'T': (MULT_HEX_INDEX_REGEX, MULT_HEX_INDEX_FIND_REGEX), 
 915              'Z': (ZVARIANT_REGEX, ZVARIANT_REGEX)} 
 916          """ 
 917          Mapping of entry types to regular expression describing the entry's 
 918          pattern. 
 919          """ 
 920   
 921 -        def __init__(self, variantEntries, typeList, quiet=False): 
 922              """ 
 923              Initialises the VariantGenerator. 
 924   
 925              @type variantEntries: list of tuple 
 926              @param variantEntries: character variant entries from the Unihan 
 927                  database 
 928              @type typeList: list of str 
 929              @param typeList: variant types in the order given in tableEntries 
 930              @type quiet: bool 
 931              @param quiet: if true no status information will be printed 
 932              """ 
 933              self.variantEntries = variantEntries 
 934              self.typeList = typeList 
 935              self.quiet = quiet 
 936   
 937 -        def generator(self): 
 938              """Provides one entry per variant and character.""" 
 939              for entries in self.variantEntries: 
 940                  character = entries[0] 
 941                  for i, variantType in enumerate(self.typeList): 
 942                      variantInfo = entries[i+1] 
 943                      if variantInfo: 
 944                          # get proper regular expression for given variant info 
 945                          matchR, findR = self.VARIANT_REGEX_MAPPING[variantType] 
 946                          if matchR.match(variantInfo): 
 947                              # get all hex indices 
 948                              variantIndices = findR.findall(variantInfo) 
 949                              for unicodeHexIndex in variantIndices: 
 950                                  try: 
 951                                      variant = unichr(int(unicodeHexIndex, 16)) 
 952                                      yield(character, variant, variantType) 
 953                                  except ValueError: 
 954                                      # catch for Unicode characters outside BMP 
 955                                      #   for narrow builds 
 956                                      pass 
 957                          elif not self.quiet: 
 958                              # didn't match the regex 
 959                              warn('unable to read variant information of ' \ 
 960                                  + "character '" + character + "' for type '" \ 
 961                                  + variantType + "': '" + variantInfo + "'") 
 962   
 963      PROVIDES = 'CharacterVariant' 
 964      DEPENDS=['Unihan'] 
 965   
 966      COLUMN_SOURCE_ABBREV = {'kCompatibilityVariant': 'C', 
 967          'kSemanticVariant': 'M', 'kSimplifiedVariant': 'S', 
 968          'kSpecializedSemanticVariant': 'P', 'kTraditionalVariant': 'T', 
 969          'kZVariant': 'Z'} 
 970      """ 
 971      Unihan table columns providing content for the table together with their 
 972      abbreviation used in the target table. 
 973      """ 
 974      COLUMN_TYPES = {'ChineseCharacter': String(1), 'Variant': String(1), 
 975          'Type': String(1)} 
 976   
 977 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
 978          super(CharacterVariantBuilder, self).__init__(dataPath, dbConnectInst, 
 979              quiet) 
 980          # create name mappings 
 981          self.COLUMNS = ['ChineseCharacter', 'Variant', 'Type'] 
 982          self.PRIMARY_KEYS = self.COLUMNS 
 983   
 984 -    def getGenerator(self): 
 985          # create generator 
 986          keys = self.COLUMN_SOURCE_ABBREV.keys() 
 987          variantTypes = [self.COLUMN_SOURCE_ABBREV[key] for key in keys] 
 988          selectKeys = ['ChineseCharacter'] 
 989          selectKeys.extend(keys) 
 990   
 991          table = self.db.tables['Unihan'] 
 992          tableEntries = self.db.selectRows( 
 993              select([table.c[column] for column in selectKeys])) 
 994          return CharacterVariantBuilder.VariantGenerator(tableEntries, 
 995              variantTypes, self.quiet).generator() 
 996   
 997 -    def build(self): 
 998          if not self.quiet: 
 999              warn("Reading table content from Unihan columns '" \ 
1000                  + ', '.join(self.COLUMN_SOURCE_ABBREV.keys()) + "'") 
1001          super(CharacterVariantBuilder, self).build() 
1002   
1003   
1004 -class CharacterVariantBMPBuilder(CharacterVariantBuilder): 
1005      """ 
1006      Builds a character variant mapping table from the Unihan database for 
1007      characters from the Basic Multilingual Plane (BMP) with code values between 
1008      U+0000 and U+FFFF. 
1009   
1010      MySQL < 6 doesn't support true UTF-8, and uses a Version with max 3 bytes: 
1011      U{http://dev.mysql.com/doc/refman/6.0/en/charset-unicode.html} 
1012      """ 
1013 -    class BMPVariantGenerator: 
1014   
1015 -        def __init__(self, variantEntries, typeList, quiet=False): 
1016              """ 
1017              Initialises the BMPVariantGenerator. 
1018   
1019              @type variantEntries: list of tuple 
1020              @param variantEntries: character variant entries from the Unihan 
1021                  database 
1022              @type typeList: list of str 
1023              @param typeList: variant types in the order given in tableEntries 
1024              @type quiet: bool 
1025              @param quiet: if true no status information will be printed 
1026              """ 
1027              self.variantGen = CharacterVariantBuilder.VariantGenerator( \ 
1028                  variantEntries, typeList, quiet).generator() 
1029   
1030 -        def generator(self): 
1031              for character, variant, variantType in self.variantGen: 
1032                  # skip characters outside the BMP, i.e. for Chinese characters 
1033                  # >= 0x20000 
1034                  if ord(variant) < int('20000', 16): 
1035                      yield(character, variant, variantType) 
1036   
1037 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
1038          super(CharacterVariantBMPBuilder, self).__init__(dataPath, 
1039              dbConnectInst, quiet) 
1040   
1041 -    def getGenerator(self): 
1042          # create generator 
1043          keys = self.COLUMN_SOURCE_ABBREV.keys() 
1044          variantTypes = [self.COLUMN_SOURCE_ABBREV[key] for key in keys] 
1045          selectKeys = ['ChineseCharacter'] 
1046          selectKeys.extend(keys) 
1047   
1048          table = self.db.tables['Unihan'] 
1049          tableEntries = self.db.selectRows( 
1050              select([table.c[column] for column in selectKeys])) 
1051          return CharacterVariantBMPBuilder.BMPVariantGenerator(tableEntries, 
1052              variantTypes, self.quiet).generator() 
1053   
1054   
1055 -class UnihanCharacterSetBuilder(EntryGeneratorBuilder): 
1056      """ 
1057      Builds a simple list of characters that belong to a specific class using the 
1058      Unihan data. 
1059      """ 
1060      DEPENDS=['Unihan'] 
1061   
1062 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
1063          super(UnihanCharacterSetBuilder, self).__init__(dataPath, dbConnectInst, 
1064              quiet) 
1065          # create name mappings 
1066          self.COLUMNS = ['ChineseCharacter'] 
1067          self.PRIMARY_KEYS = self.COLUMNS 
1068          # set column types 
1069          self.COLUMN_TYPES = {'ChineseCharacter': String(1)} 
1070   
1071 -    def getGenerator(self): 
1072          # create generator 
1073          table = self.db.tables['Unihan'] 
1074          # read rows here instead of scalars to yield tuples for the generator 
1075          tableEntries = self.db.selectRows( 
1076              select([table.c.ChineseCharacter], 
1077                  table.c[self.COLUMN_SOURCE] != None)) 
1078          return ListGenerator(tableEntries).generator() 
1079   
1080 -    def build(self): 
1081          if not self.quiet: 
1082              warn("Reading table content from Unihan column '" \ 
1083                  + self.COLUMN_SOURCE + "'") 
1084          super(UnihanCharacterSetBuilder, self).build() 
1085   
1086   
1087 -class IICoreSetBuilder(UnihanCharacterSetBuilder): 
1088      u""" 
1089      Builds a simple list of all characters in X{IICore} 
1090      (Unicode I{International Ideograph Core)}. 
1091      @see: Chinese Wikipedia on IICore: 
1092          U{http://zh.wikipedia.org/wiki/國際表意文字核心} 
1093      """ 
1094      PROVIDES = 'IICoreSet' 
1095      COLUMN_SOURCE = 'kIICore' 
1096   
1097   
1098 -class GB2312SetBuilder(UnihanCharacterSetBuilder): 
1099      """ 
1100      Builds a simple list of all characters in the Chinese standard X{GB2312-80}. 
1101      """ 
1102      PROVIDES = 'GB2312Set' 
1103      COLUMN_SOURCE = 'kGB0' 
1104   
1105  #} 
1106  #{ Unihan reading information 
1107   
1108 -class CharacterReadingBuilder(UnihanDerivedBuilder): 
1109      """ 
1110      Provides an abstract class for building a character reading mapping table 
1111      using the Unihan database. 
1112      """ 
1113 -    class SimpleReadingSplitter: 
1114          """Generates the reading entities from the Unihan table.""" 
1115          SPLIT_REGEX = re.compile(r"(\S+)") 
1116   
1117 -        def __init__(self, readingEntries, quiet=False): 
1118              """ 
1119              Initialises the ReadingSplitter. 
1120   
1121              @type readingEntries: list of tuple 
1122              @param readingEntries: character reading entries from the Unihan 
1123                  database 
1124              @type quiet: bool 
1125              @param quiet: if true no status information will be printed 
1126              """ 
1127              self.readingEntries = readingEntries 
1128              self.quiet = quiet 
1129   
1130 -        def generator(self): 
1131              """Provides one entry per reading entity and character.""" 
1132              for character, readings in self.readingEntries: 
1133                  readingList = self.SPLIT_REGEX.findall(readings) 
1134                  if not self.quiet and len(set(readingList)) < len(readingList): 
1135                      warn('reading information of character ' + character \ 
1136                          + ' is inconsistent: ' + ', '.join(readingList)) 
1137                  for reading in set(readingList): 
1138                      yield(character, reading.lower()) 
1139   
1140      COLUMN_TARGET = 'Reading' 
1141      COLUMN_TARGET_TYPE = Text() 
1142      GENERATOR_CLASS = SimpleReadingSplitter 
1143      DEPENDS=['Unihan'] 
1144   
1145   
1146 -class CharacterUnihanPinyinBuilder(CharacterReadingBuilder): 
1147      """ 
1148      Builds the character Pinyin mapping table from the Unihan database. 
1149      """ 
1150      PROVIDES = 'CharacterUnihanPinyin' 
1151      COLUMN_SOURCE = 'kMandarin' 
1152   
1153   
1154 -class CharacterJyutpingBuilder(CharacterReadingBuilder): 
1155      """Builds the character Jyutping mapping table from the Unihan database.""" 
1156      PROVIDES = 'CharacterJyutping' 
1157      COLUMN_SOURCE = 'kCantonese' 
1158   
1159   
1160 -class CharacterJapaneseKunBuilder(CharacterReadingBuilder): 
1161      """Builds the character Kun'yomi mapping table from the Unihan database.""" 
1162      PROVIDES = 'CharacterJapaneseKun' 
1163      COLUMN_SOURCE = 'kJapaneseKun' 
1164   
1165   
1166 -class CharacterJapaneseOnBuilder(CharacterReadingBuilder): 
1167      """Builds the character On'yomi mapping table from the Unihan database.""" 
1168      PROVIDES = 'CharacterJapaneseOn' 
1169      COLUMN_SOURCE = 'kJapaneseOn' 
1170   
1171   
1172 -class CharacterHangulBuilder(CharacterReadingBuilder): 
1173      """Builds the character Hangul mapping table from the Unihan database.""" 
1174      PROVIDES = 'CharacterHangul' 
1175      COLUMN_SOURCE = 'kHangul' 
1176   
1177   
1178 -class CharacterVietnameseBuilder(CharacterReadingBuilder): 
1179      """ 
1180      Builds the character Vietnamese mapping table from the Unihan database. 
1181      """ 
1182      PROVIDES = 'CharacterVietnamese' 
1183      COLUMN_SOURCE = 'kVietnamese' 
1184   
1185   
1186 -class CharacterXHPCReadingBuilder(CharacterReadingBuilder): 
1187      """ 
1188      Builds the Xiandai Hanyu Pinlu Cidian Pinyin mapping table using the Unihan 
1189      database. 
1190      """ 
1191 -    class XHPCReadingSplitter(CharacterReadingBuilder.SimpleReadingSplitter): 
1192          """ 
1193          Generates the Xiandai Hanyu Pinlu Cidian Pinyin syllables from the 
1194          Unihan table. 
1195          """ 
1196          SPLIT_REGEX = re.compile(ur"([a-zü]+[1-5])\([0-9]+\)") 
1197   
1198      GENERATOR_CLASS = XHPCReadingSplitter 
1199   
1200      PROVIDES = 'CharacterXHPCPinyin' 
1201      COLUMN_SOURCE = 'kHanyuPinlu' 
1202   
1203   
1204 -class CharacterXHCReadingBuilder(CharacterReadingBuilder): 
1205      """ 
1206      Builds the Xiandai Hanyu Cidian Pinyin mapping table using the Unihan 
1207      database. 
1208      """ 
1209 -    class XHCReadingSplitter(CharacterReadingBuilder.SimpleReadingSplitter): 
1210          """ 
1211          Generates the Xiandai Hanyu Cidian Pinyin syllables from the Unihan 
1212          table. 
1213          """ 
1214          SPLIT_REGEX = re.compile(r"[0-9,.*]+:(\S+)") 
1215   
1216          TONEMARK_VOWELS = [u'a', u'e', u'i', u'o', u'u', u'ü', u'n', u'm', u'r', 
1217              u'ê'] 
1218   
1219          TONEMARK_MAP = {u'\u0304': 1, u'\u0301': 2, u'\u030c': 3, u'\u0300': 4} 
1220   
1221 -        def __init__(self, readingEntries, quiet=False): 
1222              """ 
1223              Initialises the XHCReadingSplitter. 
1224   
1225              @type readingEntries: list of tuple 
1226              @param readingEntries: character reading entries from the Unihan 
1227                  database 
1228              @type quiet: bool 
1229              @param quiet: if true no status information will be printed 
1230              """ 
1231              CharacterReadingBuilder.SimpleReadingSplitter.__init__(self, 
1232                  readingEntries, quiet) 
1233              self._toneMarkRegex = re.compile(u'[' \ 
1234                  + ''.join(self.TONEMARK_MAP.keys()) + ']') 
1235   
1236 -        def convertTonemark(self, entity): 
1237              """ 
1238              Converts the entity with diacritics into an entity with tone mark 
1239              as appended number. 
1240   
1241              @type entity: str 
1242              @param entity: entity with tonal information 
1243              @rtype: tuple 
1244              @return: plain entity without tone mark and entity's tone index 
1245                  (starting with 1) 
1246              """ 
1247              import unicodedata 
1248              # get decomposed Unicode string, e.g. C{'ū'} to C{'u\u0304'} 
1249              entity = unicodedata.normalize("NFD", unicode(entity)) 
1250              # find character with tone marker 
1251              matchObj = self._toneMarkRegex.search(entity) 
1252              if matchObj: 
1253                  diacriticalMark = matchObj.group(0) 
1254                  tone = self.TONEMARK_MAP[diacriticalMark] 
1255                  # strip off diacritical mark 
1256                  plainEntity = entity.replace(diacriticalMark, '') 
1257                  # compose Unicode string (used for ê) and return with tone 
1258                  return unicodedata.normalize("NFC", plainEntity) + str(tone) 
1259              else: 
1260                  # fifth tone doesn't have any marker 
1261                  return unicodedata.normalize("NFC", entity) + '5' 
1262   
1263 -        def generator(self): 
1264              """Provides one entry per reading entity and character.""" 
1265              for character, readings in self.readingEntries: 
1266                  readingList = self.SPLIT_REGEX.findall(readings) 
1267                  if not self.quiet and len(set(readingList)) < len(readingList): 
1268                      warn('reading information of character ' + character \ 
1269                          + ' is inconsistent: ' + ', '.join(readingList)) 
1270                  for reading in set(readingList): 
1271                      yield(character, self.convertTonemark(reading.lower())) 
1272   
1273      GENERATOR_CLASS = XHCReadingSplitter 
1274   
1275      PROVIDES = 'CharacterXHCPinyin' 
1276      COLUMN_SOURCE = 'kXHC1983' 
1277   
1278   
1279 -class CharacterPinyinBuilder(EntryGeneratorBuilder): 
1280      """ 
1281      Builds the character Pinyin mapping table from the several sources. 
1282      """ 
1283      PROVIDES = 'CharacterPinyin' 
1284      DEPENDS=['CharacterUnihanPinyin', 'CharacterXHPCPinyin', 
1285          'CharacterXHCPinyin'] 
1286   
1287 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
1288          super(CharacterPinyinBuilder, self).__init__(dataPath, dbConnectInst, 
1289              quiet) 
1290          # create name mappings 
1291          self.COLUMNS = ['ChineseCharacter', 'Reading'] 
1292          self.PRIMARY_KEYS = self.COLUMNS 
1293          # set column types 
1294          self.COLUMN_TYPES = {'ChineseCharacter': String(1), 
1295              'Reading': String(255)} 
1296   
1297 -    def getGenerator(self): 
1298          # create generator 
1299          selectQueries = [] 
1300          for tableName in self.DEPENDS: 
1301              table = self.db.tables[tableName] 
1302              selectQueries.append( 
1303                  select([table.c[column] for column in self.COLUMNS])) 
1304   
1305          tableEntries = self.db.selectRows(union(*selectQueries)) 
1306          return ListGenerator(tableEntries).generator() 
1307   
1308  #} 
1309  #{ CSV file based 
1310   
1311 -class CSVFileLoader(TableBuilder): 
1312      """ 
1313      Builds a table by loading its data from a list of comma separated values 
1314      (CSV). 
1315      """ 
1316      TABLE_CSV_FILE_MAPPING = '' 
1317      """csv file path""" 
1318      TABLE_DECLARATION_FILE_MAPPING = '' 
1319      """file path containing SQL create table code.""" 
1320      INDEX_KEYS = [] 
1321      """Index keys (not unique) of the created table""" 
1322   
1323 -    class DefaultDialect(csv.Dialect): 
1324          """Defines a default dialect for the case sniffing fails.""" 
1325          quoting = csv.QUOTE_NONE 
1326          delimiter = ',' 
1327          lineterminator = '\n' 
1328          quotechar = "'" 
1329   
1330 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
1331          super(CSVFileLoader, self).__init__(dataPath, dbConnectInst, quiet) 
1332   
1333      # TODO unicode_csv_reader(), utf_8_encoder(), byte_string_dialect() used 
1334      #  to work around missing Unicode support in csv module 
1335      @staticmethod 
1336 -    def unicode_csv_reader(unicode_csv_data, dialect, **kwargs): 
1337          # csv.py doesn't do Unicode; encode temporarily as UTF-8: 
1338          csv_reader = csv.reader(CSVFileLoader.utf_8_encoder(unicode_csv_data), 
1339              dialect=CSVFileLoader.byte_string_dialect(dialect), **kwargs) 
1340          for row in csv_reader: 
1341              # decode UTF-8 back to Unicode, cell by cell: 
1342              yield [unicode(cell, 'utf-8') for cell in row] 
1343   
1344      @staticmethod 
1345 -    def utf_8_encoder(unicode_csv_data): 
1346          for line in unicode_csv_data: 
1347              yield line.encode('utf-8') 
1348   
1349      @staticmethod 
1350 -    def byte_string_dialect(dialect): 
1351          class ByteStringDialect(csv.Dialect): 
1352              def __init__(self, dialect): 
1353                  self.delimiter = str(dialect.delimiter) 
1354                  if dialect.escapechar: 
1355                      self.escapechar = str(dialect.escapechar) 
1356                  self.lineterminator = str(dialect.lineterminator) 
1357                  self.quotechar = str(dialect.quotechar) 
1358                  self.quoting = dialect.quoting 
1359   
1360          return ByteStringDialect(dialect) 
1361   
1362 -    def getCSVReader(self, fileHandle): 
1363          """ 
1364          Returns a csv reader object for a given file name. 
1365   
1366          The file can start with the character '#' to mark comments. These will 
1367          be ignored. The first line after the leading comments will be used to 
1368          guess the csv file's format. 
1369   
1370          @type fileHandle: file 
1371          @param fileHandle: file handle of the CSV file 
1372          @rtype: instance 
1373          @return: CSV reader object returning one entry per line 
1374          """ 
1375          def prependLineGenerator(line, data): 
1376              """ 
1377              The first line red for guessing format has to be reinserted. 
1378              """ 
1379              yield line 
1380              for nextLine in data: 
1381                  yield nextLine 
1382   
1383          line = '#' 
1384          try: 
1385              while line.strip().startswith('#'): 
1386                  line = fileHandle.next() 
1387          except StopIteration: 
1388              return csv.reader(fileHandle) 
1389          try: 
1390              self.fileDialect = csv.Sniffer().sniff(line, ['\t', ',']) 
1391          except csv.Error: 
1392              self.fileDialect = CSVFileLoader.DefaultDialect() 
1393   
1394          content = prependLineGenerator(line, fileHandle) 
1395          #return csv.reader(content, dialect=self.fileDialect) # TODO 
1396          return CSVFileLoader.unicode_csv_reader(content, self.fileDialect) 
1397   
1398 -    def build(self): 
1399          import locale 
1400          import codecs 
1401   
1402          definitionFile = self.findFile([self.TABLE_DECLARATION_FILE_MAPPING], 
1403              "SQL table definition file") 
1404          contentFile = self.findFile([self.TABLE_CSV_FILE_MAPPING], "table") 
1405   
1406          # get create statement 
1407          if not self.quiet: 
1408              warn("Reading table definition from file '" + definitionFile + "'") 
1409   
1410          fileHandle = codecs.open(definitionFile, 'r', 'utf-8') 
1411          createStatement = ''.join(fileHandle.readlines()).strip("\n") 
1412          # get create statement 
1413          self.db.execute(text(createStatement)) 
1414          table = Table(self.PROVIDES, self.db.metadata, autoload=True) 
1415   
1416          # write table content 
1417          if not self.quiet: 
1418              warn("Reading table '" + self.PROVIDES + "' from file '" \ 
1419                  + contentFile + "'") 
1420          fileHandle = codecs.open(contentFile, 'r', 'utf-8') 
1421   
1422          entries = [] 
1423          for line in self.getCSVReader(fileHandle): 
1424              if len(line) == 1 and not line[0].strip(): 
1425                  continue 
1426              entryDict = dict([(column.name, line[i]) \ 
1427                  for i, column in enumerate(table.columns)]) 
1428              entries.append(entryDict) 
1429   
1430          try: 
1431              self.db.execute(table.insert(), entries) 
1432          except sqlalchemy.exceptions.IntegrityError, e: 
1433              warn(unicode(e)) 
1434              #warn(unicode(insertStatement)) 
1435              raise 
1436   
1437          # get create index statement 
1438          for index in self.buildIndexObjects(self.PROVIDES, self.INDEX_KEYS): 
1439              index.create() 
1440   
1441 -    def remove(self): 
1442          # get drop table statement 
1443          table = Table(self.PROVIDES, self.db.metadata) 
1444          table.drop() 
1445   
1446   
1447 -class PinyinSyllablesBuilder(CSVFileLoader): 
1448      """ 
1449      Builds a list of Pinyin syllables. 
1450      """ 
1451      PROVIDES = 'PinyinSyllables' 
1452   
1453      TABLE_CSV_FILE_MAPPING = 'pinyinsyllables.csv' 
1454      TABLE_DECLARATION_FILE_MAPPING = 'pinyinsyllables.sql' 
1455   
1456   
1457 -class PinyinInitialFinalBuilder(CSVFileLoader): 
1458      """ 
1459      Builds a mapping from Pinyin syllables to their initial/final parts. 
1460      """ 
1461      PROVIDES = 'PinyinInitialFinal' 
1462   
1463      TABLE_CSV_FILE_MAPPING = 'pinyininitialfinal.csv' 
1464      TABLE_DECLARATION_FILE_MAPPING = 'pinyininitialfinal.sql' 
1465   
1466   
1467 -class WadeGilesSyllablesBuilder(CSVFileLoader): 
1468      """ 
1469      Builds a list of Wade-Giles syllables. 
1470      """ 
1471      PROVIDES = 'WadeGilesSyllables' 
1472   
1473      TABLE_CSV_FILE_MAPPING = 'wadegilessyllables.csv' 
1474      TABLE_DECLARATION_FILE_MAPPING = 'wadegilessyllables.sql' 
1475   
1476   
1477 -class GRSyllablesBuilder(CSVFileLoader): 
1478      """ 
1479      Builds a list of Gwoyeu Romatzyh syllables. 
1480      """ 
1481      PROVIDES = 'GRSyllables' 
1482   
1483      TABLE_CSV_FILE_MAPPING = 'grsyllables.csv' 
1484      TABLE_DECLARATION_FILE_MAPPING = 'grsyllables.sql' 
1485   
1486   
1487 -class GRRhotacisedFinalsBuilder(CSVFileLoader): 
1488      """ 
1489      Builds a list of Gwoyeu Romatzyh rhotacised finals. 
1490      """ 
1491      PROVIDES = 'GRRhotacisedFinals' 
1492   
1493      TABLE_CSV_FILE_MAPPING = 'grrhotacisedfinals.csv' 
1494      TABLE_DECLARATION_FILE_MAPPING = 'grrhotacisedfinals.sql' 
1495   
1496   
1497 -class GRAbbreviationBuilder(CSVFileLoader): 
1498      """ 
1499      Builds a list of Gwoyeu Romatzyh abbreviated spellings. 
1500      """ 
1501      PROVIDES = 'GRAbbreviation' 
1502   
1503      TABLE_CSV_FILE_MAPPING = 'grabbreviation.csv' 
1504      TABLE_DECLARATION_FILE_MAPPING = 'grabbreviation.sql' 
1505   
1506   
1507 -class JyutpingSyllablesBuilder(CSVFileLoader): 
1508      """ 
1509      Builds a list of Jyutping syllables. 
1510      """ 
1511      PROVIDES = 'JyutpingSyllables' 
1512   
1513      TABLE_CSV_FILE_MAPPING = 'jyutpingsyllables.csv' 
1514      TABLE_DECLARATION_FILE_MAPPING = 'jyutpingsyllables.sql' 
1515   
1516   
1517 -class JyutpingInitialFinalBuilder(CSVFileLoader): 
1518      """ 
1519      Builds a mapping from Jyutping syllables to their initial/final parts. 
1520      """ 
1521      PROVIDES = 'JyutpingInitialFinal' 
1522   
1523      TABLE_CSV_FILE_MAPPING = 'jyutpinginitialfinal.csv' 
1524      TABLE_DECLARATION_FILE_MAPPING = 'jyutpinginitialfinal.sql' 
1525   
1526   
1527 -class CantoneseYaleSyllablesBuilder(CSVFileLoader): 
1528      """ 
1529      Builds a list of Cantonese Yale syllables. 
1530      """ 
1531      PROVIDES = 'CantoneseYaleSyllables' 
1532   
1533      TABLE_CSV_FILE_MAPPING = 'cantoneseyalesyllables.csv' 
1534      TABLE_DECLARATION_FILE_MAPPING = 'cantoneseyalesyllables.sql' 
1535   
1536   
1537 -class CantoneseYaleInitialNucleusCodaBuilder(CSVFileLoader): 
1538      """ 
1539      Builds a mapping of Cantonese syllable in the Yale romanisation 
1540      system to the syllables' initial, nucleus and coda. 
1541      """ 
1542      PROVIDES = 'CantoneseYaleInitialNucleusCoda' 
1543   
1544      TABLE_CSV_FILE_MAPPING = 'cantoneseyaleinitialnucleuscoda.csv' 
1545      TABLE_DECLARATION_FILE_MAPPING = 'cantoneseyaleinitialnucleuscoda.sql' 
1546   
1547   
1548 -class JyutpingYaleMappingBuilder(CSVFileLoader): 
1549      """ 
1550      Builds a mapping between syllables in Jyutping and the Yale romanization 
1551      system. 
1552      """ 
1553      PROVIDES = 'JyutpingYaleMapping' 
1554   
1555      TABLE_CSV_FILE_MAPPING = 'jyutpingyalemapping.csv' 
1556      TABLE_DECLARATION_FILE_MAPPING = 'jyutpingyalemapping.sql' 
1557   
1558   
1559 -class WadeGilesPinyinMappingBuilder(CSVFileLoader): 
1560      """ 
1561      Builds a mapping between syllables in Wade-Giles and Pinyin. 
1562      """ 
1563      PROVIDES = 'WadeGilesPinyinMapping' 
1564   
1565      TABLE_CSV_FILE_MAPPING = 'wadegilespinyinmapping.csv' 
1566      TABLE_DECLARATION_FILE_MAPPING = 'wadegilespinyinmapping.sql' 
1567   
1568   
1569 -class PinyinGRMappingBuilder(CSVFileLoader): 
1570      """ 
1571      Builds a mapping between syllables in Pinyin and Gwoyeu Romatzyh. 
1572      """ 
1573      PROVIDES = 'PinyinGRMapping' 
1574   
1575      TABLE_CSV_FILE_MAPPING = 'pinyingrmapping.csv' 
1576      TABLE_DECLARATION_FILE_MAPPING = 'pinyingrmapping.sql' 
1577   
1578   
1579 -class PinyinIPAMappingBuilder(CSVFileLoader): 
1580      """ 
1581      Builds a mapping between syllables in Pinyin and their representation in 
1582      IPA. 
1583      """ 
1584      PROVIDES = 'PinyinIPAMapping' 
1585   
1586      TABLE_CSV_FILE_MAPPING = 'pinyinipamapping.csv' 
1587      TABLE_DECLARATION_FILE_MAPPING = 'pinyinipamapping.sql' 
1588   
1589   
1590 -class MandarinIPAInitialFinalBuilder(CSVFileLoader): 
1591      """ 
1592      Builds a mapping from Mandarin syllables in IPA to their initial/final 
1593      parts. 
1594      """ 
1595      PROVIDES = 'MandarinIPAInitialFinal' 
1596   
1597      TABLE_CSV_FILE_MAPPING = 'mandarinipainitialfinal.csv' 
1598      TABLE_DECLARATION_FILE_MAPPING = 'mandarinipainitialfinal.sql' 
1599   
1600   
1601 -class JyutpingIPAMappingBuilder(CSVFileLoader): 
1602      """ 
1603      Builds a mapping between syllables in Jyutping and their representation in 
1604      IPA. 
1605      """ 
1606      PROVIDES = 'JyutpingIPAMapping' 
1607   
1608      TABLE_CSV_FILE_MAPPING = 'jyutpingipamapping.csv' 
1609      TABLE_DECLARATION_FILE_MAPPING = 'jyutpingipamapping.sql' 
1610   
1611   
1612 -class CantoneseIPAInitialFinalBuilder(CSVFileLoader): 
1613      """ 
1614      Builds a mapping from Cantonese syllables in IPA to their initial/final 
1615      parts. 
1616      """ 
1617      PROVIDES = 'CantoneseIPAInitialFinal' 
1618   
1619      TABLE_CSV_FILE_MAPPING = 'cantoneseipainitialfinal.csv' 
1620      TABLE_DECLARATION_FILE_MAPPING = 'cantoneseipainitialfinal.sql' 
1621   
1622   
1623 -class KangxiRadicalBuilder(CSVFileLoader): 
1624      """ 
1625      Builds a mapping between Kangxi radical index and radical characters. 
1626      """ 
1627      PROVIDES = 'KangxiRadical' 
1628   
1629      TABLE_CSV_FILE_MAPPING = 'kangxiradical.csv' 
1630      TABLE_DECLARATION_FILE_MAPPING = 'kangxiradical.sql' 
1631   
1632   
1633 -class KangxiRadicalIsolatedCharacterBuilder(CSVFileLoader): 
1634      """ 
1635      Builds a mapping between Kangxi radical index and radical equivalent 
1636      characters without radical form. 
1637      """ 
1638      PROVIDES = 'KangxiRadicalIsolatedCharacter' 
1639   
1640      TABLE_CSV_FILE_MAPPING = 'kangxiradicalisolatedcharacter.csv' 
1641      TABLE_DECLARATION_FILE_MAPPING = 'kangxiradicalisolatedcharacter.sql' 
1642   
1643   
1644 -class RadicalEquivalentCharacterBuilder(CSVFileLoader): 
1645      """ 
1646      Builds a mapping between I{Unicode radical forms} and 
1647      I{Unicode radical variants} on one side and I{equivalent characters} on the 
1648      other side. 
1649      """ 
1650      PROVIDES = 'RadicalEquivalentCharacter' 
1651   
1652      TABLE_CSV_FILE_MAPPING = 'radicalequivalentcharacter.csv' 
1653      TABLE_DECLARATION_FILE_MAPPING = 'radicalequivalentcharacter.sql' 
1654   
1655   
1656 -class StrokesBuilder(CSVFileLoader): 
1657      """ 
1658      Builds a list of strokes and their names. 
1659      """ 
1660      PROVIDES = 'Strokes' 
1661   
1662      TABLE_CSV_FILE_MAPPING = 'strokes.csv' 
1663      TABLE_DECLARATION_FILE_MAPPING = 'strokes.sql' 
1664   
1665   
1666 -class StrokeOrderBuilder(CSVFileLoader): 
1667      """ 
1668      Builds a mapping between characters and their stroke order. 
1669      """ 
1670      PROVIDES = 'StrokeOrder' 
1671   
1672      TABLE_CSV_FILE_MAPPING = 'strokeorder.csv' 
1673      TABLE_DECLARATION_FILE_MAPPING = 'strokeorder.sql' 
1674   
1675   
1676 -class CharacterDecompositionBuilder(CSVFileLoader): 
1677      """ 
1678      Builds a mapping between characters and their decomposition. 
1679      """ 
1680      PROVIDES = 'CharacterDecomposition' 
1681   
1682      TABLE_CSV_FILE_MAPPING = 'characterdecomposition.csv' 
1683      TABLE_DECLARATION_FILE_MAPPING = 'characterdecomposition.sql' 
1684      INDEX_KEYS = [['ChineseCharacter', 'ZVariant']] 
1685   
1686   
1687 -class LocaleCharacterVariantBuilder(CSVFileLoader): 
1688      """ 
1689      Builds a mapping between a character under a locale and its default variant. 
1690      """ 
1691      PROVIDES = 'LocaleCharacterVariant' 
1692   
1693      TABLE_CSV_FILE_MAPPING = 'localecharactervariant.csv' 
1694      TABLE_DECLARATION_FILE_MAPPING = 'localecharactervariant.sql' 
1695   
1696   
1697 -class MandarinBraileInitialBuilder(CSVFileLoader): 
1698      """ 
1699      Builds a mapping of Mandarin Chinese syllable initials in Pinyin to Braille 
1700      characters. 
1701      """ 
1702      PROVIDES = 'PinyinBrailleInitialMapping' 
1703   
1704      TABLE_CSV_FILE_MAPPING = 'pinyinbrailleinitialmapping.csv' 
1705      TABLE_DECLARATION_FILE_MAPPING = 'pinyinbrailleinitialmapping.sql' 
1706   
1707   
1708 -class MandarinBraileFinalBuilder(CSVFileLoader): 
1709      """ 
1710      Builds a mapping of Mandarin Chinese syllable finals in Pinyin to Braille 
1711      characters. 
1712      """ 
1713      PROVIDES = 'PinyinBrailleFinalMapping' 
1714   
1715      TABLE_CSV_FILE_MAPPING = 'pinyinbraillefinalmapping.csv' 
1716      TABLE_DECLARATION_FILE_MAPPING = 'pinyinbraillefinalmapping.sql' 
1717   
1718   
1719  #} 
1720  #{ Library dependant 
1721   
1722 -class ZVariantBuilder(EntryGeneratorBuilder): 
1723      """ 
1724      Builds a list of glyph indices for characters. 
1725      @todo Impl: Check if all Z-variants in LocaleCharacterVariant are included. 
1726      @todo Bug: Forms with two variants in CharacterDecomposition are missing, 
1727          e.g. ⾓. 
1728      """ 
1729      PROVIDES = 'ZVariants' 
1730      DEPENDS = ['CharacterDecomposition', 'StrokeOrder', 'Unihan'] 
1731      # TODO 'LocaleCharacterVariant' 
1732   
1733      COLUMNS = ['ChineseCharacter', 'ZVariant'] 
1734      PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant'] 
1735      INDEX_KEYS = [['ChineseCharacter']] 
1736      COLUMN_TYPES = {'ChineseCharacter': String(1), 'ZVariant': Integer()} 
1737   
1738 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
1739          super(ZVariantBuilder, self).__init__(dataPath, dbConnectInst, quiet) 
1740   
1741 -    def getGenerator(self): 
1742          decompositionTable = self.db.tables['CharacterDecomposition'] 
1743          strokeOrderTable = self.db.tables['CharacterDecomposition'] 
1744          unihanTable = self.db.tables['Unihan'] 
1745   
1746          characterSet = set(self.db.selectRows( 
1747              select([decompositionTable.c.ChineseCharacter, 
1748                  decompositionTable.c.ZVariant], distinct=True))) 
1749          characterSet.update(self.db.selectRows( 
1750              select([strokeOrderTable.c.ChineseCharacter, 
1751                  strokeOrderTable.c.ZVariant]))) 
1752          # TODO 
1753          #characterSet.update(self.db.select('LocaleCharacterVariant', 
1754              #['ChineseCharacter', 'ZVariant'])) 
1755          # Add characters from Unihan as Z-variant 0 
1756          unihanCharacters = self.db.selectScalars( 
1757              select([unihanTable.c.ChineseCharacter], 
1758                  or_(unihanTable.c.kTotalStrokes != None, 
1759                      unihanTable.c.kRSKangXi != None))) 
1760          characterSet.update([(char, 0) for char in unihanCharacters]) 
1761   
1762          return ListGenerator(characterSet).generator() 
1763   
1764   
1765 -class StrokeCountBuilder(EntryGeneratorBuilder): 
1766      """ 
1767      Builds a mapping between characters and their stroke count. 
1768      """ 
1769 -    class StrokeCountGenerator: 
1770          """Generates the character stroke count mapping.""" 
1771 -        def __init__(self, dbConnectInst, characterSet, quiet=False): 
1772              """ 
1773              Initialises the StrokeCountGenerator. 
1774   
1775              @type dbConnectInst: instance 
1776              @param dbConnectInst: instance of a L{DatabaseConnector}. 
1777              @type characterSet: set 
1778              @param characterSet: set of characters to generate the table for 
1779              @type quiet: bool 
1780              @param quiet: if true no status information will be printed to 
1781                  stderr 
1782              """ 
1783              self.characterSet = characterSet 
1784              self.quiet = quiet 
1785              self.cjk = characterlookup.CharacterLookup( 
1786                  dbConnectInst=dbConnectInst) 
1787              # make sure a currently existing table is not used 
1788              self.cjk.hasStrokeCount = False 
1789   
1790 -        def generator(self): 
1791              """Provides one entry per character, z-Variant and locale subset.""" 
1792              for char, zVariant in self.characterSet: 
1793                  try: 
1794                      # cjklib's stroke count method uses the stroke order 
1795                      #   information as long as this table doesn't exist 
1796                      strokeCount = self.cjk.getStrokeCount(char, 
1797                          zVariant=zVariant) 
1798                      yield {'ChineseCharacter': char, 'StrokeCount': strokeCount, 
1799                          'ZVariant': zVariant} 
1800                  except exception.NoInformationError: 
1801                      pass 
1802                  except IndexError: 
1803                      if not self.quiet: 
1804                          warn("malformed IDS for character '" + char \ 
1805                              + "'") 
1806   
1807      PROVIDES = 'StrokeCount' 
1808      DEPENDS = ['CharacterDecomposition', 'StrokeOrder'] 
1809   
1810      COLUMNS = ['ChineseCharacter', 'StrokeCount', 'ZVariant'] 
1811      PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant'] 
1812      COLUMN_TYPES = {'ChineseCharacter': String(1), 'StrokeCount': Integer(), 
1813          'ZVariant': Integer()} 
1814   
1815 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
1816          super(StrokeCountBuilder, self).__init__(dataPath, dbConnectInst, quiet) 
1817   
1818 -    def getGenerator(self): 
1819          decompositionTable = self.db.tables['CharacterDecomposition'] 
1820          strokeOrderTable = self.db.tables['StrokeOrder'] 
1821   
1822          characterSet = set(self.db.selectRows( 
1823              select([decompositionTable.c.ChineseCharacter, 
1824                  decompositionTable.c.ZVariant], distinct=True))) 
1825          characterSet.update(self.db.selectRows( 
1826              select([strokeOrderTable.c.ChineseCharacter, 
1827                  strokeOrderTable.c.ZVariant]))) 
1828          return StrokeCountBuilder.StrokeCountGenerator(self.db, characterSet, 
1829              self.quiet).generator() 
1830   
1831   
1832 -class CombinedStrokeCountBuilder(StrokeCountBuilder): 
1833      """ 
1834      Builds a mapping between characters and their stroke count. Includes stroke 
1835      count data from the Unihan database to make up for missing data in own data 
1836      files. 
1837      """ 
1838 -    class CombinedStrokeCountGenerator: 
1839          """Generates the character stroke count mapping.""" 
1840 -        def __init__(self, dbConnectInst, characterSet, tableEntries, 
1841              preferredBuilder, quiet=False): 
1842              """ 
1843              Initialises the CombinedStrokeCountGenerator. 
1844   
1845              @type dbConnectInst: instance 
1846              @param dbConnectInst: instance of a L{DatabaseConnector}. 
1847              @type characterSet: set 
1848              @param characterSet: set of characters to generate the table for 
1849              @type tableEntries: list of list 
1850              @param tableEntries: list of characters with Z-variant 
1851              @type preferredBuilder: instance 
1852              @param preferredBuilder: TableBuilder which forms are preferred over 
1853                  entries from the Unihan table 
1854              @type quiet: bool 
1855              @param quiet: if true no status information will be printed to 
1856                  stderr 
1857              """ 
1858              self.characterSet = characterSet 
1859              self.tableEntries = tableEntries 
1860              self.preferredBuilder = preferredBuilder 
1861              self.quiet = quiet 
1862              self.cjk = characterlookup.CharacterLookup( 
1863                  dbConnectInst=dbConnectInst) 
1864              self.db = dbConnectInst 
1865   
1866 -        def getStrokeCount(self, char, zVariant, strokeCountDict, 
1867              unihanStrokeCountDict, decompositionDict): 
1868              """ 
1869              Gets the stroke count of the given character by summing up the 
1870              stroke count of its components and using the Unihan table as 
1871              fallback. 
1872   
1873              For the sake of consistency this method doesn't take the stroke 
1874              count given by Unihan directly but sums up the stroke counts of the 
1875              components to make sure the sum of component's stroke count will 
1876              always give the characters stroke count. The result yielded will be 
1877              in many cases even more precise than the value given in Unihan (not 
1878              depending on the actual glyph form). 
1879   
1880              Once calculated the stroke count will be cached in the given 
1881              strokeCountDict object. 
1882   
1883              @type char: str 
1884              @param char: Chinese character 
1885              @type zVariant: int 
1886              @param zVariant: Z-variant of character 
1887              @rtype: int 
1888              @return: stroke count 
1889              @raise ValueError: if stroke count is ambiguous due to inconsistent 
1890                  values wrt Unihan vs. own data. 
1891              @raise NoInformationError: if decomposition is incomplete 
1892              """ 
1893              if char == u'？': 
1894                  # we have an incomplete decomposition, can't build 
1895                  raise exception.NoInformationError("incomplete decomposition") 
1896   
1897              if (char, zVariant) not in strokeCountDict: 
1898                  lastStrokeCount = None 
1899                  if (char, zVariant) in decompositionDict: 
1900                      # try all decompositions of this character, all need to 
1901                      #   return the same count for sake of consistency 
1902                      for decomposition in decompositionDict[(char, zVariant)]: 
1903                          try: 
1904                              accumulatedStrokeCount = 0 
1905   
1906                              for entry in decomposition: 
1907                                  if type(entry) == types.TupleType: 
1908                                      component, componentZVariant = entry 
1909   
1910                                      accumulatedStrokeCount += \ 
1911                                          self.getStrokeCount(component, 
1912                                              componentZVariant, strokeCountDict, 
1913                                              unihanStrokeCountDict, 
1914                                              decompositionDict) 
1915   
1916                              if lastStrokeCount != None \ 
1917                                  and lastStrokeCount != accumulatedStrokeCount: 
1918                                  # different stroke counts taken from different 
1919                                  #   decompositions, can't build at all 
1920                                  raise ValueError("ambiguous stroke count " \ 
1921                                      + "information, due to various stroke " \ 
1922                                      + "count sources for " \ 
1923                                      + repr((char, ZVariant))) 
1924                              else: 
1925                                  # first run or equal to previous calculation 
1926                                  lastStrokeCount = accumulatedStrokeCount 
1927   
1928                          except exception.NoInformationError: 
1929                              continue 
1930   
1931                  if lastStrokeCount != None: 
1932                      strokeCountDict[(char, zVariant)] = lastStrokeCount 
1933                  else: 
1934                      # couldn't get stroke counts from components, check fallback 
1935                      #   resources 
1936                      if (char, 0) in strokeCountDict: 
1937                          # own sources have info for fallback zVariant 
1938                          strokeCountDict[(char, zVariant)] \ 
1939                              = strokeCountDict[(char, 0)] 
1940   
1941                      elif char in unihanStrokeCountDict: 
1942                          # take Unihan info 
1943                          strokeCountDict[(char, zVariant)] \ 
1944                              = unihanStrokeCountDict[char] 
1945   
1946                      else: 
1947                          strokeCountDict[(char, zVariant)] = None 
1948   
1949              if strokeCountDict[(char, zVariant)] == None: 
1950                  raise exception.NoInformationError( 
1951                      "missing stroke count information") 
1952              else: 
1953                  return strokeCountDict[(char, zVariant)] 
1954   
1955 -        def generator(self): 
1956              """Provides one entry per character, z-Variant and locale subset.""" 
1957              # handle chars from own data first 
1958              strokeCountDict = {} 
1959              for entry in self.preferredBuilder: 
1960                  yield entry 
1961   
1962                  # save stroke count for later processing, prefer Z-variant 0 
1963                  key = (entry['ChineseCharacter'], entry['ZVariant']) 
1964                  strokeCountDict[key] = entry['StrokeCount'] 
1965   
1966              # now get stroke counts from Unihan table 
1967   
1968              # get Unihan table stroke count data 
1969              unihanStrokeCountDict = {} 
1970              for char, strokeCount in self.tableEntries: 
1971                  if (char, 0) not in strokeCountDict: 
1972                      unihanStrokeCountDict[char] = strokeCount 
1973   
1974              # finally fill up with characters from Unihan; proper glyph 
1975              #   information missing though in some cases. 
1976   
1977              # remove glyphs we already have an entry for 
1978              self.characterSet.difference_update(strokeCountDict.keys()) 
1979   
1980              # get character decompositions 
1981              decompositionDict = self.cjk.getDecompositionEntriesDict() 
1982   
1983              for char, zVariant in self.characterSet: 
1984                  warningZVariants = [] 
1985                  try: 
1986                      # build stroke count from mixed source 
1987                      strokeCount = self.getStrokeCount(char, zVariant, 
1988                          strokeCountDict, unihanStrokeCountDict, 
1989                          decompositionDict) 
1990   
1991                      yield {'ChineseCharacter': char, 'ZVariant': zVariant, 
1992                          'StrokeCount': strokeCount} 
1993                  except ValueError, e: 
1994                      warningZVariants.append(zVariant) 
1995                  except exception.NoInformationError: 
1996                      pass 
1997   
1998                  if not self.quiet and warningZVariants: 
1999                      warn("ambiguous stroke count information (mixed sources) " \ 
2000                          "for character '" + char + "' for Z-variant(s) '" \ 
2001                          + ''.join([str(z) for z in warningZVariants]) + "'") 
2002   
2003      DEPENDS = ['CharacterDecomposition', 'StrokeOrder', 'Unihan'] 
2004      COLUMN_SOURCE = 'kTotalStrokes' 
2005   
2006 -    def getGenerator(self): 
2007          decompositionTable = self.db.tables['CharacterDecomposition'] 
2008          strokeOrderTable = self.db.tables['StrokeOrder'] 
2009          unihanTable = self.db.tables['Unihan'] 
2010   
2011          characterSet = set(self.db.selectRows( 
2012              select([decompositionTable.c.ChineseCharacter, 
2013                  decompositionTable.c.ZVariant], distinct=True))) 
2014          characterSet.update(self.db.selectRows( 
2015              select([strokeOrderTable.c.ChineseCharacter, 
2016                  strokeOrderTable.c.ZVariant]))) 
2017          preferredBuilder = \ 
2018              CombinedStrokeCountBuilder.StrokeCountGenerator(self.db, 
2019                  characterSet, self.quiet).generator() 
2020          # get main builder 
2021          tableEntries = self.db.selectRows( 
2022              select([unihanTable.c.ChineseCharacter, 
2023                  unihanTable.c[self.COLUMN_SOURCE]], 
2024                  unihanTable.c[self.COLUMN_SOURCE] != None)) 
2025   
2026          # get characters to build combined stroke count for. Some characters 
2027          #   from the CharacterDecomposition table might not have a stroke count 
2028          #   entry in Unihan though their components do have. 
2029          characterSet.update([(char, 0) for char, totalCount in tableEntries]) 
2030   
2031          return CombinedStrokeCountBuilder.CombinedStrokeCountGenerator(self.db, 
2032              characterSet, tableEntries, preferredBuilder, self.quiet)\ 
2033              .generator() 
2034   
2035   
2036 -class CharacterComponentLookupBuilder(EntryGeneratorBuilder): 
2037      """ 
2038      Builds a mapping between characters and their components. 
2039      """ 
2040 -    class CharacterComponentGenerator: 
2041          """Generates the component to character mapping.""" 
2042   
2043 -        def __init__(self, dbConnectInst, characterSet): 
2044              """ 
2045              Initialises the CharacterComponentGenerator. 
2046   
2047              @type dbConnectInst: instance 
2048              @param dbConnectInst: instance of a L{DatabaseConnector} 
2049              @type characterSet: set 
2050              @param characterSet: set of characters to generate the table for 
2051              """ 
2052              self.characterSet = characterSet 
2053              self.cjk = characterlookup.CharacterLookup( 
2054                  dbConnectInst=dbConnectInst) 
2055   
2056 -        def getComponents(self, char, zVariant, decompositionDict, 
2057              componentDict): 
2058              """ 
2059              Gets all character components for the given glyph. 
2060   
2061              @type char: str 
2062              @param char: Chinese character 
2063              @type zVariant: int 
2064              @param zVariant: Z-variant of character 
2065              @rtype: set 
2066              @return: all components of the character 
2067              """ 
2068              if (char, zVariant) not in componentDict: 
2069                  componentDict[(char, zVariant)] = set() 
2070   
2071                  if (char, zVariant) in decompositionDict: 
2072                      for decomposition in decompositionDict[(char, zVariant)]: 
2073                          componentDict[(char, zVariant)].update( 
2074                              [entry for entry in decomposition \ 
2075                                  if type(entry) == types.TupleType]) 
2076   
2077              componentSet = set() 
2078              for component, componentZVariant in componentDict[(char, zVariant)]: 
2079                  componentSet.add((component, componentZVariant)) 
2080                  # get sub-components 
2081                  componentSet.update(self.getComponents(component, 
2082                      componentZVariant, decompositionDict, componentDict)) 
2083   
2084              return componentSet 
2085   
2086 -        def generator(self): 
2087              """Provides the component entries.""" 
2088              decompositionDict = self.cjk.getDecompositionEntriesDict() 
2089              componentDict = {} 
2090              for char, zVariant in self.characterSet: 
2091                  for component, componentZVariant \ 
2092                      in self.getComponents(char, zVariant, decompositionDict, 
2093                          componentDict): 
2094                      yield {'ChineseCharacter': char, 'ZVariant': zVariant, 
2095                          'Component': component, 
2096                          'ComponentZVariant': componentZVariant} 
2097   
2098      PROVIDES = 'ComponentLookup' 
2099      DEPENDS = ['CharacterDecomposition'] 
2100   
2101      COLUMNS = ['ChineseCharacter', 'ZVariant', 'Component', 'ComponentZVariant'] 
2102      PRIMARY_KEYS = COLUMNS 
2103      INDEX_KEYS = [['Component']] 
2104      COLUMN_TYPES = {'ChineseCharacter': String(1), 'ZVariant': Integer(), 
2105          'Component': String(1), 'ComponentZVariant': Integer()} 
2106   
2107 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
2108          super(CharacterComponentLookupBuilder, self).__init__(dataPath, 
2109              dbConnectInst, quiet) 
2110   
2111 -    def getGenerator(self): 
2112          decompositionTable = self.db.tables['CharacterDecomposition'] 
2113          characterSet = set(self.db.selectRows( 
2114              select([decompositionTable.c.ChineseCharacter, 
2115                  decompositionTable.c.ZVariant], distinct=True))) 
2116          return CharacterComponentLookupBuilder.CharacterComponentGenerator( 
2117              self.db, characterSet).generator() 
2118   
2119   
2120 -class CharacterRadicalStrokeCountBuilder(EntryGeneratorBuilder): 
2121      """ 
2122      Builds a mapping between characters and their radical with stroke count of 
2123      residual components. 
2124   
2125      This class can be extended by inheriting 
2126      L{CharacterRadicalStrokeCountGenerator} and overwriting 
2127      L{CharacterRadicalStrokeCountGenerator.getFormRadicalIndex()} to implement 
2128      which forms should be regarded as radicals as well as 
2129      L{CharacterRadicalStrokeCountGenerator.filterForms()} to filter entries 
2130      before creation. 
2131      """ 
2132 -    class CharacterRadicalStrokeCountGenerator: 
2133          """Generates the character to radical/residual stroke count mapping.""" 
2134   
2135 -        def __init__(self, dbConnectInst, characterSet, quiet=False): 
2136              """ 
2137              Initialises the CharacterRadicalStrokeCountGenerator. 
2138   
2139              @type dbConnectInst: instance 
2140              @param dbConnectInst: instance of a L{DatabaseConnector} 
2141              @type characterSet: set 
2142              @param characterSet: set of characters to generate the table for 
2143              @type quiet: bool 
2144              @param quiet: if true no status information will be printed to 
2145                  stderr 
2146              """ 
2147              self.characterSet = characterSet 
2148              self.quiet = quiet 
2149              self.cjk = characterlookup.CharacterLookup( 
2150                  dbConnectInst=dbConnectInst) 
2151              self.radicalForms = None 
2152   
2153 -        def getFormRadicalIndex(self, form): 
2154              """ 
2155              Returns the Kangxi radical index for the given component. 
2156   
2157              @type form: str 
2158              @param form: component 
2159              @rtype: int 
2160              @return: radical index of the given radical form. 
2161              """ 
2162              if self.radicalForms == None: 
2163                  self.radicalForms = {} 
2164                  for loc in ['T', 'C', 'J', 'K', 'V']: 
2165                      for radicalIdx in range(1, 215): 
2166                          for f in \ 
2167                              self.cjk.getKangxiRadicalRepresentativeCharacters( 
2168                                  radicalIdx, loc): 
2169                              self.radicalForms[f] = radicalIdx 
2170   
2171              if form not in self.radicalForms: 
2172                  return None 
2173              return self.radicalForms[form] 
2174   
2175 -        def filterForms(self, formSet): 
2176              u""" 
2177              Filters the set of given radical form entries to return only one 
2178              single occurrence of a radical. 
2179   
2180              @type formSet: set of dict 
2181              @param formSet: radical/residual stroke count entries as generated 
2182                  by L{getEntries()}. 
2183              @rtype: set of dict 
2184              @return: subset of input 
2185              @todo Lang: On multiple occurrences of same radical (may be in 
2186                  different forms): Which one to choose? Implement to turn down 
2187                  unwanted forms. 
2188              """ 
2189              return formSet 
2190   
2191 -        def getEntries(self, char, zVariant, strokeCountDict, decompositionDict, 
2192              entriesDict): 
2193              u""" 
2194              Gets all radical/residual stroke count combinations from the given 
2195              decomposition. 
2196   
2197              @rtype: list 
2198              @return: all radical/residual stroke count combinations for the 
2199                  character 
2200              @raise ValueError: if IDS is malformed or ambiguous residual stroke 
2201                  count is calculated 
2202              @todo Fix:  Remove validity check, only needed as long 
2203                  decomposition entries aren't checked against stroke order 
2204                  entries. 
2205              """ 
2206              def getCharLayout(mainCharacterLayout, mainLayoutPosition, 
2207                  subCharLayout, subLayoutPosition): 
2208                  u""" 
2209                  Returns the character layout for the radical form within the 
2210                  component with layout subCharLayout itself belonging to a parent 
2211                  char with layout mainCharacterLayout. 
2212                  E.g. 鸺 can be decomposed into ⿰休鸟 and 休 can be furthermore 
2213                  decomposed into ⿰亻木. 亻 is found in a lower layer of 
2214                  decomposition, but as the structure of 休 and 鸺 are the same, 
2215                  and 亻 is on the left side of 休 which is on the left side of 鸺 
2216                  one can deduce 亻 as being on the utmost left side of 鸺. Thus 
2217                  (⿰, 0) would be returned. 
2218                  """ 
2219                  specialReturn = { 
2220                      (u'⿰', 0, u'⿰', 0): (u'⿰', 0), 
2221                      (u'⿰', 1, u'⿰', 1): (u'⿰', 1), 
2222                      (u'⿱', 0, u'⿱', 0): (u'⿱', 0), 
2223                      (u'⿱', 1, u'⿱', 1): (u'⿱', 1), 
2224                      (u'⿲', 0, u'⿲', 0): (u'⿰', 0), 
2225                      (u'⿲', 2, u'⿲', 2): (u'⿰', 1), 
2226                      (u'⿳', 0, u'⿳', 0): (u'⿱', 0), 
2227                      (u'⿳', 2, u'⿳', 2): (u'⿱', 0), 
2228                      (u'⿲', 0, u'⿰', 0): (u'⿰', 0), 
2229                      (u'⿲', 2, u'⿰', 1): (u'⿰', 1), 
2230                      (u'⿰', 0, u'⿲', 0): (u'⿰', 0), 
2231                      (u'⿰', 1, u'⿲', 1): (u'⿰', 1), 
2232                      (u'⿳', 0, u'⿱', 0): (u'⿱', 0), 
2233                      (u'⿳', 2, u'⿱', 1): (u'⿱', 1), 
2234                      (u'⿱', 0, u'⿳', 0): (u'⿱', 0), 
2235                      (u'⿱', 1, u'⿳', 2): (u'⿱', 1), 
2236                      } 
2237                  entry = (mainCharacterLayout, mainLayoutPosition, subCharLayout, 
2238                      subLayoutPosition) 
2239                  if entry in specialReturn: 
2240                      return specialReturn[entry] 
2241                  elif subCharLayout == u'⿻': 
2242                      # default value for complex position 
2243                      return (u'⿻', 0) 
2244                  elif mainCharacterLayout == None: 
2245                      # main layout 
2246                      return subCharLayout, subLayoutPosition 
2247                  else: 
2248                      # radical component has complex position 
2249                      return (u'⿻', 0) 
2250   
2251              # if no decomposition available then there is nothing to do 
2252              if (char, zVariant) not in decompositionDict: 
2253                  return [] 
2254   
2255              if (char, zVariant) not in entriesDict: 
2256                  entriesDict[(char, zVariant)] = set() 
2257   
2258                  for decomposition in decompositionDict[(char, zVariant)]: 
2259                      componentRadicalForms = [] 
2260                      # if a radical is found in a subcharacter an entry is added 
2261                      #   containing the radical form, its variant, the stroke 
2262                      #   count of residual characters in this main character and 
2263                      #   it's position in the main char (e.g. for 鸺 contains 
2264                      #   Form 鸟, Z-variant 0, residual stroke count 6, main 
2265                      #   layout ⿰ and position 1 (right side), as 亻 and 木 
2266                      #   together form the residual components, and the 
2267                      #   simplified structure of 鸺 applies to a left/right 
2268                      #   model, with 鸟 being at the 2nd position. 
2269   
2270                      # get all radical entries 
2271   
2272                      # layout stack which holds the IDS operators and a position 
2273                      #   in the IDS operator itself for each Chinese character 
2274                      layoutStack = [(None, None)] 
2275   
2276                      for entry in decomposition: 
2277                          try: 
2278                              layout, position = layoutStack.pop() 
2279                          except IndexError: 
2280                              raise ValueError("malformed IDS for character '" \ 
2281                                  + mainChar + "'") 
2282   
2283                          if type(entry) != types.TupleType: 
2284                              # ideographic description character found, derive 
2285                              #   layout from IDS and parent character and store 
2286                              #   in layout stack to be consumed by following 
2287                              #   Chinese characters 
2288                              if self.cjk.isTrinaryIDSOperator(entry): 
2289                                  posRange = [2, 1, 0] 
2290                              else: 
2291                                  posRange = [1, 0] 
2292   
2293                              for componentPos in posRange: 
2294                                  # append to stack one per following element, 
2295                                  #   adapt layout to parent one 
2296                                  layoutStack.append(getCharLayout(layout, 
2297                                      position, entry, componentPos)) 
2298                          else: 
2299                              # Chinese character found 
2300                              componentChar, componentZVariant = entry 
2301   
2302                              # create entries for this component 
2303                              radicalIndex \ 
2304                                  = self.getFormRadicalIndex(componentChar) 
2305                              if radicalIndex != None: 
2306                                  # main component is radical, no residual stroke 
2307                                  #   count, save relative position in main 
2308                                  #   character 
2309                                  componentRadicalForms.append( 
2310                                      {'Component': entry, 
2311                                      'Form': componentChar, 
2312                                      'Z-variant': componentZVariant, 
2313                                      'ResidualStrokeCount': 0, 
2314                                      'CharacterLayout': layout, 
2315                                      'RadicalIndex': radicalIndex, 
2316                                      'RadicalPosition': position}) 
2317   
2318                              # get all radical forms for this entry from 
2319                              #   sub-components 
2320                              for radicalEntry in self.getEntries(componentChar, 
2321                                  componentZVariant, strokeCountDict, 
2322                                  decompositionDict, entriesDict): 
2323   
2324                                  # get layout for this character wrt parent char 
2325                                  charLayout, charPosition = getCharLayout(layout, 
2326                                      position, radicalEntry['CharacterLayout'], 
2327                                      radicalEntry['RadicalPosition']) 
2328                                  componentEntry = radicalEntry.copy() 
2329                                  componentEntry['Component'] = entry 
2330                                  componentEntry['CharacterLayout'] = charLayout 
2331                                  componentEntry['RadicalPosition'] = charPosition 
2332                                  componentRadicalForms.append(componentEntry) 
2333   
2334                      # for each character get the residual characters first 
2335                      residualCharacters = {} 
2336                      charactersSeen = [] 
2337                      for entry in decomposition: 
2338                          # get Chinese characters 
2339                          if type(entry) == types.TupleType: 
2340                              # fill up already seen characters with next found 
2341                              for seenEntry in residualCharacters: 
2342                                  residualCharacters[seenEntry].append(entry) 
2343   
2344                              # set current character to already seen ones 
2345                              residualCharacters[entry] = charactersSeen[:] 
2346   
2347                              charactersSeen.append(entry) 
2348   
2349                      # calculate residual stroke count and create entries 
2350                      for componentEntry in componentRadicalForms: 
2351                          # residual stroke count is the sum of the component's 
2352                          #   residual stroke count (with out radical) and count 
2353                          #   of the other components 
2354                          for entry in \ 
2355                              residualCharacters[componentEntry['Component']]: 
2356   
2357                              if entry not in strokeCountDict: 
2358                                  break 
2359   
2360                              componentEntry['ResidualStrokeCount'] \ 
2361                                  += strokeCountDict[entry] 
2362                          else: 
2363                              # all stroke counts available 
2364                              del componentEntry['Component'] 
2365                              entriesDict[(char, zVariant)].add( 
2366                                  frozenset(componentEntry.items())) 
2367   
2368                  # validity check # TODO only needed as long decomposition and 
2369                  #   stroke order entries aren't checked for validity 
2370                  seenEntriesDict = {} 
2371                  for entry in [dict(d) for d in entriesDict[(char, zVariant)]]: 
2372                      keyEntry = (entry['Form'], entry['Z-variant'], 
2373                          entry['CharacterLayout'], entry['RadicalIndex'], 
2374                          entry['RadicalPosition']) 
2375                      if keyEntry in seenEntriesDict \ 
2376                          and seenEntriesDict[keyEntry] \ 
2377                              != entry['ResidualStrokeCount']: 
2378                          raise ValueError("ambiguous residual stroke count for " \ 
2379                              + "character '" + mainChar + "' with entry '" \ 
2380                              + "', '".join(list([unicode(column) \ 
2381                                  for column in keyEntry])) \ 
2382                              + "': '" + str(seenEntriesDict[keyEntry]) + "'/'" \ 
2383                              + str(entry['ResidualStrokeCount']) + "'") 
2384                      seenEntriesDict[keyEntry] = entry['ResidualStrokeCount'] 
2385   
2386              # filter forms, i.e. for multiple radical occurrences prefer one 
2387              return self.filterForms( 
2388                  [dict(d) for d in entriesDict[(char, zVariant)]]) 
2389   
2390 -        def generator(self): 
2391              """Provides the radical/stroke count entries.""" 
2392              strokeCountDict = self.cjk.getStrokeCountDict() 
2393              decompositionDict = self.cjk.getDecompositionEntriesDict() 
2394              entryDict = {} 
2395   
2396              for char, zVariant in self.characterSet: 
2397                  if self.cjk.isRadicalChar(char): 
2398                      # ignore Unicode radical forms 
2399                      continue 
2400   
2401                  for entry in self.getEntries(char, zVariant, strokeCountDict, 
2402                      decompositionDict, entryDict): 
2403   
2404                      yield [char, zVariant, entry['RadicalIndex'], entry['Form'], 
2405                          entry['Z-variant'], entry['CharacterLayout'], 
2406                          entry['RadicalPosition'], entry['ResidualStrokeCount']] 
2407   
2408      PROVIDES = 'CharacterRadicalResidualStrokeCount' 
2409      DEPENDS = ['CharacterDecomposition', 'StrokeCount', 'KangxiRadical', 
2410          'KangxiRadicalIsolatedCharacter', 'RadicalEquivalentCharacter', 
2411          'CharacterKangxiRadical'] 
2412   
2413      COLUMNS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex', 'RadicalForm', 
2414          'RadicalZVariant', 'MainCharacterLayout', 'RadicalRelativePosition', 
2415          'ResidualStrokeCount'] 
2416      PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant', 'RadicalForm', 
2417          'RadicalZVariant', 'MainCharacterLayout', 'RadicalRelativePosition'] 
2418      COLUMN_TYPES = {'ChineseCharacter': String(1), 'RadicalIndex': Integer(), 
2419          'RadicalForm': String(1), 'ZVariant': Integer(), 
2420          'RadicalZVariant': Integer(), 'MainCharacterLayout': String(1), 
2421          'RadicalRelativePosition': Integer(), 'ResidualStrokeCount': Integer()} 
2422   
2423 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
2424          super(CharacterRadicalStrokeCountBuilder, self).__init__(dataPath, 
2425              dbConnectInst, quiet) 
2426   
2427 -    def getGenerator(self): 
2428          # get all characters we have component information for 
2429          decompositionTable = self.db.tables['CharacterDecomposition'] 
2430          characterSet = set(self.db.selectRows( 
2431              select([decompositionTable.c.ChineseCharacter, 
2432                  decompositionTable.c.ZVariant], distinct=True))) 
2433          return CharacterRadicalStrokeCountBuilder\ 
2434              .CharacterRadicalStrokeCountGenerator(self.db, characterSet, 
2435                  self.quiet).generator() 
2436   
2437   
2438 -class CharacterResidualStrokeCountBuilder(EntryGeneratorBuilder): 
2439      """ 
2440      Builds a mapping between characters and their residual stroke count when 
2441      splitting of the radical form. This is stripped off information gathered 
2442      from table C{CharacterRadicalStrokeCount}. 
2443      """ 
2444 -    class ResidualStrokeCountExtractor: 
2445          """ 
2446          Generates the character to residual stroke count mapping from the 
2447          C{CharacterRadicalResidualStrokeCount} table. 
2448          """ 
2449 -        def __init__(self, dbConnectInst, characterSet): 
2450              """ 
2451              Initialises the ResidualStrokeCountExtractor. 
2452   
2453              @type dbConnectInst: instance 
2454              @param dbConnectInst: instance of a L{DatabaseConnector} 
2455              @type characterSet: set 
2456              @param characterSet: set of characters to generate the table for 
2457              """ 
2458              self.characterSet = characterSet 
2459              self.cjk = characterlookup.CharacterLookup( 
2460                  dbConnectInst=dbConnectInst) 
2461   
2462 -        def getEntries(self, char, zVariant, radicalDict): 
2463              u""" 
2464              Gets a list of radical residual entries. For multiple radical 
2465              occurrences (e.g. 伦) only returns the residual stroke count for the 
2466              "main" radical form. 
2467   
2468              @type char: str 
2469              @param char: Chinese character 
2470              @type zVariant: int 
2471              @param zVariant: I{Z-variant} of given character 
2472              @rtype: list of tuple 
2473              @return: list of residual stroke count entries 
2474              @todo Lang: Implement, find a good algorithm to turn down unwanted 
2475                  forms, don't just choose random one. See the following list:: 
2476   
2477                  >>> from cjklib import characterlookup 
2478                  >>> cjk = characterlookup.CharacterLookup() 
2479                  >>> for char in cjk.db.selectSoleValue('CharacterRadicalResidualStrokeCount', 
2480                  ...     'ChineseCharacter', distinctValues=True): 
2481                  ...     try: 
2482                  ...         entries = cjk.getCharacterKangxiRadicalResidualStrokeCount(char, 'C') 
2483                  ...         lastEntry = entries[0] 
2484                  ...         for entry in entries[1:]: 
2485                  ...             # print if diff. radical forms and diff. residual stroke count 
2486                  ...             if lastEntry[0] != entry[0] and lastEntry[2] != entry[2]: 
2487                  ...                 print char 
2488                  ...                 break 
2489                  ...             lastEntry = entry 
2490                  ...     except: 
2491                  ...         pass 
2492                  ... 
2493                  渌 
2494                  犾 
2495                  玺 
2496                  珏 
2497                  缧 
2498                  >>> cjk.getCharacterKangxiRadicalResidualStrokeCount(u'缧') 
2499                  [(u'\u7cf8', 0, u'\u2ffb', 0, 8), (u'\u7e9f', 0, u'\u2ff0', 0, 11)] 
2500              """ 
2501              # filter entries to return only the main radical form 
2502              # TODO provisional solution, take first entry per radical index 
2503              filteredEntries = [] 
2504              for radicalIdx in radicalDict[(char, zVariant)]: 
2505                  _, _, _, _, residualStrokeCount \ 
2506                      = radicalDict[(char, zVariant)][radicalIdx][0] 
2507                  filteredEntries.append((radicalIdx, residualStrokeCount)) 
2508   
2509              return filteredEntries 
2510   
2511 -        def generator(self): 
2512              """Provides one entry per character, z-Variant and locale subset.""" 
2513              radicalDict = self.cjk.getCharacterRadicalResidualStrokeCountDict() 
2514              for char, zVariant in self.characterSet: 
2515                  for radicalIndex, residualStrokeCount in self.getEntries(char, 
2516                      zVariant, radicalDict): 
2517                      yield [char, zVariant, radicalIndex, residualStrokeCount] 
2518   
2519      PROVIDES = 'CharacterResidualStrokeCount' 
2520      DEPENDS = ['CharacterRadicalResidualStrokeCount'] 
2521   
2522      COLUMNS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex', 
2523          'ResidualStrokeCount'] 
2524      PRIMARY_KEYS = ['ChineseCharacter', 'ZVariant', 'RadicalIndex'] 
2525      INDEX_KEYS = [['RadicalIndex']] 
2526      COLUMN_TYPES = {'ChineseCharacter': String(1), 'RadicalIndex': Integer(), 
2527          'ZVariant': Integer(), 'ResidualStrokeCount': Integer()} 
2528   
2529 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
2530          super(CharacterResidualStrokeCountBuilder, self).__init__(dataPath, 
2531              dbConnectInst, quiet) 
2532   
2533 -    def getGenerator(self): 
2534          residualSCTable = self.db.tables['CharacterRadicalResidualStrokeCount'] 
2535          characterSet = set(self.db.selectRows( 
2536              select([residualSCTable.c.ChineseCharacter, 
2537                  residualSCTable.c.ZVariant], distinct=True))) 
2538          return CharacterResidualStrokeCountBuilder.ResidualStrokeCountExtractor( 
2539              self.db, characterSet).generator() 
2540   
2541   
2542 -class CombinedCharacterResidualStrokeCountBuilder( 
2543      CharacterResidualStrokeCountBuilder): 
2544      """ 
2545      Builds a mapping between characters and their residual stroke count when 
2546      splitting of the radical form. Includes stroke count data from the Unihan 
2547      database to make up for missing data in own data files. 
2548      """ 
2549 -    class CombinedResidualStrokeCountExtractor: 
2550          """ 
2551          Generates the character to residual stroke count mapping. 
2552          """ 
2553 -        def __init__(self, tableEntries, preferredBuilder, quiet=False): 
2554              """ 
2555              Initialises the CombinedResidualStrokeCountExtractor. 
2556   
2557              @type tableEntries: list of list 
2558              @param tableEntries: list of characters with Z-variant 
2559              @type preferredBuilder: instance 
2560              @param preferredBuilder: TableBuilder which forms are preferred over 
2561                  entries from the Unihan table 
2562              @type quiet: bool 
2563              @param quiet: if true no status information will be printed 
2564              """ 
2565              self.RADICAL_REGEX = re.compile(ur"(\d+)\.(\d+)") 
2566              self.tableEntries = tableEntries 
2567              self.preferredBuilder = preferredBuilder 
2568              self.quiet = quiet 
2569   
2570 -        def generator(self): 
2571              """Provides one entry per character and z-Variant.""" 
2572              # handle chars from own data first 
2573              seenCharactersSet = set() 
2574              for entry in self.preferredBuilder: 
2575                  yield entry 
2576                  char = entry[0] 
2577                  radicalIdx = entry[2] 
2578                  seenCharactersSet.add((char, radicalIdx)) 
2579   
2580              # now fill up with characters from Unihan, Z-variant missing though 
2581              for char, radicalStroke in self.tableEntries: 
2582                  matchObj = self.RADICAL_REGEX.match(radicalStroke) 
2583                  if matchObj: 
2584                      try: 
2585                          radicalIndex = int(matchObj.group(1)) 
2586                          residualStrokeCount = int(matchObj.group(2)) 
2587                          if (char, radicalIndex) not in seenCharactersSet: 
2588                              yield [char, 0, radicalIndex, residualStrokeCount] 
2589                      except ValueError: 
2590                          if not self.quiet: 
2591                              warn("unable to read radical information of " \ 
2592                                  + "character '" + character + "': '" \ 
2593                                      + radicalStroke + "'") 
2594                  elif not self.quiet: 
2595                      warn("unable to read radical information of character '" \ 
2596                          + character + "': '" + radicalStroke + "'") 
2597   
2598      DEPENDS = ['CharacterRadicalResidualStrokeCount', 'Unihan'] 
2599      COLUMN_SOURCE = 'kRSKangXi' 
2600   
2601 -    def getGenerator(self): 
2602          residualSCTable = self.db.tables['CharacterRadicalResidualStrokeCount'] 
2603          characterSet = set(self.db.selectRows( 
2604              select([residualSCTable.c.ChineseCharacter, 
2605                  residualSCTable.c.ZVariant], distinct=True))) 
2606          preferredBuilder = CombinedCharacterResidualStrokeCountBuilder\ 
2607              .ResidualStrokeCountExtractor(self.db, characterSet).generator() 
2608   
2609          # get main builder 
2610          unihanTable = self.db.tables['Unihan'] 
2611          tableEntries = set(self.db.selectRows( 
2612              select([unihanTable.c.ChineseCharacter, 
2613                  unihanTable.c[self.COLUMN_SOURCE]], 
2614                  unihanTable.c[self.COLUMN_SOURCE] != None))) 
2615          return CombinedCharacterResidualStrokeCountBuilder\ 
2616              .CombinedResidualStrokeCountExtractor(tableEntries, 
2617                  preferredBuilder, self.quiet).generator() 
2618   
2619  #} 
2620  #{ Dictionary builder 
2621   
2622 -class EDICTFormatBuilder(EntryGeneratorBuilder): 
2623      """ 
2624      Provides an abstract class for loading EDICT formatted dictionaries. 
2625   
2626      One column will be provided for the headword, one for the reading (in EDICT 
2627      that is the Kana) and one for the translation. 
2628      @todo Fix: Optimize insert, use transaction which disables autocommit and 
2629          cosider passing data all at once, requiring proper handling of row 
2630          indices. 
2631      """ 
2632 -    class TableGenerator: 
2633          """Generates the dictionary entries.""" 
2634   
2635 -        def __init__(self, fileHandle, quiet=False, entryRegex=None, 
2636              columns=None, filterFunc=None): 
2637              """ 
2638              Initialises the TableGenerator. 
2639   
2640              @type fileHandle: file 
2641              @param fileHandle: handle of file to read from 
2642              @type quiet: bool 
2643              @param quiet: if true no status information will be printed 
2644              @type entryRegex: instance 
2645              @param entryRegex: regular expression object for entry pattern 
2646              @type columns: list of str 
2647              @param columns: column names of generated data 
2648              @type filterFunc: function 
2649              @param filterFunc: function used to filter entry content 
2650              """ 
2651              self.fileHandle = fileHandle 
2652              self.quiet = quiet 
2653              self.columns = columns 
2654              self.filterFunc = filterFunc 
2655              if entryRegex: 
2656                  self.entryRegex = entryRegex 
2657              else: 
2658                  # the EDICT dictionary itself omits the KANA in brackets if 
2659                  # the headword is already a KANA word 
2660                  # KANJI [KANA] /english_1/english_2/.../ 
2661                  # KANA /english_1/.../ 
2662                  self.entryRegex = \ 
2663                      re.compile(r'\s*(\S+)\s*(?:\[([^\]]*)\]\s*)?(/.*/)\s*$') 
2664   
2665 -        def generator(self): 
2666              """Provides the dictionary entries.""" 
2667              a = 0 
2668              for line in self.fileHandle: 
2669                  # ignore comments 
2670                  if line.lstrip().startswith('#'): 
2671                      continue 
2672                  # parse line 
2673                  matchObj = self.entryRegex.match(line) 
2674                  if not matchObj: 
2675                      if line.strip() != '': 
2676                          warn("error reading line '" + line + "'") 
2677                      continue 
2678                  # get entries 
2679                  entry = matchObj.groups() 
2680                  if self.columns: 
2681                      entry = dict([(self.columns[idx], cell) for idx, cell \ 
2682                          in enumerate(entry)]) 
2683                  if self.filterFunc: 
2684                      entry = self.filterFunc(entry) 
2685                  yield entry 
2686   
2687      COLUMNS = ['Headword', 'Reading', 'Translation'] 
2688      PRIMARY_KEYS = [] 
2689      INDEX_KEYS = [['Headword'], ['Reading']] 
2690      COLUMN_TYPES = {'Headword': String(255), 'Reading': String(255), 
2691          'Translation': Text()} 
2692   
2693      FULLTEXT_COLUMNS = ['Translation'] 
2694      """Column names which shall be fulltext searchable.""" 
2695      FILE_NAMES = None 
2696      """Names of file containing the edict formated dictionary.""" 
2697      ENCODING = 'utf-8' 
2698      """Encoding of the dictionary file.""" 
2699      ENTRY_REGEX = None 
2700      """ 
2701      Regular Expression matching a dictionary entry. Needs to be overwritten if 
2702      not strictly follows the EDICT format. 
2703      """ 
2704      IGNORE_LINES = 0 
2705      """Number of starting lines to ignore.""" 
2706      FILTER = None 
2707      """Filter to apply to the read entry before writing to table.""" 
2708   
2709 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
2710          super(EDICTFormatBuilder, self).__init__(dataPath, dbConnectInst, quiet) 
2711   
2712 -    def getGenerator(self): 
2713          # get file handle 
2714          import os.path as path 
2715          filePath = self.findFile(self.FILE_NAMES) 
2716          handle = self.getFileHandle(filePath) 
2717          if not self.quiet: 
2718              warn("Reading table from file '" + filePath + "'") 
2719          # ignore starting lines 
2720          for i in range(0, self.IGNORE_LINES): 
2721              handle.readline() 
2722          # create generator 
2723          return EDICTFormatBuilder.TableGenerator(handle, self.quiet, 
2724              self.ENTRY_REGEX, self.COLUMNS, self.FILTER).generator() 
2725   
2726 -    def getArchiveContentName(self, filePath): 
2727          """ 
2728          Function extracting the name of contained file from the zipped archive 
2729          using the file name. 
2730          Reimplement and adapt to own needs. 
2731   
2732          @type filePath: str 
2733          @param filePath: path of file 
2734          @rtype: str 
2735          @return: name of file in archive 
2736          """ 
2737          fileName = os.path.basename(filePath) 
2738          fileRoot, _ = os.path.splitext(fileName) 
2739          return fileRoot 
2740   
2741 -    def getFileHandle(self, filePath): 
2742          """ 
2743          Returns a handle to the give file. 
2744   
2745          The file can be either normal content, zip, tar, .tar.gz, tar.bz2 
2746   
2747          @type filePath: str 
2748          @param filePath: path of file 
2749          @rtype: file 
2750          @return: handle to file's content 
2751          """ 
2752          import zipfile 
2753          import tarfile 
2754   
2755          fileName = os.path.basename(filePath) 
2756   
2757          if zipfile.is_zipfile(filePath): 
2758              import StringIO 
2759              z = zipfile.ZipFile(filePath, 'r') 
2760              archiveContent = self.getArchiveContentName(filePath) 
2761              return StringIO.StringIO(z.read(archiveContent)\ 
2762                  .decode(self.ENCODING)) 
2763          elif tarfile.is_tarfile(filePath): 
2764              import StringIO 
2765              mode = '' 
2766              if filePath.endswith('bz2'): 
2767                  mode = ':bz2' 
2768              elif filePath.endswith('gz'): 
2769                  mode = ':gz' 
2770              z = tarfile.open(filePath, 'r' + mode) 
2771              archiveContent = self.getArchiveContentName(filePath) 
2772              file = z.extractfile(archiveContent) 
2773              return StringIO.StringIO(file.read().decode(self.ENCODING)) 
2774          elif filePath.endswith('.gz'): 
2775              import gzip 
2776              import StringIO 
2777              z = gzip.GzipFile(filePath, 'r') 
2778              return StringIO.StringIO(z.read().decode(self.ENCODING)) 
2779          else: 
2780              import codecs 
2781              return codecs.open(filePath, 'r', self.ENCODING) 
2782   
2783 -    def buildFTS3CreateTableStatement(self, table): 
2784          """ 
2785          Returns a SQL statement for creating a virtual table using FTS3 for 
2786          SQLite. 
2787   
2788          @type table: object 
2789          @param table: SQLAlchemy table object representing the FTS3 table 
2790          @rtype: str 
2791          @return: Create table statement 
2792          """ 
2793          preparer = self.db.engine.dialect.identifier_preparer 
2794   
2795          preparedColumns = [] 
2796          for column in table.columns: 
2797              preparedColumns.append(preparer.format_column(column)) 
2798          preparedTableName = preparer.format_table(table) 
2799          return text("CREATE VIRTUAL TABLE %s USING FTS3(%s);" \ 
2800              % (preparedTableName, ', '.join(preparedColumns))) 
2801   
2802 -    def buildFTS3Tables(self, tableName, columns, columnTypeMap={}, 
2803          primaryKeys=[], fullTextColumns=[]): 
2804          """ 
2805          Builds a FTS3 table construct for supporting full text search under 
2806          SQLite. 
2807   
2808          @type tableName: str 
2809          @param tableName: name of table 
2810          @type columns: list of str 
2811          @param columns: column names 
2812          @type columnTypeMap: dict of str and object 
2813          @param columnTypeMap: mapping of column name to SQLAlchemy Column 
2814          @type primaryKeys: list of str 
2815          @param primaryKeys: list of primary key columns 
2816          @type fullTextColumns: list of str 
2817          @param fullTextColumns: list of fulltext columns 
2818          """ 
2819   
2820          # table with non-FTS3 data 
2821          simpleColumns = [column for column in columns \ 
2822              if column not in fullTextColumns] 
2823          simpleTable = self.buildTableObject(tableName + '_Normal', 
2824              simpleColumns, columnTypeMap, primaryKeys) 
2825          simpleTable.create() 
2826   
2827          # FTS3 table 
2828          fts3Table = self.buildTableObject(tableName + '_Text', fullTextColumns, 
2829              columnTypeMap) 
2830          createFTS3Statement = self.buildFTS3CreateTableStatement(fts3Table) 
2831          self.db.execute(createFTS3Statement) 
2832   
2833          # view to mask FTS3 table construct as simple table 
2834          view = Table(tableName, self.db.metadata) 
2835          preparer = self.db.engine.dialect.identifier_preparer 
2836          simpleTableName = preparer.format_table(simpleTable) 
2837          fts3TableName = preparer.format_table(fts3Table) 
2838   
2839          createViewStatement = text("""CREATE VIEW %s AS SELECT * FROM %s JOIN %s 
2840              ON %s.rowid = %s.rowid;""" \ 
2841                  % (preparer.format_table(view), simpleTableName, fts3TableName, 
2842                      simpleTableName, fts3TableName)) 
2843          self.db.execute(createViewStatement) 
2844          # register view so processes depending on this succeed, see special 
2845          #   view handling in DatabaseBuilder.__init__, workaround for SQLalchemy 
2846          # TODO Bug in SQLalchemy that doesn't reflect table on reload? 
2847          #t = Table(tableName, self.db.metadata, autoload=True, useexisting=True) 
2848          self.db.engine.reflecttable(view) 
2849   
2850 -    def insertFTS3Tables(self, tableName, generator, columns=[], 
2851          fullTextColumns=[]): 
2852   
2853          simpleColumns = [column for column in columns \ 
2854              if column not in fullTextColumns] 
2855          simpleTable = Table(tableName + '_Normal', self.db.metadata, 
2856              autoload=True) 
2857          fts3Table = Table(tableName + '_Text', self.db.metadata, 
2858              autoload=True) 
2859          fts3FullRows = ['rowid'] 
2860          fts3FullRows.extend(fullTextColumns) 
2861   
2862          for newEntry in generator: 
2863              try: 
2864                  if type(newEntry) == type([]): 
2865                      simpleData = [newEntry[i] \ 
2866                          for i, column in enumerate(columns) \ 
2867                              if column not in fullTextColumns] 
2868                      fts3Data = [newEntry[i] \ 
2869                          for i, column in enumerate(columns) \ 
2870                              if column in fullTextColumns] 
2871                      fts3Data.insert('rowid', 0) 
2872                  else: 
2873                      simpleData = dict([(key, value) \ 
2874                          for key, value in newEntry.items() \ 
2875                          if key in simpleColumns]) 
2876                      fts3Data = dict([(key, value) \ 
2877                          for key, value in newEntry.items() \ 
2878                          if key in fullTextColumns]) 
2879                      fts3Data['rowid'] = func.last_insert_rowid() 
2880   
2881                  # table with non-FTS3 data 
2882                  simpleTable.insert(simpleData).execute() 
2883                  fts3Table.insert(fts3Data).execute() 
2884              except sqlalchemy.exceptions.IntegrityError: 
2885                  warn(unicode(e)) 
2886                  #warn(unicode(insertStatement)) 
2887                  raise 
2888   
2889 -    def testFTS3(self): 
2890          """ 
2891          Tests if the SQLite FTS3 extension is supported on the build system. 
2892   
2893          @rtype: bool 
2894          @return: C{True} if the FTS3 extension exists, C{False} otherwise. 
2895          """ 
2896          # Until #3436 is fixed (http://www.sqlite.org/cvstrac/tktview?tn=3436,5) 
2897          #   do it the bad way 
2898          try: 
2899              dummyTable = Table('cjklib_test_fts3_presence', self.db.metadata, 
2900                  Column('dummy'), useexisting=True) 
2901              createStatement = self.buildFTS3CreateTableStatement(dummyTable) 
2902              self.db.execute(createStatement) 
2903              try: 
2904                  dummyTable.drop() 
2905              except sqlalchemy.exceptions.OperationalError: 
2906                  pass 
2907              return True 
2908          except sqlalchemy.exceptions.OperationalError: 
2909              return False 
2910   
2911 -    def build(self): 
2912          """ 
2913          Build the table provided by the TableBuilder. 
2914   
2915          A search index is created to allow for fulltext searching. 
2916          """ 
2917          # get generator, might raise an Exception if source not found 
2918          generator = self.getGenerator() 
2919   
2920          hasFTS3 = self.db.engine.name == 'sqlite' and self.testFTS3() 
2921          if not hasFTS3: 
2922              warn("No SQLite FTS3 support found, fulltext search not supported.") 
2923              # get create statement 
2924              table = self.buildTableObject(self.PROVIDES, self.COLUMNS, 
2925                  self.COLUMN_TYPES, self.PRIMARY_KEYS) 
2926              table.create() 
2927          else: 
2928              # get create statement 
2929              self.buildFTS3Tables(self.PROVIDES, self.COLUMNS, self.COLUMN_TYPES, 
2930                  self.PRIMARY_KEYS, self.FULLTEXT_COLUMNS) 
2931   
2932          if not hasFTS3: 
2933              # write table content 
2934              #try: 
2935                  #entries = self.getEntryDict(generator) 
2936                  #self.db.execute(table.insert(), entries) 
2937              #except sqlalchemy.exceptions.IntegrityError, e: 
2938                  #warn(unicode(e)) 
2939                  ##warn(unicode(insertStatement)) 
2940                  #raise 
2941              for newEntry in generator: 
2942                  try: 
2943                      table.insert(newEntry).execute() 
2944                  except sqlalchemy.exceptions.IntegrityError: 
2945                      warn(unicode(e)) 
2946                      #warn(unicode(insertStatement)) 
2947                      raise 
2948          else: 
2949              # write table content 
2950              self.insertFTS3Tables(self.PROVIDES, generator, self.COLUMNS, 
2951                  self.FULLTEXT_COLUMNS) 
2952   
2953          # get create index statement 
2954          if not hasFTS3: 
2955              for index in self.buildIndexObjects(self.PROVIDES, self.INDEX_KEYS): 
2956                  index.create() 
2957          else: 
2958              for index in self.buildIndexObjects(self.PROVIDES + '_Normal', 
2959                  self.INDEX_KEYS): 
2960                  index.create() 
2961   
2962 -    def remove(self): 
2963          # get drop table statement 
2964   
2965          hasFTS3 = self.db.engine.has_table(self.PROVIDES + '_Text') 
2966          if not hasFTS3: 
2967              table = Table(self.PROVIDES, self.db.metadata) 
2968              table.drop() 
2969          else: 
2970              preparer = self.db.engine.dialect.identifier_preparer 
2971              view = Table(self.PROVIDES, self.db.metadata) 
2972              dropViewStatement = text("DROP VIEW %s" \ 
2973                  % preparer.format_table(view)) 
2974              self.db.execute(dropViewStatement) 
2975              table = Table(self.PROVIDES + '_Normal', self.db.metadata) 
2976              table.drop() 
2977              table = Table(self.PROVIDES + '_Text', self.db.metadata) 
2978              table.drop() 
2979   
2980   
2981 -class WordIndexBuilder(EntryGeneratorBuilder): 
2982      """ 
2983      Builds a translation word index for a given dictionary. 
2984   
2985      Searching for a word will return a headword and reading. This allows to find 
2986      several dictionary entries with same headword and reading, with only one 
2987      including the translation word. 
2988   
2989      @todo Fix:  Word regex is specialised for HanDeDict. 
2990      @todo Fix:  Using a row_id for joining instead of Headword(Traditional) and 
2991          Reading would maybe speed up table joins. Needs a workaround to include 
2992          multiple rows for one actual headword entry though. 
2993      """ 
2994 -    class WordEntryGenerator: 
2995          """Generates words for a list of dictionary entries.""" 
2996   
2997 -        def __init__(self, entries): 
2998              """ 
2999              Initialises the WordEntryGenerator. 
3000   
3001              @type entries: list of tuple 
3002              @param entries: a list of headword and its translation 
3003              """ 
3004              self.entries = entries 
3005              # TODO this regex is adapted to HanDeDict, might be not general 
3006              #   enough 
3007              self.wordRegex = re.compile(r'\([^\)]+\)|' \ 
3008                  + r'(?:; Bsp.: [^/]+?--[^/]+)|([^/,\(\)\[\]\!\?]+)') 
3009   
3010 -        def generator(self): 
3011              """Provides all data of one word per entry.""" 
3012              # remember seen entries to prevent double entries 
3013              seenWordEntries = set() 
3014              newEntryDict = {} 
3015   
3016              for headword, reading, translation in self.entries: 
3017                  newEntryDict['Headword'] = headword 
3018                  newEntryDict['Reading'] = reading 
3019                  for word in self.wordRegex.findall(translation): 
3020                      word = word.strip().lower() 
3021                      if not word: 
3022                          continue 
3023                      if word \ 
3024                          and (headword, reading, word) not in seenWordEntries: 
3025                          seenWordEntries.add((headword, reading, word)) 
3026                          newEntryDict['Word'] = word 
3027                          yield newEntryDict 
3028   
3029      COLUMNS = ['Headword', 'Reading', 'Word'] 
3030      COLUMN_TYPES = {'Headword': String(255), 'Reading': String(255), 
3031          'Word': String(255)} 
3032      INDEX_KEYS = [['Word']] 
3033   
3034      TABLE_SOURCE = None 
3035      """Dictionary source""" 
3036      HEADWORD_SOURCE = 'Headword' 
3037      """Source of headword""" 
3038   
3039 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
3040          super(WordIndexBuilder, self).__init__(dataPath, dbConnectInst, quiet) 
3041   
3042 -    def getGenerator(self): 
3043          table = self.db.tables[self.TABLE_SOURCE] 
3044          entries = self.db.selectRows( 
3045              select([table.c[self.HEADWORD_SOURCE], table.c.Reading, 
3046                  table.c.Translation])) 
3047          return WordIndexBuilder.WordEntryGenerator(entries).generator() 
3048   
3049   
3050 -class EDICTBuilder(EDICTFormatBuilder): 
3051      """ 
3052      Builds the EDICT dictionary. 
3053      """ 
3054      PROVIDES = 'EDICT' 
3055      FILE_NAMES = ['edict.gz', 'edict.zip', 'edict'] 
3056      ENCODING = 'euc-jp' 
3057      IGNORE_LINES = 1 
3058   
3059   
3060 -class EDICTWordIndexBuilder(WordIndexBuilder): 
3061      """ 
3062      Builds the word index of the EDICT dictionary. 
3063      """ 
3064      PROVIDES = 'EDICT_Words' 
3065      DEPENDS = ['EDICT'] 
3066      TABLE_SOURCE = 'EDICT' 
3067   
3068   
3069 -class CEDICTFormatBuilder(EDICTFormatBuilder): 
3070      """ 
3071      Provides an abstract class for loading CEDICT formatted dictionaries. 
3072   
3073      Two column will be provided for the headword (one for traditional and 
3074      simplified writings each), one for the reading (e.g. in CEDICT Pinyin) and 
3075      one for the translation. 
3076      @todo Impl: Proper collation for Translation and Reading columns. 
3077      """ 
3078      COLUMNS = ['HeadwordTraditional', 'HeadwordSimplified', 'Reading', 
3079          'Translation'] 
3080      INDEX_KEYS = [['HeadwordTraditional'], ['HeadwordSimplified'], ['Reading']] 
3081      COLUMN_TYPES = {'HeadwordTraditional': String(255), 
3082          'HeadwordSimplified': String(255), 'Reading': String(255), 
3083          'Translation': Text()} 
3084   
3085 -    def __init__(self, dataPath, dbConnectInst, quiet=False): 
3086          self.ENTRY_REGEX = \ 
3087              re.compile(r'\s*(\S+)(?:\s+(\S+))?\s*\[([^\]]*)\]\s*(/.*/)\s*$') 
3088          super(CEDICTFormatBuilder, self).__init__(dataPath, dbConnectInst, 
3089              quiet) 
3090   
3091   
3092 -class CEDICTBuilder(CEDICTFormatBuilder): 
3093      """ 
3094      Builds the CEDICT dictionary. 
3095      """ 
3096 -    def filterUmlaut(self, entry): 
3097          """ 
3098          Converts the C{'u:'} to C{'ü'}. 
3099   
3100          @type entry: tuple 
3101          @param entry: a dictionary entry 
3102          @rtype: tuple 
3103          @return: the given entry with corrected ü-voul 
3104          """ 
3105          if type(entry) == type({}): 
3106              entry['Reading'] = entry['Reading'].replace('u:', u'ü') 
3107              return entry 
3108          else: 
3109              trad, simp, reading, translation = entry 
3110              reading = reading.replace('u:', u'ü') 
3111              return [trad, simp, reading, translation] 
3112   
3113      PROVIDES = 'CEDICT' 
3114      FILE_NAMES = ['cedict_1_0_ts_utf-8_mdbg.zip', 
3115          'cedict_1_0_ts_utf-8_mdbg.txt.gz', 'cedictu8.zip', 'cedict_ts.u8', 
3116          'cedict_1_0_ts_utf-8_mdbg.txt'] 
3117      ENCODING = 'utf-8' 
3118      FILTER = filterUmlaut 
3119   
3120 -    def getArchiveContentName(self, filePath): 
3121          return 'cedict_ts.u8' 
3122   
3123   
3124 -class CEDICTWordIndexBuilder(WordIndexBuilder): 
3125      """ 
3126      Builds the word index of the CEDICT dictionary. 
3127      """ 
3128      PROVIDES = 'CEDICT_Words' 
3129      DEPENDS = ['CEDICT'] 
3130      TABLE_SOURCE = 'CEDICT' 
3131      HEADWORD_SOURCE = 'HeadwordTraditional' 
3132   
3133   
3134 -class CEDICTGRBuilder(EDICTFormatBuilder): 
3135      """ 
3136      Builds the CEDICT-GR dictionary. 
3137      """ 
3138      PROVIDES = 'CEDICTGR' 
3139      FILE_NAMES = ['cedictgr.zip', 'cedictgr.b5'] 
3140      ENCODING = 'big5hkscs' 
3141   
3142 -    def getArchiveContentName(self, filePath): 
3143          return 'cedictgr.b5' 
3144   
3145   
3146 -class CEDICTGRWordIndexBuilder(WordIndexBuilder): 
3147      """ 
3148      Builds the word index of the CEDICT-GR dictionary. 
3149      """ 
3150      PROVIDES = 'CEDICTGR_Words' 
3151      DEPENDS = ['CEDICTGR'] 
3152      TABLE_SOURCE = 'CEDICTGR' 
3153      HEADWORD_SOURCE = 'Headword' 
3154   
3155   
3156 -class HanDeDictBuilder(CEDICTFormatBuilder): 
3157      """ 
3158      Builds the HanDeDict dictionary. 
3159      """ 
3160 -    def filterSpacing(self, entry): 
3161          """ 
3162          Converts wrong spacing in readings of entries in HanDeDict. 
3163   
3164          @type entry: tuple 
3165          @param entry: a dictionary entry 
3166          @rtype: tuple 
3167          @return: the given entry with corrected spacing 
3168          """ 
3169          if type(entry) == type({}): 
3170              headword = entry['HeadwordTraditional'] 
3171              reading = entry['Reading'] 
3172          else: 
3173              headword, headwordSimplified, reading, translation = entry 
3174   
3175          readingEntities = [] 
3176          precedingIsNonReading = False 
3177          for idx, entity in enumerate(reading.split(' ')): 
3178              if idx < len(headword) and entity == headword[idx]: 
3179                  # for entities showing up in both strings, ommit spaces 
3180                  #   (e.g. "IC卡", "I C kǎ") 
3181                  if not precedingIsNonReading: 
3182                      readingEntities.append(' ') 
3183   
3184                  precedingIsNonReading = True 
3185              elif idx != 0: 
3186                  readingEntities.append(' ') 
3187                  precedingIsNonReading = False 
3188   
3189              readingEntities.append(entity) 
3190   
3191          reading = ''.join(readingEntities) 
3192   
3193          if type(entry) == type({}): 
3194              entry['Reading'] = reading 
3195              return entry 
3196          else: 
3197              return [headword, headwordSimplified, reading, translation] 
3198   
3199      PROVIDES = 'HanDeDict' 
3200      FILE_NAMES = ['handedict-*.zip', 'handedict-*.tar.bz2', 'handedict.u8'] 
3201      ENCODING = 'utf-8' 
3202      FILTER = filterSpacing 
3203   
3204 -    def extractTimeStamp(self, filePath): 
3205          fileName = os.path.basename(filePath) 
3206          matchObj = re.match(r'handedict-(\d{8})\.', fileName) 
3207          if matchObj: 
3208              return matchObj.group(1) 
3209   
3210 -    def getPreferredFile(self, filePaths): 
3211          timeStamps = [] 
3212          for filePath in filePaths: 
3213              ts = self.extractTimeStamp(filePath) 
3214              if ts: 
3215                  timeStamps.append((ts, filePath)) 
3216          if timeStamps: 
3217              _, filePath = max(timeStamps) 
3218              return filePath 
3219          else: 
3220              filePaths[0] 
3221   
3222 -    def getArchiveContentName(self, filePath): 
3223          timeStamp = self.extractTimeStamp(filePath) 
3224          return 'handedict-' + timeStamp + '/handedict.u8' 
3225   
3226 -    def findFile(self, fileGlobs, fileType=None): 
3227          """ 
3228          Tries to locate a file with a given list of possible file names under 
3229          the classes default data paths. 
3230   
3231          Uses the newest version of all files found. 
3232   
3233          @type fileGlobs: str/list of str 
3234          @param fileGlobs: possible file names 
3235          @type fileType: str 
3236          @param fileType: textual type of file used in error msg 
3237          @rtype: str 
3238          @return: path to file of first match in search for existing file 
3239          @raise IOError: if no file found 
3240          """ 
3241          import glob 
3242   
3243          if type(fileGlobs) != type([]): 
3244              fileGlobs = [fileGlobs] 
3245          foundFiles = [] 
3246          for fileGlob in fileGlobs: 
3247              for path in self.dataPath: 
3248                  globPath = os.path.join(os.path.expanduser(path), fileGlob) 
3249                  for filePath in glob.glob(globPath): 
3250                      if os.path.exists(filePath): 
3251                          fileName = os.path.basename(filePath) 
3252                          foundFiles.append((fileName, filePath)) 
3253   
3254          if foundFiles: 
3255              if hasattr(self, 'getPreferredFile'): 
3256                  return self.getPreferredFile([path for _, path in foundFiles]) 
3257              else: 
3258                  _, newestPath = max(foundFiles) 
3259                  return newestPath 
3260          else: 
3261              if fileType == None: 
3262                  fileType = "file" 
3263              raise IOError("No " + fileType + " found for '" + self.PROVIDES \ 
3264                  + "' under path(s)'" + "', '".join(self.dataPath) \ 
3265                  + "' for file names '" + "', '".join(fileGlobs) + "'") 
3266   
3267   
3268 -class HanDeDictWordIndexBuilder(WordIndexBuilder): 
3269      """ 
3270      Builds the word index of the HanDeDict dictionary. 
3271      """ 
3272      PROVIDES = 'HanDeDict_Words' 
3273      DEPENDS = ['HanDeDict'] 
3274      TABLE_SOURCE = 'HanDeDict' 
3275      HEADWORD_SOURCE = 'HeadwordTraditional' 
3276   
3277  #} 
3278  #{ DatabaseBuilder 
3279   
3280 -class DatabaseBuilder: 
3281      """ 
3282      DatabaseBuilder provides the main class for building up a database for the 
3283      cjklib package. 
3284   
3285      It contains all L{TableBuilder} classes and a dependency graph to handle 
3286      build requests. 
3287      """ 
3288 -    def __init__(self, databaseSettings={}, dbConnectInst=None, dataPath=[], 
3289          quiet=False, rebuildDepending=True, rebuildExisting=True, noFail=False, 
3290          prefer=[], additionalBuilders=[]): 
3291          """ 
3292          Constructs the DatabaseBuilder. 
3293   
3294          @type databaseSettings: dict 
3295          @param databaseSettings: dictionary holding the database options for the 
3296              dbconnector module. 
3297          @type dbConnectInst: instance 
3298          @param dbConnectInst: instance of a L{DatabaseConnector} 
3299          @type dataPath: list of str 
3300          @param dataPath: optional list of paths to the data file(s) 
3301          @type quiet: bool 
3302          @param quiet: if true no status information will be printed to stderr 
3303          @type rebuildDepending: bool 
3304          @param rebuildDepending: if true existing tables that depend on updated 
3305              tables will be dropped and built from scratch 
3306          @type rebuildExisting: bool 
3307          @param rebuildExisting: if true existing tables will be dropped and 
3308              built from scratch 
3309          @type noFail: bool 
3310          @param noFail: if true build process won't terminate even if one table 
3311              fails to build 
3312          @type prefer: list 
3313          @param prefer: list of L{TableBuilder} names to prefer in conflicting 
3314              cases 
3315          @type additionalBuilders: list of classobj 
3316          @param additionalBuilders: list of externally provided TableBuilders 
3317          """ 
3318          if not dataPath: 
3319              buildModule = __import__("cjklib.build") 
3320              self.dataPath = [os.path.join(buildModule.__path__[0], 'data')] 
3321          else: 
3322              if type(dataPath) == type([]): 
3323                  self.dataPath = dataPath 
3324              else: 
3325                  # wrap as list 
3326                  self.dataPath = [dataPath] 
3327          self.quiet = quiet 
3328          self.rebuildDepending = rebuildDepending 
3329          self.rebuildExisting = rebuildExisting 
3330          self.noFail = noFail 
3331          # get connector to database 
3332          if dbConnectInst: 
3333              self.db = dbConnectInst 
3334          else: 
3335              self.db = dbconnector.DatabaseConnector.getDBConnector( 
3336                  databaseSettings) 
3337   
3338          # get TableBuilder classes 
3339          tableBuilderClasses = DatabaseBuilder.getTableBuilderClasses( 
3340              set(prefer), quiet=self.quiet, 
3341              additionalBuilders=additionalBuilders) 
3342   
3343          # build lookup 
3344          self.tableBuilderLookup = {} 
3345          for tableBuilder in tableBuilderClasses.values(): 
3346              if self.tableBuilderLookup.has_key(tableBuilder.PROVIDES): 
3347                  raise Exception("Table '" + tableBuilder.PROVIDES \ 
3348                      + "' provided by several builders") 
3349              self.tableBuilderLookup[tableBuilder.PROVIDES] = tableBuilder 
3350   
3351 -    def setDataPath(self, dataPath): 
3352          """ 
3353          Changes the data path. 
3354   
3355          @type dataPath: list of str 
3356          @param dataPath: list of paths to the data file(s) 
3357          """ 
3358          if type(dataPath) == type([]): 
3359              self.dataPath = dataPath 
3360          else: 
3361              # wrap as list 
3362              self.dataPath = [dataPath] 
3363   
3364 -    def build(self, tables): 
3365          """ 
3366          Builds the given tables. 
3367   
3368          @type tables: list 
3369          @param tables: list of tables to build 
3370          """ 
3371          if type(tables) != type([]): 
3372              tables = [tables] 
3373   
3374          warn("Building database '%s'" % self.db.databaseUrl) 
3375   
3376          # remove tables that don't need to be rebuilt 
3377          filteredTables = [] 
3378          for table in tables: 
3379              if table not in self.tableBuilderLookup: 
3380                  raise exception.UnsupportedError("Table '%s' not provided" \ 
3381                      % table) 
3382   
3383              if self.needsRebuild(table): 
3384                  filteredTables.append(table) 
3385              else: 
3386                  if not self.quiet: 
3387                      warn("Skipping table '%s' because it already exists" \ 
3388                          % table) 
3389          tables = filteredTables 
3390   
3391          # get depending tables that need to be updated when dependencies change 
3392          dependingTables = [] 
3393          if self.rebuildDepending: 
3394              dependingTables = self.getRebuiltDependingTables(tables) 
3395              if dependingTables: 
3396                  warn("Tables rebuilt because of dependencies updated: '" \ 
3397                      +"', '".join(dependingTables) + "'") 
3398                  tables.extend(dependingTables) 
3399   
3400          # get table list according to dependencies 
3401          buildDependentTables = self.getBuildDependentTables(tables) 
3402          buildTables = set(tables) | buildDependentTables 
3403          # get build order and remove tables we don't need to build 
3404          builderClasses = self.getClassesInBuildOrder(buildTables) 
3405   
3406          # build tables 
3407          if not self.quiet and self.rebuildExisting: 
3408              warn("Rebuilding tables and overwriting old ones...") 
3409          builderClasses.reverse() 
3410          instancesUnrequestedTable = set() 
3411          while builderClasses: 
3412              builder = builderClasses.pop() 
3413              # check first if the table will only be created for resolving 
3414              # dependencies and note it down for deletion 
3415              transaction = self.db.connection.begin() 
3416              try: 
3417                  instance = builder(self.dataPath, self.db, self.quiet) 
3418                  # mark tables as deletable if its only provided because of 
3419                  #   dependencies and the table doesn't exists yet 
3420                  if builder.PROVIDES in buildDependentTables \ 
3421                      and not self.db.engine.has_table(builder.PROVIDES): 
3422                      instancesUnrequestedTable.add(instance) 
3423   
3424                  if self.db: 
3425                      if self.db.engine.has_table(builder.PROVIDES): 
3426                          if not self.quiet: 
3427                              warn("Removing previously built table '" \ 
3428                                  + builder.PROVIDES + "'") 
3429                          instance.remove() 
3430                  else: 
3431                      instance.remove() 
3432   
3433                  if not self.quiet: 
3434                      warn("Building table '" + builder.PROVIDES \ 
3435                          + "' with builder '" + builder.__name__ + "'...") 
3436   
3437                  instance.build() 
3438                  transaction.commit() 
3439              except IOError, e: 
3440                  transaction.rollback() 
3441                  # data not available, can't build table 
3442                  if self.noFail: 
3443                      if not self.quiet: 
3444                          warn("Building table '" + builder.PROVIDES \ 
3445                              + "' failed: '" + str(e) + "', skipping") 
3446                      dependingTables = [builder.PROVIDES] 
3447                      remainingBuilderClasses = [] 
3448                      for clss in builderClasses: 
3449                          if set(clss.DEPENDS) & set(dependingTables): 
3450                              # this class depends on one being removed 
3451                              dependingTables.append(clss.PROVIDES) 
3452                          else: 
3453                              remainingBuilderClasses.append(clss) 
3454                      if not self.quiet and len(dependingTables) > 1: 
3455                          warn("Ignoring depending table(s) '" \ 
3456                              + "', '".join(dependingTables[1:]) + "'") 
3457                      builderClasses = remainingBuilderClasses 
3458                  else: 
3459                      raise 
3460              except Exception, e: 
3461                  transaction.rollback() 
3462                  raise 
3463   
3464          # remove tables that where only created as build dependencies 
3465          if instancesUnrequestedTable: 
3466              for instance in instancesUnrequestedTable: 
3467                  if not self.quiet: 
3468                      warn("Removing table '" + instance.PROVIDES \ 
3469                          + "' as it was only created to solve build " \ 
3470                          + "dependencies") 
3471                  instance.remove() 
3472   
3473 -    def remove(self, tables): 
3474          """ 
3475          Removes the given tables. 
3476   
3477          @type tables: list 
3478          @param tables: list of tables to remove 
3479          """ 
3480          if type(tables) != type([]): 
3481              tables = [tables] 
3482   
3483          tableBuilderClasses = [] 
3484          for table in set(tables): 
3485              if not self.tableBuilderLookup.has_key(table): 
3486                  raise exception.UnsupportedError("table '" + table \ 
3487                      + "' not provided") 
3488              tableBuilderClasses.append(self.tableBuilderLookup[table]) 
3489   
3490          for builder in tableBuilderClasses: 
3491              instance = builder(self.dataPath, self.db, self.quiet) 
3492              if self.db: 
3493                  if self.db.engine.has_table(builder.PROVIDES): 
3494                      if not self.quiet: 
3495                          warn("Removing previously built table '" \ 
3496                              + builder.PROVIDES + "'") 
3497                      instance.remove() 
3498              else: 
3499                  instance.remove() 
3500   
3501 -    def needsRebuild(self, tableName): 
3502          """ 
3503          Returns true if either rebuild is turned on by default or we build into 
3504          database and the table doesn't exist yet. 
3505   
3506          @type tableName: classobj 
3507          @param tableName: L{TableBuilder} class 
3508          @rtype: bool 
3509          @return: True, if table needs to be rebuilt 
3510          """ 
3511          if self.rebuildExisting: 
3512              return True 
3513          else: 
3514              return not self.db.engine.has_table(tableName) 
3515   
3516 -    def getBuildDependentTables(self, tableNames): 
3517          """ 
3518          Gets the name of the tables that needs to be built to resolve 
3519          dependencies. 
3520   
3521          @type tableNames: list of str 
3522          @param tableNames: list of tables to build 
3523          @rtype: list of str 
3524          @return: names of tables needed to resolve dependencies 
3525          """ 
3526          def solveDependencyRecursive(table): 
3527              """ 
3528              Gets all tables on which the given table depends and that need to be 
3529              rebuilt. Also will mark tables skipped which won't be rebuilt. 
3530   
3531              Uses parent's variables to store data. 
3532   
3533              @type table: str 
3534              @param table: table name for which to solve dependencies 
3535              """ 
3536              if table in tableNames: 
3537                  # don't add dependant tables if they are given explicitly 
3538                  return 
3539              if self.db and self.db.engine.has_table(table): 
3540                  skippedTables.add(table) 
3541                  return 
3542   
3543              dependedTablesNames.add(table) 
3544   
3545              # add dependent tables if needed (recursively) 
3546              if not self.tableBuilderLookup.has_key(table): 
3547                  # either we have no builder or the builder was removed in 
3548                  # favour of another builder that shares at least one table 
3549                  # with the removed one 
3550                  raise exception.UnsupportedError("table '" + table \ 
3551                      + "' not provided, might be related to conflicting " \ 
3552                      + "builders") 
3553              builderClass = self.tableBuilderLookup[table] 
3554              for dependantTable in builderClass.DEPENDS: 
3555                  solveDependencyRecursive(dependantTable) 
3556   
3557          tableNames = set(tableNames) 
3558          dependedTablesNames = set() 
3559          skippedTables = set() 
3560   
3561          for table in tableNames: 
3562              builderClass = self.tableBuilderLookup[table] 
3563              for depededTable in builderClass.DEPENDS: 
3564                  solveDependencyRecursive(depededTable) 
3565   
3566          if not self.quiet and skippedTables: 
3567              warn("Newly built tables depend on table(s) '" \ 
3568                  + "', '".join(skippedTables) \ 
3569                  + "' but skipping because they already exist") 
3570          return dependedTablesNames 
3571   
3572 -    def getDependingTables(self, tableNames): 
3573          """ 
3574          Gets the name of the tables that depend on the given tables to be built 
3575          and are not included in the given set. 
3576   
3577          Dependencies depend on the choice of table builders and thus may vary. 
3578   
3579          @type tableNames: list of str 
3580          @param tableNames: list of tables 
3581          @rtype: list of str 
3582          @return: names of tables that depend on given tables 
3583          """ 
3584          dependencyTables = set(tableNames) 
3585          dependingTablesNames = set() 
3586          residualTables = self.getCurrentSupportedTables() - dependencyTables 
3587   
3588          while dependencyTables: 
3589              dependencyTable = dependencyTables.pop() 
3590              for table in residualTables: 
3591                  builderClass = self.tableBuilderLookup[table] 
3592                  if  dependencyTable in builderClass.DEPENDS: 
3593                      # found a table that depends on the given table 
3594                      dependingTablesNames.add(table) 
3595                      # queue for check of depending tables 
3596                      dependencyTables.add(table) 
3597                      # no need for further testing on the newly found table 
3598              residualTables = residualTables - dependencyTables 
3599   
3600          return dependingTablesNames 
3601   
3602 -    def getRebuiltDependingTables(self, tableNames): 
3603          """ 
3604          Gets the name of the tables that depend on the given tables to be built 
3605          and already exist, thus need to be rebuilt. 
3606   
3607          @type tableNames: list of str 
3608          @param tableNames: list of tables 
3609          @rtype: list of str 
3610          @return: names of tables that need to be rebuilt because of dependencies 
3611          """ 
3612          dependingTables = self.getDependingTables(tableNames) 
3613   
3614          needRebuild = set() 
3615          for tableName in dependingTables: 
3616              if self.db.engine.has_table(tableName): 
3617                  needRebuild.add(tableName) 
3618          return needRebuild 
3619   
3620 -    def getClassesInBuildOrder(self, tableNames): 
3621          """ 
3622          Gets the build order for the given table names. 
3623   
3624          @type tableNames: list of str 
3625          @param tableNames: list of names of tables to build 
3626          @rtype: list of classobj 
3627          @return: L{TableBuilder}s in build order 
3628          """ 
3629          # get dependencies and save order 
3630          tableBuilderClasses = [] 
3631          for table in set(tableNames): 
3632              if not self.tableBuilderLookup.has_key(table): 
3633                  # either we have no builder or the builder was removed in favour 
3634                  # of another builder that shares at least one table with the 
3635                  # removed one 
3636                  raise exception.UnsupportedError("table '" + table \ 
3637                      + "' not provided, might be related to conflicting " \ 
3638                      + "builders") 
3639              tableBuilderClasses.append(self.tableBuilderLookup[table]) 
3640          return self.getBuildDependencyOrder(tableBuilderClasses) 
3641   
3642      @staticmethod 
3643 -    def getBuildDependencyOrder(tableBuilderClasses): 
3644          """ 
3645          Create order in which the tables have to be created. 
3646   
3647          @type tableBuilderClasses: list of classobj 
3648          @param tableBuilderClasses: list of L{TableBuilder} classes 
3649          @rtype: list of classobj 
3650          @return: the given classes ordered in build dependency order 
3651          """ 
3652          dependencyOrder = [] 
3653          providedTables = [bc.PROVIDES for bc in tableBuilderClasses] 
3654          includedTableNames = set() 
3655          while tableBuilderClasses: 
3656              for builderClass in tableBuilderClasses: 
3657                  if set(builderClass.DEPENDS).intersection(providedTables) \ 
3658                      <= includedTableNames: 
3659                      # found a terminal class or one whose dependencies are 
3660                      #   already covered (at least no dependency on one of the 
3661                      #   tables in the list) 
3662                      break 
3663              else: 
3664                  # one dependency can not be fulfilled, might be that no 
3665                  #   TableBuilder is  implemented, that it was removed due to 
3666                  #   conflicting other builder, or that a cycle in DEPEND graph 
3667                  #   exists 
3668                  raise Exception("Unfulfillable depend request, " \ 
3669                      + "might be related to conflicting builders or cycle. " \ 
3670                      + "Builders included: '" \ 
3671                      + "', '".join([clss.__name__ for clss in dependencyOrder]) \ 
3672                      + "'. Builders with open depends: '" \ 
3673                      + "', '".join([builder.PROVIDES \ 
3674                          for builder in tableBuilderClasses]) + "'") 
3675              dependencyOrder.append(builderClass) 
3676              includedTableNames.add(builderClass.PROVIDES) 
3677              tableBuilderClasses.remove(builderClass) 
3678          return dependencyOrder 
3679   
3680      @staticmethod 
3681 -    def getTableBuilderClasses(preferClassSet=set(), resolveConflicts=True, 
3682          quiet=True, additionalBuilders=[]): 
3683          """ 
3684          Gets all classes in module that implement L{TableBuilder}. 
3685   
3686          @type preferClassSet: set of str 
3687          @param preferClassSet: set of L{TableBuilder} names to prefer in 
3688              conflicting cases, resolveConflicting must be True to take effect 
3689              (default) 
3690          @type resolveConflicts: bool 
3691          @param resolveConflicts: if true conflicting builders will be removed 
3692              so that only one builder is left per Table. 
3693          @type quiet: bool 
3694          @param quiet: if true no status information will be printed to stderr 
3695          @type additionalBuilders: list of classobj 
3696          @param additionalBuilders: list of externally provided TableBuilders 
3697          @rtype: dict 
3698          @return: dictionary of all classes inheriting form L{TableBuilder} that 
3699              provide a table (i.d. non abstract implementations), with its name 
3700              as key 
3701          """ 
3702          tableBuilderClasses = {} 
3703          buildModule = __import__("cjklib.build") 
3704          # get all classes that inherit from TableBuilder 
3705          tableBuilderClasses = dict([(clss.__name__, clss) \ 
3706              for clss in buildModule.build.__dict__.values() \ 
3707              if type(clss) == types.TypeType \ 
3708              and issubclass(clss, buildModule.build.TableBuilder) \ 
3709              and clss.PROVIDES]) 
3710          # add additionally provided 
3711          tableBuilderClasses.update(dict([(clss.__name__, clss) \ 
3712              for clss in additionalBuilders])) 
3713   
3714          # check for conflicting builders and keep only one per conflicting group 
3715          # group builders first 
3716          tableToBuilderMapping = {} 
3717          for clssName, clss in tableBuilderClasses.iteritems(): 
3718              if clss.PROVIDES not in tableToBuilderMapping: 
3719                  tableToBuilderMapping[clss.PROVIDES] = set() 
3720   
3721              tableToBuilderMapping[clss.PROVIDES].add(clssName) 
3722   
3723          if resolveConflicts: 
3724              # now check conflicting and choose preferred if given 
3725              for tableName, builderClssSet in tableToBuilderMapping.items(): 
3726                  preferredBuilders = builderClssSet & preferClassSet 
3727                  if preferredBuilders: 
3728                      if len(preferredBuilders) > 1: 
3729                          # the user specified more than one preferred table that 
3730                          # both provided at least one same table 
3731                          raise Exception("More than one TableBuilder " \ 
3732                              + "preferred for conflicting table.") 
3733                      preferred = preferredBuilders.pop() 
3734                      builderClssSet.remove(preferred) 
3735                  else: 
3736                      preferred = builderClssSet.pop() 
3737                  if not quiet and builderClssSet: 
3738                      warn("Removing conflicting builder(s) '" \ 
3739                          + "', '".join(builderClssSet) + "' in favour of '" \ 
3740                          + preferred + "'") 
3741                  # remove other conflicting 
3742                  for clssName in builderClssSet: 
3743                      del tableBuilderClasses[clssName] 
3744          return tableBuilderClasses 
3745   
3746      @staticmethod 
3747 -    def getSupportedTables(): 
3748          """ 
3749          Gets names of supported tables. 
3750   
3751          @rtype: list of str 
3752          @return: names of tables 
3753          """ 
3754          classDict = DatabaseBuilder.getTableBuilderClasses( 
3755              resolveConflicts=False) 
3756          return set([clss.PROVIDES for clss in classDict.values()]) 
3757   
3758 -    def getCurrentSupportedTables(self): 
3759          """ 
3760          Gets names of tables supported by this instance of the database builder. 
3761   
3762          This list can have more entries then L{getSupportedTables()} as 
3763          additional external builders can be supplied on instantiation. 
3764   
3765          @rtype: list of str 
3766          @return: names of tables 
3767          """ 
3768          return set(self.tableBuilderLookup.keys()) 
3769   
3770 -    def isOptimizable(self): 
3771          """ 
3772          Checks if the current database supports optimization. 
3773   
3774          @rtype: boolean 
3775          @return: True if optimizable, False otherwise 
3776          """ 
3777          return self.db.engine.name in ['sqlite'] 
3778   
3779 -    def optimize(self): 
3780          """ 
3781          Optimizes the current database. 
3782   
3783          @raise Exception: if database does not support optimization 
3784          @raise OperationalError: if optimization failed 
3785          """ 
3786          if self.db.engine.name == 'sqlite': 
3787              self.db.execute('VACUUM') 
3788          else: 
3789              raise Exception('Database does not seem to support optimization') 
3790   
3791  #} 
3792  #{ Global methods 
3793   
3794 -def warn(message): 
3795      """ 
3796      Prints the given message to stderr with the system's default encoding. 
3797   
3798      @type message: str 
3799      @param message: message to print 
3800      """ 
3801      print >> sys.stderr, message.encode(locale.getpreferredencoding(), 
3802          'replace') 
3803