Package pyxb :: Package utils :: Module unicode
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.unicode

  1  # -*- coding: utf-8 -*- 
  2  # Copyright 2009-2013, Peter A. Bigot 
  3  # 
  4  # Licensed under the Apache License, Version 2.0 (the "License"); you may 
  5  # not use this file except in compliance with the License. You may obtain a 
  6  # copy of the License at: 
  7  # 
  8  #            http://www.apache.org/licenses/LICENSE-2.0 
  9  # 
 10  # Unless required by applicable law or agreed to in writing, software 
 11  # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 
 12  # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 
 13  # License for the specific language governing permissions and limitations 
 14  # under the License. 
 15   
 16  """This module contains support for Unicode characters as required to 
 17  support the regular expression syntax defined in U{annex F 
 18  <http://www/Documentation/W3C/www.w3.org/TR/xmlschema-2/index.html#regexs>} 
 19  of the XML Schema definition. 
 20   
 21  In particular, we need to be able to identify character properties and 
 22  block escapes, as defined in F.1.1, by name. 
 23   
 24   - Block data: U{http://www.unicode.org/Public/3.1-Update/Blocks-4.txt} 
 25   - Property list data: U{http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.txt} 
 26   - Full dataset: U{http://www.unicode.org/Public/3.1-Update/UnicodeData-3.1.0.txt} 
 27   
 28  The Unicode database active at the time XML Schema 1.0 was defined is 
 29  archived at 
 30  U{http://www.unicode.org/Public/3.1-Update/UnicodeCharacterDatabase-3.1.0.html}, 
 31  and refers to U{Unicode Standard Annex #27: Unicode 3.1 
 32  <http://www.unicode.org/unicode/reports/tr27/>}. 
 33  """ 
 34   
 35  import re 
 36  import logging 
 37   
 38  _log = logging.getLogger(__name__) 
 39   
 40  SupportsWideUnicode = False 
 41  try: 
 42      re.compile(u'[\U0001d7ce-\U0001d7ff]') 
 43      SupportsWideUnicode = True 
 44  except: 
 45      pass 
 46   
 47  import bisect 
 48           
49 -class CodePointSetError (LookupError):
50 """Raised when some abuse of a L{CodePointSet} is detected.""" 51 pass
52
53 -class CodePointSet (object):
54 """Represent a set of Unicode code points. 55 56 Each code point is an integral value between 0 and 0x10FFFF. This 57 class is used to represent a set of code points in a manner 58 suitable for use as regular expression character sets.""" 59 60 MaxCodePoint = 0x10FFFF 61 """The maximum value for a code point in the Unicode code point 62 space. This is normally 0xFFFF, because wide unicode characters 63 are generally not enabled in Python builds. If, however, they are 64 enabled, this will be the full value of 0x10FFFF.""" 65 66 MaxShortCodePoint = 0xFFFF 67 if not SupportsWideUnicode: 68 MaxCodePoint = MaxShortCodePoint 69 70 # The internal representation of the codepoints is as a sorted 71 # list where values at an even index denote the first codepoint in 72 # a range that is in the set, and the immediately following value 73 # indicates the next following codepoint that is not in the set. 74 # A missing value at the end is interpreted as MaxCodePoint. For 75 # example, the sequence [ 12, 15, 200 ] denotes the set containing 76 # codepoints 12, 13, 14, and everything above 199. 77 __codepoints = None 78
79 - def _codepoints (self):
80 """For testing purrposes only, access to the codepoints 81 internal representation.""" 82 return self.__codepoints
83
84 - def __cmp__ (self, other):
85 """Equality is delegated to the codepoints list.""" 86 return cmp(self.__codepoints, other.__codepoints)
87
88 - def __init__ (self, *args):
89 self.__codepoints = [] 90 if 1 == len(args): 91 if isinstance(args[0], CodePointSet): 92 self.__codepoints.extend(args[0].__codepoints) 93 return 94 if isinstance(args[0], list): 95 args = args[0] 96 for a in args: 97 self.add(a)
98
99 - def __mutate (self, value, do_add):
100 # Identify the start (inclusive) and end (exclusive) code 101 # points of the value's range. 102 if isinstance(value, tuple): 103 (s, e) = value 104 e += 1 105 elif isinstance(value, basestring): 106 if 1 < len(value): 107 raise TypeError() 108 s = ord(value) 109 e = s+1 110 else: 111 s = int(value) 112 e = s+1 113 if s >= e: 114 raise ValueError('codepoint range value order') 115 116 # Validate the range for the code points supported by this 117 # Python interpreter. Recall that e is exclusive. 118 if s > self.MaxCodePoint: 119 return self 120 if e > self.MaxCodePoint: 121 e = self.MaxCodePoint+1 122 123 # Index of first code point equal to or greater than s 124 li = bisect.bisect_left(self.__codepoints, s) 125 # Index of last code point less than or equal to e 126 ri = bisect.bisect_right(self.__codepoints, e) 127 # There are four cases; if we're subtracting, they reflect. 128 case = ((li & 1) << 1) | (ri & 1) 129 if not do_add: 130 case = 3 - case 131 if 0x03 == case: 132 # Add: Incoming value begins and ends within existing ranges 133 del self.__codepoints[li:ri] 134 elif 0x02 == case: 135 # Add: Incoming value extends into an excluded range 136 del self.__codepoints[li+1:ri] 137 self.__codepoints[li] = e 138 elif 0x01 == case: 139 # Add: Incoming value begins in an excluded range 140 del self.__codepoints[li+1:ri] 141 self.__codepoints[li] = s 142 else: 143 # Add: Incoming value begins and ends within excluded ranges 144 self.__codepoints[li:ri] = [s, e] 145 return self
146
147 - def add (self, value):
148 """Add the given value to the code point set. 149 150 @param value: An integral value denoting a code point, or a 151 tuple C{(s,e)} denoting the start and end (inclusive) code 152 points in a range. 153 @return: C{self}""" 154 return self.__mutate(value, True)
155
156 - def extend (self, values):
157 """Add multiple values to a code point set. 158 159 @param values: Either a L{CodePointSet} instance, or an iterable 160 whose members are valid parameters to L{add}. 161 162 @return: C{self}""" 163 if isinstance(values, CodePointSet): 164 self.extend(values.asTuples()) 165 else: 166 for v in values: 167 self.__mutate(v, True) 168 return self
169
170 - def subtract (self, value):
171 """Remove the given value from the code point set. 172 173 @param value: An integral value denoting a code point, or a tuple 174 C{(s,e)} denoting the start and end (inclusive) code points in a 175 range, or a L{CodePointSet}. 176 177 @return: C{self}""" 178 if isinstance(value, CodePointSet): 179 for v in value.asTuples(): 180 self.subtract(v) 181 return self 182 return self.__mutate(value, False)
183 184 # Escape sequences for characters that must not appear unescaped in 185 # Python regular expression patterns. Maps each bad character to a safe 186 # escape sequence. 187 __XMLtoPythonREMap = { 188 u'\x00': u'\\x00', # From docs for Python's "re" module: Regular 189 # expression pattern strings may not contain null 190 # bytes 191 u'^': u'\\^', # Indicates negation if it happens to occur at the 192 # start of a character group 193 u'\\': u'\\\\', # Escape character 194 u'[': u'\\[', # Actually doesn't need to be escaped inside a Python 195 # character group, but escaping it is less confusing. 196 u']': u'\\]', # End of character group 197 u'-': u'\\-', # Indicates a range of characters 198 } 199 200 # Return the given code point as a unicode character suitable for 201 # use in a regular expression
202 - def __unichr (self, code_point):
203 rv = unichr(code_point) 204 rv = self.__XMLtoPythonREMap.get(rv, rv) 205 return rv
206
207 - def asPattern (self, with_brackets=True):
208 """Return the code point set as Unicode regular expression 209 character group consisting of a sequence of characters or 210 character ranges. 211 212 This returns a regular expression fragment using Python's 213 regular expression syntax. Note that different regular expression 214 syntaxes are not compatible, often in subtle ways. 215 216 @param with_brackets: If C{True} (default), square brackets 217 are added to enclose the returned character group.""" 218 rva = [] 219 if with_brackets: 220 rva.append(u'[') 221 for (s, e) in self.asTuples(): 222 if s == e: 223 rva.append(self.__unichr(s)) 224 else: 225 rva.extend([self.__unichr(s), '-', self.__unichr(e)]) 226 if with_brackets: 227 rva.append(u']') 228 return u''.join(rva)
229
230 - def asTuples (self):
231 """Return the codepoints as tuples denoting the ranges that are in 232 the set. 233 234 Each tuple C{(s, e)} indicates that the code points from C{s} 235 (inclusive) to C{e}) (inclusive) are in the set.""" 236 237 rv = [] 238 start = None 239 for ri in xrange(len(self.__codepoints)): 240 if start is not None: 241 rv.append( (start, self.__codepoints[ri]-1) ) 242 start = None 243 else: 244 start = self.__codepoints[ri] 245 if (start is not None) and (start <= self.MaxCodePoint): 246 rv.append( (start, self.MaxCodePoint) ) 247 return rv
248
249 - def negate (self):
250 """Return an instance that represents the inverse of this set.""" 251 rv = type(self)() 252 if (0 < len(self.__codepoints)) and (0 == self.__codepoints[0]): 253 rv.__codepoints.extend(self.__codepoints[1:]) 254 else: 255 rv.__codepoints.append(0) 256 rv.__codepoints.extend(self.__codepoints) 257 return rv
258
259 - def asSingleCharacter (self):
260 """If this set represents a single character, return it as its 261 unicode string value. Otherwise return C{None}.""" 262 if (2 != len(self.__codepoints)) or (1 < (self.__codepoints[1] - self.__codepoints[0])): 263 return None 264 return unichr(self.__codepoints[0])
265 266 from pyxb.utils.unicode_data import PropertyMap 267 from pyxb.utils.unicode_data import BlockMap 268
269 -class XML1p0e2 (object):
270 """Regular expression support for XML Schema Data Types. 271 272 This class holds character classes and regular expressions used to 273 constrain the lexical space of XML Schema datatypes derived from 274 U{string<http://www.w3.org/TR/xmlschema-2/#string>}. They are 275 from U{XML 1.0 (Second 276 Edition)<http://www.w3.org/TR/2000/WD-xml-2e-20000814>} and 277 U{Namespaces in XML 278 <http://www.w3.org/TR/1999/REC-xml-names-19990114/>}. 279 280 Unlike the regular expressions used for pattern constraints in XML 281 Schema, which are derived from the Unicode 3.1 specification, 282 these are derived from the Unicode 2.0 specification. 283 284 The XML Schema definition refers explicitly to the second edition 285 of XML, so we have to use these code point sets and patterns. Be 286 aware that U{subsequent updates to the XML specification 287 <http://www.w3.org/XML/xml-V10-4e-errata#E09>} have changed the 288 corresponding patterns for other uses of XML. One significant 289 change is that the original specification, used here, does not 290 allow wide unicode characters.""" 291 292 Char = CodePointSet( 293 0x0009, 294 0x000A, 295 0x000D, 296 ( 0x0020, 0xD7FF ), 297 ( 0xE000, 0xFFFD ) 298 ) 299 if SupportsWideUnicode: 300 Char.add( ( 1+CodePointSet.MaxShortCodePoint, CodePointSet.MaxCodePoint ) ) 301 302 BaseChar = CodePointSet( 303 ( 0x0041, 0x005A ), 304 ( 0x0061, 0x007A ), 305 ( 0x00C0, 0x00D6 ), 306 ( 0x00D8, 0x00F6 ), 307 ( 0x00F8, 0x00FF ), 308 ( 0x0100, 0x0131 ), 309 ( 0x0134, 0x013E ), 310 ( 0x0141, 0x0148 ), 311 ( 0x014A, 0x017E ), 312 ( 0x0180, 0x01C3 ), 313 ( 0x01CD, 0x01F0 ), 314 ( 0x01F4, 0x01F5 ), 315 ( 0x01FA, 0x0217 ), 316 ( 0x0250, 0x02A8 ), 317 ( 0x02BB, 0x02C1 ), 318 0x0386, 319 ( 0x0388, 0x038A ), 320 0x038C, 321 ( 0x038E, 0x03A1 ), 322 ( 0x03A3, 0x03CE ), 323 ( 0x03D0, 0x03D6 ), 324 0x03DA, 325 0x03DC, 326 0x03DE, 327 0x03E0, 328 ( 0x03E2, 0x03F3 ), 329 ( 0x0401, 0x040C ), 330 ( 0x040E, 0x044F ), 331 ( 0x0451, 0x045C ), 332 ( 0x045E, 0x0481 ), 333 ( 0x0490, 0x04C4 ), 334 ( 0x04C7, 0x04C8 ), 335 ( 0x04CB, 0x04CC ), 336 ( 0x04D0, 0x04EB ), 337 ( 0x04EE, 0x04F5 ), 338 ( 0x04F8, 0x04F9 ), 339 ( 0x0531, 0x0556 ), 340 0x0559, 341 ( 0x0561, 0x0586 ), 342 ( 0x05D0, 0x05EA ), 343 ( 0x05F0, 0x05F2 ), 344 ( 0x0621, 0x063A ), 345 ( 0x0641, 0x064A ), 346 ( 0x0671, 0x06B7 ), 347 ( 0x06BA, 0x06BE ), 348 ( 0x06C0, 0x06CE ), 349 ( 0x06D0, 0x06D3 ), 350 0x06D5, 351 ( 0x06E5, 0x06E6 ), 352 ( 0x0905, 0x0939 ), 353 0x093D, 354 ( 0x0958, 0x0961 ), 355 ( 0x0985, 0x098C ), 356 ( 0x098F, 0x0990 ), 357 ( 0x0993, 0x09A8 ), 358 ( 0x09AA, 0x09B0 ), 359 0x09B2, 360 ( 0x09B6, 0x09B9 ), 361 ( 0x09DC, 0x09DD ), 362 ( 0x09DF, 0x09E1 ), 363 ( 0x09F0, 0x09F1 ), 364 ( 0x0A05, 0x0A0A ), 365 ( 0x0A0F, 0x0A10 ), 366 ( 0x0A13, 0x0A28 ), 367 ( 0x0A2A, 0x0A30 ), 368 ( 0x0A32, 0x0A33 ), 369 ( 0x0A35, 0x0A36 ), 370 ( 0x0A38, 0x0A39 ), 371 ( 0x0A59, 0x0A5C ), 372 0x0A5E, 373 ( 0x0A72, 0x0A74 ), 374 ( 0x0A85, 0x0A8B ), 375 0x0A8D, 376 ( 0x0A8F, 0x0A91 ), 377 ( 0x0A93, 0x0AA8 ), 378 ( 0x0AAA, 0x0AB0 ), 379 ( 0x0AB2, 0x0AB3 ), 380 ( 0x0AB5, 0x0AB9 ), 381 0x0ABD, 382 0x0AE0, 383 ( 0x0B05, 0x0B0C ), 384 ( 0x0B0F, 0x0B10 ), 385 ( 0x0B13, 0x0B28 ), 386 ( 0x0B2A, 0x0B30 ), 387 ( 0x0B32, 0x0B33 ), 388 ( 0x0B36, 0x0B39 ), 389 0x0B3D, 390 ( 0x0B5C, 0x0B5D ), 391 ( 0x0B5F, 0x0B61 ), 392 ( 0x0B85, 0x0B8A ), 393 ( 0x0B8E, 0x0B90 ), 394 ( 0x0B92, 0x0B95 ), 395 ( 0x0B99, 0x0B9A ), 396 0x0B9C, 397 ( 0x0B9E, 0x0B9F ), 398 ( 0x0BA3, 0x0BA4 ), 399 ( 0x0BA8, 0x0BAA ), 400 ( 0x0BAE, 0x0BB5 ), 401 ( 0x0BB7, 0x0BB9 ), 402 ( 0x0C05, 0x0C0C ), 403 ( 0x0C0E, 0x0C10 ), 404 ( 0x0C12, 0x0C28 ), 405 ( 0x0C2A, 0x0C33 ), 406 ( 0x0C35, 0x0C39 ), 407 ( 0x0C60, 0x0C61 ), 408 ( 0x0C85, 0x0C8C ), 409 ( 0x0C8E, 0x0C90 ), 410 ( 0x0C92, 0x0CA8 ), 411 ( 0x0CAA, 0x0CB3 ), 412 ( 0x0CB5, 0x0CB9 ), 413 0x0CDE, 414 ( 0x0CE0, 0x0CE1 ), 415 ( 0x0D05, 0x0D0C ), 416 ( 0x0D0E, 0x0D10 ), 417 ( 0x0D12, 0x0D28 ), 418 ( 0x0D2A, 0x0D39 ), 419 ( 0x0D60, 0x0D61 ), 420 ( 0x0E01, 0x0E2E ), 421 0x0E30, 422 ( 0x0E32, 0x0E33 ), 423 ( 0x0E40, 0x0E45 ), 424 ( 0x0E81, 0x0E82 ), 425 0x0E84, 426 ( 0x0E87, 0x0E88 ), 427 0x0E8A, 428 0x0E8D, 429 ( 0x0E94, 0x0E97 ), 430 ( 0x0E99, 0x0E9F ), 431 ( 0x0EA1, 0x0EA3 ), 432 0x0EA5, 433 0x0EA7, 434 ( 0x0EAA, 0x0EAB ), 435 ( 0x0EAD, 0x0EAE ), 436 0x0EB0, 437 ( 0x0EB2, 0x0EB3 ), 438 0x0EBD, 439 ( 0x0EC0, 0x0EC4 ), 440 ( 0x0F40, 0x0F47 ), 441 ( 0x0F49, 0x0F69 ), 442 ( 0x10A0, 0x10C5 ), 443 ( 0x10D0, 0x10F6 ), 444 0x1100, 445 ( 0x1102, 0x1103 ), 446 ( 0x1105, 0x1107 ), 447 0x1109, 448 ( 0x110B, 0x110C ), 449 ( 0x110E, 0x1112 ), 450 0x113C, 451 0x113E, 452 0x1140, 453 0x114C, 454 0x114E, 455 0x1150, 456 ( 0x1154, 0x1155 ), 457 0x1159, 458 ( 0x115F, 0x1161 ), 459 0x1163, 460 0x1165, 461 0x1167, 462 0x1169, 463 ( 0x116D, 0x116E ), 464 ( 0x1172, 0x1173 ), 465 0x1175, 466 0x119E, 467 0x11A8, 468 0x11AB, 469 ( 0x11AE, 0x11AF ), 470 ( 0x11B7, 0x11B8 ), 471 0x11BA, 472 ( 0x11BC, 0x11C2 ), 473 0x11EB, 474 0x11F0, 475 0x11F9, 476 ( 0x1E00, 0x1E9B ), 477 ( 0x1EA0, 0x1EF9 ), 478 ( 0x1F00, 0x1F15 ), 479 ( 0x1F18, 0x1F1D ), 480 ( 0x1F20, 0x1F45 ), 481 ( 0x1F48, 0x1F4D ), 482 ( 0x1F50, 0x1F57 ), 483 0x1F59, 484 0x1F5B, 485 0x1F5D, 486 ( 0x1F5F, 0x1F7D ), 487 ( 0x1F80, 0x1FB4 ), 488 ( 0x1FB6, 0x1FBC ), 489 0x1FBE, 490 ( 0x1FC2, 0x1FC4 ), 491 ( 0x1FC6, 0x1FCC ), 492 ( 0x1FD0, 0x1FD3 ), 493 ( 0x1FD6, 0x1FDB ), 494 ( 0x1FE0, 0x1FEC ), 495 ( 0x1FF2, 0x1FF4 ), 496 ( 0x1FF6, 0x1FFC ), 497 0x2126, 498 ( 0x212A, 0x212B ), 499 0x212E, 500 ( 0x2180, 0x2182 ), 501 ( 0x3041, 0x3094 ), 502 ( 0x30A1, 0x30FA ), 503 ( 0x3105, 0x312C ), 504 ( 0xAC00, 0xD7A3 ) 505 ) 506 507 Ideographic = CodePointSet( 508 ( 0x4E00, 0x9FA5 ), 509 0x3007, 510 ( 0x3021, 0x3029 ) 511 ) 512 513 Letter = CodePointSet(BaseChar).extend(Ideographic) 514 515 CombiningChar = CodePointSet( 516 ( 0x0300, 0x0345 ), 517 ( 0x0360, 0x0361 ), 518 ( 0x0483, 0x0486 ), 519 ( 0x0591, 0x05A1 ), 520 ( 0x05A3, 0x05B9 ), 521 ( 0x05BB, 0x05BD ), 522 0x05BF, 523 ( 0x05C1, 0x05C2 ), 524 0x05C4, 525 ( 0x064B, 0x0652 ), 526 0x0670, 527 ( 0x06D6, 0x06DC ), 528 ( 0x06DD, 0x06DF ), 529 ( 0x06E0, 0x06E4 ), 530 ( 0x06E7, 0x06E8 ), 531 ( 0x06EA, 0x06ED ), 532 ( 0x0901, 0x0903 ), 533 0x093C, 534 ( 0x093E, 0x094C ), 535 0x094D, 536 ( 0x0951, 0x0954 ), 537 ( 0x0962, 0x0963 ), 538 ( 0x0981, 0x0983 ), 539 0x09BC, 540 0x09BE, 541 0x09BF, 542 ( 0x09C0, 0x09C4 ), 543 ( 0x09C7, 0x09C8 ), 544 ( 0x09CB, 0x09CD ), 545 0x09D7, 546 ( 0x09E2, 0x09E3 ), 547 0x0A02, 548 0x0A3C, 549 0x0A3E, 550 0x0A3F, 551 ( 0x0A40, 0x0A42 ), 552 ( 0x0A47, 0x0A48 ), 553 ( 0x0A4B, 0x0A4D ), 554 ( 0x0A70, 0x0A71 ), 555 ( 0x0A81, 0x0A83 ), 556 0x0ABC, 557 ( 0x0ABE, 0x0AC5 ), 558 ( 0x0AC7, 0x0AC9 ), 559 ( 0x0ACB, 0x0ACD ), 560 ( 0x0B01, 0x0B03 ), 561 0x0B3C, 562 ( 0x0B3E, 0x0B43 ), 563 ( 0x0B47, 0x0B48 ), 564 ( 0x0B4B, 0x0B4D ), 565 ( 0x0B56, 0x0B57 ), 566 ( 0x0B82, 0x0B83 ), 567 ( 0x0BBE, 0x0BC2 ), 568 ( 0x0BC6, 0x0BC8 ), 569 ( 0x0BCA, 0x0BCD ), 570 0x0BD7, 571 ( 0x0C01, 0x0C03 ), 572 ( 0x0C3E, 0x0C44 ), 573 ( 0x0C46, 0x0C48 ), 574 ( 0x0C4A, 0x0C4D ), 575 ( 0x0C55, 0x0C56 ), 576 ( 0x0C82, 0x0C83 ), 577 ( 0x0CBE, 0x0CC4 ), 578 ( 0x0CC6, 0x0CC8 ), 579 ( 0x0CCA, 0x0CCD ), 580 ( 0x0CD5, 0x0CD6 ), 581 ( 0x0D02, 0x0D03 ), 582 ( 0x0D3E, 0x0D43 ), 583 ( 0x0D46, 0x0D48 ), 584 ( 0x0D4A, 0x0D4D ), 585 0x0D57, 586 0x0E31, 587 ( 0x0E34, 0x0E3A ), 588 ( 0x0E47, 0x0E4E ), 589 0x0EB1, 590 ( 0x0EB4, 0x0EB9 ), 591 ( 0x0EBB, 0x0EBC ), 592 ( 0x0EC8, 0x0ECD ), 593 ( 0x0F18, 0x0F19 ), 594 0x0F35, 595 0x0F37, 596 0x0F39, 597 0x0F3E, 598 0x0F3F, 599 ( 0x0F71, 0x0F84 ), 600 ( 0x0F86, 0x0F8B ), 601 ( 0x0F90, 0x0F95 ), 602 0x0F97, 603 ( 0x0F99, 0x0FAD ), 604 ( 0x0FB1, 0x0FB7 ), 605 0x0FB9, 606 ( 0x20D0, 0x20DC ), 607 0x20E1, 608 ( 0x302A, 0x302F ), 609 0x3099, 610 0x309A 611 ) 612 613 Digit = CodePointSet( 614 ( 0x0030, 0x0039 ), 615 ( 0x0660, 0x0669 ), 616 ( 0x06F0, 0x06F9 ), 617 ( 0x0966, 0x096F ), 618 ( 0x09E6, 0x09EF ), 619 ( 0x0A66, 0x0A6F ), 620 ( 0x0AE6, 0x0AEF ), 621 ( 0x0B66, 0x0B6F ), 622 ( 0x0BE7, 0x0BEF ), 623 ( 0x0C66, 0x0C6F ), 624 ( 0x0CE6, 0x0CEF ), 625 ( 0x0D66, 0x0D6F ), 626 ( 0x0E50, 0x0E59 ), 627 ( 0x0ED0, 0x0ED9 ), 628 ( 0x0F20, 0x0F29 ) 629 ) 630 631 Extender = CodePointSet( 632 0x00B7, 633 0x02D0, 634 0x02D1, 635 0x0387, 636 0x0640, 637 0x0E46, 638 0x0EC6, 639 0x3005, 640 ( 0x3031, 0x3035 ), 641 ( 0x309D, 0x309E ), 642 ( 0x30FC, 0x30FE ) 643 ) 644 645 # Not an explicit production, but used in Name production 646 NameStartChar = CodePointSet(Letter) 647 NameStartChar.add(ord('_')) 648 NameStartChar.add(ord(':')) 649 650 NCNameStartChar = CodePointSet(Letter) 651 NCNameStartChar.add(ord('_')) 652 653 NameChar = CodePointSet(Letter) 654 NameChar.extend(Digit) 655 NameChar.add(ord('.')) 656 NameChar.add(ord('-')) 657 NameChar.add(ord('_')) 658 NameChar.add(ord(':')) 659 NameChar.extend(CombiningChar) 660 NameChar.extend(Extender) 661 662 NCNameChar = CodePointSet(Letter) 663 NCNameChar.extend(Digit) 664 NCNameChar.add(ord('.')) 665 NCNameChar.add(ord('-')) 666 NCNameChar.add(ord('_')) 667 NCNameChar.extend(CombiningChar) 668 NCNameChar.extend(Extender) 669 670 Name_pat = '%s%s*' % (NameStartChar.asPattern(), NameChar.asPattern()) 671 Name_re = re.compile('^%s$' % (Name_pat,)) 672 NmToken_pat = '%s+' % (NameChar.asPattern(),) 673 NmToken_re = re.compile('^%s$' % (NmToken_pat,)) 674 NCName_pat = '%s%s*' % (NCNameStartChar.asPattern(), NCNameChar.asPattern()) 675 NCName_re = re.compile('^%s$' % (NCName_pat,)) 676 QName_pat = '(%s:)?%s' % (NCName_pat, NCName_pat) 677 QName_re = re.compile('^%s$' % (QName_pat,))
678 679 # Production 24 : Single Character Escapes 680 SingleCharEsc = { 'n' : CodePointSet(0x0A), 681 'r' : CodePointSet(0x0D), 682 't' : CodePointSet(0x09) } 683 for c in r'\|.-^?*+{}()[]': 684 SingleCharEsc[c] = CodePointSet(ord(c)) 685 686 # Production 25 : Category Escapes 687 # Production 26: Complemented Category Escapes 688 catEsc = { } 689 complEsc = { } 690 for k, v in PropertyMap.iteritems(): 691 catEsc[u'p{%s}' % (k,)] = v 692 catEsc[u'P{%s}' % (k,)] = v.negate() 693 694 # Production 36 : IsBlock escapes 695 IsBlockEsc = { } 696 for k, v in BlockMap.iteritems(): 697 IsBlockEsc[u'p{Is%s}' % (k,)] = v 698 IsBlockEsc[u'P{Is%s}' % (k,)] = v.negate() 699 700 # Production 37 : Multi-Character Escapes 701 WildcardEsc = CodePointSet(ord('\n'), ord('\r')).negate() 702 MultiCharEsc = { } 703 MultiCharEsc['s'] = CodePointSet(0x20, ord('\t'), ord('\n'), ord('\r')) 704 MultiCharEsc['S'] = MultiCharEsc['s'].negate() 705 MultiCharEsc['i'] = CodePointSet(XML1p0e2.Letter).add(ord('_')).add(ord(':')) 706 MultiCharEsc['I'] = MultiCharEsc['i'].negate() 707 MultiCharEsc['c'] = CodePointSet(XML1p0e2.NameChar) 708 MultiCharEsc['C'] = MultiCharEsc['c'].negate() 709 MultiCharEsc['d'] = PropertyMap['Nd'] 710 MultiCharEsc['D'] = MultiCharEsc['d'].negate() 711 MultiCharEsc['W'] = CodePointSet(PropertyMap['P']).extend(PropertyMap['Z']).extend(PropertyMap['C']) 712 MultiCharEsc['w'] = MultiCharEsc['W'].negate() 713