Package pyxb :: Package utils :: Module activestate
[hide private]
[frames] | no frames]

Source Code for Module pyxb.utils.activestate

 1  # -*- coding: utf-8 -*- 
 2  # This file contains code adopted from http://code.activestate.com/recipes/. 
 3  # Copyright and licensing information is associated with each incorporated 
 4  # recipe.  At this time, all code is licensed under PSF. 
 5   
 6  # http://code.activestate.com/recipes/363841/ 
 7  # by Lars Tiede, derivative from: 
 8  # http://code.activestate.com/recipes/52257/ 
 9  # by Paul Prescod 
10  # Licensed under PSF 
11  # No changes 
12 -def detectXMLEncoding(fp):
13 """ Attempts to detect the character encoding of the xml file 14 given by a file object fp. fp must not be a codec wrapped file 15 object! 16 17 The return value can be: 18 - if detection of the BOM succeeds, the codec name of the 19 corresponding unicode charset is returned 20 21 - if BOM detection fails, the xml declaration is searched for 22 the encoding attribute and its value returned. the "<" 23 character has to be the very first in the file then (it's xml 24 standard after all). 25 26 - if BOM and xml declaration fail, None is returned. According 27 to xml 1.0 it should be utf_8 then, but it wasn't detected by 28 the means offered here. at least one can be pretty sure that a 29 character coding including most of ASCII is used :-/ 30 """ 31 ### detection using BOM 32 33 ## the BOMs we know, by their pattern 34 bomDict={ # bytepattern : name 35 (0x00, 0x00, 0xFE, 0xFF) : "utf_32_be", 36 (0xFF, 0xFE, 0x00, 0x00) : "utf_32_le", 37 (0xFE, 0xFF, None, None) : "utf_16_be", 38 (0xFF, 0xFE, None, None) : "utf_16_le", 39 (0xEF, 0xBB, 0xBF, None) : "utf_8", 40 } 41 42 ## go to beginning of file and get the first 4 bytes 43 oldFP = fp.tell() 44 fp.seek(0) 45 (byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) 46 47 ## try bom detection using 4 bytes, 3 bytes, or 2 bytes 48 bomDetection = bomDict.get((byte1, byte2, byte3, byte4)) 49 if not bomDetection : 50 bomDetection = bomDict.get((byte1, byte2, byte3, None)) 51 if not bomDetection : 52 bomDetection = bomDict.get((byte1, byte2, None, None)) 53 54 ## if BOM detected, we're done :-) 55 if bomDetection : 56 fp.seek(oldFP) 57 return bomDetection 58 59 60 ## still here? BOM detection failed. 61 ## now that BOM detection has failed we assume one byte character 62 ## encoding behaving ASCII - of course one could think of nice 63 ## algorithms further investigating on that matter, but I won't for now. 64 65 66 ### search xml declaration for encoding attribute 67 import re 68 69 ## assume xml declaration fits into the first 2 KB (*cough*) 70 fp.seek(0) 71 buffer = fp.read(2048) 72 73 ## set up regular expression 74 xmlDeclPattern = r""" 75 ^<\?xml # w/o BOM, xmldecl starts with <?xml at the first byte 76 .+? # some chars (version info), matched minimal 77 encoding= # encoding attribute begins 78 ["'] # attribute start delimiter 79 (?P<encstr> # what's matched in the brackets will be named encstr 80 [^"']+ # every character not delimiter (not overly exact!) 81 ) # closes the brackets pair for the named group 82 ["'] # attribute end delimiter 83 .*? # some chars optionally (standalone decl or whitespace) 84 \?> # xmldecl end 85 """ 86 87 xmlDeclRE = re.compile(xmlDeclPattern, re.VERBOSE) 88 89 ## search and extract encoding string 90 match = xmlDeclRE.search(buffer) 91 fp.seek(oldFP) 92 if match : 93 return match.group("encstr") 94 else : 95 return None
96