Python LMF library
 All Classes Namespaces Files Functions Variables
xml_lmf.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 # -*- coding: utf-8 -*-
3 
4 """! @package output
5 """
6 
7 from utils.xml_format import write_result, Element, SubElement
8 from utils.io import ENCODING
9 
10 def xml_lmf_write(object, filename):
11  """! @brief Write an XML LMF file.
12  @param object The LMF instance to write as XML.
13  @param filename The name of the XML LMF file to write with full path, for instance 'user/output.xml'.
14  """
15  # Create the root XML element
16  root = Element(object.__class__.__name__)
17  # Create all XML sub-elements
18  build_sub_elements(object, root)
19  # Write all created XML elements in the output file
20  write_result(root, filename)
21 
22 def build_sub_elements(object, element):
23  """! @brief Create XML sub-elements to an existing XML element by parsing an LMF object instance.
24  @param object An LMF object instance.
25  @param element XML element for which sub-elements have to be created according to LMF object attributes.
26  """
27  # Parse instance attributes
28  for item in object.__dict__.items():
29  attr_name = item[0]
30  attr_value = item[1]
31  # For each defined public attribute, create an XML sub-element
32  if not attr_name.startswith('_'):
33  if attr_value is not None:
34  # Handle boolean values
35  if type(attr_value) is bool:
36  attr_value = unicode(attr_value)
37  # Check if the attribute is itself a class instance
38  if type(attr_value) is list:
39  # We suppose that a list always contains objects
40  for item in attr_value:
41  sub_element = SubElement(element, item.__class__.__name__)
42  build_sub_elements(item, sub_element)
43  elif type(attr_value) not in [int, str, unicode]:
44  # If this is the case, create an XML element and restart the same operation recursively on this object
45  sub_element = SubElement(element, attr_value.__class__.__name__)
46  build_sub_elements(attr_value, sub_element)
47  elif attr_name in ["dtdVersion", "id", "targets"]:
48  # If this is a specical attribute ("id" or "targets"), it must be inserted as an XML element attribute
49  if type(attr_value) is int:
50  attr_value = unicode(attr_value)
51  element.attrib.update({attr_name: attr_value})
52  if attr_name == "targets":
53  add_link(object, element)
54  else:
55  # In all other cases, an XML sub-element must be created with the keyword name "feat"
56  feat = SubElement(element, "feat", att=attr_name, val=attr_value)
57  # Handle reserved characters and fonts
58  handle_reserved(feat)
59  handle_fv(feat)
60  handle_fn(feat)
61  handle_font(feat)
62  # Special formatting
63  handle_pinyin(feat)
64  handle_tones(feat)
65  handle_caps(feat)
66 
67 ## Functions to process XML/XHTML layout
68 
69 def add_link(object, element):
70  """Insert an hyperlink <a href=xxx>xxx<a/> in XML.
71  """
72  # To access options
73  from pylmflib import options
74  global options
75  if options.cross_references:
76  # Retrieve identifier
77  try:
78  id = object.get_lexical_entry().get_id()
79  except AttributeError:
80  id = None
81  if id is not None:
82  # Create link
83  a = Element("a")
84  a.attrib["href"] = id
85  a.text = element.attrib["targets"]
86  # Insert link in element
87  element.insert(0, a)
88  return (object, element)
89 
90 def handle_reserved(element):
91  """ Handle reserved characters.
92  """
93  return element
94 
95 def handle_fv(element):
96  """Replace 'fv:xxx' and '|fv{xxx}' by '<span class="vernacular">xxx</span>'.
97  """
98  import re
99  # Find text to display in vernacular font
100  pattern = r"(([^:\|]*)fv:([^\s\.,)]*)(.*))|(([^:\|]*)\|fv{([^}]*)}(.*))"
101  result = re.match(pattern, element.attrib["val"])
102  # Initialize loop variables
103  previous_span = None
104  index = 0
105  while result:
106  if result.group(1) is not None:
107  before = result.group(2)
108  vernacular = result.group(3)
109  after = result.group(4)
110  elif result.group(5) is not None:
111  before = result.group(6)
112  vernacular = result.group(7)
113  after = result.group(8)
114  # Handle previous span or element
115  if previous_span is None:
116  element.text = before
117  else:
118  previous_span.tail = before
119  # Create span
120  span = Element("span")
121  span.attrib["class"] = "vernacular"
122  span.text = vernacular
123  # Insert span in element
124  element.insert(index, span)
125  # Update result
126  result = re.match(pattern, after)
127  if not result:
128  span.tail = after
129  # Update loop variables
130  previous_span = span
131  index += 1
132  return element
133 
134 def handle_fn(element):
135  """Replace 'fn:xxx' and '|fn{xxx}' by '<span class="national">xxx</span>'.
136  """
137  import re
138  # Find text to display in vernacular font
139  pattern = r"([^:\|]*)((fn:([^\s\.,)]*)|(\|fn{([^}]*)})))(.*)"
140  result = re.match(pattern, element.attrib["val"])
141  # Initialize loop variables
142  previous_span = None
143  index = 0
144  while result:
145  before = result.group(1)
146  if result.group(4) is not None:
147  national = result.group(4)
148  elif result.group(6) is not None:
149  national = result.group(6)
150  after = result.group(7)
151  # Handle previous span or element
152  if previous_span is None:
153  element.text = before
154  else:
155  previous_span.tail = before
156  # Create span
157  span = Element("span")
158  span.attrib["class"] = "national"
159  span.text = national
160  # Insert span in element
161  element.insert(index, span)
162  # Update result
163  result = re.match(pattern, after)
164  if not result:
165  span.tail = after
166  # Update loop variables
167  previous_span = span
168  index += 1
169  return element
170 
171 def handle_font(element):
172  """Replace '{xxx}' by '<span class="ipa">xxx</span>'.
173  """
174  import re
175  # Find text to display in IPA
176  pattern = r"([^{}]*){([^}]*)}(.*)"
177  result = re.match(pattern, element.attrib["val"])
178  # Initialize loop variables
179  previous_span = None
180  index = 0
181  while result:
182  before = result.group(1)
183  ipa = result.group(2)
184  after = result.group(3)
185  # Handle previous span or element
186  if previous_span is None:
187  element.text = before
188  else:
189  previous_span.tail = before
190  # Create span
191  span = Element("span")
192  span.attrib["class"] = "ipa"
193  span.text = ipa
194  # Insert span in element
195  element.insert(index, span)
196  # Update result
197  result = re.match(pattern, after)
198  if not result:
199  span.tail = after
200  # Update loop variables
201  previous_span = span
202  index += 1
203  return element
204 
205 def handle_pinyin(element):
206  """Replace '@xxx' by '<span class="pinyin">xxx</span>'.
207  """
208  import re
209  # Find pinyin
210  pattern = r"([^@]*)@(\w*)(.*)"
211  result = re.match(pattern, element.attrib["val"])
212  # Initialize loop variables
213  previous_span = None
214  index = 0
215  while result:
216  before = result.group(1)
217  pinyin = result.group(2)
218  after = result.group(3)
219  # Handle previous span or element
220  if previous_span is None:
221  element.text = before
222  else:
223  previous_span.tail = before
224  # Create span
225  span = Element("span")
226  span.attrib["class"] = "pinyin"
227  span.text = pinyin
228  # Insert span in element
229  element.insert(index, span)
230  # Update result
231  result = re.match(pattern, after)
232  if not result:
233  span.tail = after
234  # Update loop variables
235  previous_span = span
236  index += 1
237  return element
238 
239 def handle_caps(element):
240  """Handle small caps.
241  Replace '°xxx' by '<span class="sc">xxx</span>'.
242  """
243  import re
244  pattern = r"([^°]*)°([^\s\.,)+/:]*)(.*)"
245  # Find text to display in small caps
246  result = re.match(pattern, element.attrib["val"].encode(ENCODING))
247  # Initialize loop variables
248  previous_span = None
249  index = 0
250  while result:
251  before = result.group(1).decode(ENCODING)
252  sc = result.group(2).decode(ENCODING)
253  after = result.group(3).decode(ENCODING)
254  # Handle previous span or element
255  if previous_span is None:
256  element.text = before
257  else:
258  previous_span.tail = before
259  # Create span
260  span = Element("span")
261  span.attrib["class"] = "sc"
262  span.text = sc
263  # Insert span in element
264  element.insert(index, span)
265  # Update result
266  result = re.match(pattern, after.encode(ENCODING))
267  if not result:
268  span.tail = after
269  # Update loop variables
270  previous_span = span
271  index += 1
272  return element
273 
274 def handle_tones(element):
275  """Replace tones subscripts by '<sub>xxx</sub>'.
276  """
277  from utils.io import ENCODING
278  import re
279  if element.attrib["att"] == "tone":
280  # Initialize loop variables
281  previous_sub = None
282  if element.text is None:
283  element.text = ""
284  index = 0
285  for c in element.attrib["val"]:
286  if c in set("abcd123"):
287  # Create sub
288  sub = Element("sub")
289  sub.text = c
290  # Insert sub in element
291  element.insert(index, sub)
292  # Update loop variables
293  previous_sub = sub
294  previous_sub.tail = ""
295  index += 1
296  else:
297  # Handle previous sub or element
298  if previous_sub is None:
299  element.text += c
300  else:
301  previous_sub.tail += c
302  if element.text == element.attrib["val"]:
303  # Reset if identical
304  element.text = None
305  return element
306  if element.attrib["att"] != "lexeme":
307  return element
308  # Find text to display as subscript
309  tones = "˩˧˥".decode(encoding=ENCODING)
310  # Monosyllabic
311  current_pattern = "([^" + tones + "#$]+)(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
312  pattern = "^" + current_pattern + "$"
313  if re.search(pattern, element.attrib["val"]):
314  result = re.match(pattern, element.attrib["val"])
315  before = result.group(1) + result.group(2)
316  subscript = result.group(3)
317  element.text = before
318  if len(subscript) != 0:
319  # Create sub
320  sub = Element("sub")
321  sub.text = subscript
322  # Insert sub in element
323  element.insert(0, sub)
324  if element.text == element.attrib["val"]:
325  # Reset if identical
326  element.text = None
327  return element
328  # Disyllabic: add a constraint on other syllables which must have at least 2 characters (maximum 5)
329  syllable = "([^" + tones + "#$]{2,5})(#?[" + tones + "]{1,2}[$#]?)([abcd123]?)"
330  # Handle words composed of 2, 3, 4, 5 syllables
331  for syllable_nb in range (2, 6):
332  current_pattern += syllable
333  pattern = "^" + current_pattern + "$"
334  if re.search(pattern, element.attrib["val"]):
335  result = re.match(pattern, element.attrib["val"])
336  # Initialize loop variables
337  previous_sub = None
338  if element.text is None:
339  element.text = ""
340  for i in range (0, syllable_nb):
341  before = result.group(i*3+1) + result.group(i*3+2)
342  subscript = result.group(i*3+3)
343  if i != syllable_nb - 1:
344  before += subscript
345  subscript = ""
346  # Handle previous sub or element
347  if previous_sub is None:
348  element.text += before
349  else:
350  previous_sub.tail += before
351  if len(subscript) != 0:
352  # Create sub
353  sub = Element("sub")
354  sub.text = subscript
355  # Insert sub in element
356  element.insert(i, sub)
357  # Update loop variable
358  previous_sub = sub
359  previous_sub.tail = ""
360  if element.text == element.attrib["val"]:
361  # Reset if identical
362  element.text = None
363  return element
def write_result
Write an XML element into a pretty XML output file.
Definition: xml_format.py:24
def xml_lmf_write
Write an XML LMF file.
Definition: xml_lmf.py:10
def add_link
Functions to process XML/XHTML layout.
Definition: xml_lmf.py:69
def build_sub_elements
Create XML sub-elements to an existing XML element by parsing an LMF object instance.
Definition: xml_lmf.py:22