Python LMF library
 All Classes Namespaces Files Functions Variables
mdf.py
Go to the documentation of this file.
1 #! /usr/bin/env python
2 
3 """! @package input
4 """
5 
6 from config.mdf import mdf_lmf
7 from core.lexicon import Lexicon
8 from core.lexical_entry import LexicalEntry
9 from utils.io import open_read, EOL, ENCODING
10 from utils.error_handling import Warning, Error
11 from utils.ipa2sampa import uni2sampa
12 
13 def mdf_read(filename=None, mdf2lmf=mdf_lmf, lexicon=None, id=None, encoding=ENCODING):
14  """! @brief Read an MDF file.
15  @param filename The name of the MDF file to read with full path, for instance 'user/input.txt'.
16  @param mdf2lmf A Python dictionary describing the mapping between MDF markers and LMF representation. Default value is 'mdf_lmf' dictionary defined in 'pylmflib/config/mdf.py'. Please refer to it as an example.
17  @param lexicon An existing Lexicon to fill with lexical entries to read.
18  @param id A Python string identifying the lexicon to create.
19  @param encoding Use 'utf-8' encoding by default. Otherwise, user has to precise the native encoding of its document.
20  @return A Lexicon instance containing all lexical entries.
21  """
22  import re
23  # If not provided, create a Lexicon instance to contain all lexical entries
24  if lexicon is None:
25  lexicon = Lexicon(id)
26  # Read in unicode
27  if filename is None:
28  filename = lexicon.get_entrySource()
29  else:
30  # Set lexicon attribute
31  lexicon.set_entrySource(filename)
32  # Read in unicode
33  mdf_file = open_read(filename, encoding=encoding)
34  # MDF syntax is the following: '\marker value'
35  mdf_pattern = """^\\\(\w*) (<(.*)>)? ?(.*)$"""
36  # Add each lexical entry to the lexicon
37  current_entry = None
38  sub_entry = None
39  main_entry = None
40  for line in mdf_file.readlines():
41  # Do not parse empty lines
42  if line != EOL:
43  result = re.match(mdf_pattern, line)
44  if result is None:
45  # Line is empty => continue parsing next line
46  continue
47  marker = result.group(1)
48  attrs = result.group(3)
49  value = result.group(4)
50  # Do not consider markers starting with an underscore character (e.g. '_sh' and '_DateStampHasFourDigitYear')
51  if marker[0] == '_':
52  continue
53  # Remove trailing spaces and end-of-line characters
54  value = value.rstrip(' \r\n')
55  # Do not consider empty fields
56  if value == "":
57  continue
58  # Check if the current entry is a multiword expression
59  is_mwe = False
60  if marker == "lf":
61  lf = value.split(" = ")
62  if lf[0].startswith("Component"):
63  component_nb = lf[0].lstrip("Component")
64  value = lf[1]
65  is_mwe = True
66  # 'lx' and 'se' markers indicate a new entry
67  if marker == "lx" or marker == "se" or is_mwe:
68  # Compute a unique identifier
69  uid = uni2sampa(value)
70  if marker == "se" or is_mwe:
71  # Create a subentry
72  sub_entry = LexicalEntry(uid)
73  # An MDF subentry corresponds to an LMF lexical entry
74  mdf2lmf["lx"](value, sub_entry)
75  # Add it to the lexicon
76  lexicon.add_lexical_entry(sub_entry)
77  # Manage main entry
78  if main_entry is None:
79  main_entry = current_entry
80  else:
81  current_entry = main_entry
82  # Set main entry
83  homonym_nb = current_entry.get_homonymNumber()
84  if homonym_nb is None:
85  homonym_nb = ""
86  if marker == "se":
87  sub_entry.create_and_add_related_form(current_entry.get_lexeme() + homonym_nb, "main entry")
88  elif is_mwe:
89  current_entry.create_and_add_component(component_nb, value)
90  sub_entry.create_and_add_related_form(current_entry.get_lexeme() + homonym_nb, "complex predicate")
91  sub_entry.set_independentWord(False)
92  else:
93  # Create a new entry
94  current_entry = LexicalEntry(uid)
95  # Add it to the lexicon
96  lexicon.add_lexical_entry(current_entry)
97  # Reset main entry
98  main_entry = None
99  # Map MDF marker and value to LMF representation
100  try:
101  if attrs is not None:
102  # There are attributes
103  attributes = {}
104  # Remove quotation marks from attributes if any
105  attrs = attrs.replace('"', '')
106  for attr in attrs.split(' '):
107  attributes.update({attr.split('=')[0] : attr.split('=')[1]})
108  # A customized marker starts with '__' characters
109  mdf2lmf["__" + marker](attributes, value, current_entry)
110  else:
111  mdf2lmf[marker](value, current_entry)
112  if sub_entry is not None:
113  current_entry = sub_entry
114  sub_entry = None
115  except KeyError:
116  # When printing, we need to convert 'unicode' into 'str' using 'utf-8' encoding:
117  print Warning("MDF marker '%s' encountered for lexeme '%s' is not defined in configuration" % (marker.encode(ENCODING), current_entry.get_lexeme().encode(ENCODING)))
118  except Error as exception:
119  exception.handle()
120  mdf_file.close()
121  return lexicon
def mdf_read
Read an MDF file.
Definition: mdf.py:13
def open_read
Open file in read mode (automatically decode file in unicode).
Definition: io.py:36