6 from config.mdf
import mdf_lmf
7 from core.lexicon
import Lexicon
8 from core.lexical_entry
import LexicalEntry
9 from utils.io
import open_read, EOL, ENCODING
10 from utils.error_handling
import Warning, Error
11 from utils.ipa2sampa
import uni2sampa
13 def mdf_read(filename=None, mdf2lmf=mdf_lmf, lexicon=None, id=None, encoding=ENCODING):
14 """! @brief Read an MDF file.
15 @param filename The name of the MDF file to read with full path, for instance 'user/input.txt'.
16 @param mdf2lmf A Python dictionary describing the mapping between MDF markers and LMF representation. Default value is 'mdf_lmf' dictionary defined in 'pylmflib/config/mdf.py'. Please refer to it as an example.
17 @param lexicon An existing Lexicon to fill with lexical entries to read.
18 @param id A Python string identifying the lexicon to create.
19 @param encoding Use 'utf-8' encoding by default. Otherwise, user has to precise the native encoding of its document.
20 @return A Lexicon instance containing all lexical entries.
28 filename = lexicon.get_entrySource()
31 lexicon.set_entrySource(filename)
33 mdf_file =
open_read(filename, encoding=encoding)
35 mdf_pattern =
"""^\\\(\w*) (<(.*)>)? ?(.*)$"""
40 for line
in mdf_file.readlines():
43 result = re.match(mdf_pattern, line)
47 marker = result.group(1)
48 attrs = result.group(3)
49 value = result.group(4)
54 value = value.rstrip(
' \r\n')
61 lf = value.split(
" = ")
62 if lf[0].startswith(
"Component"):
63 component_nb = lf[0].lstrip(
"Component")
67 if marker ==
"lx" or marker ==
"se" or is_mwe:
70 if marker ==
"se" or is_mwe:
72 sub_entry = LexicalEntry(uid)
74 mdf2lmf[
"lx"](value, sub_entry)
76 lexicon.add_lexical_entry(sub_entry)
78 if main_entry
is None:
79 main_entry = current_entry
81 current_entry = main_entry
83 homonym_nb = current_entry.get_homonymNumber()
84 if homonym_nb
is None:
87 sub_entry.create_and_add_related_form(current_entry.get_lexeme() + homonym_nb,
"main entry")
89 current_entry.create_and_add_component(component_nb, value)
90 sub_entry.create_and_add_related_form(current_entry.get_lexeme() + homonym_nb,
"complex predicate")
91 sub_entry.set_independentWord(
False)
94 current_entry = LexicalEntry(uid)
96 lexicon.add_lexical_entry(current_entry)
101 if attrs
is not None:
105 attrs = attrs.replace(
'"',
'')
106 for attr
in attrs.split(
' '):
107 attributes.update({attr.split(
'=')[0] : attr.split(
'=')[1]})
109 mdf2lmf[
"__" + marker](attributes, value, current_entry)
111 mdf2lmf[marker](value, current_entry)
112 if sub_entry
is not None:
113 current_entry = sub_entry
117 print Warning(
"MDF marker '%s' encountered for lexeme '%s' is not defined in configuration" % (marker.encode(ENCODING), current_entry.get_lexeme().encode(ENCODING)))
118 except Error
as exception:
def open_read
Open file in read mode (automatically decode file in unicode).