Source code for gramps.gen.datehandler._dateparser

# -*- coding: utf-8 -*-
#
# Gramps - a GTK+/GNOME based genealogy program
#
# Copyright (C) 2004-2006  Donald N. Allingham
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#

"""
Date parsing class. Serves as the base class for any localized
date parsing class. The default base class provides parsing for English.
"""

#-------------------------------------------------------------------------
#
# Python modules
#
#-------------------------------------------------------------------------
from __future__ import print_function, unicode_literals

import re
import calendar

#-------------------------------------------------------------------------
#
# set up logging
#
#-------------------------------------------------------------------------
import logging
log = logging.getLogger(".DateParser")

#-------------------------------------------------------------------------
#
# GRAMPS modules
#
#-------------------------------------------------------------------------
from ..lib.date import Date, DateError, Today
from . import _grampslocale
from ..utils.grampslocale import GrampsLocale
from ._datestrings import DateStrings

#-------------------------------------------------------------------------
#
# Top-level module functions
#
#-------------------------------------------------------------------------
_max_days  = [ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ]
_leap_days = [ 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ]

def gregorian_valid(date_tuple):
    """ Checks if date_tuple is a valid date in Gregorian Calendar  """
    day = date_tuple[0]
    month = date_tuple[1]
    valid = True
    try:
        if month > 12:
            valid = False
        elif calendar.isleap(date_tuple[2]):
            if day > _leap_days[month-1]:
                valid = False
        elif day > _max_days[month-1]:
            valid = False
    except:
        valid = False
    return valid

def julian_valid(date_tuple):
    """ Checks if date_tuple is a valid date in Julian Calendar """
    day = date_tuple[0]
    month = date_tuple[1]
    valid = True
    try:
        if month > 12:
            valid = False
        elif (date_tuple[2]) % 4 == 0:
            # julian always have leapyear every 4 year
            if day > _leap_days[month-1]:
                valid = False
        elif day > _max_days[month-1]:
            valid = False
    except:
        valid = False
    return valid

def swedish_valid(date_tuple):
    """ Checks if date_tuple is a valid date in Swedish Calendar """
    valid_J = julian_valid(date_tuple)
    date_tuple = (date_tuple[2], date_tuple[1], date_tuple[0])
    # Swedish calendar starts as Julian 1700-03-01 and ends 1712-03-01 as Julian
    if date_tuple >= (1700, 2, 29) and  date_tuple < (1712, 3, 1):
        if date_tuple == (1712, 2, 30):  # extra day was inserted 1712, not valid Julian
            return True
        if valid_J:
            if date_tuple == (1700, 2, 29):  # leapday 1700 was skipped
                return False
            return True
        else:
            return False
    else:
        return False

def french_valid(date_tuple):
    """ Checks if date_tuple is a valid date in French Calendar """
    valid = True
    # year 1 starts on 22.9.1792
    if date_tuple[2] < 1:
        valid = False       
    return valid

def _build_prefix_table(month_to_int, month_variants):
    """
    Populate a DateParser.month_to_int-like dict
    with all the prefixes found in month_variants.
    """

    month_variants = list(month_variants) # drain the generator, if any
    month_to_int_new = {}

    # Populate with full names first, w/o prefixes
    log.debug("Mapping full names...")
    for i in range(0,len(month_variants)):
        for month in month_variants[i]:
            m = month.lower()
            log.debug("Mapping {} -> {}".format(m, i))
            month_to_int_new[m] = i
    month_to_int.update(month_to_int_new)

    log.debug("Mapping new prefixes...")
    months_sorted = list(month_to_int_new.keys())
    months_sorted.sort(key=len, reverse=True)
    for m in months_sorted:
        for prefixlen in reversed(range(1,len(m))):
            mp = m[:prefixlen]
            if mp.strip() != mp:
                continue
            if mp in month_to_int:
                break
            else:
                i = month_to_int[m]
                log.debug("Mapping {} -> {}".format(mp, i))
                month_to_int[mp] = i

def _generate_variants(months):
    """
    Generate all month variants for passing to _build_prefix_table
    :param months: an iterable ordered collection, 1st item is empty, the rest
                   1..N, for a calendar with N months overall, contain, each,
                   an iterable of alternative specifications.
                   Each such specification can be:
                     1) a Lexeme, supporting .variants() to return the list of
                        variants underneath
                     2) a literal string
                     3) a |-separated string of alternatives
                   Empty strings are discarded.
    :return: generator of lists per month with all variants listed once only
             the 1st item will be empty
    """

    for month_lexemes_and_alternatives in months:
        v = []
        for m in month_lexemes_and_alternatives:
            try:
                # Lexeme? ask it to compute the variants it knows
                mv = list(m.variants())
            except AttributeError:
                # plain string, not a lexeme with inflections...
                # Maybe a '|'-separated list of alternatives, maybe empty,
                # maybe a single string. Suppress empty strings!
                mv = (s for s in m.split('|') if s)
            v.extend(mv)
        yield(list(set(v)))

#-------------------------------------------------------------------------
#
# DateParser class
#
#-------------------------------------------------------------------------
[docs]class DateParser(object): """ Convert a text string into a :class:`.Date` object. If the date cannot be converted, the text string is assigned. """ _locale = GrampsLocale(lang='en', languages='en') _fmt_parse = re.compile(".*%(\S).*%(\S).*%(\S).*") # RFC-2822 only uses capitalized English abbreviated names, no locales. _rfc_days = ('Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat') _rfc_mons_to_int = { 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6, 'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12, } # seeded with __init_prefix_tables swedish_to_int = month_to_int = {} """ Map Gregorian month names and their prefixes, wherever unambiguous, to the relevant integer index (1..12). """ # modifiers before the date # (overridden if a locale-specific date parser exists) modifier_to_int = { 'before' : Date.MOD_BEFORE, 'bef' : Date.MOD_BEFORE, 'bef.' : Date.MOD_BEFORE, 'after' : Date.MOD_AFTER, 'aft' : Date.MOD_AFTER, 'aft.' : Date.MOD_AFTER, 'about' : Date.MOD_ABOUT, 'abt.' : Date.MOD_ABOUT, 'abt' : Date.MOD_ABOUT, 'circa' : Date.MOD_ABOUT, 'c.' : Date.MOD_ABOUT, 'around' : Date.MOD_ABOUT, } # in some languages some of above listed modifiers are after the date, # in that case the subclass should put them into this dictionary instead modifier_after_to_int = {} hebrew_to_int = { "tishri" : 1, "heshvan" : 2, "kislev" : 3, "tevet" : 4, "shevat" : 5, "adari" : 6, "adarii" : 7, "nisan" : 8, "iyyar" : 9, "sivan" : 10, "tammuz" : 11, "av" : 12, "elul" : 13, #alternative spelling "cheshvan": 2, "adar sheni": 7, "iyar" : 9, #GEDCOM months "tsh" : 1, "csh": 5, "ksl": 3, "tvt": 4, "shv": 5, "adr": 6, "ads" : 7, "nsn": 8, "iyr": 9, "svn":10, "tmz":11, "aav":12, "ell":13, } french_to_int = { # the long ones are seeded with __init_prefix_tables #GEDCOM months 'vend' : 1, 'brum' : 2, 'frim' : 3, 'nivo' : 4, 'pluv' : 5, 'vent' : 6, 'germ' : 7, 'flor' : 8, 'prai' : 9, 'mess' : 10, 'ther' : 11, 'fruc' : 12, 'comp' : 13, } islamic_to_int = { # some are already seeded with __init_prefix_tables, # but it is a pain to separate them out from the variants... "muharram" : 1, "muharram ul haram" : 1, "safar" : 2, "rabi`al-awwal" : 3, "rabi'l" : 3, "rabi`ul-akhir" : 4, "rabi`ath-thani" : 4, "rabi` ath-thani" : 4, "rabi`al-thaany" : 4, "rabi` al-thaany" : 4, "rabi' ii" : 4, "jumada l-ula" : 5, "jumaada-ul-awwal" : 5, "jumaada i" : 5, "jumada t-tania" : 6, "jumaada-ul-akhir" : 6, "jumaada al-thaany" : 6, "jumaada ii" : 5, "rajab" : 7, "sha`ban" : 8, "sha`aban" : 8, "ramadan" : 9, "ramadhan" : 9, "shawwal" : 10, "dhu l-qa`da" : 11, "dhu qadah" : 11, "thw al-qi`dah" : 11, "dhu l-hijja" : 12, "dhu hijja" : 12, "thw al-hijjah" : 12, } # seeded with __init_prefix_tables persian_to_int = { } bce = ["B.C.E.", "B.C.E", "BCE", "B.C.", "B.C", "BC" ] # (overridden if a locale-specific date parser exists) # seeded with __init_prefix_tables calendar_to_int = { } # (probably overridden if a locale-specific date parser exists) newyear_to_int = { "jan1": Date.NEWYEAR_JAN1, "mar1": Date.NEWYEAR_MAR1, "mar25": Date.NEWYEAR_MAR25, "sep1" : Date.NEWYEAR_SEP1, } quality_to_int = { 'estimated' : Date.QUAL_ESTIMATED, 'est.' : Date.QUAL_ESTIMATED, 'est' : Date.QUAL_ESTIMATED, 'calc.' : Date.QUAL_CALCULATED, 'calc' : Date.QUAL_CALCULATED, 'calculated' : Date.QUAL_CALCULATED, } # (overridden if a locale-specific date parser exists) today = ["$T",] # Override with a list of *synonyms* for "today" in your language. # Note: the word "today" itself will already be pulled in from your translation DB, # see init_strings, so there is no need to override this if you have no aliases # for "today". # We also secretly support "$T" like in some reports. _langs = set() def __init_prefix_tables(self): lang = self._locale.lang if lang in DateParser._langs: log.debug("Prefix tables for {} already built".format(lang)) return else: DateParser._langs.add(lang) ds = DateStrings(self._locale) log.debug("Begin building parser prefix tables for {}".format(lang)) _build_prefix_table(DateParser.month_to_int, _generate_variants( zip(ds.long_months, ds.short_months, ds.swedish_SV, ds.alt_long_months))) _build_prefix_table(DateParser.hebrew_to_int, _generate_variants(zip(ds.hebrew))) _build_prefix_table(DateParser.french_to_int, _generate_variants(zip(ds.french))) _build_prefix_table(DateParser.islamic_to_int, _generate_variants(zip(ds.islamic))) _build_prefix_table(DateParser.persian_to_int, _generate_variants(zip(ds.persian))) _build_prefix_table(DateParser.calendar_to_int, _generate_variants(zip(ds.calendar))) def __init__(self): self.init_strings() self.parser = { Date.CAL_GREGORIAN : self._parse_gregorian, Date.CAL_JULIAN : self._parse_julian, Date.CAL_FRENCH : self._parse_french, Date.CAL_PERSIAN : self._parse_persian, Date.CAL_HEBREW : self._parse_hebrew, Date.CAL_ISLAMIC : self._parse_islamic, Date.CAL_SWEDISH : self._parse_swedish, } fmt = _grampslocale.tformat match = self._fmt_parse.match(fmt.lower()) if match: self.dmy = (match.groups() == ('d', 'm', 'y') or \ match.groups() == ('d', 'b', 'y')) self.ymd = (match.groups() == ('y', 'm', 'd') or \ match.groups() == ('y', 'b', 'd')) else: self.dmy = True self.ymd = False
[docs] def re_longest_first(self, keys): """ returns a string for a RE group which contains the given keys sorted so that longest keys match first. Any '.' characters are quoted. """ keys.sort(key=len, reverse=True) return '(' + '|'.join([re.escape(key) for key in keys]) + ')'
[docs] def init_strings(self): """ This method compiles regular expression strings for matching dates. Most of the re's in most languages can stay as is. span and range most likely will need to change. Whatever change is done, this method may be called first as DateParser.init_strings(self) so that the invariant expresions don't need to be repeteadly coded. All differences can be coded after DateParser.init_strings(self) call, that way they override stuff from this method. .. seealso:: :class:`.DateParserRU` as an example. """ _ = self._locale.translation.gettext self.__init_prefix_tables() self._rfc_mon_str = '(' + '|'.join(list(self._rfc_mons_to_int.keys())) + ')' self._rfc_day_str = '(' + '|'.join(self._rfc_days) + ')' self._bce_str = self.re_longest_first(self.bce) self._qual_str = self.re_longest_first(list(self.quality_to_int.keys())) self._mod_str = self.re_longest_first(list(self.modifier_to_int.keys())) self._mod_after_str = self.re_longest_first( list(self.modifier_after_to_int.keys())) self._mon_str = self.re_longest_first(list(self.month_to_int.keys())) self._jmon_str = self.re_longest_first(list(self.hebrew_to_int.keys())) self._fmon_str = self.re_longest_first(list(self.french_to_int.keys())) self._pmon_str = self.re_longest_first(list(self.persian_to_int.keys())) self._imon_str = self.re_longest_first(list(self.islamic_to_int.keys())) self._smon_str = self.re_longest_first(list(self.swedish_to_int.keys())) self._cal_str = self.re_longest_first(list(self.calendar_to_int.keys())) self._ny_str = self.re_longest_first(list(self.newyear_to_int.keys())) self._today_str = self.re_longest_first(self.today + [_("today"),]) # bce, calendar type and quality may be either at the end or at # the beginning of the given date string, therefore they will # be parsed from the middle and will be in match.group(2). self._bce_re = re.compile("(.*)\s+%s( ?.*)" % self._bce_str) self._cal = re.compile("(.*)\s+\(%s\)( ?.*)" % self._cal_str, re.IGNORECASE) self._calny = re.compile("(.*)\s+\(%s,\s*%s\)( ?.*)" % (self._cal_str, self._ny_str), re.IGNORECASE) self._calny_iso = re.compile("(.*)\s+\(%s,\s*(\d{1,2}-\d{1,2})\)( ?.*)" % self._cal_str, re.IGNORECASE) self._ny = re.compile("(.*)\s+\(%s\)( ?.*)" % self._ny_str, re.IGNORECASE) self._ny_iso = re.compile("(.*)\s+\((\d{1,2}-\d{1,2})\)( ?.*)") self._qual = re.compile("(.* ?)%s\s+(.+)" % self._qual_str, re.IGNORECASE) self._span = re.compile("(from)\s+(?P<start>.+)\s+to\s+(?P<stop>.+)", re.IGNORECASE) self._range = re.compile("(bet|bet.|between)\s+(?P<start>.+)\s+and\s+(?P<stop>.+)", re.IGNORECASE) self._modifier = re.compile('%s\s+(.*)' % self._mod_str, re.IGNORECASE) self._modifier_after = re.compile('(.*)\s+%s' % self._mod_after_str, re.IGNORECASE) self._abt2 = re.compile('<(.*)>', re.IGNORECASE) self._text = re.compile('%s\.?\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._mon_str, re.IGNORECASE) self._text2 = re.compile('(\d+)?\s+?%s\.?\s*((\d+)(/\d+)?)?\s*$' % self._mon_str, re.IGNORECASE) self._jtext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._jmon_str, re.IGNORECASE) self._jtext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._jmon_str, re.IGNORECASE) self._ftext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._fmon_str, re.IGNORECASE) self._ftext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._fmon_str, re.IGNORECASE) self._ptext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._pmon_str, re.IGNORECASE) self._ptext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._pmon_str, re.IGNORECASE) self._itext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._imon_str, re.IGNORECASE) self._itext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._imon_str, re.IGNORECASE) self._stext = re.compile('%s\.?\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._smon_str, re.IGNORECASE) self._stext2 = re.compile('(\d+)?\s+?%s\.?\s*((\d+)(/\d+)?)?\s*$' % self._smon_str, re.IGNORECASE) self._numeric = re.compile("((\d+)[/\.]\s*)?((\d+)[/\.]\s*)?(\d+)\s*$") self._iso = re.compile("(\d+)(/(\d+))?-(\d+)-(\d+)\s*$") self._isotimestamp = re.compile("^\s*?(\d{4})([01]\d)([0123]\d)(?:(?:[012]\d[0-5]\d[0-5]\d)|(?:\s+[012]\d:[0-5]\d(?::[0-5]\d)?))?\s*?$") self._rfc = re.compile("(%s,)?\s+(\d|\d\d)\s+%s\s+(\d+)\s+\d\d:\d\d(:\d\d)?\s+(\+|-)\d\d\d\d" % (self._rfc_day_str, self._rfc_mon_str)) self._today = re.compile("^\s*%s\s*$" % self._today_str, re.IGNORECASE)
def _get_int(self, val): """ Convert the string to an integer if the value is not None. If the value is None, a zero is returned """ if val is None: return 0 else: return int(val) def _parse_hebrew(self, text): return self._parse_calendar(text, self._jtext, self._jtext2, self.hebrew_to_int) def _parse_islamic(self, text): return self._parse_calendar(text, self._itext, self._itext2, self.islamic_to_int) def _parse_persian(self, text): return self._parse_calendar(text, self._ptext, self._ptext2, self.persian_to_int) def _parse_french(self, text): return self._parse_calendar(text, self._ftext, self._ftext2, self.french_to_int, french_valid) def _parse_gregorian(self, text): return self._parse_calendar(text, self._text, self._text2, self.month_to_int, gregorian_valid) def _parse_julian(self, text): return self._parse_calendar(text, self._text, self._text2, self.month_to_int, julian_valid) def _parse_swedish(self, text): return self._parse_calendar(text, self._stext, self._stext2, self.swedish_to_int,swedish_valid) def _parse_calendar(self, text, regex1, regex2, mmap, check=None): match = regex1.match(text.lower()) if match: groups = match.groups() if groups[0] is None: m = 0 else: m = mmap[groups[0].lower()] if groups[2] is None: y = self._get_int(groups[1]) d = 0 s = False else: d = self._get_int(groups[1]) if groups[4] is not None: # slash year "/80" y = int(groups[3]) + 1 # fullyear + 1 s = True else: # regular, non-slash date y = int(groups[3]) s = False value = (d, m, y, s) if check and not check((d, m, y)): value = Date.EMPTY return value match = regex2.match(text.lower()) if match: groups = match.groups() if groups[1] is None: m = 0 else: m = mmap[groups[1].lower()] d = self._get_int(groups[0]) if groups[2] is None: y = None s = False else: if groups[4] is not None: # slash year digit y = int(groups[3]) + 1 # fullyear + 1 s = True else: y = int(groups[3]) s = False value = (d, m, y, s) if check and not check((d, m, y)): value = Date.EMPTY return value return Date.EMPTY def _parse_subdate(self, text, subparser=None, cal=None): """ Convert only the date portion of a date. """ if subparser is None: subparser = self._parse_gregorian if subparser == self._parse_gregorian: check = gregorian_valid elif subparser == self._parse_julian: check = julian_valid elif subparser == self._parse_swedish: check = swedish_valid elif subparser == self._parse_french: check = french_valid else: check = None value = subparser(text) if value != Date.EMPTY: return value match = self._iso.match(text) if match: groups = match.groups() y = self._get_int(groups[0]) m = self._get_int(groups[3]) d = self._get_int(groups[4]) if check and not check((d, m, y)): return Date.EMPTY if groups[2]: # slash year digit return (d, m, y + 1, True) else: return (d, m, y, False) # Database datetime format, used in ex. MSSQL # YYYYMMDD HH:MM:SS or YYYYMMDD or YYYYMMDDHHMMSS match = self._isotimestamp.match(text) if match: groups = match.groups() y = self._get_int(groups[0]) m = self._get_int(groups[1]) d = self._get_int(groups[2]) value = (d, m, y, False) if not check((d, m, y)): value = Date.EMPTY return value match = self._rfc.match(text) if match: groups = match.groups() d = self._get_int(groups[2]) m = self._rfc_mons_to_int[groups[3]] y = self._get_int(groups[4]) value = (d, m, y, False) if check and not check((d, m, y)): value = Date.EMPTY return value match = self._numeric.match(text) if match: groups = match.groups() if self.ymd: # '1789' and ymd: incomplete date if groups[1] is None: y = self._get_int(groups[4]) m = 0 d = 0 else: y = self._get_int(groups[1]) m = self._get_int(groups[3]) d = self._get_int(groups[4]) else: y = self._get_int(groups[4]) if self.dmy: m = self._get_int(groups[3]) d = self._get_int(groups[1]) else: m = self._get_int(groups[1]) d = self._get_int(groups[3]) value = (d, m, y, False) if check and not check((d, m, y)): value = Date.EMPTY return value match = self._today.match(text) if match: today = Today() if cal: today = today.to_calendar(cal) return today.get_dmy(get_slash=True) return Date.EMPTY
[docs] def match_calendar(self, text, cal): """ Try parsing calendar. Return calendar index and the text with calendar removed. """ match = self._cal.match(text) if match: cal = self.calendar_to_int[match.group(2).lower()] text = match.group(1) + match.group(3) return (text, cal)
[docs] def match_calendar_newyear(self, text, cal, newyear): """ Try parsing calendar and newyear code. Return newyear index and the text with calendar removed. """ match = self._calny.match(text) if match: cal = self.calendar_to_int[match.group(2).lower()] newyear = self.newyear_to_int[match.group(3).lower()] text = match.group(1) + match.group(4) else: match = self._calny_iso.match(text) if match: cal = self.calendar_to_int[match.group(2).lower()] newyear = tuple(map(int, match.group(3).split("-"))) text = match.group(1) + match.group(4) return (text, cal, newyear)
[docs] def match_newyear(self, text, newyear): """ Try parsing calendar and newyear code. Return newyear index and the text with calendar removed. """ match = self._ny.match(text) if match: newyear = self.newyear_to_int[match.group(2).lower()] text = match.group(1) + match.group(3) else: match = self._ny_iso.match(text) if match: newyear = tuple(map(int, match.group(2).split("-"))) text = match.group(1) + match.group(3) return (text, newyear)
[docs] def match_quality(self, text, qual): """ Try matching quality. Return quality index and the text with quality removed. """ match = self._qual.match(text) if match: qual = self.quality_to_int[match.group(2).lower()] text = match.group(1) + match.group(3) return (text, qual)
[docs] def match_span(self, text, cal, ny, qual, date): """ Try matching span date. On success, set the date and return 1. On failure return 0. """ match = self._span.match(text) if match: text_parser = self.parser[cal] (text1, bc1) = self.match_bce(match.group('start')) start = self._parse_subdate(text1, text_parser, cal) if start == Date.EMPTY and text1 != "": return 0 if bc1: start = self.invert_year(start) (text2, bc2) = self.match_bce(match.group('stop')) stop = self._parse_subdate(text2, text_parser, cal) if stop == Date.EMPTY and text2 != "": return 0 if bc2: stop = self.invert_year(stop) date.set(qual, Date.MOD_SPAN, cal, start + stop, newyear=ny) return 1 return 0
[docs] def match_range(self, text, cal, ny, qual, date): """ Try matching range date. On success, set the date and return 1. On failure return 0. """ match = self._range.match(text) if match: text_parser = self.parser[cal] (text1, bc1) = self.match_bce(match.group('start')) start = self._parse_subdate(text1, text_parser, cal) if start == Date.EMPTY and text1 != "": return 0 if bc1: start = self.invert_year(start) (text2, bc2) = self.match_bce(match.group('stop')) stop = self._parse_subdate(text2, text_parser, cal) if stop == Date.EMPTY and text2 != "": return 0 if bc2: stop = self.invert_year(stop) date.set(qual, Date.MOD_RANGE, cal, start + stop, newyear=ny) return 1 return 0
[docs] def match_bce(self, text): """ Try matching BCE qualifier. Return BCE (True/False) and the text with matched part removed. """ match = self._bce_re.match(text) bc = False if match: # bce is in the match.group(2) try: text = match.group(1) + match.group(3) except: print("ERROR MATCH:", match.groups()) bc = True return (text, bc)
[docs] def match_modifier(self, text, cal, ny, qual, bc, date): """ Try matching date with modifier. On success, set the date and return 1. On failure return 0. """ # modifiers before the date match = self._modifier.match(text) if match: grps = match.groups() start = self._parse_subdate(grps[1], self.parser[cal], cal) mod = self.modifier_to_int.get(grps[0].lower(), Date.MOD_NONE) if start == Date.EMPTY: date.set_modifier(Date.MOD_TEXTONLY) date.set_text_value(text) elif bc: date.set(qual, mod, cal, self.invert_year(start), newyear=ny) else: date.set(qual, mod, cal, start, newyear=ny) return True # modifiers after the date if self.modifier_after_to_int: match = self._modifier_after.match(text) if match: grps = match.groups() start = self._parse_subdate(grps[0], self.parser[cal], cal) mod = self.modifier_after_to_int.get(grps[1].lower(), Date.MOD_NONE) if start == Date.EMPTY: date.set_modifier(Date.MOD_TEXTONLY) date.set_text_value(text) elif bc: date.set(qual, mod, cal, self.invert_year(start), newyear=ny) else: date.set(qual, mod, cal, start, newyear=ny) return True match = self._abt2.match(text) if match: grps = match.groups() start = self._parse_subdate(grps[0], cal=cal) mod = Date.MOD_ABOUT if start == Date.EMPTY: date.set_modifier(Date.MOD_TEXTONLY) date.set_text_value(text) elif bc: date.set(qual, mod, cal, self.invert_year(start), newyear=ny) else: date.set(qual, mod, cal, start, newyear=ny) return True return False
[docs] def set_date(self, date, text): """ Parses the text and sets the date according to the parsing. """ text = text.strip() # otherwise spaces can make it a bad date date.set_text_value(text) qual = Date.QUAL_NONE cal = Date.CAL_GREGORIAN newyear = Date.NEWYEAR_JAN1 (text, cal, newyear) = self.match_calendar_newyear(text, cal, newyear) (text, newyear) = self.match_newyear(text, newyear) (text, cal) = self.match_calendar(text, cal) (text, qual) = self.match_quality(text, qual) if self.match_span(text, cal, newyear, qual, date): return if self.match_range(text, cal, newyear, qual, date): return (text, bc) = self.match_bce(text) if self.match_modifier(text, cal, newyear, qual, bc, date): return try: subdate = self._parse_subdate(text, self.parser[cal], cal) if subdate == Date.EMPTY and text != "": date.set_as_text(text) return #else: # print 'valid subdate', text, subdate except: date.set_as_text(text) return if bc: date.set(qual, Date.MOD_NONE, cal, self.invert_year(subdate), newyear=newyear) else: date.set(qual, Date.MOD_NONE, cal, subdate, newyear=newyear)
[docs] def invert_year(self, subdate): return (subdate[0], subdate[1], -subdate[2], subdate[3])
[docs] def parse(self, text): """ Parses the text, returning a :class:`.Date` object. """ new_date = Date() try: self.set_date(new_date, text) except DateError: new_date.set_as_text(text) return new_date