Package DateHandler :: Module _DateParser
[frames] | no frames]

Source Code for Module DateHandler._DateParser

  1  # -*- coding: utf-8 -*- 
  2  # 
  3  # Gramps - a GTK+/GNOME based genealogy program 
  4  # 
  5  # Copyright (C) 2004-2006  Donald N. Allingham 
  6  # 
  7  # This program is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11  # 
 12  # This program is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16  # 
 17  # You should have received a copy of the GNU General Public License 
 18  # along with this program; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20  # 
 21   
 22  # $Id: _DateParser.py 8054 2007-02-04 21:40:27Z dallingham $ 
 23   
 24  """ 
 25  Date parsing class. Serves as the base class for any localized 
 26  date parsing class. The default, base class provides parsing for 
 27  English. 
 28  """ 
 29   
 30  __author__ = "Donald N. Allingham" 
 31  __version__ = "$Revision: 8054 $" 
 32   
 33  #------------------------------------------------------------------------- 
 34  # 
 35  # Python modules 
 36  # 
 37  #------------------------------------------------------------------------- 
 38  import re 
 39  import calendar 
 40   
 41  #------------------------------------------------------------------------- 
 42  # 
 43  # set up logging 
 44  # 
 45  #------------------------------------------------------------------------- 
 46  import logging 
 47  log = logging.getLogger(".DateParser") 
 48   
 49  #------------------------------------------------------------------------- 
 50  # 
 51  # GRAMPS modules 
 52  # 
 53  #------------------------------------------------------------------------- 
 54  from RelLib import Date, DateError 
 55  import GrampsLocale 
 56   
 57  #------------------------------------------------------------------------- 
 58  # 
 59  # Top-level module functions 
 60  # 
 61  #------------------------------------------------------------------------- 
 62  _max_days  = [ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ] 
 63  _leap_days = [ 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 ] 
 64   
65 -def gregorian_valid(date_tuple):
66 day = date_tuple[0] 67 month = date_tuple[1] 68 valid = True 69 try: 70 if month > 12: 71 valid = False 72 elif calendar.isleap(date_tuple[2]): 73 if day > _leap_days[month-1]: 74 valid = False 75 elif day > _max_days[month-1]: 76 valid = False 77 except: 78 valid = False 79 return valid
80 81 #------------------------------------------------------------------------- 82 # 83 # Parser class 84 # 85 #-------------------------------------------------------------------------
86 -class DateParser:
87 """ 88 Converts a text string into a Date object. If the date cannot be 89 converted, the text string is assigned. 90 """ 91 92 _fmt_parse = re.compile(".*%(\S).*%(\S).*%(\S).*") 93 94 # RFC-2822 only uses capitalized English abbreviated names, no locales. 95 _rfc_days = ('Sun','Mon','Tue','Wed','Thu','Fri','Sat') 96 _rfc_mons_to_int = { 97 'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 98 'May' : 5, 'Jun' : 6, 'Jul' : 7, 'Aug' : 8, 99 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12, 100 } 101 102 month_to_int = GrampsLocale.month_to_int 103 104 # modifiers before the date 105 modifier_to_int = { 106 'before' : Date.MOD_BEFORE, 'bef' : Date.MOD_BEFORE, 107 'bef.' : Date.MOD_BEFORE, 'after' : Date.MOD_AFTER, 108 'aft' : Date.MOD_AFTER, 'aft.' : Date.MOD_AFTER, 109 'about' : Date.MOD_ABOUT, 'abt.' : Date.MOD_ABOUT, 110 'abt' : Date.MOD_ABOUT, 'circa' : Date.MOD_ABOUT, 111 'c.' : Date.MOD_ABOUT, 'around' : Date.MOD_ABOUT, 112 } 113 # in some languages some of above listed modifiers are after the date, 114 # in that case the subclass should put them into this dictionary instead 115 modifier_after_to_int = {} 116 117 hebrew_to_int = { 118 "tishri" : 1, "heshvan" : 2, "kislev" : 3, 119 "tevet" : 4, "shevat" : 5, "adari" : 6, 120 "adarii" : 7, "nisan" : 8, "iyyar" : 9, 121 "sivan" : 10, "tammuz" : 11, "av" : 12, 122 "elul" : 13, 123 } 124 125 french_to_int = { 126 u'vendémiaire' : 1, u'brumaire' : 2, 127 u'frimaire' : 3, u'nivôse': 4, 128 u'pluviôse' : 5, u'ventôse' : 6, 129 u'germinal' : 7, u'floréal' : 8, 130 u'prairial' : 9, u'messidor' : 10, 131 u'thermidor' : 11, u'fructidor' : 12, 132 u'extra' : 13 133 } 134 135 islamic_to_int = { 136 "muharram" : 1, "muharram ul haram" : 1, 137 "safar" : 2, "rabi`al-awwal" : 3, 138 "rabi'l" : 3, "rabi`ul-akhir" : 4, 139 "rabi`ath-thani" : 4, "rabi` ath-thani" : 4, 140 "rabi`al-thaany" : 4, "rabi` al-thaany" : 4, 141 "rabi' ii" : 4, "jumada l-ula" : 5, 142 "jumaada-ul-awwal" : 5, "jumaada i" : 5, 143 "jumada t-tania" : 6, "jumaada-ul-akhir" : 6, 144 "jumaada al-thaany" : 6, "jumaada ii" : 5, 145 "rajab" : 7, "sha`ban" : 8, 146 "sha`aban" : 8, "ramadan" : 9, 147 "ramadhan" : 9, "shawwal" : 10, 148 "dhu l-qa`da" : 11, "dhu qadah" : 11, 149 "thw al-qi`dah" : 11, "dhu l-hijja" : 12, 150 "dhu hijja" : 12, "thw al-hijjah" : 12, 151 } 152 153 persian_to_int = { 154 "farvardin" : 1, "ordibehesht" : 2, 155 "khordad" : 3, "tir" : 4, 156 "mordad" : 5, "shahrivar" : 6, 157 "mehr" : 7, "aban" : 8, 158 "azar" : 9, "dey" : 10, 159 "bahman" : 11, "esfand" : 12, 160 } 161 162 bce = ["B.C.E.", "B.C.E", "BCE", "B.C.", "B.C", "BC" ] 163 164 calendar_to_int = { 165 'gregorian' : Date.CAL_GREGORIAN, 166 'g' : Date.CAL_GREGORIAN, 167 'julian' : Date.CAL_JULIAN, 168 'j' : Date.CAL_JULIAN, 169 'hebrew' : Date.CAL_HEBREW, 170 'h' : Date.CAL_HEBREW, 171 'islamic' : Date.CAL_ISLAMIC, 172 'i' : Date.CAL_ISLAMIC, 173 'french' : Date.CAL_FRENCH, 174 'french republican': Date.CAL_FRENCH, 175 'f' : Date.CAL_FRENCH, 176 'persian' : Date.CAL_PERSIAN, 177 'p' : Date.CAL_PERSIAN, 178 } 179 180 quality_to_int = { 181 'estimated' : Date.QUAL_ESTIMATED, 182 'est.' : Date.QUAL_ESTIMATED, 183 'est' : Date.QUAL_ESTIMATED, 184 'calc.' : Date.QUAL_CALCULATED, 185 'calc' : Date.QUAL_CALCULATED, 186 'calculated' : Date.QUAL_CALCULATED, 187 } 188
189 - def __init__(self):
190 self.init_strings() 191 self.parser = { 192 Date.CAL_GREGORIAN : self._parse_greg_julian, 193 Date.CAL_JULIAN : self._parse_greg_julian, 194 Date.CAL_FRENCH : self._parse_french, 195 Date.CAL_PERSIAN : self._parse_persian, 196 Date.CAL_HEBREW : self._parse_hebrew, 197 Date.CAL_ISLAMIC : self._parse_islamic, 198 } 199 200 fmt = GrampsLocale.tformat 201 match = self._fmt_parse.match(fmt.lower()) 202 if match: 203 self.dmy = (match.groups() == ('d','m','y')) 204 self.ymd = (match.groups() == ('y','m','d')) 205 else: 206 self.dmy = True 207 self.ymd = False
208
209 - def re_longest_first(self, keys):
210 """ 211 returns a string for a RE group which contains the given keys 212 sorted so that longest keys match first. Any '.' characters 213 are quoted. 214 """ 215 keys.sort(lambda x, y: cmp(len(y), len(x))) 216 return '(' + '|'.join([key.replace('.','\.') for key in keys]) + ')'
217
218 - def init_strings(self):
219 """ 220 This method compiles regular expression strings for matching dates. 221 222 Most of the re's in most languages can stay as is. span and range 223 most likely will need to change. Whatever change is done, this method 224 may be called first as DateParser.init_strings(self) so that the 225 invariant expresions don't need to be repeteadly coded. All differences 226 can be coded after DateParser.init_strings(self) call, that way they 227 override stuff from this method. See DateParserRU() as an example. 228 """ 229 self._rfc_mon_str = '(' + '|'.join(self._rfc_mons_to_int.keys()) + ')' 230 self._rfc_day_str = '(' + '|'.join(self._rfc_days) + ')' 231 232 self._bce_str = self.re_longest_first(self.bce) 233 self._qual_str = self.re_longest_first(self.quality_to_int.keys()) 234 self._mod_str = self.re_longest_first(self.modifier_to_int.keys()) 235 self._mod_after_str = self.re_longest_first( 236 self.modifier_after_to_int.keys()) 237 238 self._mon_str = self.re_longest_first(self.month_to_int.keys()) 239 self._jmon_str = self.re_longest_first(self.hebrew_to_int.keys()) 240 self._fmon_str = self.re_longest_first(self.french_to_int.keys()) 241 self._pmon_str = self.re_longest_first(self.persian_to_int.keys()) 242 self._imon_str = self.re_longest_first(self.islamic_to_int.keys()) 243 self._cal_str = self.re_longest_first(self.calendar_to_int.keys()) 244 245 # bce, calendar type and quality may be either at the end or at 246 # the beginning of the given date string, therefore they will 247 # be parsed from the middle and will be in match.group(2). 248 self._bce_re = re.compile("(.*)\s+%s( ?.*)" % self._bce_str) 249 250 self._cal = re.compile("(.*)\s+\(%s\)( ?.*)" % self._cal_str, 251 re.IGNORECASE) 252 self._qual = re.compile("(.* ?)%s\s+(.+)" % self._qual_str, 253 re.IGNORECASE) 254 255 self._span = re.compile("(from)\s+(?P<start>.+)\s+to\s+(?P<stop>.+)", 256 re.IGNORECASE) 257 self._range = re.compile("(bet|bet.|between)\s+(?P<start>.+)\s+and\s+(?P<stop>.+)", 258 re.IGNORECASE) 259 self._modifier = re.compile('%s\s+(.*)' % self._mod_str, 260 re.IGNORECASE) 261 self._modifier_after = re.compile('(.*)\s+%s' % self._mod_after_str, 262 re.IGNORECASE) 263 self._abt2 = re.compile('<(.*)>',re.IGNORECASE) 264 self._text = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._mon_str, 265 re.IGNORECASE) 266 self._text2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._mon_str, 267 re.IGNORECASE) 268 self._jtext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._jmon_str, 269 re.IGNORECASE) 270 self._jtext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._jmon_str, 271 re.IGNORECASE) 272 self._ftext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._fmon_str, 273 re.IGNORECASE) 274 self._ftext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._fmon_str, 275 re.IGNORECASE) 276 self._ptext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._pmon_str, 277 re.IGNORECASE) 278 self._ptext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._pmon_str, 279 re.IGNORECASE) 280 self._itext = re.compile('%s\s+(\d+)?\s*,?\s*((\d+)(/\d+)?)?\s*$' % self._imon_str, 281 re.IGNORECASE) 282 self._itext2 = re.compile('(\d+)?\s+?%s\s*((\d+)(/\d+)?)?\s*$' % self._imon_str, 283 re.IGNORECASE) 284 self._numeric = re.compile("((\d+)[/\.]\s*)?((\d+)[/\.]\s*)?(\d+)\s*$") 285 self._iso = re.compile("(\d+)(/(\d+))?-(\d+)-(\d+)\s*$") 286 self._rfc = re.compile("(%s,)?\s+(\d|\d\d)\s+%s\s+(\d+)\s+\d\d:\d\d(:\d\d)?\s+(\+|-)\d\d\d\d" 287 % (self._rfc_day_str,self._rfc_mon_str))
288
289 - def _get_int(self,val):
290 """ 291 Converts the string to an integer if the value is not None. If the 292 value is None, a zero is returned 293 """ 294 if val == None: 295 return 0 296 else: 297 return int(val)
298
299 - def _parse_hebrew(self,text):
300 return self._parse_calendar(text,self._jtext,self._jtext2, 301 self.hebrew_to_int)
302
303 - def _parse_islamic(self,text):
304 return self._parse_calendar(text,self._itext,self._itext2, 305 self.islamic_to_int)
306
307 - def _parse_persian(self,text):
308 return self._parse_calendar(text,self._ptext,self._ptext2, 309 self.persian_to_int)
310
311 - def _parse_french(self,text):
312 return self._parse_calendar(text,self._ftext,self._ftext2, 313 self.french_to_int)
314
315 - def _parse_greg_julian(self,text):
316 return self._parse_calendar(text,self._text,self._text2, 317 self.month_to_int,gregorian_valid)
318
319 - def _parse_calendar(self,text,regex1,regex2,mmap,check=None):