# -*- coding: utf-8 -*- ''' Time Parser is yet another human readable time parser. It currently supports 6 languages: English, Russian, Hebrew, Arabic, German and Spanish. Here are a few usage examples: >>> import datetime >>> import timeparser >>> datetime.datetime.now() datetime.datetime(2010, 8, 30, 18, 17, 13, 266687) >>> timeparser.parse_phrase('20 minutes ago') datetime.datetime(2010, 8, 30, 17, 57, 26, 259396) >>> timeparser.parse_phrase('15 Jul') datetime.datetime(2010, 7, 15, 0, 0) >>> timeparser.parse_phrase('3rd of November') datetime.datetime(2010, 11, 3, 0, 0) >>> timeparser.parse_phrase('2007 5 jul') datetime.datetime(2007, 7, 5, 0, 0) IMPLEMENTATION DETAILS: * The script uses one big dictionary with all the words in all the languages * If a word that means "past" like "ago" appears and a scale word like "weeks" or "months" and a number without colons ("15:45" will be skipped and "5" will work) then the timedelta is calculated from now. * If that fails, the algorithm looks for a name of a month and the first number found (excluding colon numbers) is considered the day of the month. KNOWN_ISSUES: * Spanish "Agosto" is abbreviated to "ago" which makes "ago" a double meaning and the english past "ago" takes precedence. Spanish abbreviated dates won't work until this is fixed. * The algorithm doesn't really know "1st", "2nd", etc, it just uses the the number so it will accept things like "5rd" or "5asdf" as "5". * Certain formats will confuse this parser. A lot of formats will. ''' import re import datetime import unittest LANGUAGES = { 'en' : { 'months' : ( 'january' , 'february' , 'march' , 'april' , 'may' , 'june' , 'july' , 'august' , 'september' , 'october' , 'november' , 'december' ), 'deltas' : ( 'second' , 'minute' , 'hour' , 'day' , 'week' , 'month' , 'year' ), 'past' : ( 'ago' ,), 'future' : ( 'in' ,), }, 'de' : { 'months' : ( 'januar' , 'februar' , 'märz' , 'april' , 'mai' , 'juni' , 'juli' , 'august' , 'september' , 'oktober' , 'november' , 'dezember' ), 'deltas' : ( 'zweiten' , 'minute' , 'stunde' , 'tag' , 'woche' , 'monat' , 'jahr' ), 'past' : ( 'vor' ,), 'future' : ( 'in' ,) }, 'ar' : { 'months' : ( u'يناير' , u'فبراير' , u'مارس' , u'أبريل' , u'مايو' , u'يونيو' , u'يوليو' , u'أغسطس' , u'سبتمبر' , u'أكتوبر' , u'نوفمبر' , u'ديسمبر' ), 'deltas' : ( u'الثاني' , u'دقيقة' , u'ساعة' , u'اليوم' , u'الأسبوع' , u'شهر' , u'السنة' ), 'past' : ( u'منذ' ,), 'future' : (), }, 'es' : { 'months' : ( 'enero' , 'febrero' , 'marzo' , 'abril' , 'mayo' , 'junio' , 'julio' , 'agosto' , 'septiembre' , 'octubre' , 'noviembre' , 'diciembre' ), 'deltas' : ( 'segundo' , 'minuto' , 'hora' , 'día' , 'semana' , 'mes' , 'años' ), 'past' : ( 'hace' ,), 'future' : ( 'en' ,), }, 'ru' : { 'months' : ( u'январь' , u'февраль' , u'март' , u'апрель' , u'май' , u'июнь' , u'июль' , u'август' , u'сентябрь' , u'октябрь' , u'ноябрь' , u'декабрь' ), 'deltas' : ( u'секунды' , u'минуты' , u'час' , u'день' , u'неделя' , u'месяц ' , u'год' ), 'past' : ( u'назад' ,), 'future' : ( u'в' ,), }, 'he' : { 'months' : ( u'ינואר' , u'פברואר' , u'מרץ' , u'אפריל' , u'מאי' , u'יוני' , u'יולי' , u'אוגוסט' , u'ספטמבר' , u'אוקטובר' , u'נובמבר' , u'דצמבר' ), 'deltas' : ( u'שניות' , u'דקות' , u'שעות' , u'ימים' , u'שבועות' , u'חודשים' , u'שנים' ), 'past' : ( u'לפני' , u'מלפני' ), 'future' : ( u'בעוד' , u'עוד' ), }, } BIG_DICTIONARY = {} class TimeParseError ( Exception ): pass class InaccurateDateTime ( datetime . datetime ): max_error = datetime . timedelta ( seconds = 30 ) def __eq__ ( self , other ): return abs ( self - other ) < self . max_error @classmethod def copy ( cls , other ): ''' Useful when one wants to create an InaccurateDateTime from a datetime ''' date = other . date () time = other . time () return InaccurateDateTime . combine ( date , time ) class Word ( object ): ''' A word is an object you can ask questions about, eg: myword.is_month or myword.month_number If an invalid question is asked - None is returned. NOTE: A word can have double meaning by containing an attribute "double_meaning" which is a list of all the words this can mean. The word should answer all the answers any subword can answer, although contradicting double meanings aren't supported. ''' def __init__ ( self , text , ** kwargs ): self . text = text self . double_meaning = [] for key , value in kwargs . items (): self . __setattr__ ( key , value ) def __getattr__ ( self , key ): try : return object . __getattribute__ ( self , key ) except Exception : return None def __repr__ ( self ): template = "Word( %s )" fields = [] attributes = self . __dict__ for key , value in attributes . items (): field_construction = ' %s = %s ' % ( key , repr ( value )) fields . append ( field_construction ) return template % ', ' . join ( fields ) def add_double_meaning ( self , other ): ''' puts the word in the list of double meanings and applies the fields that word have to this one ''' if other in self . double_meaning : raise Exception ( 'added redundant double meaning' ) self . double_meaning . append ( other ) # NOTE: this forever corrupts the current word - maybe there's # a better way to do this... for key , value in other . __dict__ . items (): self . __setattr__ ( key , value ) class MultiLingualDictionary ( dict ): def __setitem__ ( self , key , value ): if key in self : return self [ key ] . add_double_meaning ( value ) else : return dict . __setitem__ ( self , key , value ) class TimeParser ( object ): def parse_time ( self , text ): text = text . lower () now = datetime . datetime . now () # search for known (ago/month/week/second/etc) words recognized = self . find_recognized_words ( text ) scale = None is_in_past = False month_word = None # NOTE: the last scale and "is_in_past" are the ones that count. for word_obj in recognized : if word_obj . is_scale : scale = word_obj . scale if word_obj . means_in_past : is_in_past = True if word_obj . is_name_of_month : month_word = word_obj # find the relevant numbers - separate "16:45" from "5 minutes ago" colon_numbers_re = r'(\d+:\d+)' quad_numbers_re = r'(\d{4})' cleaned_text = text numbers_with_colon = re . findall ( colon_numbers_re , cleaned_text ) cleaned_text = re . sub ( colon_numbers_re , '' , cleaned_text ) quad_numbers = re . findall ( quad_numbers_re , cleaned_text ) cleaned_text = re . sub ( quad_numbers_re , '' , cleaned_text ) numbers = re . findall ( r'\d+' , cleaned_text ) if len ( numbers ) == 0 : raise TimeParseError ( 'Failed to parse a datetime because no numbers were given' ) if scale is not None : amount = int ( numbers [ 0 ]) if is_in_past : delta = - amount else : delta = amount parsed_time = now + scale * delta return parsed_time if month_word is not None : # maybe it's a month + day type of date. day_of_month = int ( numbers [ 0 ]) if len ( quad_numbers ) > 0 : year = int ( quad_numbers [ 0 ]) else : year = now . year month = month_word . month_number parsed_time = datetime . datetime ( year = year , month = month , day = day_of_month ) return parsed_time raise TimeParseError ( 'Failed parsing a datetime completely' ) def find_recognized_words ( self , text ): words_list = re . findall ( ur'\w+' , text , re . UNICODE ) recognized = [] for word in words_list : if word in BIG_DICTIONARY : word_obj = BIG_DICTIONARY [ word ] recognized . append ( word_obj ) elif len ( word ) > 2 and word [: - 1 ] in BIG_DICTIONARY : # given "minutes" test for "minute" word_without_suffix = word [: - 1 ] word_obj = BIG_DICTIONARY [ word_without_suffix ] recognized . append ( word_obj ) return recognized def detect_language ( self , text ): words_list = re . findall ( '\w+' , text ) points = {} for word in words_list : if word in BIG_DICTIONARY : language = BIG_DICTIONARY [ word ] . language score = points . get ( language , 0 ) points [ language ] = score + 1 detected = max_in_dict ( points ) return detected def build_big_dictionary (): global BIG_DICTIONARY temp_dict = MultiLingualDictionary () # months, deltas, past and future # 'second', 'minute', 'hour', 'day', 'week', 'month', 'year' for lang_name , lang_data in LANGUAGES . items (): for i , word in enumerate ( lang_data [ 'months' ]): word_obj = Word ( word ) word_obj . language = lang_name word_obj . is_name_of_month = True word_obj . month_number = i + 1 # months are 1-indexed temp_dict [ word ] = word_obj if len ( word ) > 3 : abbreviation = word [: 3 ] word_obj = Word ( abbreviation ) word_obj . language = lang_name word_obj . is_name_of_month = True word_obj . month_number = i + 1 # months are 1-indexed temp_dict [ abbreviation ] = word_obj # second, minute, hour, etc... # NOTE: months and years is not 100% accurate because it # doesn't take leap years or 28/31 day months into account. scale_times = ( datetime . timedelta ( seconds = 1 ), datetime . timedelta ( minutes = 1 ), datetime . timedelta ( hours = 1 ), datetime . timedelta ( days = 1 ), datetime . timedelta ( days = 30 ), datetime . timedelta ( days = 365 ), ) for delta_word , scale_time in zip ( lang_data [ 'deltas' ], scale_times ): temp_dict [ delta_word ] = Word ( delta_word , language = lang_name , is_scale = True , scale = scale_time ) for word in lang_data [ 'past' ]: temp_dict [ word ] = Word ( word , language = lang_name , means_in_past = True ) for word in lang_data [ 'future' ]: temp_dict [ word ] = Word ( word , language = lang_name , means_in_future = True ) BIG_DICTIONARY = temp_dict def max_in_dict ( dictionary ): iterator = iter ( dictionary . items ()) current_key , current_max = next ( iterator ) for key , value in iterator : if current_max < value : current_key = key current_max = value return current_key def parse_phrase ( text ): tt = TimeParser () return tt . parse_time ( text ) class TimeParserTests ( unittest . TestCase ): def test_month_day ( self ): now = datetime . datetime . now () a = parse_phrase ( '5th of july' ) b = InaccurateDateTime ( year = now . year , month = 7 , day = 5 ) self . assertEqual ( b , a ) def test_german ( self ): now = datetime . datetime . now () a = parse_phrase ( 'Vor 4 Stunden' ) b = now - datetime . timedelta ( hours = 4 ) b = InaccurateDateTime . copy ( b ) self . assertEqual ( b , a ) a = parse_phrase ( '26. Aug. (Vor 4 Tagen)' ) b = now - datetime . timedelta ( days = 4 ) self . assertEqual ( b . date (), a . date ()) def test_arabic ( self ): now = datetime . datetime . now () a = parse_phrase ( u'03:33 ص (منذ 11 ساعة / ساعات)' ) b = now - datetime . timedelta ( hours = 11 ) b = InaccurateDateTime . copy ( b ) self . assertEqual ( b , a ) a = parse_phrase ( u'16 أغسطس' ) b = InaccurateDateTime ( year = now . year , month = 8 , day = 16 ) self . assertEqual ( b , a ) def test_other_languages ( self ): now = datetime . datetime . now () a = parse_phrase ( u'10:50 (לפני 5 שעות)' ) b = now - datetime . timedelta ( hours = 5 ) b = InaccurateDateTime . copy ( b ) self . assertEqual ( b , a ) def test_abbreviation_month_day ( self ): now = datetime . datetime . now () a = parse_phrase ( 'aug 15' ) b = InaccurateDateTime ( year = now . year , month = 8 , day = 15 ) self . assertEqual ( b , a ) def test_time_ago ( self ): a = parse_phrase ( '5 minutes ago' ) b = datetime . datetime . now () - datetime . timedelta ( minutes = 5 ) b = InaccurateDateTime . copy ( b ) self . assertEqual ( b , a ) def test_throws_errors ( self ): self . assertRaises ( TimeParseError , parse_phrase , ( 'bla bla bla' )) def init (): build_big_dictionary () init () if __name__ == '__main__' : unittest . main ()