From 2af64898263fb1299119b80b8453f83afd56235c Mon Sep 17 00:00:00 2001 From: Konstantin Lopuhin Date: Tue, 25 May 2021 10:00:26 +0300 Subject: [PATCH] WIP add find_date function: finds the first date similar to search_dates, but gets only the first date, and more suitable to shorts strings. It uses a brute-force approach, but has more predictable performance (at least on shorter strings - we limit the length to 100), and better quality in our tests, although still not perfect. Not sure if it should be exposed in current form, but may be useful for future development. --- dateparser/find_date.py | 70 +++++++++++ tests/test_find_date.py | 249 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100644 dateparser/find_date.py create mode 100644 tests/test_find_date.py diff --git a/dateparser/find_date.py b/dateparser/find_date.py new file mode 100644 index 000000000..6b0b33144 --- /dev/null +++ b/dateparser/find_date.py @@ -0,0 +1,70 @@ +from datetime import datetime +import logging +import re +from typing import List, Optional + +from dateparser.conf import apply_settings +from dateparser.date import DateDataParser +from dateparser.languages.loader import LocaleDataLoader +from dateparser.search.search import DateSearchWithDetection + + +LANGUAGES = set(LocaleDataLoader().get_locale_map()) +_bad_date_re = re.compile( + # whole dates we black-list (can still be parts of valid dates) + '^(' + '|'.join([ + r'\d{1,3}', # less than 4 digits + r'#\d+', # this is a sequence number + # some common false positives + # (https://github.com/scrapinghub/dateparser/issues/568) + r'[-/.]+', # bare separators parsed as current date + r'\w\.?', # one letter (with optional dot) + 'an', + ]) + ')$') +_date_separator = re.compile(r'[ ,|\(\)@]') # never part of the date +_drop_words = {'on', 'at', 'of', 'a'} # cause annoying false positives +_date_search = DateSearchWithDetection() + + +@apply_settings +def find_date( + text: str, *, + languages: Optional[List[str]], + settings, + max_join: int = 7, + ) -> Optional[datetime]: + """ Look for a date in the string, return the first date that is parsed. + This is used instead of search_dates from dateparser, because it has more + predictable performance and gets more dates correct, although it's still + not perfect. + Approach: + - split the date into tokens using _date_separator + - move over tokens and try to parse multiple tokens joined with + dateparser.parse, and return the first date. At each position start with + the longest n-gram, to parse the most complete date (max_join sets the + maximum length of the ngram) + """ + languages = list(languages or []) + languages = [l for l in languages if l in LANGUAGES] + if not languages: + detected = _date_search.detect_language(text=text, languages=languages) + if detected: + languages = [detected] + if 'en' not in languages: + languages.append('en') + parser = DateDataParser(languages=languages, settings=settings) + to_parse = [p for p in _date_separator.split(text) + if p and p not in _drop_words] + for i in range(len(to_parse)): + for j in reversed(range(min(max_join, len(to_parse) - i))): + x = ' '.join(to_parse[i: i + j + 1]) + if _bad_date_re.match(x): + continue + + try: + match = parser.get_date_data(x)['date_obj'] + except Exception as e: + logging.exception(e) + else: + if match: + return match diff --git a/tests/test_find_date.py b/tests/test_find_date.py new file mode 100644 index 000000000..ce0bd4943 --- /dev/null +++ b/tests/test_find_date.py @@ -0,0 +1,249 @@ +from datetime import datetime + +import pytest + +from dateparser.find_date import find_date + + +DATE_TEST_BASE = datetime(2018, 6, 28) +DATE_CASES_FAILURES = [ + ('+38 (097) 34-23-083', ['ru', 'en', 'mt'], '+38 (097) 34-23-083', None), + ('01.09 – 03.09.2017', ['en', 'de'], '01.09 – 03.09.2017', '2018-09-03T00:00:00'), + ('0800 022 26 26', ['nl'], '0800 022 26 26', None), + ('09 شباط/فبراير 2017', ['ar'], '09 شباط/فبراير 2017', '2017-09-09T00:00:00'), + ('10:00 - 01:30', ['nl'], '10:00 - 01:30', None), + ('2012 - 2016', ['de', 'en'], '2012 - 2016', '2012-01-01T00:00:00'), + ('2015-2017 Audio Sermons', ['en'], '2015-2017 Audio Sermons', '2015-01-01T00:00:00'), + ('2017.02.23更新', ['ja'], '2017.02.23更新', '2017-02-23T00:00:00'), + ('2018 Maiatzak 25', ['eu'], '2018 Maiatzak 25', '2018-05-25T00:00:00'), + ('2802', ['en'], '2802', None), + ('05 آبان 1394', ['fa'], '05 آبان 1394', '1394-11-05T00:00:00'), + ('Apr 25th 2018 at 7.57pm', ['en'], 'Apr 25th 2018 at 7.57pm', '2018-04-25T19:57:00'), + ('Año: 2011', ['es'], '2011', '2011-01-01T00:00:00'), + ('Mis à jour : 28 août 2015', ['fr'], '28 août 2015', '2015-09-28T00:00:00'), + ('Published 12:05 p.m. UTC Apr 26, 2018', ['en'], 'Published 12:05 p.m. UTC Apr 26, 2018', '2018-04-26T12:05:00'), + # ('SUMMER 2014', ['en', 'sw'], ???), + ('Published: 02:09 BST, 26 April 2018', ['en'], '02:09 BST, 26 April 2018', '2018-04-26T02:09:00'), + ('Updated on 2017년 2월 13일', ['ko', 'en', 'pt'], 'Updated on 2017년 2월 13일', '2018-02-13T00:00:00'), + ('Year composed: 2017', ['en'], '2017', '2017-01-01T00:00:00'), + ('Создано: 06.02.2017 10:49', ['ru'], '06.02.2017 10:49', '2017-02-06T10:49:00'), + ('نشر بتاريخ: 01 أيلول/سبتمبر 2016', ['ar'], '01 أيلول/سبتمبر 2016', '2016-09-01T00:00:00'), + ('نشر بتاريخ: 16 أيار 2016', ['ar'], '16 أيار 2016', '2016-05-16T00:00:00'), + ('發佈日期:2015-12-21', ['zh-Hant', 'da'], '發佈日期:2015-12-21', '2015-12-21T00:00:00'), + ('April 26, 2018 02:00 AM Eastern Daylight Time', ['en'], + 'April 26, 2018 02:00 AM Eastern Daylight Time', '2018-04-26T02:00:00-04:00'), + ('1 month ago (25, Apr 2018 7:18:54 PM)', ['en'], '1 month ago (25, Apr 2018 7:18:54 PM)', '2018-04-25T19:18:54'), + ('Publication : 9 novembre 2017', ['en'], '9 novembre 2017', '2017-11-09T00:00:00'), + ('11/12', ['es'], '11/12', '2018-12-11T00:00:00'), + ('05 49 27 01 11', ['fr'], '05 49 27 01 11', None), + ('09 65 35 95 16', ['fr'], '09 65 35 95 16', None), + ('4 days ago / May 3, 2019 / 5:46 AM', ['en'], 'May 3, 2019 / 5:46 AM / 4 days ago', '2019-05-03T05:46:00'), +] +DATE_CASES_SUCCESS = [ + ('- 24/04/2018', ['en'], '- 24/04/2018', '2018-04-24T00:00:00'), + ('- Jun 17, 2018 10:58 pm', ['en'], '- Jun 17, 2018 10:58 pm', '2018-06-17T22:58:00'), + ('/ 30 June, 2017', ['en'], '/ 30 June, 2017', '2017-06-30T00:00:00'), + ('01 Aprile 2015', ['it'], '01 Aprile 2015', '2015-04-01T00:00:00'), + ('01 June 2016', ['en'], '01 June 2016', '2016-06-01T00:00:00'), + ('01 Июня, 2015', ['ru'], '01 Июня, 2015', '2015-06-01T00:00:00'), + ('01/18/2018', ['en'], '01/18/2018', '2018-01-18T00:00:00'), + ('04.25.18 10:02 PM ET', ['en'], '04.25.18 10:02 PM ET', '2018-04-25T22:02:00-05:00'), + ('04.25.2018', ['en'], '04.25.2018', '2018-04-25T00:00:00'), + ('04/25/18 09:45 PM EDT', ['en'], '04/25/18 09:45 PM EDT', '2018-04-25T21:45:00-04:00'), + ('05 Junio 2017', ['es'], '05 Junio 2017', '2017-06-05T00:00:00'), + ('05 июля 2016', ['ru'], '05 июля 2016', '2016-07-05T00:00:00'), + ('06 lipiec 2015', ['pl', 'en'], '06 lipiec 2015', '2015-07-06T00:00:00'), + ('07 Jun, 2018', ['en'], '07 Jun, 2018', '2018-06-07T00:00:00'), + ('07 Maggio 2014', ['it'], '07 Maggio 2014', '2014-05-07T00:00:00'), + ('07 Mayo 2017', ['es', 'en'], '07 Mayo 2017', '2017-05-07T00:00:00'), + ('07 November 2017', ['en'], '07 November 2017', '2017-11-07T00:00:00'), + ('07 de noviembre de 2017', ['es'], '07 de noviembre de 2017', '2017-11-07T00:00:00'), + ('1 abril 2013 13:03:00', ['es'], '1 abril 2013 13:03:00', '2013-04-01T13:03:00'), + ('1 año hace', ['es'], '1 año hace', '2017-06-28T00:00:00'), + ('1 mayo, 2017', ['es'], '1 mayo, 2017', '2017-05-01T00:00:00'), + ('1 oktober 2015', [], '1 oktober 2015', '2015-10-01T00:00:00'), + ('1 year ago', ['ar'], '1 year ago', '2017-06-28T00:00:00'), + ('1-15', ['en'], '1-15', '2018-01-15T00:00:00'), + ('1. Mai 2005', ['de'], '1. Mai 2005', '2005-05-01T00:00:00'), + ('10 Styczeń 2017', ['pl'], '10 Styczeń 2017', '2017-01-10T00:00:00'), + ('10. Juli 2010', ['de'], '10. Juli 2010', '2010-07-10T00:00:00'), + ('10. marec, 2017', ['sl'], '10. marec, 2017', '2017-03-10T00:00:00'), + ('10/05/2018', ['en'], '10/05/2018', '2018-10-05T00:00:00'), + ('11 lutego 2018', ['pl'], '11 lutego 2018', '2018-02-11T00:00:00'), + ('11 marzo, 2017', ['es'], '11 marzo, 2017', '2017-03-11T00:00:00'), + ('11 noviembre, 2017', ['es'], '11 noviembre, 2017', '2017-11-11T00:00:00'), + ('11 septembre 2017', ['fr'], '11 septembre 2017', '2017-09-11T00:00:00'), + ('11. März 2017', ['en', 'de'], '11. März 2017', '2017-03-11T00:00:00'), + ('11. srpen 2016', ['cs'], '11. srpen 2016', '2016-08-11T00:00:00'), + ('12 Julio, 2017', ['es'], '12 Julio, 2017', '2017-07-12T00:00:00'), + ('12 luglio 2017', ['it'], '12 luglio 2017', '2017-07-12T00:00:00'), + ('12 maja 2017', ['pl'], '12 maja 2017', '2017-05-12T00:00:00'), + ('12 mois', ['fr'], '12 mois', '2017-06-28T00:00:00'), # ago? + ('12. Juni 2014', ['de'], '12. Juni 2014', '2014-06-12T00:00:00'), + ('12. Mai 2011', ['de'], '12. Mai 2011', '2011-05-12T00:00:00'), + ('12th September 2017', ['en'], '12th September 2017', '2017-09-12T00:00:00'), + ('13 - Sep - 2017', ['en'], '13 - Sep - 2017', '2017-09-13T00:00:00'), + ('13 Mar 2014', ['en'], '13 Mar 2014', '2014-03-13T00:00:00'), + ('13 enero, 2011', ['es'], '13 enero, 2011', '2011-01-13T00:00:00'), + ('13 novembre 2017', ['fr'], '13 novembre 2017', '2017-11-13T00:00:00'), + ('13 noviembre, 2017', ['es'], '13 noviembre, 2017', '2017-11-13T00:00:00'), + ('13 septembre', ['fr'], '13 septembre', '2018-09-13T00:00:00'), + ('13 мая 2017', ['ru', 'sr'], '13 мая 2017', '2017-05-13T00:00:00'), + ('13F Report Date: 9/30/2016', ['en'], '13F Report Date: 9/30/2016', '2016-09-30T00:00:00'), + ('14 Novembre 2015', ['it', 'en'], '14 Novembre 2015', '2015-11-14T00:00:00'), + ('15 марта 2017, в 08:30', ['ru'], '15 марта 2017, в 08:30', '2017-03-15T08:30:00'), + ('16 gennaio 2014', ['it'], '16 gennaio 2014', '2014-01-16T00:00:00'), + ('16 juny 2015', ['ca'], '16 juny 2015', '2015-06-16T00:00:00'), + ('18 ianuarie 2013', ['ro', 'en'], '18 ianuarie 2013', '2013-01-18T00:00:00'), + ('18. říjen 2017', ['cs'], '18. říjen 2017', '2017-10-18T00:00:00'), + ('1:55 AM 04/26/2018', ['en'], '1:55 AM 04/26/2018', '2018-04-26T01:55:00'), + ('2 months ago', ['en'], '2 months ago', '2018-04-28T00:00:00'), + ('20 Apr 2018 at 22:06', ['en'], '20 Apr 2018 at 22:06', '2018-04-20T22:06:00'), + ('2009. július 13. hétfő', ['hu', 'en'], '2009. július 13. hétfő', '2009-07-13T00:00:00'), + ('2014. július 31. 10:00', ['hu'], '2014. július 31. 10:00', '2014-07-31T10:00:00'), + ('2015. november 25. (szerda) 22:53', ['hu', 'en'], '2015. november 25. (szerda) 22:53', '2015-11-25T22:53:00'), + ('2017-06-22 08:54:58', ['en'], '2017-06-22 08:54:58', '2017-06-22T08:54:58'), + ('2018 06 18', ['en'], '2018 06 18', '2018-06-18T00:00:00'), + ('2018-04-24T21:07:58Z', [], '2018-04-24T21:07:58Z', '2018-04-24T21:07:58+00:00'), + ('22 augustus 2016', ['nl'], '22 augustus 2016', '2016-08-22T00:00:00'), + ('230 days ago', ['en'], '230 days ago', '2017-11-10T00:00:00'), + ('24 abril 2018', ['es', 'en'], '24 abril 2018', '2018-04-24T00:00:00'), + ('24 mayo, 2011', ['es'], '24 mayo, 2011', '2011-05-24T00:00:00'), + ('25 Apr 2018', ['en'], '25 Apr 2018', '2018-04-25T00:00:00'), + ('25 April 2018', ['en'], '25 April 2018', '2018-04-25T00:00:00'), + ('25 Οκτ,2017', ['el'], '25 Οκτ,2017', '2017-10-25T00:00:00'), + ('25.04.2018 | 10:47', ['en'], '25.04.2018 | 10:47', '2018-04-25T10:47:00'), + ('26 Apr 2018 07:00 GMT', ['en'], '26 Apr 2018 07:00 GMT', '2018-04-26T07:00:00+00:00'), + ('26 april 2018 04:00', ['sv', 'no', 'en'], '26 april 2018 04:00', '2018-04-26T04:00:00'), + ('26.04.2018 - 09:33 Uhr', ['de', 'en'], '26.04.2018 - 09:33 Uhr', '2018-04-26T09:33:00'), + ('26.09.2017', ['en'], '26.09.2017', '2017-09-26T00:00:00'), + ('26/04 - 00:14', ['en'], '26/04 - 00:14', '2018-04-26T00:14:00'), + ('26/04/2018 01:40 | Actualizado a 26/04/2018 03:36', ['es'], '26/04/2018 01:40 | Actualizado a 26/04/2018 03:36', + '2018-04-26T01:40:00'), + ('26th April 2018', ['en'], '26th April 2018', '2018-04-26T00:00:00'), + ('29th April 2015', ['en'], '29th April 2015', '2015-04-29T00:00:00'), + ('2nd November 2017 12:26 pm', ['en'], '2nd November 2017 12:26 pm', '2017-11-02T12:26:00'), + ('30 September 2017, 11:37 pm', ['en'], '30 September 2017, 11:37 pm', '2017-09-30T23:37:00'), + ('3月 06, 2011', ['ja', 'en', 'da'], '3月 06, 2011', '2011-03-06T00:00:00'), + ('4/21/17 11:41am', ['en'], '4/21/17 11:41am', '2017-04-21T11:41:00'), + ('4/25/2018 08:17:00 PM', ['en'], '4/25/2018 08:17:00 PM', '2018-04-25T20:17:00'), + ('7 tweets', ['en'], '7 tweets', None), + ('Apr 18, 2018', ['fr'], 'Apr 18, 2018', '2018-04-18T00:00:00'), + ('Apr 25', ['en'], 'Apr 25', '2018-04-25T00:00:00'), + ('Apr 25, 2018', ['en'], 'Apr 25, 2018', '2018-04-25T00:00:00'), + ('Apr 25, 2018 11:53 p.m. ET', ['en'], 'Apr 25, 2018 11:53 p.m. ET', '2018-04-25T23:53:00-05:00'), + ('Apr 26, 2018 00:00 IST', ['en'], 'Apr 26, 2018 00:00 IST', '2018-04-26T00:00:00+02:00'), + ('Apr 27, 2018 - 6:32 PM', ['en'], 'Apr 27, 2018 - 6:32 PM', '2018-04-27T18:32:00'), + ('Apr. 26, 2018', ['en'], 'Apr. 26, 2018', '2018-04-26T00:00:00'), + ('April 07, 2017', ['en'], 'April 07, 2017', '2017-04-07T00:00:00'), + ('April 24, 2018 05:29 PM', ['en'], 'April 24, 2018 05:29 PM', '2018-04-24T17:29:00'), + ('April 25 2018, 10:00pm,', ['en'], 'April 25 2018, 10:00pm,', '2018-04-25T22:00:00'), + ('April 25, 2018 9:56 PM EDT', ['en'], 'April 25, 2018 9:56 PM EDT', '2018-04-25T21:56:00-04:00'), + ('April 25, 2018 @10:11 PM', ['en'], 'April 25, 2018 @10:11 PM', '2018-04-25T22:11:00'), + ('April 25, 2018 at 11:31 pm', ['en'], 'April 25, 2018 at 11:31 pm', '2018-04-25T23:31:00'), + ('April 25, 2018 | 10:26pm', ['en'], 'April 25, 2018 | 10:26pm', '2018-04-25T22:26:00'), + ('April 25, 2018 | 4:27 PM', ['en'], 'April 25, 2018 | 4:27 PM', '2018-04-25T16:27:00'), + ('April 25, 2018, 6:08 PM', ['en'], 'April 25, 2018, 6:08 PM', '2018-04-25T18:08:00'), + ('April 26th, 2018 | Author: Gerry', ['en'], 'April 26th, 2018 | Author: Gerry', '2018-04-26T00:00:00'), + ('April 27, 2018', ['en'], 'April 27, 2018', '2018-04-27T00:00:00'), + ('August 22, 1939', ['en'], 'August 22, 1939', '1939-08-22T00:00:00'), + ('Breaking News April 25, 2018 11:14', ['en'], 'Breaking News April 25, 2018 11:14', '2018-04-25T11:14:00'), + ('By Francis Arinze Iloani | Publish Date: Apr 26 2018 4:00AM', ['en'], + 'Apr 26 2018 4:00AM', '2018-04-26T04:00:00'), + ('Creado: Lunes, 04 Noviembre 2013 17:16', ['es'], 'Lunes, 04 Noviembre 2013 17:16', '2013-11-04T17:16:00'), + ('Created: 06 November 2017', ['en'], '06 November 2017', '2017-11-06T00:00:00'), + ('Danny Bird | Published 26 April 2017', ['en'], 'Danny Bird | Published 26 April 2017', '2017-04-26T00:00:00'), + ('Ditayangkan: 02 Desember 2010', ['id'], '02 Desember 2010', '2010-12-02T00:00:00'), + ('Euan Andrews , April 26th, 2018 06:40', ['en'], 'Euan Andrews , April 26th, 2018 06:40', '2018-04-26T06:40:00'), + ('Kreirano: 04 Veljača 2016', ['hr'], '04 Veljača 2016', '2016-02-04T00:00:00'), + ('Laatst bijgewerkt: 17 oktober 2016', ['nl'], '17 oktober 2016', '2016-10-17T00:00:00'), + ('Last Updated: 02 January 2018', ['en'], '02 January 2018', '2018-01-02T00:00:00'), + ('Last modified: 18 Oct 2017', ['en'], '18 Oct 2017', '2017-10-18T00:00:00'), + ('Last updated 26 Apr 2018, 12:12 pm', ['en'], 'Last updated 26 Apr 2018, 12:12 pm', '2018-04-26T12:12:00'), + ('Latest update : 2018-04-26', ['en'], '2018-04-26', '2018-04-26T00:00:00'), + ('Lundi 05 septembre 2016', ['fr'], 'Lundi 05 septembre 2016', '2016-09-05T00:00:00'), + ('Megjelent: 2017. május 12.', ['hu'], '2017. május 12.', '2017-05-12T00:00:00'), + ('Monday, April 17, 2017', ['en'], 'Monday, April 17, 2017', '2017-04-17T00:00:00'), + ('Neil Macdonald , April 15th, 2013 05:04', ['en'], 'Neil Macdonald , April 15th, 2013 05:04', '2013-04-15T05:04:00'), + ('Objavljeno: 13 Srpanj 2017', ['hr'], '13 Srpanj 2017', '2017-07-13T00:00:00'), + ('On April 25, 2018', ['en'], 'On April 25, 2018', '2018-04-25T00:00:00'), + ('Opublikowano: 02 marzec 2017', ['pl'], '02 marzec 2017', '2017-03-02T00:00:00'), + ('POSTED April 26, 2018', ['en'], 'POSTED April 26, 2018', '2018-04-26T00:00:00'), + ('Paskelbta: 2017 spalio 09', ['lt'], '2017 spalio 09', '2017-10-09T00:00:00'), + ('Posted 2018-04-16', ['en'], 'Posted 2018-04-16', '2018-04-16T00:00:00'), + ('Posted: 04/25/2018 06:44:37 PM MDT', ['en'], '04/25/2018 06:44:37 PM MDT', '2018-04-25T18:44:37-06:00'), + ('Pubblicato: 30 Ottobre 2017', ['it'], '30 Ottobre 2017', '2017-10-30T00:00:00'), + ('Publicado: 05 Noviembre 2015', ['es'], '05 Noviembre 2015', '2015-11-05T00:00:00'), + ('Publicat: 06 Decembrie 2016', ['ro'], '06 Decembrie 2016', '2016-12-06T00:00:00'), + ('Publicerad 16 mars 2015', ['sv'], 'Publicerad 16 mars 2015', '2015-03-16T00:00:00'), + ('Published 29 August, 2015', [], 'Published 29 August, 2015', '2015-08-29T00:00:00'), + ('Published April 26, 2018 by Lisa Campbell', ['en'], + 'Published April 26, 2018 by Lisa Campbell', '2018-04-26T00:00:00'), + ('Sabato 1 Aprile 2017', ['it'], 'Sabato 1 Aprile 2017', '2017-04-01T00:00:00'), + ('Scris pe 16 februarie 2017 17 Comments', ['ro'], 'Scris pe 16 februarie 2017 17 Comments', '2017-02-16T00:00:00'), + ('Sunday, 23 July 2017', ['en'], 'Sunday, 23 July 2017', '2017-07-23T00:00:00'), + ('Thu, 04/26/2018 - 05:22', ['en'], 'Thu, 04/26/2018 - 05:22', '2018-04-26T05:22:00'), + ('Thursday 26 April 2018 10:00 UTC', ['en'], 'Thursday 26 April 2018 10:00 UTC', '2018-04-26T10:00:00+00:00'), + ('Thursday, April 26, 2018', ['en'], 'Thursday, April 26, 2018', '2018-04-26T00:00:00'), + ('Ultima modifica: 18 Ottobre 2010', ['it'], '18 Ottobre 2010', '2010-10-18T00:00:00'), + ('Updated: Apr 25, 2018 09:44 PM PDT', ['en'], 'Apr 25, 2018 09:44 PM PDT', '2018-04-25T21:44:00-07:00'), + ('Utworzono: 08 listopad 2017', [], '08 listopad 2017', '2017-11-08T00:00:00'), + ('Wed 5:18 PM, Apr 25, 2018', ['en'], 'Wed 5:18 PM, Apr 25, 2018', '2018-04-25T17:18:00'), + ('Wednesday 25 April 2018 - 5:01pm', ['en'], 'Wednesday 25 April 2018 - 5:01pm', '2018-04-25T17:01:00'), + ('Zuletzt aktualisiert: 12. Februar 2016', ['de'], '12. Februar 2016', '2016-02-12T00:00:00'), + ('Zveřejněno: 12. duben 2017', ['cs'], '12. duben 2017', '2017-04-12T00:00:00'), + ('by Joseph A. Wulfsohn | 11:06 pm, April 25th, 2018', ['en'], + 'by Joseph A. Wulfsohn | 11:06 pm, April 25th, 2018', '2018-04-25T23:06:00'), + ('on August 11, 2015', ['en'], 'on August 11, 2015', '2015-08-11T00:00:00'), + ('| 7:45 pm', ['en'], '| 7:45 pm', '2018-06-28T19:45:00'), + ('| April 25, 2018 08:15 PM', ['en'], '| April 25, 2018 08:15 PM', '2018-04-25T20:15:00'), + ('| Updated July 27, 2017', ['en'], '| Updated July 27, 2017', '2017-07-27T00:00:00'), + ('Última actualización: Lunes, 06 Julio 2015 18:00', ['es'], + 'Lunes, 06 Julio 2015 18:00', '2015-07-06T18:00:00'), + ('Được đăng: 14 Tháng 7 2017', ['vi'], '14 Tháng 7 2017', '2017-07-14T00:00:00'), + ('Được đăng: 17 Tháng 10 2016', ['vi'], '17 Tháng 10 2016', '2016-10-17T00:00:00'), + ('Πρώτη καταχώρηση: Τρίτη, 16 Αυγούστου 2016, 14:26', ['el'], + 'Τρίτη, 16 Αυγούστου 2016, 14:26', '2016-08-16T14:26:00'), + ('Τελευταία ενημέρωση : 08 Αύγουστος 2017', ['el', 'es'], + '08 Αύγουστος 2017', '2017-08-08T00:00:00'), + ('Дата: 01 серпня 2006', ['uk', 'en'], '01 серпня 2006', '2006-08-01T00:00:00'), + ('Опубликовано: 07 января 2017', ['ru'], '07 января 2017', '2017-01-07T00:00:00'), + ('Публикувана на 07 Юни 2018', ['bg'], 'Публикувана на 07 Юни 2018', '2018-06-07T00:00:00'), + ('от Administrator · Сентябрь 25, 2017', ['ru', 'en'], + 'от Administrator · Сентябрь 25, 2017', '2017-09-25T00:00:00'), + ('מאי 11, 2015', ['iw'], 'מאי 11, 2015', '2015-05-11T00:00:00'), + ('ธันวาคม 31, 2016', ['th'], 'ธันวาคม 31, 2016', '2016-12-31T00:00:00'), + ('เผยแพร่เมื่อ: วันอังคาร, 18 เมษายน 2560 11:14', ['th'], + 'วันอังคาร, 18 เมษายน 2560 11:14', '2560-04-18T11:14:00'), + ('— 26 Apr, 2018', ['en'], '— 26 Apr, 2018', '2018-04-26T00:00:00'), + ('Publiceret: 24. juli 2017', ['da'], '24. juli 2017', '2017-07-24T00:00:00'), + ('Veröffentlicht: 24. August 2017', ['de'], '24. August 2017', '2017-08-24T00:00:00'), + ('11/12', ['en'], '11/12', '2018-11-12T00:00:00'), + ('(201) 254-0596', ['en'], '(201) 254-0596', None), + ('01', ['en'], '01', None), + ('0823.1543014', ['it'], '0823.1543014', None), + ('100 głosów', ['pl'], '100 głosów', None), + ('16年12月22日', ['ja'], '16年12月22日', '2016-12-22T00:00:00'), + ('2 meses ago', ['es'], '2 meses ago', '2018-04-28T00:00:00'), + ('日期:2020年2月1日 下午6:25', ['zh'], '2020年2月1日 下午6:25', '2020-02-01T18:25:00'), + ('约会᠄ 2020年2月1日 下午6:25', ['zh'], '2020年2月1日 下午6:25', '2020-02-01T18:25:00'), + ('416 Pages / Published: 03/05/2018', ['en'], '416 Pages / Published: 03/05/2018', '2018-03-05T00:00:00'), + ('Posted on 07.23.16', ['en'], 'Posted on 07.23.16', '2016-07-23T00:00:00'), + ('May 3, 2019 / 5:46 AM / 4 days ago', ['en'], 'May 3, 2019 / 5:46 AM / 4 days ago', '2019-05-03T05:46:00'), + ('Oct 1, 2018 4:40 PM EST —', ['en'], 'Oct 1, 2018 4:40 PM EST —', '2018-10-01T16:40:00-05:00'), + ] +DATE_CASES = [pytest.param(*row) for row in DATE_CASES_SUCCESS] + \ + [pytest.param(*row, marks=pytest.mark.xfail) for row in DATE_CASES_FAILURES] + + +@pytest.mark.parametrize(['value', 'languages', 'expected_raw', 'expected'], DATE_CASES) +def test_find_date(value, languages, expected_raw, expected): + # value was used to test pre-cleanup step, using expected_raw instead here + # dt_raw = clean_date_attr_prefix(value) + # assert dt_raw == expected_raw + dt = find_date( + expected_raw, languages=languages, settings={'RELATIVE_BASE': DATE_TEST_BASE}) + if expected is None: + assert dt == expected + else: + assert dt.isoformat() == expected