diff --git a/README.md b/README.md index 9aabc9c..901b663 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ Parmi les outils et ressources sur lesquelles s'appuie l'agenda culturel, on peu - [Selenium](https://www.selenium.dev/) - [Feather icons](https://feathericons.com/) - [Pico CSS](https://picocss.com/) +- [chronostring](https://forge.chapril.org/jmtrivial/chronostring) (des mêmes auteurs) ## Installation diff --git a/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py b/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py index 1bef3c2..9261865 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py @@ -1,9 +1,10 @@ -from datetime import date +from datetime import date, datetime from urllib.parse import urlparse +from chronostring import parse_dates +import re from bs4 import BeautifulSoup -from ..extractor import Extractor from ..twosteps_extractor import TwoStepsExtractorNoPause @@ -53,33 +54,6 @@ class CExtractor(TwoStepsExtractorNoPause): return result - # this method is not perfect, but dates and hours are not structured - def parse_dates(self, date): - dl = date.replace(" à ", "\n").split("\n") - result = [] - - for d in dl: - # only lines with a digit - if sum(c.isdigit() for c in d) != 0: - # split subparts - for d2 in d.replace(" et ", ", ").split(", "): - d2 = d2.strip() - dd = Extractor.parse_french_date( - d2, default_year_by_proximity=self.today - ) - if dd is None: - hh = Extractor.parse_french_time(d2) - for i, r in enumerate(result): - result[i][1].append(hh) - else: - result.append([dd, []]) - - if "De" in date and " à " in date: - for i, r in enumerate(result): - result[i].append(True) - - return result - def build_event_url_list(self, content, infuture_days=180): soup = BeautifulSoup(content, "html.parser") links = soup.select(".cell a.evenement") @@ -118,104 +92,64 @@ class CExtractor(TwoStepsExtractorNoPause): soup.select_one(".champ.taxo-age").text category = self.parse_category(soup.select_one(".champ.categorie").text) - date = soup.select_one(".champ.date-libre").text + dt = soup.select_one(".champ.date-libre").text description = "\n\n".join( - [x for x in [soustitre, description, date, infos] if x is not None] + [x for x in [soustitre, description, dt, infos] if x is not None] ) if ( - " au " in date - or date.startswith("Du") - or date.lower().strip() == "en continu" - or date.startswith("Les") + " au " in dt + or dt.startswith("Du") + or dt.lower().strip() == "en continu" + or dt.startswith("Les") ): return - dates = self.parse_dates(date) + dates = [] + for dl in dt.split("\n"): + if re.match(r".* ans[  ]*:.*", dt): + dates += parse_dates(dt.split(":")[1]) + else: + dates += parse_dates(dt) for d in dates: - if len(d) >= 2: - start_day = d[0] + start_day = None + start_time = None + end_day = None + end_time = None + if isinstance(d, datetime): + start_day = d.date() + start_time = d.time() + elif isinstance(d, date): + start_day = d + elif isinstance(d, list) and len(d) == 2: + start_day = d[0].date() + start_time = d[0].time() + end_day = d[1].date() + end_time = d[1].time() - if len(d) == 3 and len(d[1]) == 2: - start_time = d[1][0] - end_time = d[1][1] - uuid = ( - event_url - + "?date=" - + str(start_day) - + "&hour=" - + str(start_time) - ) - self.add_event_with_props( - default_values, - event_url, - title, - category, - start_day, - location, - description, - [], - recurrences=None, - uuids=[uuid], - url_human=event_url, - start_time=start_time, - end_day=start_day, - end_time=end_time, - published=published, - image=image, - image_alt=image_alt, - ) - else: - end_time = None - if len(d[1]) == 0: - start_time = None - uuid = event_url + "?date=" + str(start_day) - self.add_event_with_props( - default_values, - event_url, - title, - category, - start_day, - location, - description, - [], - recurrences=None, - uuids=[uuid], - url_human=event_url, - start_time=start_time, - end_day=start_day, - end_time=end_time, - published=published, - image=image, - image_alt=image_alt, - ) - for t in d[1]: - start_time = t - uuid = ( - event_url - + "?date=" - + str(start_day) - + "&hour=" - + str(start_time) - ) - self.add_event_with_props( - default_values, - event_url, - title, - category, - start_day, - location, - description, - [], - recurrences=None, - uuids=[uuid], - url_human=event_url, - start_time=start_time, - end_day=start_day, - end_time=end_time, - published=published, - image=image, - image_alt=image_alt, - ) + if start_day is not None: + uuid = event_url + "?date=" + str(start_day) + if start_time is not None: + uuid = uuid + "&hour=" + str(start_time) + + self.add_event_with_props( + default_values, + event_url, + title, + category, + start_day, + location, + description, + [], + recurrences=None, + uuids=[uuid], + url_human=event_url, + start_time=start_time, + end_day=end_day, + end_time=end_time, + published=published, + image=image, + image_alt=image_alt, + ) diff --git a/src/requirements.txt b/src/requirements.txt index 678cbac..82d9d91 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -51,3 +51,4 @@ django-cleanup==9.0.0 django-unused-media==0.2.2 django-resized==1.0.3 django-solo==2.4.0 +chronostring==0.1.2