From 938ece53262be1054a46b16f0a5a25eed6234847 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sun, 9 Feb 2025 10:30:08 +0100 Subject: [PATCH] =?UTF-8?q?Am=C3=A9lioration=20de=20l'import=20des=20?= =?UTF-8?q?=C3=A9v=C3=A9nements=20avec=20google=20calendar?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../import_tasks/extractor_ggcal_link.py | 26 +++++++++++++++++-- .../import_tasks/generic_extractors.py | 11 +++++--- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/agenda_culturel/import_tasks/extractor_ggcal_link.py b/src/agenda_culturel/import_tasks/extractor_ggcal_link.py index da5344e..5ce6ead 100644 --- a/src/agenda_culturel/import_tasks/extractor_ggcal_link.py +++ b/src/agenda_culturel/import_tasks/extractor_ggcal_link.py @@ -14,7 +14,27 @@ logger = logging.getLogger(__name__) class GoogleCalendarLinkEventExtractor(Extractor): def __init__(self): super().__init__() - self.possible_urls = ["https://calendar.google.com/calendar/", "https://addtocalendar.com/"] + self.possible_urls = ["https://calendar.google.com/calendar/", "https://addtocalendar.com/", "https://www.google.com/calendar/event"] + + + def guess_image(self, soup, url): + image = soup.find("meta", property="og:image") + + if image is None: + for img in soup.select('img'): + if img.find_parent(name='nav'): + continue + image = img["src"] + break + else: + image = image["content"] + + if image.startswith("/"): + root_url = "https://" + urlparse(url).netloc + "/" + image = root_url + image + + return image + def extract( self, content, url, url_human=None, default_values=None, published=False @@ -40,6 +60,8 @@ class GoogleCalendarLinkEventExtractor(Extractor): self.set_header(url) + image = self.guess_image(soup, url) + category = None self.add_event( @@ -57,7 +79,7 @@ class GoogleCalendarLinkEventExtractor(Extractor): end_day=end_day, end_time=end_time, published=published, - image=None, + image=image, ) break diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py index 6380e3f..532c2ef 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -1,6 +1,8 @@ from abc import abstractmethod from urllib.parse import urlparse from urllib.parse import parse_qs +from bs4 import BeautifulSoup + import logging @@ -38,10 +40,11 @@ class GGCalendar: params = GGCalendar.filter_keys(params) - self.location = params["location"][0] if "location" in params else None - self.title = params["text"][0] if "text" in params else params["title"][0] if "title" in params else None - self.description = params["description"][0] if "description" in params else None - self.location = params["location"][0] if "location" in params else None + self.location = params["location"][0] if "location" in params else "" + self.title = params["text"][0] if "text" in params else params["title"][0] if "title" in params else "" + self.description = params["description"][0] if "description" in params else params["details"][0] if "details" in params else "" + if self.description != "": + self.description = BeautifulSoup(self.description, "html.parser").text if "dates" in params: dates = [x.replace(" ", "+") for x in params["dates"][0].split("/")] if len(dates) > 0: