From ae26f3630c2ae823374a9756b3401cb9a5a5cf3c Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sun, 9 Mar 2025 15:54:28 +0100 Subject: [PATCH] On ajoute l'import de pages de l'agenda associations CF Fix #277 --- experimentations/get_associations_cf.py | 38 ++++++++++ .../custom_extractors/associations_cf.py | 72 +++++++++++++++++++ src/agenda_culturel/import_tasks/extractor.py | 2 + 3 files changed, 112 insertions(+) create mode 100755 experimentations/get_associations_cf.py create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py diff --git a/experimentations/get_associations_cf.py b/experimentations/get_associations_cf.py new file mode 100755 index 0000000..4551519 --- /dev/null +++ b/experimentations/get_associations_cf.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import json +import os +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) +sys.path.append(parent + "/src") + +from src.agenda_culturel.import_tasks.downloader import ( + ChromiumHeadlessDownloader, +) +from src.agenda_culturel.import_tasks.custom_extractors.associations_cf import ( + CExtractor, +) +from src.agenda_culturel.import_tasks.importer import URL2Events + +if __name__ == "__main__": + u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor()) + url = "https://associations.clermont-ferrand.fr/evenement/week-end-multi-culturel" + + events = u2e.process(url, cache="asso_cf.html", published=True) + + exportfile = "event-asso_cf.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py b/src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py new file mode 100644 index 0000000..8c65bcf --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py @@ -0,0 +1,72 @@ +import logging +import re +from urllib.parse import urlparse + +from bs4 import BeautifulSoup + +from ..extractor import Extractor + +logger = logging.getLogger(__name__) + + +class CExtractor(Extractor): + def __init__(self): + super().__init__() + + def extract( + self, + content, + url, + url_human=None, + default_values=None, + published=False, + ): + self.set_header(url) + u = urlparse(url) + if u.netloc == "associations.clermont-ferrand.fr": + soup = BeautifulSoup(content, "html.parser") + + title = soup.select_one("h1").text.strip() + img = soup.select_one("img.image-style-event") + image = None + image_alt = None + if img: + image = img["src"] + if not image.startswith("http"): + image = "https://" + u.netloc + image + image_alt = img["alt"] + description = soup.select_one("div.field--name-body").get_text( + separator="\n" + ) + location = soup.select_one("div.c-location__holder .c-desc").text + + start_day = soup.select_one("div.o-date") + if start_day is not None: + start_day = Extractor.parse_french_date( + re.sub("[ ]*\n[ ]*", " ", start_day.get_text(separator=" ")) + ) + + start_time = soup.select_one("div.c-hours__holder .c-desc") + if start_time is not None: + start_time = Extractor.parse_french_time(start_time.text) + + category = None + tags = [] + uuids = [url] + + self.add_event( + default_values, + title, + category, + start_day, + location, + description, + tags, + uuids, + url_human=url_human, + start_time=start_time, + image=image, + image_alt=image_alt, + ) + + return self.get_structure() diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index 11add78..305b2f8 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -315,11 +315,13 @@ class Extractor(ABC): CExtractor as GoogleCalendarLinkEventExtractor, ) from .generic_extractors.ical import ICALExtractor + from .custom_extractors.associations_cf import CExtractor as AssociationsCF if single_event: return [ FacebookEventExtractor(), GoogleCalendarLinkEventExtractor(), + AssociationsCF(), EventNotFoundExtractor(), ] else: