On ajoute l'import de pages de l'agenda associations CF

Fix #277
2025-03-09 15:54:28 +01:00 · 2025-03-09 15:54:28 +01:00 · ae26f3630c
commit ae26f3630c
parent d401f533bd
3 changed files with 112 additions and 0 deletions
--- a/experimentations/get_associations_cf.py
+++ b/experimentations/get_associations_cf.py
@ -0,0 +1,38 @@
 #!/usr/bin/python3
 # coding: utf-8
 import json
 import os
 import sys
 # getting the name of the directory
 # where the this file is present.
 current = os.path.dirname(os.path.realpath(__file__))
 # Getting the parent directory name
 # where the current directory is present.
 parent = os.path.dirname(current)
 # adding the parent directory to
 # the sys.path.
 sys.path.append(parent)
 sys.path.append(parent + "/src")
 from src.agenda_culturel.import_tasks.downloader import (
    ChromiumHeadlessDownloader,
 )
 from src.agenda_culturel.import_tasks.custom_extractors.associations_cf import (
    CExtractor,
 )
 from src.agenda_culturel.import_tasks.importer import URL2Events
 if __name__ == "__main__":
    u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor())
    url = "https://associations.clermont-ferrand.fr/evenement/week-end-multi-culturel"
    events = u2e.process(url, cache="asso_cf.html", published=True)
    exportfile = "event-asso_cf.json"
    print("Saving events to file {}".format(exportfile))
    with open(exportfile, "w") as f:
        json.dump(events, f, indent=4, default=str)
--- a/src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py
@ -0,0 +1,72 @@
 import logging
 import re
 from urllib.parse import urlparse
 from bs4 import BeautifulSoup
 from ..extractor import Extractor
 logger = logging.getLogger(__name__)
 class CExtractor(Extractor):
    def __init__(self):
        super().__init__()
    def extract(
        self,
        content,
        url,
        url_human=None,
        default_values=None,
        published=False,
    ):
        self.set_header(url)
        u = urlparse(url)
        if u.netloc == "associations.clermont-ferrand.fr":
            soup = BeautifulSoup(content, "html.parser")
            title = soup.select_one("h1").text.strip()
            img = soup.select_one("img.image-style-event")
            image = None
            image_alt = None
            if img:
                image = img["src"]
                if not image.startswith("http"):
                    image = "https://" + u.netloc + image
                image_alt = img["alt"]
            description = soup.select_one("div.field--name-body").get_text(
                separator="\n"
            )
            location = soup.select_one("div.c-location__holder .c-desc").text
            start_day = soup.select_one("div.o-date")
            if start_day is not None:
                start_day = Extractor.parse_french_date(
                    re.sub("[ ]*\n[ ]*", " ", start_day.get_text(separator=" "))
                )
            start_time = soup.select_one("div.c-hours__holder .c-desc")
            if start_time is not None:
                start_time = Extractor.parse_french_time(start_time.text)
            category = None
            tags = []
            uuids = [url]
            self.add_event(
                default_values,
                title,
                category,
                start_day,
                location,
                description,
                tags,
                uuids,
                url_human=url_human,
                start_time=start_time,
                image=image,
                image_alt=image_alt,
            )
        return self.get_structure()
--- a/src/agenda_culturel/import_tasks/extractor.py
+++ b/src/agenda_culturel/import_tasks/extractor.py
@ -315,11 +315,13 @@ class Extractor(ABC):
            CExtractor as GoogleCalendarLinkEventExtractor,
        )
        from .generic_extractors.ical import ICALExtractor
        from .custom_extractors.associations_cf import CExtractor as AssociationsCF
        if single_event:
            return [
                FacebookEventExtractor(),
                GoogleCalendarLinkEventExtractor(),
                AssociationsCF(),
                EventNotFoundExtractor(),
            ]
        else: