From ae26f3630c2ae823374a9756b3401cb9a5a5cf3c Mon Sep 17 00:00:00 2001
From: Jean-Marie Favreau <jean-marie.favreau@logiroad-center.com>
Date: Sun, 9 Mar 2025 15:54:28 +0100
Subject: [PATCH] On ajoute l'import de pages de l'agenda associations CF

Fix #277
---
 experimentations/get_associations_cf.py       | 38 ++++++++++
 .../custom_extractors/associations_cf.py      | 72 +++++++++++++++++++
 src/agenda_culturel/import_tasks/extractor.py |  2 +
 3 files changed, 112 insertions(+)
 create mode 100755 experimentations/get_associations_cf.py
 create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py

diff --git a/experimentations/get_associations_cf.py b/experimentations/get_associations_cf.py
new file mode 100755
index 0000000..4551519
--- /dev/null
+++ b/experimentations/get_associations_cf.py
@@ -0,0 +1,38 @@
+#!/usr/bin/python3
+# coding: utf-8
+
+import json
+import os
+import sys
+
+# getting the name of the directory
+# where the this file is present.
+current = os.path.dirname(os.path.realpath(__file__))
+
+# Getting the parent directory name
+# where the current directory is present.
+parent = os.path.dirname(current)
+
+# adding the parent directory to
+# the sys.path.
+sys.path.append(parent)
+sys.path.append(parent + "/src")
+
+from src.agenda_culturel.import_tasks.downloader import (
+    ChromiumHeadlessDownloader,
+)
+from src.agenda_culturel.import_tasks.custom_extractors.associations_cf import (
+    CExtractor,
+)
+from src.agenda_culturel.import_tasks.importer import URL2Events
+
+if __name__ == "__main__":
+    u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor())
+    url = "https://associations.clermont-ferrand.fr/evenement/week-end-multi-culturel"
+
+    events = u2e.process(url, cache="asso_cf.html", published=True)
+
+    exportfile = "event-asso_cf.json"
+    print("Saving events to file {}".format(exportfile))
+    with open(exportfile, "w") as f:
+        json.dump(events, f, indent=4, default=str)
diff --git a/src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py b/src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py
new file mode 100644
index 0000000..8c65bcf
--- /dev/null
+++ b/src/agenda_culturel/import_tasks/custom_extractors/associations_cf.py
@@ -0,0 +1,72 @@
+import logging
+import re
+from urllib.parse import urlparse
+
+from bs4 import BeautifulSoup
+
+from ..extractor import Extractor
+
+logger = logging.getLogger(__name__)
+
+
+class CExtractor(Extractor):
+    def __init__(self):
+        super().__init__()
+
+    def extract(
+        self,
+        content,
+        url,
+        url_human=None,
+        default_values=None,
+        published=False,
+    ):
+        self.set_header(url)
+        u = urlparse(url)
+        if u.netloc == "associations.clermont-ferrand.fr":
+            soup = BeautifulSoup(content, "html.parser")
+
+            title = soup.select_one("h1").text.strip()
+            img = soup.select_one("img.image-style-event")
+            image = None
+            image_alt = None
+            if img:
+                image = img["src"]
+                if not image.startswith("http"):
+                    image = "https://" + u.netloc + image
+                image_alt = img["alt"]
+            description = soup.select_one("div.field--name-body").get_text(
+                separator="\n"
+            )
+            location = soup.select_one("div.c-location__holder .c-desc").text
+
+            start_day = soup.select_one("div.o-date")
+            if start_day is not None:
+                start_day = Extractor.parse_french_date(
+                    re.sub("[ ]*\n[ ]*", " ", start_day.get_text(separator=" "))
+                )
+
+            start_time = soup.select_one("div.c-hours__holder .c-desc")
+            if start_time is not None:
+                start_time = Extractor.parse_french_time(start_time.text)
+
+            category = None
+            tags = []
+            uuids = [url]
+
+            self.add_event(
+                default_values,
+                title,
+                category,
+                start_day,
+                location,
+                description,
+                tags,
+                uuids,
+                url_human=url_human,
+                start_time=start_time,
+                image=image,
+                image_alt=image_alt,
+            )
+
+        return self.get_structure()
diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py
index 11add78..305b2f8 100644
--- a/src/agenda_culturel/import_tasks/extractor.py
+++ b/src/agenda_culturel/import_tasks/extractor.py
@@ -315,11 +315,13 @@ class Extractor(ABC):
             CExtractor as GoogleCalendarLinkEventExtractor,
         )
         from .generic_extractors.ical import ICALExtractor
+        from .custom_extractors.associations_cf import CExtractor as AssociationsCF
 
         if single_event:
             return [
                 FacebookEventExtractor(),
                 GoogleCalendarLinkEventExtractor(),
+                AssociationsCF(),
                 EventNotFoundExtractor(),
             ]
         else: