From 9bb3373f99bab72230dc262c45639084279eca51 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Wed, 4 Sep 2024 11:42:31 +0200 Subject: [PATCH] =?UTF-8?q?Ajout=20(pas=20finalis=C3=A9)=20de=20l'import?= =?UTF-8?q?=20Cour=20des=203=20Coquins?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- experimentations/get_c3c_events.py | 43 ++++++++ .../import_tasks/custom_extractors/c3c.py | 100 ++++++++++++++++++ src/agenda_culturel/import_tasks/extractor.py | 11 +- 3 files changed, 152 insertions(+), 2 deletions(-) create mode 100755 experimentations/get_c3c_events.py create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/c3c.py diff --git a/experimentations/get_c3c_events.py b/experimentations/get_c3c_events.py new file mode 100755 index 0000000..7e6fee4 --- /dev/null +++ b/experimentations/get_c3c_events.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), c3c.CExtractor()) + url = "https://billetterie-c3c.clermont-ferrand.fr/" + url_human = "https://billetterie-c3c.clermont-ferrand.fr/" + + try: + events = u2e.process(url, url_human, cache = "cache-c3c.html", default_values = {"location": "La Cour des 3 Coquins"}, published = True) + + exportfile = "events-c3c.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/c3c.py b/src/agenda_culturel/import_tasks/custom_extractors/c3c.py new file mode 100644 index 0000000..6f28001 --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/c3c.py @@ -0,0 +1,100 @@ +from ..generic_extractors import * +from bs4 import BeautifulSoup + +# A class dedicated to get events from La Cour des 3 Coquins +# URL: https://billetterie-c3c.clermont-ferrand.fr// +class CExtractor(TwoStepsExtractor): + nom_lieu = "La Cour des 3 Coquins" + + def category_c3c2agenda(self, category): + if not category: + return None + mapping = {"Théâtre": "Théâtre", "Concert": "Concert", "Projection": "Cinéma"} + if category in mapping: + return mapping[category] + else: + return None + + def build_event_url_list(self, content): + soup = BeautifulSoup(content, "html.parser") + + events = soup.select("div.fiche-info") + + for e in events: + e_url = e.select_one("a.btn.lien_savoir_plus")["href"] + if e_url != "": + e_url = self.url + "/" + e_url + self.add_event_url(e_url) + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + soup = BeautifulSoup(event_content, "html.parser") + + title = soup.select_one("h1") + if title: + title = title.text + + image = soup.select_one("#media .swiper-slide img") + if image: + image = image["src"] + else: + image = None + + description = soup.select_one(".presentation") + duree = soup.select_one("#criteres .DUREE-V .valeur-critere li") + if duree is not None: + duree = self.parse_french_time(duree.text) + + location = self.nom_lieu + tags = [] + for t in soup.select(".sous-titre span"): + classes = t.get("class") + if classes and len(classes) > 0: + if classes[0].startswith("LIEU-"): + location = t.text + elif classes[0].startswith("THEMATIQUE-"): + tag = self.category_c3c2agenda(t.text) + if tag is not None: + tags.append(tag) + + # TODO: parser les dates, récupérer les heures () + + + + print("EVENT ", event_url) + print("- ", title) + print("- ", image) + print("- ", len(description)) + print("- ", duree) + print("- ", location) + print("- ", tags) + print("- ", dates) + + return + + + url_human = event_url + + self.add_event_with_props( + event_url, + None, + None, + start_day, + location, + description, + tags, + recurrences=None, + uuids=[event_url], + url_human=url_human, + start_time=start_time, + end_day=end_day, + end_time=end_time, + published=published, + image=image, + ) diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index 21d96a1..7eb6f80 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -97,13 +97,20 @@ class Extractor(ABC): s = "0" else: # format heures - m = re.search("([0-9]+)[ Hh:.]", text) + m = re.search("([0-9]+) [Hh:.]", text) if m: h = m.group(1) m = "0" s = "0" else: - return None + # format minutes + m = re.search("([0-9]+)[ ]*(?:mn|min|Min|Mn)", text) + if m: + h = "0" + m = m.group(1) + s = "0" + else: + return None try: h = int(h)