From 8547a671c4476cc62e6eb2fe2f4c61270f80a299 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sun, 2 Jun 2024 14:29:05 +0200 Subject: [PATCH] Support des imports plugin MEC de Wordpress --- experimentations/get_le_poulailler.py | 43 ++++++++ experimentations/get_les_vinzelles.py | 43 ++++++++ src/agenda_culturel/celery.py | 2 + .../custom_extractors/wordpress_mec.py | 99 +++++++++++++++++++ .../import_tasks/generic_extractors.py | 21 +++- .../0064_alter_recurrentimport_processor.py | 18 ++++ src/agenda_culturel/models.py | 1 + 7 files changed, 222 insertions(+), 5 deletions(-) create mode 100755 experimentations/get_le_poulailler.py create mode 100755 experimentations/get_les_vinzelles.py create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py create mode 100644 src/agenda_culturel/migrations/0064_alter_recurrentimport_processor.py diff --git a/experimentations/get_le_poulailler.py b/experimentations/get_le_poulailler.py new file mode 100755 index 0000000..e7fd697 --- /dev/null +++ b/experimentations/get_le_poulailler.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), wordpress_mec.CExtractor()) + url = "https://www.cabaretlepoulailler.fr/agenda/tout-lagenda/" + url_human = "https://www.cabaretlepoulailler.fr/agenda/tout-lagenda/" + + try: + events = u2e.process(url, url_human, cache = "cache-le-poulailler.html", default_values = {"location": "Le Poulailler"}, published = True) + + exportfile = "events-le-poulailler.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/experimentations/get_les_vinzelles.py b/experimentations/get_les_vinzelles.py new file mode 100755 index 0000000..f7534e8 --- /dev/null +++ b/experimentations/get_les_vinzelles.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), wordpress_mec.CExtractor()) + url = "https://www.lesvinzelles.com/index.php/programme/" + url_human = "https://www.lesvinzelles.com/index.php/programme/" + + try: + events = u2e.process(url, url_human, cache = "cache-les-vinzelles.html", default_values = {"location": "Les Vinzelles"}, published = True) + + exportfile = "events-les-vinzelles.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index eec6dfa..09c0b8e 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -112,6 +112,8 @@ def run_recurrent_import(self, pk): extractor = lefotomat.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE: extractor = lapucealoreille.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.MECWORDPRESS: + extractor = wordpress_mec.CExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py b/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py new file mode 100644 index 0000000..e102cea --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py @@ -0,0 +1,99 @@ +from ..generic_extractors import * +from bs4 import BeautifulSoup + + +# A class dedicated to get events from MEC Wordpress plugin +# URL: https://webnus.net/modern-events-calendar/ +class CExtractor(TwoStepsExtractor): + + def local2agendaCategory(self, category): + mapping = { + "Musique": "Concert", + "CONCERT": "Concert", + "VISITE": "Autre", + "Spectacle": "Théâtre", + "Rencontre": "Autre", + "Atelier": "Autre", + "Projection": "Autre", + } + if category in mapping: + return mapping[category] + else: + return None + + def build_event_url_list(self, content): + soup = BeautifulSoup(content, "xml") + + events = soup.select("div.mec-tile-event-content") + for e in events: + link = e.select("h4.mec-event-title a") + if len(link) == 1: + url = link[0]["href"] + title = link[0].get_text() + + if self.add_event_url(url): + print(url, title) + self.add_event_title(url, title) + + categories = e.select(".mec-label-normal") + if len(categories) == 0: + categories = e.select(".mec-category") + if len(categories) > 0: + category = self.local2agendaCategory(categories[0].get_text()) + if category is not None: + self.add_event_category(url, category) + + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + soup = BeautifulSoup(event_content, "xml") + + start_day = soup.select(".mec-start-date-label") + if start_day and len(start_day) > 0: + start_day = self.parse_french_date(start_day[0].get_text()) + else: + start_day = None + t = soup.select(".mec-single-event-time .mec-events-abbr") + if t: + t = t[0].get_text().split("-") + start_time = self.parse_french_time(t[0]) + if len(t) > 1: + end_time = self.parse_french_time(t[1]) + else: + end_time = None + else: + start_time = None + end_time = None + + image = soup.select(".mec-events-event-image img") + if image: + image = image[0]["src"] + else: + image = None + description = soup.select(".mec-event-content")[0].get_text() + + url_human = event_url + + self.add_event_with_props( + event_url, + None, + None, + start_day, + None if "location" not in default_values else default_values["location"], + description, + None, + recurrences=None, + uuids=[event_url], + url_human=url_human, + start_time=start_time, + end_day=None, + end_time=end_time, + published=published, + image=image, + ) diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py index d212624..0ba6fc8 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -2,6 +2,10 @@ from abc import abstractmethod from urllib.parse import urlparse from urllib.parse import parse_qs +import logging + +logger = logging.getLogger(__name__) + from .extractor import * from django.utils.translation import gettext_lazy as _ @@ -171,6 +175,7 @@ class TwoStepsExtractor(Extractor): default_values=None, published=False, only_future=True, + ignore_404=True ): self.only_future = only_future self.now = datetime.datetime.now().date() @@ -195,10 +200,16 @@ class TwoStepsExtractor(Extractor): # first download the content associated with this link content_event = self.downloader.get_content(event_url) if content_event is None: - raise Exception(_("Cannot extract event from url {}").format(event_url)) - # then extract event information from this html document - self.add_event_from_content( - content_event, event_url, url_human, default_values, published - ) + msg = "Cannot extract event from url {}".format(event_url) + if ignore_404: + logger.error(msg) + else: + print("go") + raise Exception(msg) + else: + # then extract event information from this html document + self.add_event_from_content( + content_event, event_url, url_human, default_values, published + ) return self.get_structure() diff --git a/src/agenda_culturel/migrations/0064_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0064_alter_recurrentimport_processor.py new file mode 100644 index 0000000..33ac0a4 --- /dev/null +++ b/src/agenda_culturel/migrations/0064_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.7 on 2024-06-02 12:01 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0063_alter_event_exact_location'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', 'la puce à loreille'), ('Plugin wordpress MEC', 'Plugin wordpress MEC')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index b92f9b6..771f42f 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -1194,6 +1194,7 @@ class RecurrentImport(models.Model): LACOMEDIE = "lacomedie", _("la comédie") LEFOTOMAT = "lefotomat", _("le fotomat") LAPUCEALOREILLE = "lapucealoreille", _("la puce à l" "oreille") + MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC") class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")