From 9f0a1a33cf88240f2447cfd3b6e923dd363df222 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 19 Oct 2024 15:36:50 +0200 Subject: [PATCH] =?UTF-8?q?Ajout=20d'un=20extracteur=20pour=20Arachn=C3=A9?= =?UTF-8?q?e=20Concerts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- experimentations/get_arachnee_events.py | 40 +++++++ src/agenda_culturel/celery.py | 2 + .../custom_extractors/arachnee.py | 109 ++++++++++++++++++ .../import_tasks/generic_extractors.py | 20 ++++ .../0090_alter_recurrentimport_processor.py | 18 +++ src/agenda_culturel/models.py | 1 + 6 files changed, 190 insertions(+) create mode 100755 experimentations/get_arachnee_events.py create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/arachnee.py create mode 100644 src/agenda_culturel/migrations/0090_alter_recurrentimport_processor.py diff --git a/experimentations/get_arachnee_events.py b/experimentations/get_arachnee_events.py new file mode 100755 index 0000000..6c10908 --- /dev/null +++ b/experimentations/get_arachnee_events.py @@ -0,0 +1,40 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + +if __name__ == "__main__": + + u2e = URL2Events(ChromiumHeadlessDownloader(), arachnee.CExtractor()) + url = "https://www.arachnee-concerts.com/wp-admin/admin-ajax.php?action=movies-filter&per_page=9999&date=NaN.NaN.NaN&theatres=Clermont-Fd&cat=&sorting=&list_all_events=¤t_page=" + url_human = "https://www.arachnee-concerts.com/agenda-des-concerts/Clermont-Fd/" + + try: + events = u2e.process(url, url_human, cache = "cache-arachnee.html", default_values = {}, published = True) + + exportfile = "events-arachnee.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 27c87f7..86852d8 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -145,6 +145,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id): extractor = fbevents.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.C3C: extractor = c3c.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.ARACHNEE: + extractor = arachnee.CExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py b/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py new file mode 100644 index 0000000..259ec8c --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py @@ -0,0 +1,109 @@ +from ..generic_extractors import * +from bs4 import BeautifulSoup + +# A class dedicated to get events from Arachnée Concert +# URL: https://www.arachnee-concerts.com/agenda-des-concerts/ +class CExtractor(TwoStepsExtractorNoPause): + + def __init__(self): + super().__init__() + self.possible_dates = {} + self.theater = None + + def extract( + self, + content, + url, + url_human=None, + default_values=None, + published=False, + only_future=True, + ignore_404=True + ): + match = re.match(r".*\&theatres=([^&]*)&.*", url) + if match: + self.theater = match[1] + + return super().extract(content, url, url_human, default_values, published, only_future, ignore_404) + + def build_event_url_list(self, content, infuture_days=180): + + soup = BeautifulSoup(content, "html.parser") + + containers = soup.select("ul.event_container>li") + if containers: + for c in containers: + d = Extractor.parse_french_date(c.select_one(".date").text) + l = c.select_one(".event_auditory").text + if (self.theater is None or (l.startswith(self.theater))) and d < datetime.date.today() + timedelta(days=infuture_days): + t = Extractor.parse_french_time(c.select_one(".time").text) + e_url = c.select_one(".info a")["href"] + if not e_url in self.possible_dates: + self.possible_dates[e_url] = [] + self.possible_dates[e_url].append((str(d) + " " + str(t))) + self.add_event_url(e_url) + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + + soup = BeautifulSoup(event_content, "html.parser") + title = ", ".join([x.text for x in [soup.select_one(y) for y in [".page_title", ".artiste-subtitle"]] if x]) + + image = soup.select_one(".entry-image .image_wrapper img") + if not image is None: + image = image["src"] + + descs = soup.select(".entry-content p") + if descs: + description = "\n".join([d.text for d in descs]) + else: + description = None + + category = soup.select_one(".event_category").text + tags = [] + if category in ["Grand Spectacle"]: + category = "Danse" + elif category in ["Théâtre"]: + category = "Théâtre" + elif category in ["Chanson française", "Musique du monde", "Pop / Rock", "Rap, RnB", "Raggae", "Variété"]: + category = "Concert" + elif category in ["Comédie Musicale", "Humour / One Man Show", "Spectacle équestre"]: + category = "Art du spectacle" + elif category in ["Spectacle pour enfant"]: + tags = ["jeune public"] + category = None + else: + category = "" + + dates = soup.select("#event_ticket_content>ul>li") + for d in dates: + dt = datetime.datetime.fromisoformat(d.select_one(".date")["content"]) + date = dt.date() + time = dt.time() + if str(date) + " " + str(time) in self.possible_dates[event_url]: + location = d.select_one(".event_auditory").text + + self.add_event_with_props( + default_values, + event_url, + title, + category, + date, + location, + description, + tags, + recurrences=None, + uuids=[event_url + "?d=" + str(date) + "&t=" + str(time)], + url_human=url_human, + start_time=time, + end_day=None, + end_time=None, + published=published, + image=image, + ) diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py index 6f339dd..03061e1 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -250,3 +250,23 @@ class TwoStepsExtractor(Extractor): ) return self.get_structure() + + +class TwoStepsExtractorNoPause(TwoStepsExtractor): + + def extract( + self, + content, + url, + url_human=None, + default_values=None, + published=False, + only_future=True, + ignore_404=True + ): + pause = self.downloader.pause + self.downloader.pause = False + result = super().extract(content, url, url_human, default_values, published, only_future, ignore_404) + self.downloader.pause = pause + + return result \ No newline at end of file diff --git a/src/agenda_culturel/migrations/0090_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0090_alter_recurrentimport_processor.py new file mode 100644 index 0000000..bede196 --- /dev/null +++ b/src/agenda_culturel/migrations/0090_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2024-10-19 13:24 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0089_alter_recurrentimport_defaultcategory'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('cour3coquins', 'la cour des 3 coquins'), ('arachnee', 'Arachnée concert')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 424e794..3106142 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -1307,6 +1307,7 @@ class RecurrentImport(models.Model): MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC") FBEVENTS = "Facebook events", _("Événements d'une page FB") C3C = "cour3coquins", _("la cour des 3 coquins") + ARACHNEE = "arachnee", _("Arachnée concert") class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")