Ajout d'un extracteur pour Arachnée Concerts

2024-10-19 15:36:50 +02:00
parent 30aafd4979
commit 9f0a1a33cf
6 changed files with 190 additions and 0 deletions
--- a/experimentations/get_arachnee_events.py
+++ b/experimentations/get_arachnee_events.py
@@ -0,0 +1,40 @@
+#!/usr/bin/python3
+# coding: utf-8
+
+import os
+import json
+import sys
+
+# getting the name of the directory
+# where the this file is present.
+current = os.path.dirname(os.path.realpath(__file__))
+ 
+# Getting the parent directory name
+# where the current directory is present.
+parent = os.path.dirname(current)
+ 
+# adding the parent directory to 
+# the sys.path.
+sys.path.append(parent)
+
+from src.agenda_culturel.import_tasks.downloader import *
+from src.agenda_culturel.import_tasks.extractor import *
+from src.agenda_culturel.import_tasks.importer import *
+from src.agenda_culturel.import_tasks.custom_extractors import *
+
+
+if __name__ == "__main__":
+
+    u2e = URL2Events(ChromiumHeadlessDownloader(), arachnee.CExtractor())
+    url = "https://www.arachnee-concerts.com/wp-admin/admin-ajax.php?action=movies-filter&per_page=9999&date=NaN.NaN.NaN&theatres=Clermont-Fd&cat=&sorting=&list_all_events=&current_page="
+    url_human = "https://www.arachnee-concerts.com/agenda-des-concerts/Clermont-Fd/"
+
+    try:
+        events = u2e.process(url, url_human, cache = "cache-arachnee.html", default_values = {}, published = True)
+
+        exportfile = "events-arachnee.json"
+        print("Saving events to file {}".format(exportfile))
+        with open(exportfile, "w") as f:
+            json.dump(events, f, indent=4, default=str)
+    except Exception as e:
+        print("Exception: " + str(e))
--- a/src/agenda_culturel/celery.py
+++ b/src/agenda_culturel/celery.py
@@ -145,6 +145,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
        extractor = fbevents.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.C3C:
        extractor = c3c.CExtractor()
+    elif rimport.processor == RecurrentImport.PROCESSOR.ARACHNEE:
+        extractor = arachnee.CExtractor()
    else:
        extractor = None

--- a/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py
@@ -0,0 +1,109 @@
+from ..generic_extractors import *
+from bs4 import BeautifulSoup
+
+# A class dedicated to get events from Arachnée Concert
+# URL: https://www.arachnee-concerts.com/agenda-des-concerts/
+class CExtractor(TwoStepsExtractorNoPause):
+
+    def __init__(self):
+        super().__init__()
+        self.possible_dates = {}
+        self.theater = None
+
+    def extract(
+        self,
+        content,
+        url,
+        url_human=None,
+        default_values=None,
+        published=False,
+        only_future=True,
+        ignore_404=True
+    ):
+        match = re.match(r".*\&theatres=([^&]*)&.*", url)
+        if match:
+            self.theater = match[1]
+
+        return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
+
+    def build_event_url_list(self, content, infuture_days=180):
+        
+        soup = BeautifulSoup(content, "html.parser")
+
+        containers = soup.select("ul.event_container>li")
+        if containers:
+            for c in containers:
+                d = Extractor.parse_french_date(c.select_one(".date").text)
+                l = c.select_one(".event_auditory").text
+                if (self.theater is None or (l.startswith(self.theater))) and d < datetime.date.today() + timedelta(days=infuture_days):
+                    t = Extractor.parse_french_time(c.select_one(".time").text)
+                    e_url = c.select_one(".info a")["href"]
+                    if not e_url in self.possible_dates:
+                        self.possible_dates[e_url] = []
+                    self.possible_dates[e_url].append((str(d) + " " + str(t)))
+                    self.add_event_url(e_url)
+
+    def add_event_from_content(
+        self,
+        event_content,
+        event_url,
+        url_human=None,
+        default_values=None,
+        published=False,
+    ):
+
+        soup = BeautifulSoup(event_content, "html.parser")
+        title = ", ".join([x.text for x in [soup.select_one(y) for y in [".page_title", ".artiste-subtitle"]] if x])
+        
+        image = soup.select_one(".entry-image .image_wrapper img")
+        if not image is None:
+            image = image["src"]
+
+        descs = soup.select(".entry-content p")
+        if descs:
+            description = "\n".join([d.text for d in descs])
+        else:
+            description = None
+
+        category = soup.select_one(".event_category").text
+        tags = []
+        if category in ["Grand Spectacle"]:
+            category = "Danse"
+        elif category in ["Théâtre"]:
+            category = "Théâtre"
+        elif category in ["Chanson française", "Musique du monde", "Pop / Rock", "Rap, RnB", "Raggae", "Variété"]:
+            category = "Concert"
+        elif category in ["Comédie Musicale", "Humour / One Man Show", "Spectacle équestre"]:
+            category = "Art du spectacle"
+        elif category in ["Spectacle pour enfant"]:
+            tags = ["jeune public"]
+            category = None
+        else:
+            category = ""
+
+        dates = soup.select("#event_ticket_content>ul>li")
+        for d in dates:
+            dt = datetime.datetime.fromisoformat(d.select_one(".date")["content"])
+            date = dt.date()
+            time = dt.time()
+            if str(date) + " " + str(time) in self.possible_dates[event_url]:
+                location = d.select_one(".event_auditory").text
+                
+                self.add_event_with_props(
+                    default_values,
+                    event_url,
+                    title,
+                    category,
+                    date,
+                    location,
+                    description,
+                    tags,
+                    recurrences=None,
+                    uuids=[event_url + "?d=" + str(date) + "&t=" + str(time)],
+                    url_human=url_human,
+                    start_time=time,
+                    end_day=None,
+                    end_time=None,
+                    published=published,
+                    image=image,
+                )
--- a/src/agenda_culturel/import_tasks/generic_extractors.py
+++ b/src/agenda_culturel/import_tasks/generic_extractors.py
@@ -250,3 +250,23 @@ class TwoStepsExtractor(Extractor):
                )

        return self.get_structure()
+
+
+class TwoStepsExtractorNoPause(TwoStepsExtractor):
+
+    def extract(
+        self,
+        content,
+        url,
+        url_human=None,
+        default_values=None,
+        published=False,
+        only_future=True,
+        ignore_404=True
+    ):
+        pause = self.downloader.pause
+        self.downloader.pause = False
+        result = super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
+        self.downloader.pause = pause
+
+        return result
--- a/src/agenda_culturel/migrations/0090_alter_recurrentimport_processor.py
+++ b/src/agenda_culturel/migrations/0090_alter_recurrentimport_processor.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.9 on 2024-10-19 13:24
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('agenda_culturel', '0089_alter_recurrentimport_defaultcategory'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='recurrentimport',
+            name='processor',
+            field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('cour3coquins', 'la cour des 3 coquins'), ('arachnee', 'Arachnée concert')], default='ical', max_length=20, verbose_name='Processor'),
+        ),
+    ]
--- a/src/agenda_culturel/models.py
+++ b/src/agenda_culturel/models.py
@@ -1307,6 +1307,7 @@ class RecurrentImport(models.Model):
        MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC")
        FBEVENTS = "Facebook events", _("Événements d'une page FB")
        C3C = "cour3coquins", _("la cour des 3 coquins")
+        ARACHNEE = "arachnee", _("Arachnée concert")

    class DOWNLOADER(models.TextChoices):
        SIMPLE = "simple", _("simple")