Ajout Mille formes

Fix #274
2025-02-02 14:21:03 +01:00
parent 55a0094e2f
commit 2fe2611788
6 changed files with 274 additions and 4 deletions
--- a/experimentations/get_milleformes_events.py
+++ b/experimentations/get_milleformes_events.py
@@ -0,0 +1,44 @@
 #!/usr/bin/python3
 # coding: utf-8
 import os
 import json
 import sys
 # getting the name of the directory
 # where the this file is present.
 current = os.path.dirname(os.path.realpath(__file__))
 # Getting the parent directory name
 # where the current directory is present.
 parent = os.path.dirname(current)
 # adding the parent directory to
 # the sys.path.
 sys.path.append(parent)
 sys.path.append(parent + "/src")
 from src.agenda_culturel.import_tasks.downloader import *
 from src.agenda_culturel.import_tasks.extractor import *
 from src.agenda_culturel.import_tasks.importer import *
 from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":
    u2e = URL2Events(SimpleDownloader(), mille_formes.CExtractor())
    url = "https://www.milleformes.fr/programme"
    url_human = "https://www.milleformes.fr/programme"
    try:
        events = u2e.process(url, url_human, cache = "cache-1000formes.html", default_values = {}, published = True)
        exportfile = "events-1000formes.json"
        print("Saving events to file {}".format(exportfile))
        with open(exportfile, "w") as f:
            json.dump(events, f, indent=4, default=str)
    except Exception as e:
        print("Exception: " + str(e))
--- a/src/agenda_culturel/celery.py
+++ b/src/agenda_culturel/celery.py
@@ -156,6 +156,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
        extractor = apidae_tourisme.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.IGUANA:
        extractor = iguana_agenda.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES:
        extractor = mille_formes.CExtractor()
    else:
        extractor = None
--- a/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py
@@ -0,0 +1,193 @@
 from ..generic_extractors import *
 from bs4 import BeautifulSoup
 from datetime import datetime, date
 # A class dedicated to get events from Mille formes
 # URL: https://www.milleformes.fr/programme
 class CExtractor(TwoStepsExtractorNoPause):
    def extract(
        self,
        content,
        url,
        url_human=None,
        default_values=None,
        published=False,
        only_future=True,
        ignore_404=True):
        self.root_address = "https://" + urlparse(url).netloc + "/"
        self.today = date.today()
        return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
    def parse_category(self, cat):
        cat = cat.replace("\n", "").strip()
        if "exposition" in cat or "dispositif artistique interactif" in cat:
            result = 'Visites & Expositions'
        elif "atelier" in cat:
            result = 'Animations & Ateliers'
        elif cat in ["buffet"]:
            result = 'Rendez-vous locaux'
        elif "ciné" in cat:
            result = 'Cinéma'
        elif "concert" in cat:
            result = 'Fêtes & Concerts'
        elif "rencontre" in cat:
            result = 'Rencontres & Débats'
        elif "spectacle" in cat:
            result = 'Spectacles'
        else:
            result = 'Sans catégorie'
        return result
    # this method is not perfect, but dates and hours are not structured
    def parse_dates(self, date):
        dl = date.replace(' à ', '\n').split('\n')
        result = []
        for d in dl:
            # only lines with a digit
            if sum(c.isdigit() for c in d) != 0:
                # split subparts
                for d2 in d.replace(' et ', ', ').split(', '):
                    d2 = d2.strip()
                    dd = Extractor.parse_french_date(d2, default_year_by_proximity=self.today)
                    if dd is None:
                        hh = Extractor.parse_french_time(d2)
                        for i, r in enumerate(result):
                            result[i][1].append(hh)
                    else: 
                        result.append([dd, []])
        if "De" in date and " à " in date:
            for i, r in enumerate(result):
                result[i].append(True)
        return result
    def build_event_url_list(self, content, infuture_days=180):
        soup = BeautifulSoup(content, "html.parser")
        links = soup.select('.cell a.evenement')
        for l in links:
            self.add_event_url(self.root_address + l["href"])
    def add_event_from_content(
        self,
        event_content,
        event_url,
        url_human=None,
        default_values=None,
        published=False,
    ):
        soup = BeautifulSoup(event_content, "html.parser")
        title = soup.select_one('h1').text.replace("\n", "").strip().title()
        image = soup.select_one('.slide img')
        if image is None:
            image_alt = ''
        else:
            image_alt = image["alt"]
            image = self.root_address + image["src"]
        soustitre = soup.select_one('.sous-titre')
        if not soustitre is None:
            soustitre = soustitre.text.strip()
        description = soup.select_one('.texte-full').text.strip()
        infos = soup.select_one('.champ .infos')
        if not infos is None:
            infos = infos.text
        location = soup.select_one('.champ .taxo.espace').text.strip()
        age = soup.select_one('.champ.taxo-age').text
        category = self.parse_category(soup.select_one('.champ.categorie').text)
        date = soup.select_one('.champ.date-libre').text
        description = '\n\n'.join([x for x in [soustitre, description, date, infos] if not x is None])
        if " au " in date or date.startswith("Du") or date.lower().strip() == "en continu" or date.startswith("Les"):
            return
        dates = self.parse_dates(date)
        end_day = None
        for d in dates:
            if len(d) >= 2:
                start_day = d[0]
                if len(d) == 3 and len(d[1]) == 2:
                    start_time = d[1][0]
                    end_time = d[1][1]
                    uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
                    self.add_event_with_props(
                                default_values,
                                event_url,
                                title,
                                category,
                                start_day,
                                location,
                                description,
                                [],
                                recurrences=None,
                                uuids=[uuid],
                                url_human=event_url,
                                start_time=start_time,
                                end_day=start_day,
                                end_time=end_time,
                                published=published,
                                image=image,
                                image_alt=image_alt
                            )
                else:
                    end_time = None
                    if len(d[1]) == 0:
                        start_time = None
                        uuid = event_url + "?date=" + str(start_day)
                        self.add_event_with_props(
                                    default_values,
                                    event_url,
                                    title,
                                    category,
                                    start_day,
                                    location,
                                    description,
                                    [],
                                    recurrences=None,
                                    uuids=[uuid],
                                    url_human=event_url,
                                    start_time=start_time,
                                    end_day=start_day,
                                    end_time=end_time,
                                    published=published,
                                    image=image,
                                    image_alt=image_alt
                                )
                    for t in d[1]:
                        start_time = t
                        uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
                        self.add_event_with_props(
                                    default_values,
                                    event_url,
                                    title,
                                    category,
                                    start_day,
                                    location,
                                    description,
                                    [],
                                    recurrences=None,
                                    uuids=[uuid],
                                    url_human=event_url,
                                    start_time=start_time,
                                    end_day=start_day,
                                    end_time=end_time,
                                    published=published,
                                    image=image,
                                    image_alt=image_alt
                                )
--- a/src/agenda_culturel/import_tasks/extractor.py
+++ b/src/agenda_culturel/import_tasks/extractor.py
@@ -54,7 +54,7 @@ class Extractor(ABC):
                return i + 1
        return None
-    def parse_french_date(text, default_year=None):
+    def parse_french_date(text, default_year=None, default_year_by_proximity=None):
        # format NomJour Numero Mois Année
        m = re.search(
            "[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)", text
@@ -92,13 +92,25 @@ class Extractor(ABC):
            return None
        try:
            day = int(day)
-            year = int(year)
+            if not year is None:
                year = int(year)
        except:
            return None
        if year < 100:
            year = 2000 + year
        if day >= 32:
            return None
        # by proximity
        if year is None and not default_year_by_proximity is None:
            dates = [date(default_year_by_proximity.year + x, month, day) for x in [-1, 0, 1]]
            dates = [(abs((d - default_year_by_proximity).days), d) for d in dates]
            d = min(dates, key=lambda x: x[0])
            return d[1]
        if year is None:
            return None
        if year < 100:
            year = 2000 + year
        return date(year, month, day)
    def parse_french_time(text):
--- a/src/agenda_culturel/migrations/0142_alter_recurrentimport_processor.py
+++ b/src/agenda_culturel/migrations/0142_alter_recurrentimport_processor.py
@@ -0,0 +1,18 @@
 # Generated by Django 4.2.9 on 2025-02-02 14:18
 from django.db import migrations, models
 class Migration(migrations.Migration):
    dependencies = [
        ('agenda_culturel', '0141_alter_recurrentimport_processor'),
    ]
    operations = [
        migrations.AlterField(
            model_name='recurrentimport',
            name='processor',
            field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes')], default='ical', max_length=20, verbose_name='Processor'),
        ),
    ]
--- a/src/agenda_culturel/models.py
+++ b/src/agenda_culturel/models.py
@@ -2104,6 +2104,7 @@ class RecurrentImport(models.Model):
        LARAYMONDE = "raymonde", _('La Raymonde')
        APIDAE = 'apidae', _('Agenda apidae tourisme')
        IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
        MILLEFORMES = 'Mille formes', _('Mille formes')
    class DOWNLOADER(models.TextChoices):
        SIMPLE = "simple", _("simple")