Ajout de la source des amis du temps des cerises

Fix #297
2025-02-08 18:15:38 +01:00
parent 023a7fee8d
commit 2f146172da
8 changed files with 160 additions and 7 deletions
--- a/experimentations/get_amisdutempsdescerises.py
+++ b/experimentations/get_amisdutempsdescerises.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python3
+# coding: utf-8
+
+import os
+import json
+import sys
+
+# getting the name of the directory
+# where the this file is present.
+current = os.path.dirname(os.path.realpath(__file__))
+ 
+# Getting the parent directory name
+# where the current directory is present.
+parent = os.path.dirname(current)
+ 
+# adding the parent directory to 
+# the sys.path.
+sys.path.append(parent)
+sys.path.append(parent + "/src")
+
+from src.agenda_culturel.import_tasks.downloader import *
+from src.agenda_culturel.import_tasks.extractor import *
+from src.agenda_culturel.import_tasks.importer import *
+from src.agenda_culturel.import_tasks.custom_extractors import *
+
+
+
+
+
+if __name__ == "__main__":
+
+    u2e = URL2Events(SimpleDownloader(), amisdutempsdescerises.CExtractor())
+    url = "https://amisdutempsdescerises.org/page.php"
+    url_human = "https://amisdutempsdescerises.org/"
+
+    try:
+        events = u2e.process(url, url_human, cache = "cache-amiscerices.xml", default_values = {"category": "Rencontres & Débats"}, published = True)
+
+        exportfile = "events-amiscerices.json"
+        print("Saving events to file {}".format(exportfile))
+        with open(exportfile, "w") as f:
+            json.dump(events, f, indent=4, default=str)
+    except Exception as e:
+        print("Exception: " + str(e))
--- a/src/agenda_culturel/celery.py
+++ b/src/agenda_culturel/celery.py
@@ -158,6 +158,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
        extractor = iguana_agenda.CExtractor()
    elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES:
        extractor = mille_formes.CExtractor()
+    elif rimport.processor == RecurrentImport.PROCESSOR.AMISCERISES:
+        extractor = amisdutempsdescerises.CExtractor()
    else:
        extractor = None

--- a/src/agenda_culturel/import_tasks/custom_extractors/amisdutempsdescerises.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/amisdutempsdescerises.py
@@ -0,0 +1,72 @@
+from ..extractor import *
+import json
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse, unquote
+import pytz
+import html
+
+
+# A class dedicated to get events from les amis du temps des cerises
+# Website https://amisdutempsdescerises.org/
+class CExtractor(Extractor):
+
+    def __init__(self):
+        super().__init__()
+        self.data = b'------toto\r\nContent-Disposition: form-data; name="p"\r\n\r\nfutur\r\n------toto--\r\n'
+        self.content_type = 'multipart/form-data; boundary=----toto'
+
+
+    def extract(
+        self, content, url, url_human=None, default_values=None, published=False
+    ):
+        self.set_header(url)
+        self.clear_events()
+
+        root_url = "https://" + urlparse(url).netloc + "/"
+        images_basename = root_url + "images/"
+        from_timezone = pytz.utc
+        to_timezone = pytz.timezone("Europe/Paris")
+
+        events = json.loads(content)
+        for e in events:
+            tags = []
+            start_day = e["ev_date"].split(' ')[0]
+            start_time = e["ev_time"]
+            title = html.unescape(e["ev_titre"]) # TODO: décoder
+            if "ev_sstitre" in e and e["ev_sstitre"] != '':
+                title = title + ' - ' + html.unescape(e["ev_sstitre"])
+
+            soup = BeautifulSoup(e["ev_info"], "html.parser")
+            description = soup.text
+            location = e["li_nom"] if "li_nom" in e else None
+            if "ev_canceled" in e and e["ev_canceled"] != '0':
+                tags += ["annulé"]
+
+            image = None
+            if "ev_img" in e and e["ev_img"] != '':
+                image = images_basename + e["ev_img"]
+            
+            naive_dt = datetime.strptime(e["ev_date"], "%Y-%m-%d %H:%M:%S")
+
+            from_dt = from_timezone.localize(naive_dt)
+            dt = to_timezone.normalize(from_dt)
+            ts = int(datetime.timestamp(dt)) * 1000
+                        
+            event_url = root_url + "#" + str(ts)
+
+            self.add_event(
+                default_values,
+                title,
+                None,
+                start_day,
+                location,
+                description,
+                tags,
+                uuids=[event_url],
+                recurrences=None,
+                url_human=event_url,
+                start_time=start_time,
+                published=published,
+                image=image )             
+        
+        return self.get_structure()
--- a/src/agenda_culturel/import_tasks/downloader.py
+++ b/src/agenda_culturel/import_tasks/downloader.py
@@ -17,13 +17,13 @@ class Downloader(ABC):
    def download(self, url, post=None):
        pass

-    def get_content(self, url, cache=None, referer=None, post=None):
+    def get_content(self, url, cache=None, referer=None, post=None, content_type=None, data=None):
        if cache and os.path.exists(cache):
            print("Loading cache ({})".format(cache))
            with open(cache) as f:
                content = "\n".join(f.readlines())
        else:
-            content = self.download(url, referer=referer, post=post)
+            content = self.download(url, referer=referer, post=post, content_type=content_type, data=data)

            if cache:
                print("Saving cache ({})".format(cache))
@@ -39,7 +39,7 @@ class SimpleDownloader(Downloader):
    def __init__(self):
        super().__init__()

-    def download(self, url, referer=None, post=None):
+    def download(self, url, referer=None, post=None, content_type=None, data=None):
        print("Downloading {} referer: {} post: {}".format(url, referer, post))
        try:
            headers = {
@@ -47,7 +47,9 @@ class SimpleDownloader(Downloader):
            }
            if referer is not None:
                headers["Referer"] = referer
-            req = Request(url, headers=headers)
+            if content_type is not None:
+                headers["Content-Type"] = content_type
+            req = Request(url, headers=headers, data=data)
            if post:
                post_args = urlencode(post).encode("utf-8")
                resource = urllib.request.urlopen(req, post_args)
@@ -109,9 +111,15 @@ class ChromiumHeadlessDownloader(Downloader):
        
        return True

-    def download(self, url, referer=None, post=None):
+    def download(self, url, referer=None, post=None, content_type=None, data=None):
        if post:
            raise Exception("POST method with Chromium headless not yet implemented")
+        if referer:
+            raise Exception("Referer parameter with Chromium headless not yet implemented")
+        if data:
+            raise Exception("Data content with Chromium headless not yet implemented")
+        if content_type:
+            raise Exception("Content-type parameter with Chromium headless not yet implemented")
        print("Download {}".format(url))

        try:
--- a/src/agenda_culturel/import_tasks/extractor.py
+++ b/src/agenda_culturel/import_tasks/extractor.py
@@ -19,7 +19,11 @@ class Extractor(ABC):
        self.events = []
        self.downloader = None
        self.has_2nd_method = False
+        
+        # parameters used by the downloader to get the content
        self.referer = ""
+        self.data = None
+        self.content_type = None
    
    def prepare_2nd_extract(self):
        pass
@@ -169,7 +173,7 @@ class Extractor(ABC):

    @abstractmethod
    def clean_url(url):
-        pass
+        return url

    def is_known_url(url):
        return False
--- a/src/agenda_culturel/import_tasks/importer.py
+++ b/src/agenda_culturel/import_tasks/importer.py
@@ -20,9 +20,13 @@ class URL2Events:
        first=True
    ):
        referer = ""
+        data = None
+        content_type = None
        if self.extractor:
            referer = self.extractor.url_referer
-        content = self.downloader.get_content(url, cache, referer=referer)
+            data = self.extractor.data
+            content_type = self.extractor.content_type
+        content = self.downloader.get_content(url, cache, referer=referer, content_type=content_type, data=data)

        if content is None:
            return None
--- a/src/agenda_culturel/migrations/0146_alter_recurrentimport_processor.py
+++ b/src/agenda_culturel/migrations/0146_alter_recurrentimport_processor.py
@@ -0,0 +1,18 @@
+# Generated by Django 4.2.9 on 2025-02-08 13:33
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('agenda_culturel', '0145_revert_pause'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='recurrentimport',
+            name='processor',
+            field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes'), ('Amis cerises', 'Les Amis du Temps des Cerises')], default='ical', max_length=20, verbose_name='Processor'),
+        ),
+    ]
--- a/src/agenda_culturel/models.py
+++ b/src/agenda_culturel/models.py
@@ -2106,6 +2106,7 @@ class RecurrentImport(models.Model):
        APIDAE = 'apidae', _('Agenda apidae tourisme')
        IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
        MILLEFORMES = 'Mille formes', _('Mille formes')
+        AMISCERISES = 'Amis cerises', _('Les Amis du Temps des Cerises')

    class DOWNLOADER(models.TextChoices):
        SIMPLE = "simple", _("simple")