diff --git a/experimentations/get_amisdutempsdescerises.py b/experimentations/get_amisdutempsdescerises.py new file mode 100755 index 0000000..9dd11c8 --- /dev/null +++ b/experimentations/get_amisdutempsdescerises.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) +sys.path.append(parent + "/src") + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), amisdutempsdescerises.CExtractor()) + url = "https://amisdutempsdescerises.org/page.php" + url_human = "https://amisdutempsdescerises.org/" + + try: + events = u2e.process(url, url_human, cache = "cache-amiscerices.xml", default_values = {"category": "Rencontres & Débats"}, published = True) + + exportfile = "events-amiscerices.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index dc3567c..cf84220 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -158,6 +158,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id): extractor = iguana_agenda.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES: extractor = mille_formes.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.AMISCERISES: + extractor = amisdutempsdescerises.CExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/amisdutempsdescerises.py b/src/agenda_culturel/import_tasks/custom_extractors/amisdutempsdescerises.py new file mode 100644 index 0000000..2c43c29 --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/amisdutempsdescerises.py @@ -0,0 +1,72 @@ +from ..extractor import * +import json +from bs4 import BeautifulSoup +from urllib.parse import urlparse, unquote +import pytz +import html + + +# A class dedicated to get events from les amis du temps des cerises +# Website https://amisdutempsdescerises.org/ +class CExtractor(Extractor): + + def __init__(self): + super().__init__() + self.data = b'------toto\r\nContent-Disposition: form-data; name="p"\r\n\r\nfutur\r\n------toto--\r\n' + self.content_type = 'multipart/form-data; boundary=----toto' + + + def extract( + self, content, url, url_human=None, default_values=None, published=False + ): + self.set_header(url) + self.clear_events() + + root_url = "https://" + urlparse(url).netloc + "/" + images_basename = root_url + "images/" + from_timezone = pytz.utc + to_timezone = pytz.timezone("Europe/Paris") + + events = json.loads(content) + for e in events: + tags = [] + start_day = e["ev_date"].split(' ')[0] + start_time = e["ev_time"] + title = html.unescape(e["ev_titre"]) # TODO: décoder + if "ev_sstitre" in e and e["ev_sstitre"] != '': + title = title + ' - ' + html.unescape(e["ev_sstitre"]) + + soup = BeautifulSoup(e["ev_info"], "html.parser") + description = soup.text + location = e["li_nom"] if "li_nom" in e else None + if "ev_canceled" in e and e["ev_canceled"] != '0': + tags += ["annulé"] + + image = None + if "ev_img" in e and e["ev_img"] != '': + image = images_basename + e["ev_img"] + + naive_dt = datetime.strptime(e["ev_date"], "%Y-%m-%d %H:%M:%S") + + from_dt = from_timezone.localize(naive_dt) + dt = to_timezone.normalize(from_dt) + ts = int(datetime.timestamp(dt)) * 1000 + + event_url = root_url + "#" + str(ts) + + self.add_event( + default_values, + title, + None, + start_day, + location, + description, + tags, + uuids=[event_url], + recurrences=None, + url_human=event_url, + start_time=start_time, + published=published, + image=image ) + + return self.get_structure() diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index 5636c3e..aff45ab 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -17,13 +17,13 @@ class Downloader(ABC): def download(self, url, post=None): pass - def get_content(self, url, cache=None, referer=None, post=None): + def get_content(self, url, cache=None, referer=None, post=None, content_type=None, data=None): if cache and os.path.exists(cache): print("Loading cache ({})".format(cache)) with open(cache) as f: content = "\n".join(f.readlines()) else: - content = self.download(url, referer=referer, post=post) + content = self.download(url, referer=referer, post=post, content_type=content_type, data=data) if cache: print("Saving cache ({})".format(cache)) @@ -39,7 +39,7 @@ class SimpleDownloader(Downloader): def __init__(self): super().__init__() - def download(self, url, referer=None, post=None): + def download(self, url, referer=None, post=None, content_type=None, data=None): print("Downloading {} referer: {} post: {}".format(url, referer, post)) try: headers = { @@ -47,7 +47,9 @@ class SimpleDownloader(Downloader): } if referer is not None: headers["Referer"] = referer - req = Request(url, headers=headers) + if content_type is not None: + headers["Content-Type"] = content_type + req = Request(url, headers=headers, data=data) if post: post_args = urlencode(post).encode("utf-8") resource = urllib.request.urlopen(req, post_args) @@ -109,9 +111,15 @@ class ChromiumHeadlessDownloader(Downloader): return True - def download(self, url, referer=None, post=None): + def download(self, url, referer=None, post=None, content_type=None, data=None): if post: raise Exception("POST method with Chromium headless not yet implemented") + if referer: + raise Exception("Referer parameter with Chromium headless not yet implemented") + if data: + raise Exception("Data content with Chromium headless not yet implemented") + if content_type: + raise Exception("Content-type parameter with Chromium headless not yet implemented") print("Download {}".format(url)) try: diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index 895f6a8..ded6b8f 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -19,7 +19,11 @@ class Extractor(ABC): self.events = [] self.downloader = None self.has_2nd_method = False + + # parameters used by the downloader to get the content self.referer = "" + self.data = None + self.content_type = None def prepare_2nd_extract(self): pass @@ -169,7 +173,7 @@ class Extractor(ABC): @abstractmethod def clean_url(url): - pass + return url def is_known_url(url): return False diff --git a/src/agenda_culturel/import_tasks/importer.py b/src/agenda_culturel/import_tasks/importer.py index 4127474..e10c439 100644 --- a/src/agenda_culturel/import_tasks/importer.py +++ b/src/agenda_culturel/import_tasks/importer.py @@ -20,9 +20,13 @@ class URL2Events: first=True ): referer = "" + data = None + content_type = None if self.extractor: referer = self.extractor.url_referer - content = self.downloader.get_content(url, cache, referer=referer) + data = self.extractor.data + content_type = self.extractor.content_type + content = self.downloader.get_content(url, cache, referer=referer, content_type=content_type, data=data) if content is None: return None diff --git a/src/agenda_culturel/migrations/0146_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0146_alter_recurrentimport_processor.py new file mode 100644 index 0000000..fb7f064 --- /dev/null +++ b/src/agenda_culturel/migrations/0146_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.9 on 2025-02-08 13:33 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0145_revert_pause'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes'), ('Amis cerises', 'Les Amis du Temps des Cerises')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 8e53dd0..15d4209 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -2106,6 +2106,7 @@ class RecurrentImport(models.Model): APIDAE = 'apidae', _('Agenda apidae tourisme') IGUANA = 'iguana', _('Agenda iguana (médiathèques)') MILLEFORMES = 'Mille formes', _('Mille formes') + AMISCERISES = 'Amis cerises', _('Les Amis du Temps des Cerises') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")