diff --git a/experimentations/get_lacoope_events.py b/experimentations/get_lacoope_events.py new file mode 100755 index 0000000..b98dddb --- /dev/null +++ b/experimentations/get_lacoope_events.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor()) + url = "https://www.lacoope.org/concerts-calendrier/" + url_human = "https://www.lacoope.org/concerts-calendrier/" + + try: + events = u2e.process(url, url_human, cache = "cache-lacoope.ical", default_values = {"category": "Concert", "location": "La Coopérative"}, published = True) + + exportfile = "events-lacoope.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/experimentations/notes-sources.md b/experimentations/notes-sources.md new file mode 100644 index 0000000..b04ddc5 --- /dev/null +++ b/experimentations/notes-sources.md @@ -0,0 +1,29 @@ +# Notes sur les sources du territoire clermontois + +## La Comédie de Clermont + +URL des dates avec événements: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes +URL des informations d'une date avec paramètres en post: +```curl --data "action=load_evenements_jour" --data "jour=2024-04-19" "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php"``` +La donnée retournée est du html assez succinct, avec l'essentiel dedans. + + +## La coopé + +Dans le source de https://www.lacoope.org/concerts-calendrier/ on trouve un tableau javascript qui contient les urls des événements. Ce tableau peut contenir "Gratuit" en tag. Il n'y a pas l'heure de l'événement. +Sur chaque page événémenet, il y a : +- meta name="description" +- une url https://calendar.google.com/calendar/ avec la plupart des données + +## Le caveau de la michodière + +L'adresse https://www.lecaveaudelamichodiere.com/concerts/ donne les concerts du mois en cours. +La page est peuplée par une requête javascript qui semble difficile à rejouer indépendamment, car on se prend un erreur 403 (fucking plugin propriétaire eventon). + +Si on récupère l'identifiant de l'événement (type event_11377_0), on peut forger une url du type +```https://www.lecaveaudelamichodiere.com/wp-admin/admin-ajax.php?action=eventon_ics_download&event_id=11377&ri=0``` pour récupérer un ical de l'événement. + +## La petite gaillarde + +Le flux RSS https://lapetitegaillarde.fr/?feed=rss2 est à peu près bien structuré. + diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 09b136c..16e72c7 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -9,6 +9,7 @@ from .import_tasks.downloader import * from .import_tasks.extractor import * from .import_tasks.importer import * from .import_tasks.extractor_ical import * +from .import_tasks.custom_extractors import * @@ -100,6 +101,8 @@ def run_recurrent_import(self, pk): extractor = ICALNoBusyExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC: extractor = ICALNoVCExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE: + extractor = LaCoopeExtractor() else: extractor = None diff --git a/src/agenda_culturel/db_importer.py b/src/agenda_culturel/db_importer.py index fd43a82..ae93035 100644 --- a/src/agenda_culturel/db_importer.py +++ b/src/agenda_culturel/db_importer.py @@ -36,6 +36,7 @@ class DBImporterEvents: return self.nb_removed def import_events(self, json_structure): + print(json_structure) self.init_result_properties() try: diff --git a/src/agenda_culturel/import_tasks/custom_extractors.py b/src/agenda_culturel/import_tasks/custom_extractors.py new file mode 100644 index 0000000..473d83a --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors.py @@ -0,0 +1,64 @@ + +from .generic_extractors import * +import re +import json5 + + +# A class dedicated to get events from La Coopérative de Mai: +# URL: https://www.lacoope.org/concerts-calendrier/ +class LaCoopeExtractor(TwoStepsExtractor): + + nom_lieu = "La Coopérative de Mai" + + def build_event_url_list(self, content): + soup = BeautifulSoup(content, "html.parser") + script = soup.find('div', class_="js-filter__results").findChildren('script') + if len(script) == 0: + raise Exception("Cannot find events in the first page") + script = script[0] + search = re.search(r"window.fullCalendarContent = (.*)", str(script), re.S) + if search: + data = json5.loads(search.group(1)) + self.event_urls = [e['url'] for e in data['events']] + for e in data['events']: + if e['tag'] == "Gratuit": + self.add_event_tag(e['url'], 'gratuit') + + else: + raise Exception('Cannot extract events from javascript') + + + def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False): + soup = BeautifulSoup(event_content, "html.parser") + + title = soup.find("h1").contents[0] + category = "Concert" + image = soup.find("meta", property="og:image") + if image: + image = image["content"] + + description = soup.find("div", class_="grid-concert-content") + if description: + description = description.find('div', class_="content-striped") + if description: + description = description.find('div', class_='wysiwyg') + if description: + description = description.get_text() + if description is None: + description = "" + + tags = [] + + link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]') + if len(link_calendar) == 0: + raise Exception('Cannot find the google calendar url') + + gg_cal = GGCalendar(link_calendar[0]["href"]) + start_day = gg_cal.start_day + start_time = gg_cal.start_time + end_day = gg_cal.end_day + end_time = gg_cal.end_time + location = LaCoopeExtractor.nom_lieu + url_human = event_url + + self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image) \ No newline at end of file diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index 751eba9..d5333a9 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -1,5 +1,6 @@ from urllib.parse import urlparse import urllib.request +import os from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options @@ -15,6 +16,24 @@ class Downloader(ABC): def download(self, url): pass + def get_content(self, url, cache = None): + if cache and os.path.exists(cache): + print("Loading cache ({})".format(cache)) + with open(cache) as f: + content = "\n".join(f.readlines()) + else: + content = self.download(url) + + if cache: + print("Saving cache ({})".format(cache)) + dir = os.path.dirname(cache) + if dir != "" and not os.path.exists(dir): + os.makedirs(dir) + with open(cache, "w") as text_file: + text_file.write(content) + return content + + class SimpleDownloader(Downloader): def __init__(self): diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index e1530c0..6da3936 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -8,11 +8,15 @@ class Extractor(ABC): def __init__(self): self.header = {} self.events = [] + self.downloader = None @abstractmethod - def extract(self, content, url, url_human = None): + def extract(self, content, url, url_human = None, default_values = None, published = False): pass + def set_downloader(self, downloader): + self.downloader = downloader + @abstractmethod def clean_url(url): pass diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py new file mode 100644 index 0000000..0a8182e --- /dev/null +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -0,0 +1,109 @@ +from abc import abstractmethod +from urllib.parse import urlparse +from urllib.parse import parse_qs + +from .extractor import * +from django.utils.translation import gettext_lazy as _ +from dateutil import parser +import datetime + +class GGCalendar: + + def __init__(self, url): + self.url = url + self.extract_info() + + def extract_info(self): + parsed_url = urlparse(self.url.replace("#", "%23")) + params = parse_qs(parsed_url.query) + + self.location = params['location'][0] if 'location' in params else None + self.title = params['text'][0] if 'text' in params else None + if 'dates' in params: + dates = [x.replace(" ", "+") for x in params['dates'][0].split("/")] + if len(dates) > 0: + date = parser.parse(dates[0]) + self.start_day = date.date() + self.start_time = date.time() + if len(dates) == 2: + date = parser.parse(dates[1]) + self.end_day = date.date() + self.end_time = date.time() + else: + self.end_day = None + self.end_time = None + + else: + raise Exception("Unable to find a date in google calendar URL") + self.start_day = None + self.start_time = None + self.end_day = None + self.end_time = None + + + +# A class to extract events from URL with two steps: +# - first build a list of urls where the events will be found +# - then for each document downloaded from these urls, build the events +# This class is an abstract class +class TwoStepsExtractor(Extractor): + + def __init__(self): + super().__init__() + self.event_urls = None + self.event_properties = {} + + def clean_url(url): + return url + + def add_event_tag(self, url, tag): + if not url in self.event_properties: + self.event_properties[url] = {} + if not "tags" in self.event_properties[url]: + self.event_properties[url]["tags"] = [] + self.event_properties[url]["tags"].append(tag) + + def add_event_with_props(self, event_url, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None): + + if event_url in self.event_properties and 'tags' in self.event_properties[event_url]: + tags = tags + self.event_properties[event_url]['tags'] + + self.add_event(title, category, start_day, location, description, tags, uuid, recurrences, url_human, start_time, end_day, end_time, last_modified, published, image, image_alt) + + + @abstractmethod + def build_event_url_list(self, content): + pass + + @abstractmethod + def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False): + pass + + + def extract(self, content, url, url_human = None, default_values = None, published = False): + self.set_header(url) + self.clear_events() + + self.event_urls = None + self.event_properties.clear() + + # first build the event list + self.build_event_url_list(content) + + if self.event_urls is None: + raise Exception('Unable to find the event list from the main document') + + if self.downloader is None: + raise Exception('The downloader is not defined') + + # then process each element of the list + for i, event_url in enumerate(self.event_urls): + # first download the content associated with this link + content_event = self.downloader.get_content(event_url) + if content_event is None: + raise Exception(_('Cannot extract event from url {}').format(event_url)) + # then extract event information from this html document + self.add_event_from_content(content_event, event_url, url_human, default_values, published) + + return self.get_structure() + diff --git a/src/agenda_culturel/import_tasks/importer.py b/src/agenda_culturel/import_tasks/importer.py index 68d5b2b..1906d11 100644 --- a/src/agenda_culturel/import_tasks/importer.py +++ b/src/agenda_culturel/import_tasks/importer.py @@ -12,40 +12,22 @@ class URL2Events: self.extractor = extractor self.single_event = single_event - def get_content(self, url, cache = None): - if cache and os.path.exists(cache): - print("Loading cache ({})".format(cache)) - with open(cache) as f: - content = "\n".join(f.readlines()) - else: - content = self.downloader.download(url) - - if cache: - print("Saving cache ({})".format(cache)) - dir = os.path.dirname(cache) - if dir != "" and not os.path.exists(dir): - os.makedirs(dir) - with open(cache, "w") as text_file: - text_file.write(content) - return content - def process(self, url, url_human = None, cache = None, default_values = None, published = False): - content = self.get_content(url, cache) + content = self.downloader.get_content(url, cache) if content is None: return None if self.extractor is not None: + self.extractor.set_downloader(self.downloader) return self.extractor.extract(content, url, url_human, default_values, published) else: # if the extractor is not defined, use a list of default extractors for e in Extractor.get_default_extractors(self.single_event): - #try: + e.set_downloader(self.downloader) events = e.extract(content, url, url_human, default_values, published) if events is not None: return events - #except: - # continue return None diff --git a/src/agenda_culturel/migrations/0049_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0049_alter_recurrentimport_processor.py new file mode 100644 index 0000000..cd3d6d4 --- /dev/null +++ b/src/agenda_culturel/migrations/0049_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.7 on 2024-04-19 12:07 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0048_auto_20240417_1212'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 5c324b7..bbd3677 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -754,6 +754,7 @@ class RecurrentImport(models.Model): ICAL = "ical", _("ical") ICALNOBUSY = "icalnobusy", _("ical no busy") ICALNOVC = "icalnovc", _("ical no VC") + LACOOPE = "lacoope", _('lacoope.org') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple") diff --git a/src/requirements.txt b/src/requirements.txt index bf62e49..8c09489 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -36,4 +36,5 @@ django-recurrence==1.11.1 icalendar==5.0.11 lxml==5.1.0 bbcode==1.1.0 +json5==0.9.25