diff --git a/experimentations/get_lapucealoreille_events.py b/experimentations/get_lapucealoreille_events.py new file mode 100755 index 0000000..5175bd9 --- /dev/null +++ b/experimentations/get_lapucealoreille_events.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(SimpleDownloader(), LaPuceALOreilleExtractor()) + url = "https://www.lapucealoreille63.fr/programmation/" + url_human = "https://www.lapucealoreille63.fr/programmation/" + + try: + events = u2e.process(url, url_human, cache = "cache-lapucealoreille.xml", default_values = {}, published = True) + + exportfile = "events-lapucealoreille.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 38b5ad6..19a5e46 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -107,6 +107,8 @@ def run_recurrent_import(self, pk): extractor = LaComedieExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT: extractor = LeFotomatExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE: + extractor = LaPuceALOreilleExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors.py b/src/agenda_culturel/import_tasks/custom_extractors.py index 95dc26c..5eb034b 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors.py +++ b/src/agenda_culturel/import_tasks/custom_extractors.py @@ -195,3 +195,72 @@ class LeFotomatExtractor(TwoStepsExtractor): url_human = event_url self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image) + + +# A class dedicated to get events from La puce à l'oreille +# URL: https://www.lapucealoreille63.fr/ +class LaPuceALOreilleExtractor(TwoStepsExtractor): + + nom_lieu = "La Puce à l'Oreille" + + def build_event_url_list(self, content): + soup = BeautifulSoup(content, "html.parser") + + events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]") + for e in events: + e_url = e.find("a") + if e_url: + if self.add_event_url(e_url["href"]): + title = e.select("div[data-testid=richTextElement] h1.font_0 span") + if title: + title = title[0].contents[0].get_text().replace("\n", " ") + title = re.sub(" +", " ", title) + self.add_event_title(e_url["href"], title) + + + def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False): + soup = BeautifulSoup(event_content, "html.parser") + + start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit + + spans = soup.select("div[data-testid=richTextElement] span") + start_time = None + end_time = None + location = None + + for span in spans: + txt = span.get_text() + if txt.lstrip().startswith("DÉBUT"): + start_time = self.parse_french_time(txt.split(":")[-1]) + end_time = None + elif txt.lstrip().startswith("HORAIRES :"): + hs = txt.split(":")[-1].split("-") + start_time = self.parse_french_time(hs[0]) + if len(hs) > 1: + end_time = self.parse_french_time(hs[1]) + else: + end_time = None + elif txt.lstrip().startswith("LIEU :") and not location: + location = txt.split(":")[-1].lstrip() + + if not location: + location = self.nom_lieu + end_day = self.guess_end_day(start_day, start_time, end_time) + + url_human = event_url + tags = [] + + image = soup.select("wow-image img[fetchpriority=high]") + if image: + image = image[0]["src"] + else: + image = None + + descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]") + if descriptions: + descriptions = [d.get_text() for d in descriptions] + description = max(descriptions, key=len) + else: + description = None + + self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image) diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index 546e739..d9b9b59 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -22,6 +22,8 @@ class Extractor(ABC): return start_day else: return start_day + timedelta(days=1) + else: + return start_day def guess_month(self, text): mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"] @@ -33,14 +35,14 @@ class Extractor(ABC): def parse_french_date(self, text): # format NomJour Numero Mois Année - m = re.search('[a-zA-Z:.]+[ ]*([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text) + m = re.search('[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text) if m: day = m.group(1) month = self.guess_month(m.group(2)) year = m.group(3) else: # format Numero Mois Annee - m = re.search('([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text) + m = re.search('([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text) if m: day = m.group(1) month = self.guess_month(m.group(2)) @@ -71,14 +73,14 @@ class Extractor(ABC): s = m.group(3) else: # format heures minutes - m = re.search('([0-9]+)[ h:.]+([0-9]+)', text) + m = re.search('([0-9]+)[ hH:.]+([0-9]+)', text) if m: h = m.group(1) m = m.group(2) s = "0" else: # format heures - m = re.search('([0-9]+)[ h:.]', text) + m = re.search('([0-9]+)[ Hh:.]', text) if m: h = m.group(1) m = "0" diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py index 8f3bda4..5e8a40e 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -58,7 +58,11 @@ class TwoStepsExtractor(Extractor): return url def add_event_url(self, url): - self.event_urls.append(url) + if url in self.event_urls: + return False + else: + self.event_urls.append(url) + return True def add_event_start_day(self, url, start_day): if not url in self.event_properties: diff --git a/src/agenda_culturel/migrations/0054_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0054_alter_recurrentimport_processor.py new file mode 100644 index 0000000..ea3061f --- /dev/null +++ b/src/agenda_culturel/migrations/0054_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.7 on 2024-04-20 13:51 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0053_alter_recurrentimport_processor'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', 'la puce à loreille')], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index 0b04bcf..988136a 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -760,6 +760,7 @@ class RecurrentImport(models.Model): LACOOPE = "lacoope", _('lacoope.org') LACOMEDIE = "lacomedie", _('la comédie') LEFOTOMAT = "lefotomat", _('le fotomat') + LAPUCEALOREILLE = "lapucealoreille", _('la puce à l''oreille') class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")