parent
55a0094e2f
commit
2fe2611788
44
experimentations/get_milleformes_events.py
Executable file
44
experimentations/get_milleformes_events.py
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# getting the name of the directory
|
||||||
|
# where the this file is present.
|
||||||
|
current = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
# Getting the parent directory name
|
||||||
|
# where the current directory is present.
|
||||||
|
parent = os.path.dirname(current)
|
||||||
|
|
||||||
|
# adding the parent directory to
|
||||||
|
# the sys.path.
|
||||||
|
sys.path.append(parent)
|
||||||
|
sys.path.append(parent + "/src")
|
||||||
|
|
||||||
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
|
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
u2e = URL2Events(SimpleDownloader(), mille_formes.CExtractor())
|
||||||
|
url = "https://www.milleformes.fr/programme"
|
||||||
|
url_human = "https://www.milleformes.fr/programme"
|
||||||
|
|
||||||
|
try:
|
||||||
|
events = u2e.process(url, url_human, cache = "cache-1000formes.html", default_values = {}, published = True)
|
||||||
|
|
||||||
|
exportfile = "events-1000formes.json"
|
||||||
|
print("Saving events to file {}".format(exportfile))
|
||||||
|
with open(exportfile, "w") as f:
|
||||||
|
json.dump(events, f, indent=4, default=str)
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception: " + str(e))
|
@ -156,6 +156,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
|
|||||||
extractor = apidae_tourisme.CExtractor()
|
extractor = apidae_tourisme.CExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.IGUANA:
|
elif rimport.processor == RecurrentImport.PROCESSOR.IGUANA:
|
||||||
extractor = iguana_agenda.CExtractor()
|
extractor = iguana_agenda.CExtractor()
|
||||||
|
elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES:
|
||||||
|
extractor = mille_formes.CExtractor()
|
||||||
else:
|
else:
|
||||||
extractor = None
|
extractor = None
|
||||||
|
|
||||||
|
@ -0,0 +1,193 @@
|
|||||||
|
from ..generic_extractors import *
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from datetime import datetime, date
|
||||||
|
|
||||||
|
# A class dedicated to get events from Mille formes
|
||||||
|
# URL: https://www.milleformes.fr/programme
|
||||||
|
class CExtractor(TwoStepsExtractorNoPause):
|
||||||
|
|
||||||
|
def extract(
|
||||||
|
self,
|
||||||
|
content,
|
||||||
|
url,
|
||||||
|
url_human=None,
|
||||||
|
default_values=None,
|
||||||
|
published=False,
|
||||||
|
only_future=True,
|
||||||
|
ignore_404=True):
|
||||||
|
self.root_address = "https://" + urlparse(url).netloc + "/"
|
||||||
|
self.today = date.today()
|
||||||
|
return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_category(self, cat):
|
||||||
|
cat = cat.replace("\n", "").strip()
|
||||||
|
if "exposition" in cat or "dispositif artistique interactif" in cat:
|
||||||
|
result = 'Visites & Expositions'
|
||||||
|
elif "atelier" in cat:
|
||||||
|
result = 'Animations & Ateliers'
|
||||||
|
elif cat in ["buffet"]:
|
||||||
|
result = 'Rendez-vous locaux'
|
||||||
|
elif "ciné" in cat:
|
||||||
|
result = 'Cinéma'
|
||||||
|
elif "concert" in cat:
|
||||||
|
result = 'Fêtes & Concerts'
|
||||||
|
elif "rencontre" in cat:
|
||||||
|
result = 'Rencontres & Débats'
|
||||||
|
elif "spectacle" in cat:
|
||||||
|
result = 'Spectacles'
|
||||||
|
else:
|
||||||
|
result = 'Sans catégorie'
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
# this method is not perfect, but dates and hours are not structured
|
||||||
|
def parse_dates(self, date):
|
||||||
|
dl = date.replace(' à ', '\n').split('\n')
|
||||||
|
result = []
|
||||||
|
|
||||||
|
for d in dl:
|
||||||
|
# only lines with a digit
|
||||||
|
if sum(c.isdigit() for c in d) != 0:
|
||||||
|
# split subparts
|
||||||
|
for d2 in d.replace(' et ', ', ').split(', '):
|
||||||
|
d2 = d2.strip()
|
||||||
|
dd = Extractor.parse_french_date(d2, default_year_by_proximity=self.today)
|
||||||
|
if dd is None:
|
||||||
|
hh = Extractor.parse_french_time(d2)
|
||||||
|
for i, r in enumerate(result):
|
||||||
|
result[i][1].append(hh)
|
||||||
|
else:
|
||||||
|
result.append([dd, []])
|
||||||
|
|
||||||
|
if "De" in date and " à " in date:
|
||||||
|
for i, r in enumerate(result):
|
||||||
|
result[i].append(True)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def build_event_url_list(self, content, infuture_days=180):
|
||||||
|
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
links = soup.select('.cell a.evenement')
|
||||||
|
for l in links:
|
||||||
|
self.add_event_url(self.root_address + l["href"])
|
||||||
|
|
||||||
|
|
||||||
|
def add_event_from_content(
|
||||||
|
self,
|
||||||
|
event_content,
|
||||||
|
event_url,
|
||||||
|
url_human=None,
|
||||||
|
default_values=None,
|
||||||
|
published=False,
|
||||||
|
):
|
||||||
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
title = soup.select_one('h1').text.replace("\n", "").strip().title()
|
||||||
|
|
||||||
|
image = soup.select_one('.slide img')
|
||||||
|
if image is None:
|
||||||
|
image_alt = ''
|
||||||
|
else:
|
||||||
|
image_alt = image["alt"]
|
||||||
|
image = self.root_address + image["src"]
|
||||||
|
|
||||||
|
soustitre = soup.select_one('.sous-titre')
|
||||||
|
if not soustitre is None:
|
||||||
|
soustitre = soustitre.text.strip()
|
||||||
|
|
||||||
|
description = soup.select_one('.texte-full').text.strip()
|
||||||
|
infos = soup.select_one('.champ .infos')
|
||||||
|
if not infos is None:
|
||||||
|
infos = infos.text
|
||||||
|
|
||||||
|
location = soup.select_one('.champ .taxo.espace').text.strip()
|
||||||
|
|
||||||
|
age = soup.select_one('.champ.taxo-age').text
|
||||||
|
category = self.parse_category(soup.select_one('.champ.categorie').text)
|
||||||
|
|
||||||
|
|
||||||
|
date = soup.select_one('.champ.date-libre').text
|
||||||
|
|
||||||
|
description = '\n\n'.join([x for x in [soustitre, description, date, infos] if not x is None])
|
||||||
|
|
||||||
|
if " au " in date or date.startswith("Du") or date.lower().strip() == "en continu" or date.startswith("Les"):
|
||||||
|
return
|
||||||
|
|
||||||
|
dates = self.parse_dates(date)
|
||||||
|
end_day = None
|
||||||
|
|
||||||
|
for d in dates:
|
||||||
|
if len(d) >= 2:
|
||||||
|
start_day = d[0]
|
||||||
|
|
||||||
|
if len(d) == 3 and len(d[1]) == 2:
|
||||||
|
start_time = d[1][0]
|
||||||
|
end_time = d[1][1]
|
||||||
|
uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
|
||||||
|
self.add_event_with_props(
|
||||||
|
default_values,
|
||||||
|
event_url,
|
||||||
|
title,
|
||||||
|
category,
|
||||||
|
start_day,
|
||||||
|
location,
|
||||||
|
description,
|
||||||
|
[],
|
||||||
|
recurrences=None,
|
||||||
|
uuids=[uuid],
|
||||||
|
url_human=event_url,
|
||||||
|
start_time=start_time,
|
||||||
|
end_day=start_day,
|
||||||
|
end_time=end_time,
|
||||||
|
published=published,
|
||||||
|
image=image,
|
||||||
|
image_alt=image_alt
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
end_time = None
|
||||||
|
if len(d[1]) == 0:
|
||||||
|
start_time = None
|
||||||
|
uuid = event_url + "?date=" + str(start_day)
|
||||||
|
self.add_event_with_props(
|
||||||
|
default_values,
|
||||||
|
event_url,
|
||||||
|
title,
|
||||||
|
category,
|
||||||
|
start_day,
|
||||||
|
location,
|
||||||
|
description,
|
||||||
|
[],
|
||||||
|
recurrences=None,
|
||||||
|
uuids=[uuid],
|
||||||
|
url_human=event_url,
|
||||||
|
start_time=start_time,
|
||||||
|
end_day=start_day,
|
||||||
|
end_time=end_time,
|
||||||
|
published=published,
|
||||||
|
image=image,
|
||||||
|
image_alt=image_alt
|
||||||
|
)
|
||||||
|
for t in d[1]:
|
||||||
|
start_time = t
|
||||||
|
uuid = event_url + "?date=" + str(start_day) + "&hour=" + str(start_time)
|
||||||
|
self.add_event_with_props(
|
||||||
|
default_values,
|
||||||
|
event_url,
|
||||||
|
title,
|
||||||
|
category,
|
||||||
|
start_day,
|
||||||
|
location,
|
||||||
|
description,
|
||||||
|
[],
|
||||||
|
recurrences=None,
|
||||||
|
uuids=[uuid],
|
||||||
|
url_human=event_url,
|
||||||
|
start_time=start_time,
|
||||||
|
end_day=start_day,
|
||||||
|
end_time=end_time,
|
||||||
|
published=published,
|
||||||
|
image=image,
|
||||||
|
image_alt=image_alt
|
||||||
|
)
|
||||||
|
|
@ -54,7 +54,7 @@ class Extractor(ABC):
|
|||||||
return i + 1
|
return i + 1
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def parse_french_date(text, default_year=None):
|
def parse_french_date(text, default_year=None, default_year_by_proximity=None):
|
||||||
# format NomJour Numero Mois Année
|
# format NomJour Numero Mois Année
|
||||||
m = re.search(
|
m = re.search(
|
||||||
"[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)", text
|
"[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)", text
|
||||||
@ -92,13 +92,25 @@ class Extractor(ABC):
|
|||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
day = int(day)
|
day = int(day)
|
||||||
year = int(year)
|
if not year is None:
|
||||||
|
year = int(year)
|
||||||
except:
|
except:
|
||||||
return None
|
return None
|
||||||
if year < 100:
|
|
||||||
year = 2000 + year
|
|
||||||
if day >= 32:
|
if day >= 32:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# by proximity
|
||||||
|
if year is None and not default_year_by_proximity is None:
|
||||||
|
dates = [date(default_year_by_proximity.year + x, month, day) for x in [-1, 0, 1]]
|
||||||
|
dates = [(abs((d - default_year_by_proximity).days), d) for d in dates]
|
||||||
|
d = min(dates, key=lambda x: x[0])
|
||||||
|
return d[1]
|
||||||
|
|
||||||
|
if year is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if year < 100:
|
||||||
|
year = 2000 + year
|
||||||
return date(year, month, day)
|
return date(year, month, day)
|
||||||
|
|
||||||
def parse_french_time(text):
|
def parse_french_time(text):
|
||||||
|
@ -0,0 +1,18 @@
|
|||||||
|
# Generated by Django 4.2.9 on 2025-02-02 14:18
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('agenda_culturel', '0141_alter_recurrentimport_processor'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='recurrentimport',
|
||||||
|
name='processor',
|
||||||
|
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes')], default='ical', max_length=20, verbose_name='Processor'),
|
||||||
|
),
|
||||||
|
]
|
@ -2104,6 +2104,7 @@ class RecurrentImport(models.Model):
|
|||||||
LARAYMONDE = "raymonde", _('La Raymonde')
|
LARAYMONDE = "raymonde", _('La Raymonde')
|
||||||
APIDAE = 'apidae', _('Agenda apidae tourisme')
|
APIDAE = 'apidae', _('Agenda apidae tourisme')
|
||||||
IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
|
IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
|
||||||
|
MILLEFORMES = 'Mille formes', _('Mille formes')
|
||||||
|
|
||||||
class DOWNLOADER(models.TextChoices):
|
class DOWNLOADER(models.TextChoices):
|
||||||
SIMPLE = "simple", _("simple")
|
SIMPLE = "simple", _("simple")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user