On ajoute l'import de pages de l'agenda associations CF

Fix #277
This commit is contained in:
Jean-Marie Favreau 2025-03-09 15:54:28 +01:00
parent d401f533bd
commit ae26f3630c
3 changed files with 112 additions and 0 deletions

View File

@ -0,0 +1,38 @@
#!/usr/bin/python3
# coding: utf-8
import json
import os
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import (
ChromiumHeadlessDownloader,
)
from src.agenda_culturel.import_tasks.custom_extractors.associations_cf import (
CExtractor,
)
from src.agenda_culturel.import_tasks.importer import URL2Events
if __name__ == "__main__":
u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor())
url = "https://associations.clermont-ferrand.fr/evenement/week-end-multi-culturel"
events = u2e.process(url, cache="asso_cf.html", published=True)
exportfile = "event-asso_cf.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)

View File

@ -0,0 +1,72 @@
import logging
import re
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from ..extractor import Extractor
logger = logging.getLogger(__name__)
class CExtractor(Extractor):
def __init__(self):
super().__init__()
def extract(
self,
content,
url,
url_human=None,
default_values=None,
published=False,
):
self.set_header(url)
u = urlparse(url)
if u.netloc == "associations.clermont-ferrand.fr":
soup = BeautifulSoup(content, "html.parser")
title = soup.select_one("h1").text.strip()
img = soup.select_one("img.image-style-event")
image = None
image_alt = None
if img:
image = img["src"]
if not image.startswith("http"):
image = "https://" + u.netloc + image
image_alt = img["alt"]
description = soup.select_one("div.field--name-body").get_text(
separator="\n"
)
location = soup.select_one("div.c-location__holder .c-desc").text
start_day = soup.select_one("div.o-date")
if start_day is not None:
start_day = Extractor.parse_french_date(
re.sub("[ ]*\n[ ]*", " ", start_day.get_text(separator=" "))
)
start_time = soup.select_one("div.c-hours__holder .c-desc")
if start_time is not None:
start_time = Extractor.parse_french_time(start_time.text)
category = None
tags = []
uuids = [url]
self.add_event(
default_values,
title,
category,
start_day,
location,
description,
tags,
uuids,
url_human=url_human,
start_time=start_time,
image=image,
image_alt=image_alt,
)
return self.get_structure()

View File

@ -315,11 +315,13 @@ class Extractor(ABC):
CExtractor as GoogleCalendarLinkEventExtractor, CExtractor as GoogleCalendarLinkEventExtractor,
) )
from .generic_extractors.ical import ICALExtractor from .generic_extractors.ical import ICALExtractor
from .custom_extractors.associations_cf import CExtractor as AssociationsCF
if single_event: if single_event:
return [ return [
FacebookEventExtractor(), FacebookEventExtractor(),
GoogleCalendarLinkEventExtractor(), GoogleCalendarLinkEventExtractor(),
AssociationsCF(),
EventNotFoundExtractor(), EventNotFoundExtractor(),
] ]
else: else: