parent
d401f533bd
commit
ae26f3630c
38
experimentations/get_associations_cf.py
Executable file
38
experimentations/get_associations_cf.py
Executable file
@ -0,0 +1,38 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# getting the name of the directory
|
||||||
|
# where the this file is present.
|
||||||
|
current = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
# Getting the parent directory name
|
||||||
|
# where the current directory is present.
|
||||||
|
parent = os.path.dirname(current)
|
||||||
|
|
||||||
|
# adding the parent directory to
|
||||||
|
# the sys.path.
|
||||||
|
sys.path.append(parent)
|
||||||
|
sys.path.append(parent + "/src")
|
||||||
|
|
||||||
|
from src.agenda_culturel.import_tasks.downloader import (
|
||||||
|
ChromiumHeadlessDownloader,
|
||||||
|
)
|
||||||
|
from src.agenda_culturel.import_tasks.custom_extractors.associations_cf import (
|
||||||
|
CExtractor,
|
||||||
|
)
|
||||||
|
from src.agenda_culturel.import_tasks.importer import URL2Events
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor())
|
||||||
|
url = "https://associations.clermont-ferrand.fr/evenement/week-end-multi-culturel"
|
||||||
|
|
||||||
|
events = u2e.process(url, cache="asso_cf.html", published=True)
|
||||||
|
|
||||||
|
exportfile = "event-asso_cf.json"
|
||||||
|
print("Saving events to file {}".format(exportfile))
|
||||||
|
with open(exportfile, "w") as f:
|
||||||
|
json.dump(events, f, indent=4, default=str)
|
@ -0,0 +1,72 @@
|
|||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from ..extractor import Extractor
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CExtractor(Extractor):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
def extract(
|
||||||
|
self,
|
||||||
|
content,
|
||||||
|
url,
|
||||||
|
url_human=None,
|
||||||
|
default_values=None,
|
||||||
|
published=False,
|
||||||
|
):
|
||||||
|
self.set_header(url)
|
||||||
|
u = urlparse(url)
|
||||||
|
if u.netloc == "associations.clermont-ferrand.fr":
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
title = soup.select_one("h1").text.strip()
|
||||||
|
img = soup.select_one("img.image-style-event")
|
||||||
|
image = None
|
||||||
|
image_alt = None
|
||||||
|
if img:
|
||||||
|
image = img["src"]
|
||||||
|
if not image.startswith("http"):
|
||||||
|
image = "https://" + u.netloc + image
|
||||||
|
image_alt = img["alt"]
|
||||||
|
description = soup.select_one("div.field--name-body").get_text(
|
||||||
|
separator="\n"
|
||||||
|
)
|
||||||
|
location = soup.select_one("div.c-location__holder .c-desc").text
|
||||||
|
|
||||||
|
start_day = soup.select_one("div.o-date")
|
||||||
|
if start_day is not None:
|
||||||
|
start_day = Extractor.parse_french_date(
|
||||||
|
re.sub("[ ]*\n[ ]*", " ", start_day.get_text(separator=" "))
|
||||||
|
)
|
||||||
|
|
||||||
|
start_time = soup.select_one("div.c-hours__holder .c-desc")
|
||||||
|
if start_time is not None:
|
||||||
|
start_time = Extractor.parse_french_time(start_time.text)
|
||||||
|
|
||||||
|
category = None
|
||||||
|
tags = []
|
||||||
|
uuids = [url]
|
||||||
|
|
||||||
|
self.add_event(
|
||||||
|
default_values,
|
||||||
|
title,
|
||||||
|
category,
|
||||||
|
start_day,
|
||||||
|
location,
|
||||||
|
description,
|
||||||
|
tags,
|
||||||
|
uuids,
|
||||||
|
url_human=url_human,
|
||||||
|
start_time=start_time,
|
||||||
|
image=image,
|
||||||
|
image_alt=image_alt,
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.get_structure()
|
@ -315,11 +315,13 @@ class Extractor(ABC):
|
|||||||
CExtractor as GoogleCalendarLinkEventExtractor,
|
CExtractor as GoogleCalendarLinkEventExtractor,
|
||||||
)
|
)
|
||||||
from .generic_extractors.ical import ICALExtractor
|
from .generic_extractors.ical import ICALExtractor
|
||||||
|
from .custom_extractors.associations_cf import CExtractor as AssociationsCF
|
||||||
|
|
||||||
if single_event:
|
if single_event:
|
||||||
return [
|
return [
|
||||||
FacebookEventExtractor(),
|
FacebookEventExtractor(),
|
||||||
GoogleCalendarLinkEventExtractor(),
|
GoogleCalendarLinkEventExtractor(),
|
||||||
|
AssociationsCF(),
|
||||||
EventNotFoundExtractor(),
|
EventNotFoundExtractor(),
|
||||||
]
|
]
|
||||||
else:
|
else:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user