From 6c86a8fc189afe6c4f91df16bcb292582ad34046 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 14 Sep 2024 15:43:16 +0200 Subject: [PATCH] =?UTF-8?q?Fix=20nouveau=20site=20puce=20=C3=A0=20l'oreill?= =?UTF-8?q?e?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- experimentations/get_lapucealoreille_events.py | 4 ++-- .../custom_extractors/lapucealoreille.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/experimentations/get_lapucealoreille_events.py b/experimentations/get_lapucealoreille_events.py index 9dbbdc1..d7a0abf 100755 --- a/experimentations/get_lapucealoreille_events.py +++ b/experimentations/get_lapucealoreille_events.py @@ -29,8 +29,8 @@ from src.agenda_culturel.import_tasks.custom_extractors import * if __name__ == "__main__": u2e = URL2Events(SimpleDownloader(), lapucealoreille.CExtractor()) - url = "https://www.lapucealoreille63.fr/programmation/" - url_human = "https://www.lapucealoreille63.fr/programmation/" + url = "https://www.lapucealoreille63.fr/agenda" + url_human = "https://www.lapucealoreille63.fr/agenda" try: events = u2e.process(url, url_human, cache = "cache-lapucealoreille.xml", default_values = {}, published = True) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py index 717bb82..ac1bcc5 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py @@ -15,11 +15,6 @@ class CExtractor(TwoStepsExtractor): e_url = e.find("a") if e_url: if self.add_event_url(e_url["href"]): - title = e.select("div[data-testid=richTextElement] h1.font_0 span") - if title: - title = title[0].contents[0].get_text().replace("\n", " ") - title = re.sub(" +", " ", title) - self.add_event_title(e_url["href"], title) def add_event_from_content( self, @@ -31,9 +26,12 @@ class CExtractor(TwoStepsExtractor): ): soup = BeautifulSoup(event_content, "html.parser") + title = soup.select("h2")[0].get_text() + start_day = self.parse_french_date( - soup.find("h2").get_text() + soup.select("h2")[1].get_text() ) # pas parfait, mais bordel que ce site est mal construit + print(soup.select("h2")[1].get_text()) spans = soup.select("div[data-testid=richTextElement] span") start_time = None @@ -79,7 +77,7 @@ class CExtractor(TwoStepsExtractor): self.add_event_with_props( event_url, - None, + title, "Concert", start_day, location,