From 629a8509cbe40ebf7e9d1165a31fa9f805289975 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sun, 9 Mar 2025 14:53:34 +0100 Subject: [PATCH] On corrige l'import poulailler Fix #132 --- .../import_tasks/generic_extractors/wordpress_mec.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py b/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py index d927580..058af06 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py +++ b/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py @@ -1,3 +1,4 @@ +import re from bs4 import BeautifulSoup from ..extractor import Extractor @@ -33,17 +34,16 @@ class CExtractor(TwoStepsExtractor): return None, None def build_event_url_list(self, content): - soup = BeautifulSoup(content, "xml") + soup = BeautifulSoup(content, "html.parser") events = soup.select("div.mec-tile-event-content") for e in events: link = e.select("h4.mec-event-title a") if len(link) == 1: url = link[0]["href"] - title = link[0].get_text() + title = link[0].get_text().replace("’", "'") if self.add_event_url(url): - print(url, title) self.add_event_title(url, title) categories = e.select(".mec-label-normal") @@ -64,7 +64,7 @@ class CExtractor(TwoStepsExtractor): default_values=None, published=False, ): - soup = BeautifulSoup(event_content, "xml") + soup = BeautifulSoup(event_content, "html.parser") start_day = soup.select(".mec-start-date-label") if start_day and len(start_day) > 0: @@ -91,6 +91,7 @@ class CExtractor(TwoStepsExtractor): description = soup.select(".mec-event-content .mec-single-event-description")[ 0 ].get_text(separator=" ") + description = re.sub(r"([ ]*\n)+", "\n", description).strip().replace("’", "'") url_human = event_url