From 629a8509cbe40ebf7e9d1165a31fa9f805289975 Mon Sep 17 00:00:00 2001
From: Jean-Marie Favreau <jean-marie.favreau@logiroad-center.com>
Date: Sun, 9 Mar 2025 14:53:34 +0100
Subject: [PATCH] On corrige l'import poulailler

Fix #132
---
 .../import_tasks/generic_extractors/wordpress_mec.py     | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py b/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py
index d927580..058af06 100644
--- a/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py
+++ b/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py
@@ -1,3 +1,4 @@
+import re
 from bs4 import BeautifulSoup
 
 from ..extractor import Extractor
@@ -33,17 +34,16 @@ class CExtractor(TwoStepsExtractor):
             return None, None
 
     def build_event_url_list(self, content):
-        soup = BeautifulSoup(content, "xml")
+        soup = BeautifulSoup(content, "html.parser")
 
         events = soup.select("div.mec-tile-event-content")
         for e in events:
             link = e.select("h4.mec-event-title a")
             if len(link) == 1:
                 url = link[0]["href"]
-                title = link[0].get_text()
+                title = link[0].get_text().replace("’", "'")
 
                 if self.add_event_url(url):
-                    print(url, title)
                     self.add_event_title(url, title)
 
                 categories = e.select(".mec-label-normal")
@@ -64,7 +64,7 @@ class CExtractor(TwoStepsExtractor):
         default_values=None,
         published=False,
     ):
-        soup = BeautifulSoup(event_content, "xml")
+        soup = BeautifulSoup(event_content, "html.parser")
 
         start_day = soup.select(".mec-start-date-label")
         if start_day and len(start_day) > 0:
@@ -91,6 +91,7 @@ class CExtractor(TwoStepsExtractor):
         description = soup.select(".mec-event-content .mec-single-event-description")[
             0
         ].get_text(separator=" ")
+        description = re.sub(r"([ ]*\n)+", "\n", description).strip().replace("’", "'")
 
         url_human = event_url