From 55e8c1a3233786ba852e7c674b842324f9aae054 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Wed, 12 Mar 2025 23:01:49 +0100 Subject: [PATCH] =?UTF-8?q?Am=C3=A9lioration=20de=20l'import=20puce=20?= =?UTF-8?q?=C3=A0=20l'oreille?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix #347 --- .../import_tasks/custom_extractors/lapucealoreille.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py index c639cda..9da1e6b 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup + from ..extractor import Extractor from ..twosteps_extractor import TwoStepsExtractor @@ -33,7 +34,6 @@ class CExtractor(TwoStepsExtractor): start_day = Extractor.parse_french_date( soup.select("h2")[1].get_text() ) # pas parfait, mais bordel que ce site est mal construit - print(soup.select("h2")[1].get_text()) spans = soup.select("div[data-testid=richTextElement] span") start_time = None @@ -62,18 +62,21 @@ class CExtractor(TwoStepsExtractor): url_human = event_url tags = ["🎵 concert"] - image = soup.select("wow-image img[fetchpriority=high]") + image = soup.select("section wow-image img[fetchpriority=high]") if image: image = image[0]["src"] else: image = None descriptions = soup.select( - "div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]" + "main div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]" ) if descriptions: + prefixes = ["TARIF", "OUVER", "SPECT", "HORAI", "LIEU\n", "conce"] descriptions = [d.get_text() for d in descriptions] - description = max(descriptions, key=len) + main_descs = [d for d in descriptions if d[:5] not in prefixes] + other_descs = [d for d in descriptions if d[:5] in prefixes] + description = "\n".join(main_descs + ["\n"] + other_descs) else: description = None