diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py index c639cda..9da1e6b 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup + from ..extractor import Extractor from ..twosteps_extractor import TwoStepsExtractor @@ -33,7 +34,6 @@ class CExtractor(TwoStepsExtractor): start_day = Extractor.parse_french_date( soup.select("h2")[1].get_text() ) # pas parfait, mais bordel que ce site est mal construit - print(soup.select("h2")[1].get_text()) spans = soup.select("div[data-testid=richTextElement] span") start_time = None @@ -62,18 +62,21 @@ class CExtractor(TwoStepsExtractor): url_human = event_url tags = ["🎵 concert"] - image = soup.select("wow-image img[fetchpriority=high]") + image = soup.select("section wow-image img[fetchpriority=high]") if image: image = image[0]["src"] else: image = None descriptions = soup.select( - "div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]" + "main div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]" ) if descriptions: + prefixes = ["TARIF", "OUVER", "SPECT", "HORAI", "LIEU\n", "conce"] descriptions = [d.get_text() for d in descriptions] - description = max(descriptions, key=len) + main_descs = [d for d in descriptions if d[:5] not in prefixes] + other_descs = [d for d in descriptions if d[:5] in prefixes] + description = "\n".join(main_descs + ["\n"] + other_descs) else: description = None