Amélioration de l'import puce à l'oreille

Fix #347
This commit is contained in:
Jean-Marie Favreau 2025-03-12 23:01:49 +01:00
parent a885d1a064
commit 55e8c1a323

View File

@ -1,5 +1,6 @@
from bs4 import BeautifulSoup
from ..extractor import Extractor
from ..twosteps_extractor import TwoStepsExtractor
@ -33,7 +34,6 @@ class CExtractor(TwoStepsExtractor):
start_day = Extractor.parse_french_date(
soup.select("h2")[1].get_text()
) # pas parfait, mais bordel que ce site est mal construit
print(soup.select("h2")[1].get_text())
spans = soup.select("div[data-testid=richTextElement] span")
start_time = None
@ -62,18 +62,21 @@ class CExtractor(TwoStepsExtractor):
url_human = event_url
tags = ["🎵 concert"]
image = soup.select("wow-image img[fetchpriority=high]")
image = soup.select("section wow-image img[fetchpriority=high]")
if image:
image = image[0]["src"]
else:
image = None
descriptions = soup.select(
"div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]"
"main div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]"
)
if descriptions:
prefixes = ["TARIF", "OUVER", "SPECT", "HORAI", "LIEU\n", "conce"]
descriptions = [d.get_text() for d in descriptions]
description = max(descriptions, key=len)
main_descs = [d for d in descriptions if d[:5] not in prefixes]
other_descs = [d for d in descriptions if d[:5] in prefixes]
description = "\n".join(main_descs + ["\n"] + other_descs)
else:
description = None