parent
a885d1a064
commit
55e8c1a323
@ -1,5 +1,6 @@
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
from ..extractor import Extractor
|
from ..extractor import Extractor
|
||||||
from ..twosteps_extractor import TwoStepsExtractor
|
from ..twosteps_extractor import TwoStepsExtractor
|
||||||
|
|
||||||
@ -33,7 +34,6 @@ class CExtractor(TwoStepsExtractor):
|
|||||||
start_day = Extractor.parse_french_date(
|
start_day = Extractor.parse_french_date(
|
||||||
soup.select("h2")[1].get_text()
|
soup.select("h2")[1].get_text()
|
||||||
) # pas parfait, mais bordel que ce site est mal construit
|
) # pas parfait, mais bordel que ce site est mal construit
|
||||||
print(soup.select("h2")[1].get_text())
|
|
||||||
|
|
||||||
spans = soup.select("div[data-testid=richTextElement] span")
|
spans = soup.select("div[data-testid=richTextElement] span")
|
||||||
start_time = None
|
start_time = None
|
||||||
@ -62,18 +62,21 @@ class CExtractor(TwoStepsExtractor):
|
|||||||
url_human = event_url
|
url_human = event_url
|
||||||
tags = ["🎵 concert"]
|
tags = ["🎵 concert"]
|
||||||
|
|
||||||
image = soup.select("wow-image img[fetchpriority=high]")
|
image = soup.select("section wow-image img[fetchpriority=high]")
|
||||||
if image:
|
if image:
|
||||||
image = image[0]["src"]
|
image = image[0]["src"]
|
||||||
else:
|
else:
|
||||||
image = None
|
image = None
|
||||||
|
|
||||||
descriptions = soup.select(
|
descriptions = soup.select(
|
||||||
"div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]"
|
"main div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]"
|
||||||
)
|
)
|
||||||
if descriptions:
|
if descriptions:
|
||||||
|
prefixes = ["TARIF", "OUVER", "SPECT", "HORAI", "LIEU\n", "conce"]
|
||||||
descriptions = [d.get_text() for d in descriptions]
|
descriptions = [d.get_text() for d in descriptions]
|
||||||
description = max(descriptions, key=len)
|
main_descs = [d for d in descriptions if d[:5] not in prefixes]
|
||||||
|
other_descs = [d for d in descriptions if d[:5] in prefixes]
|
||||||
|
description = "\n".join(main_descs + ["\n"] + other_descs)
|
||||||
else:
|
else:
|
||||||
description = None
|
description = None
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user