parent
9dada3e6c2
commit
629a8509cb
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from ..extractor import Extractor
|
from ..extractor import Extractor
|
||||||
@ -33,17 +34,16 @@ class CExtractor(TwoStepsExtractor):
|
|||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
def build_event_url_list(self, content):
|
def build_event_url_list(self, content):
|
||||||
soup = BeautifulSoup(content, "xml")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
events = soup.select("div.mec-tile-event-content")
|
events = soup.select("div.mec-tile-event-content")
|
||||||
for e in events:
|
for e in events:
|
||||||
link = e.select("h4.mec-event-title a")
|
link = e.select("h4.mec-event-title a")
|
||||||
if len(link) == 1:
|
if len(link) == 1:
|
||||||
url = link[0]["href"]
|
url = link[0]["href"]
|
||||||
title = link[0].get_text()
|
title = link[0].get_text().replace("’", "'")
|
||||||
|
|
||||||
if self.add_event_url(url):
|
if self.add_event_url(url):
|
||||||
print(url, title)
|
|
||||||
self.add_event_title(url, title)
|
self.add_event_title(url, title)
|
||||||
|
|
||||||
categories = e.select(".mec-label-normal")
|
categories = e.select(".mec-label-normal")
|
||||||
@ -64,7 +64,7 @@ class CExtractor(TwoStepsExtractor):
|
|||||||
default_values=None,
|
default_values=None,
|
||||||
published=False,
|
published=False,
|
||||||
):
|
):
|
||||||
soup = BeautifulSoup(event_content, "xml")
|
soup = BeautifulSoup(event_content, "html.parser")
|
||||||
|
|
||||||
start_day = soup.select(".mec-start-date-label")
|
start_day = soup.select(".mec-start-date-label")
|
||||||
if start_day and len(start_day) > 0:
|
if start_day and len(start_day) > 0:
|
||||||
@ -91,6 +91,7 @@ class CExtractor(TwoStepsExtractor):
|
|||||||
description = soup.select(".mec-event-content .mec-single-event-description")[
|
description = soup.select(".mec-event-content .mec-single-event-description")[
|
||||||
0
|
0
|
||||||
].get_text(separator=" ")
|
].get_text(separator=" ")
|
||||||
|
description = re.sub(r"([ ]*\n)+", "\n", description).strip().replace("’", "'")
|
||||||
|
|
||||||
url_human = event_url
|
url_human = event_url
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user