Fix nouveau site puce à l'oreille

2024-09-14 15:43:16 +02:00
parent 62060925cd
commit 6c86a8fc18
2 changed files with 7 additions and 9 deletions
--- a/experimentations/get_lapucealoreille_events.py
+++ b/experimentations/get_lapucealoreille_events.py
@@ -29,8 +29,8 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 if __name__ == "__main__":

    u2e = URL2Events(SimpleDownloader(), lapucealoreille.CExtractor())
-    url = "https://www.lapucealoreille63.fr/programmation/"
-    url_human = "https://www.lapucealoreille63.fr/programmation/"
+    url = "https://www.lapucealoreille63.fr/agenda"
+    url_human = "https://www.lapucealoreille63.fr/agenda"

    try:
        events = u2e.process(url, url_human, cache = "cache-lapucealoreille.xml", default_values = {}, published = True)
--- a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py
@@ -15,11 +15,6 @@ class CExtractor(TwoStepsExtractor):
            e_url = e.find("a")
            if e_url:
                if self.add_event_url(e_url["href"]):
-                    title = e.select("div[data-testid=richTextElement] h1.font_0 span")
-                    if title:
-                        title = title[0].contents[0].get_text().replace("\n", " ")
-                        title = re.sub(" +", " ", title)
-                        self.add_event_title(e_url["href"], title)

    def add_event_from_content(
        self,
@@ -31,9 +26,12 @@ class CExtractor(TwoStepsExtractor):
    ):
        soup = BeautifulSoup(event_content, "html.parser")

+        title = soup.select("h2")[0].get_text()
+
        start_day = self.parse_french_date(
-            soup.find("h2").get_text()
+            soup.select("h2")[1].get_text()
        )  # pas parfait, mais bordel que ce site est mal construit
+        print(soup.select("h2")[1].get_text())

        spans = soup.select("div[data-testid=richTextElement] span")
        start_time = None
@@ -79,7 +77,7 @@ class CExtractor(TwoStepsExtractor):

        self.add_event_with_props(
            event_url,
-            None,
+            title,
            "Concert",
            start_day,
            location,