Ajout d'un détecteur de pages pas complètement chargées

2025-02-07 11:55:32 +01:00 · 2025-02-07 11:55:32 +01:00 · 7648d1d794
commit 7648d1d794
parent bb2a6b04e5
2 changed files with 27 additions and 17 deletions
--- a/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py
@ -17,30 +17,28 @@ logger = logging.getLogger(__name__)
 # such as https://www.facebook.com/laJeteeClermont/events
 class CExtractor(TwoStepsExtractor):

-    def find_event_id_fragment_in_array(self, array, first=True):
-        found = False
+    def find_event_id_fragment_in_array(self, array):
        if isinstance(array, dict):
            if "__typename" in array and array["__typename"] == "Event" and "id" in array:
                self.add_event_url("https://www.facebook.com/events/" + array["id"] + "/")
-                found = True
-            if not found:
+                self.found = True
+            else:
                for k in array:
-                    found = self.find_event_id_fragment_in_array(array[k], False) or found
+                    if k == "pageItems":
+                        self.has_page_items = True
+                    self.find_event_id_fragment_in_array(array[k])
        elif isinstance(array, list):
            for e in array:
-                found = self.find_event_id_fragment_in_array(e, False) or found
-        return found
+                self.find_event_id_fragment_in_array(e)


    def find_in_js(self, soup):
-        found = False

        for json_script in soup.find_all("script", type="application/json"):
            json_txt = json_script.get_text()
            json_struct = json.loads(json_txt)
-            found = self.find_event_id_fragment_in_array(json_struct) or found
+            self.find_event_id_fragment_in_array(json_struct)

-        return found


    def build_event_url_list(self, content):
@ -48,16 +46,20 @@ class CExtractor(TwoStepsExtractor):

        debug = False

-        found = False
+        self.found = False
        links = soup.find_all("a")
        for link in links:
            if link.get("href").startswith('https://www.facebook.com/events/'):
                self.add_event_url(link.get('href').split('?')[0])
-                found = True
+                self.found = True

-        found = self.find_in_js(soup) or found
+        self.has_page_items = False
+        self.find_in_js(soup)

-        if not found and debug:
+        if not self.has_page_items:
+            raise Exception(_("the page was not yet populated with events, so the loading time was probably too short"))
+
+        if not self.found and debug:
            directory = "errors/"
            if not os.path.exists(directory):
                os.makedirs(directory)
--- a/src/agenda_culturel/locale/fr/LC_MESSAGES/django.po
+++ b/src/agenda_culturel/locale/fr/LC_MESSAGES/django.po
@ -2,13 +2,13 @@
 # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
 # This file is distributed under the same license as the PACKAGE package.
 # Jean-Marie Favreau <jeanmarie.favreau@free.fr>, 2023.
-#
+# 
 #, fuzzy
 msgid ""
 msgstr ""
 "Project-Id-Version: agenda_culturel\n"
 "Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2025-02-07 10:17+0100\n"
+"POT-Creation-Date: 2025-02-07 11:55+0100\n"
 "PO-Revision-Date: 2023-10-29 14:16+0000\n"
 "Last-Translator: Jean-Marie Favreau <jeanmarie.favreau@free.fr>\n"
 "Language-Team: Jean-Marie Favreau <jeanmarie.favreau@free.fr>\n"
@ -352,7 +352,15 @@ msgstr "Informations"
 msgid "Add a comment"
 msgstr "Ajouter un commentaire"

-#: agenda_culturel/import_tasks/custom_extractors/fbevents.py:99
+#: agenda_culturel/import_tasks/custom_extractors/fbevents.py:60
+msgid ""
+"the page was not yet populated with events, so the loading time was probably "
+"too short"
+msgstr ""
+"la page n'était pas encore peuplée des événements, le temps de chargement a "
+"sans doute été trop court"
+
+#: agenda_culturel/import_tasks/custom_extractors/fbevents.py:101
 msgid "Cannot get Facebook event from {}"
 msgstr "Impossible de récupérer un événement Facebook depuis {}"