Ajout d'un détecteur de pages pas complètement chargées

This commit is contained in:
Jean-Marie Favreau 2025-02-07 11:55:32 +01:00
parent bb2a6b04e5
commit 7648d1d794
2 changed files with 27 additions and 17 deletions

View File

@ -17,30 +17,28 @@ logger = logging.getLogger(__name__)
# such as https://www.facebook.com/laJeteeClermont/events # such as https://www.facebook.com/laJeteeClermont/events
class CExtractor(TwoStepsExtractor): class CExtractor(TwoStepsExtractor):
def find_event_id_fragment_in_array(self, array, first=True): def find_event_id_fragment_in_array(self, array):
found = False
if isinstance(array, dict): if isinstance(array, dict):
if "__typename" in array and array["__typename"] == "Event" and "id" in array: if "__typename" in array and array["__typename"] == "Event" and "id" in array:
self.add_event_url("https://www.facebook.com/events/" + array["id"] + "/") self.add_event_url("https://www.facebook.com/events/" + array["id"] + "/")
found = True self.found = True
if not found: else:
for k in array: for k in array:
found = self.find_event_id_fragment_in_array(array[k], False) or found if k == "pageItems":
self.has_page_items = True
self.find_event_id_fragment_in_array(array[k])
elif isinstance(array, list): elif isinstance(array, list):
for e in array: for e in array:
found = self.find_event_id_fragment_in_array(e, False) or found self.find_event_id_fragment_in_array(e)
return found
def find_in_js(self, soup): def find_in_js(self, soup):
found = False
for json_script in soup.find_all("script", type="application/json"): for json_script in soup.find_all("script", type="application/json"):
json_txt = json_script.get_text() json_txt = json_script.get_text()
json_struct = json.loads(json_txt) json_struct = json.loads(json_txt)
found = self.find_event_id_fragment_in_array(json_struct) or found self.find_event_id_fragment_in_array(json_struct)
return found
def build_event_url_list(self, content): def build_event_url_list(self, content):
@ -48,16 +46,20 @@ class CExtractor(TwoStepsExtractor):
debug = False debug = False
found = False self.found = False
links = soup.find_all("a") links = soup.find_all("a")
for link in links: for link in links:
if link.get("href").startswith('https://www.facebook.com/events/'): if link.get("href").startswith('https://www.facebook.com/events/'):
self.add_event_url(link.get('href').split('?')[0]) self.add_event_url(link.get('href').split('?')[0])
found = True self.found = True
found = self.find_in_js(soup) or found self.has_page_items = False
self.find_in_js(soup)
if not found and debug: if not self.has_page_items:
raise Exception(_("the page was not yet populated with events, so the loading time was probably too short"))
if not self.found and debug:
directory = "errors/" directory = "errors/"
if not os.path.exists(directory): if not os.path.exists(directory):
os.makedirs(directory) os.makedirs(directory)

View File

@ -8,7 +8,7 @@ msgid ""
msgstr "" msgstr ""
"Project-Id-Version: agenda_culturel\n" "Project-Id-Version: agenda_culturel\n"
"Report-Msgid-Bugs-To: \n" "Report-Msgid-Bugs-To: \n"
"POT-Creation-Date: 2025-02-07 10:17+0100\n" "POT-Creation-Date: 2025-02-07 11:55+0100\n"
"PO-Revision-Date: 2023-10-29 14:16+0000\n" "PO-Revision-Date: 2023-10-29 14:16+0000\n"
"Last-Translator: Jean-Marie Favreau <jeanmarie.favreau@free.fr>\n" "Last-Translator: Jean-Marie Favreau <jeanmarie.favreau@free.fr>\n"
"Language-Team: Jean-Marie Favreau <jeanmarie.favreau@free.fr>\n" "Language-Team: Jean-Marie Favreau <jeanmarie.favreau@free.fr>\n"
@ -352,7 +352,15 @@ msgstr "Informations"
msgid "Add a comment" msgid "Add a comment"
msgstr "Ajouter un commentaire" msgstr "Ajouter un commentaire"
#: agenda_culturel/import_tasks/custom_extractors/fbevents.py:99 #: agenda_culturel/import_tasks/custom_extractors/fbevents.py:60
msgid ""
"the page was not yet populated with events, so the loading time was probably "
"too short"
msgstr ""
"la page n'était pas encore peuplée des événements, le temps de chargement a "
"sans doute été trop court"
#: agenda_culturel/import_tasks/custom_extractors/fbevents.py:101
msgid "Cannot get Facebook event from {}" msgid "Cannot get Facebook event from {}"
msgstr "Impossible de récupérer un événement Facebook depuis {}" msgstr "Impossible de récupérer un événement Facebook depuis {}"