From 81601ec5daec2980babf369d0bee2e2bbba3a130 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Wed, 28 Aug 2024 23:50:29 +0200 Subject: [PATCH] =?UTF-8?q?Ajout=20de=20l'extraction=20des=20=C3=A9v=C3=A9?= =?UTF-8?q?nements=20d'une=20page=20facebook?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- experimentations/get_facebook_events.py | 43 +++ src/agenda_culturel/celery.py | 2 + .../custom_extractors/fbevents.py | 48 +++ .../import_tasks/downloader.py | 13 +- .../import_tasks/extractor_facebook.py | 361 +++++++++--------- .../0068_alter_recurrentimport_processor.py | 18 + src/agenda_culturel/models.py | 3 +- 7 files changed, 301 insertions(+), 187 deletions(-) create mode 100755 experimentations/get_facebook_events.py create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/fbevents.py create mode 100644 src/agenda_culturel/migrations/0068_alter_recurrentimport_processor.py diff --git a/experimentations/get_facebook_events.py b/experimentations/get_facebook_events.py new file mode 100755 index 0000000..9d016aa --- /dev/null +++ b/experimentations/get_facebook_events.py @@ -0,0 +1,43 @@ +#!/usr/bin/python3 +# coding: utf-8 + +import os +import json +import sys + +# getting the name of the directory +# where the this file is present. +current = os.path.dirname(os.path.realpath(__file__)) + +# Getting the parent directory name +# where the current directory is present. +parent = os.path.dirname(current) + +# adding the parent directory to +# the sys.path. +sys.path.append(parent) + +from src.agenda_culturel.import_tasks.downloader import * +from src.agenda_culturel.import_tasks.extractor import * +from src.agenda_culturel.import_tasks.importer import * +from src.agenda_culturel.import_tasks.custom_extractors import * + + + + + +if __name__ == "__main__": + + u2e = URL2Events(ChromiumHeadlessDownloader(), fbevents.CExtractor()) + url = "https://www.facebook.com/laJeteeClermont/events" + url_human = "https://www.facebook.com/laJeteeClermont/events" + + try: + events = u2e.process(url, url_human, cache = "cache-lajetee-fb.html", default_values = {"location": "La Jetée"}, published = True) + + exportfile = "events-lajetee-fb.json" + print("Saving events to file {}".format(exportfile)) + with open(exportfile, "w") as f: + json.dump(events, f, indent=4, default=str) + except Exception as e: + print("Exception: " + str(e)) diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 59a0c6f..1ed40ca 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -124,6 +124,8 @@ def run_recurrent_import(self, pk): extractor = lapucealoreille.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.MECWORDPRESS: extractor = wordpress_mec.CExtractor() + elif rimport.processor == RecurrentImport.PROCESSOR.FBEVENTS: + extractor = fbevents.CExtractor() else: extractor = None diff --git a/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py b/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py new file mode 100644 index 0000000..6a4cbaf --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py @@ -0,0 +1,48 @@ +from ..generic_extractors import * +from ..extractor_facebook import FacebookEvent +import json5 +from bs4 import BeautifulSoup +import json + + +# A class dedicated to get events from a facebook events page +# such as https://www.facebook.com/laJeteeClermont/events +class CExtractor(TwoStepsExtractor): + + + def build_event_url_list(self, content): + soup = BeautifulSoup(content, "html.parser") + + links = soup.find_all("a") + for link in links: + if link.get("href").startswith('https://www.facebook.com/events/'): + self.add_event_url(link.get('href').split('?')[0]) + + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + + fevent = None + soup = BeautifulSoup(event_content, "html.parser") + for json_script in soup.find_all("script", type="application/json"): + json_txt = json_script.get_text() + json_struct = json.loads(json_txt) + fevent = FacebookEvent.find_event_fragment_in_array( + json_struct, fevent + ) + + if fevent is not None: + event = fevent.build_event(event_url) + event["published"] = published + print([e.elements for e in fevent.neighbor_events]) + + if "category" in default_values: + event["category"] = default_values["category"] + self.add_event(**event) + diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index a2f74a5..306f67f 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -5,9 +5,9 @@ import os from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options +from selenium.common.exceptions import * from abc import ABC, abstractmethod - class Downloader(ABC): def __init__(self): pass @@ -90,19 +90,20 @@ class ChromiumHeadlessDownloader(Downloader): self.driver.get(url) doc = self.driver.page_source - except exceptions.StaleElementReferenceException as e: + + except StaleElementReferenceException as e: print(f">> {type(e).__name__}: {e.args}") return None - except exceptions.NoSuchElementException as e: + except NoSuchElementException as e: print(f">> {type(e).__name__}: {e.args}") return None - except exceptions.TimeoutException as e: + except TimeoutException as e: print(f">> {type(e).__name__}: {e.args}") return None - except exceptions.WebDriverException as e: + except WebDriverException as e: print(f">> {type(e).__name__}: {e.args}") return None - except exceptions.SessionNotCreatedException as e: + except SessionNotCreatedException as e: print(f">> {type(e).__name__}: {e.args}") return None except Exception as e: diff --git a/src/agenda_culturel/import_tasks/extractor_facebook.py b/src/agenda_culturel/import_tasks/extractor_facebook.py index d394160..7f5f374 100644 --- a/src/agenda_culturel/import_tasks/extractor_facebook.py +++ b/src/agenda_culturel/import_tasks/extractor_facebook.py @@ -9,193 +9,194 @@ import logging logger = logging.getLogger(__name__) +class SimpleFacebookEvent: + def __init__(self, data): + self.elements = {} -class FacebookEventExtractor(Extractor): - class SimpleFacebookEvent: - def __init__(self, data): - self.elements = {} + for key in ["id", "start_timestamp", "end_timestamp"]: + self.elements[key] = data[key] if key in data else None - for key in ["id", "start_timestamp", "end_timestamp"]: - self.elements[key] = data[key] if key in data else None + if "parent_event" in data: + self.parent = SimpleFacebookEvent( + data["parent_event"] + ) - if "parent_event" in data: - self.parent = FacebookEventExtractor.SimpleFacebookEvent( - data["parent_event"] +class FacebookEvent: + name = "event" + keys = [ + [ + "start_time_formatted", + "start_timestamp", + "is_past", + "name", + "price_info", + "cover_media_renderer", + "id", + "day_time_sentence", + "event_place", + "comet_neighboring_siblings", + ], + ["event_description"], + ["start_timestamp", "end_timestamp"], + ] + rules = { + "event_description": {"description": ["text"]}, + "cover_media_renderer": { + "image_alt": ["cover_photo", "photo", "accessibility_caption"], + "image": ["cover_photo", "photo", "full_image", "uri"], + }, + "event_creator": { + "event_creator_name": ["name"], + "event_creator_url": ["url"], + }, + "event_place": {"event_place_name": ["name"]}, + } + + def __init__(self, i, event): + self.fragments = {} + self.elements = {} + self.neighbor_events = None + self.possible_end_timestamp = [] + self.add_fragment(i, event) + + def get_element(self, key): + return self.elements[key] if key in self.elements else None + + def get_element_date(self, key): + v = self.get_element(key) + return ( + datetime.fromtimestamp(v).date() if v is not None and v != 0 else None + ) + + def get_element_time(self, key): + v = self.get_element(key) + return ( + datetime.fromtimestamp(v).strftime("%H:%M") + if v is not None and v != 0 + else None + ) + + def add_fragment(self, i, event): + self.fragments[i] = event + + if FacebookEvent.keys[i] == [ + "start_timestamp", + "end_timestamp", + ]: + self.get_possible_end_timestamp(i, event) + else: + for k in FacebookEvent.keys[i]: + if k == "comet_neighboring_siblings": + self.get_neighbor_events(event[k]) + elif k in FacebookEvent.rules: + for nk, rule in FacebookEvent.rules[ + k + ].items(): + error = False + c = event[k] + for ki in rule: + if c is not None: + c = c[ki] + else: + error = True + if not error: + self.elements[nk] = c + else: + self.elements[k] = event[k] + + def get_possible_end_timestamp(self, i, data): + self.possible_end_timestamp.append( + dict((k, data[k]) for k in FacebookEvent.keys[i]) + ) + + def get_neighbor_events(self, data): + self.neighbor_events = [ + SimpleFacebookEvent(d) for d in data + ] + + def __str__(self): + return ( + str(self.elements) + + "\n Neighbors: " + + ", ".join([ne.elements["id"] for ne in self.neighbor_events]) + ) + + def consolidate_current_event(self): + if ( + self.neighbor_events is not None + and "id" in self.elements + and "end_timestamp" not in self.elements + ): + if self.neighbor_events is not None and "id" in self.elements: + id = self.elements["id"] + for ne in self.neighbor_events: + if ne.elements["id"] == id: + self.elements["end_timestamp"] = ne.elements[ + "end_timestamp" + ] + + if ( + "end_timestamp" not in self.elements + and len(self.possible_end_timestamp) != 0 + ): + for s in self.possible_end_timestamp: + if ( + "start_timestamp" in s + and "start_timestamp" in self.elements + and s["start_timestamp"] == self.elements["start_timestamp"] + ): + self.elements["end_timestamp"] = s["end_timestamp"] + break + + def find_event_fragment_in_array(array, event, first=True): + if isinstance(array, dict): + seen = False + for i, ks in enumerate(FacebookEvent.keys): + # DEBUG: print([k for k in ks if k in array], "il manque", [k for k in ks if k not in array]) + if len(ks) == len([k for k in ks if k in array]): + seen = True + if event is None: + event = FacebookEvent(i, array) + else: + event.add_fragment(i, array) + # only consider the first of FacebookEvent.keys + break + if not seen: + for k in array: + event = FacebookEvent.find_event_fragment_in_array( + array[k], event, False + ) + elif isinstance(array, list): + for e in array: + event = FacebookEvent.find_event_fragment_in_array( + e, event, False ) - class FacebookEvent: - name = "event" - keys = [ - [ - "start_time_formatted", - "start_timestamp", - "is_past", - "name", - "price_info", - "cover_media_renderer", - "id", - "day_time_sentence", - "event_place", - "comet_neighboring_siblings", - ], - ["event_description"], - ["start_timestamp", "end_timestamp"], - ] - rules = { - "event_description": {"description": ["text"]}, - "cover_media_renderer": { - "image_alt": ["cover_photo", "photo", "accessibility_caption"], - "image": ["cover_photo", "photo", "full_image", "uri"], - }, - "event_creator": { - "event_creator_name": ["name"], - "event_creator_url": ["url"], - }, - "event_place": {"event_place_name": ["name"]}, + if event is not None and first: + event.consolidate_current_event() + return event + + def build_event(self, url): + self.get_element("image") + + return { + "title": self.get_element("name"), + "category": None, + "start_day": self.get_element_date("start_timestamp"), + "location": self.get_element("event_place_name"), + "description": self.get_element("description"), + "tags": [], + "uuids": [url], + "url_human": url, + "start_time": self.get_element_time("start_timestamp"), + "end_day": self.get_element_date("end_timestamp"), + "end_time": self.get_element_time("end_timestamp"), + "image": self.get_element("image"), + "image_alt": self.get_element("image"), } - def __init__(self, i, event): - self.fragments = {} - self.elements = {} - self.neighbor_events = None - self.possible_end_timestamp = [] - self.add_fragment(i, event) - def get_element(self, key): - return self.elements[key] if key in self.elements else None - - def get_element_date(self, key): - v = self.get_element(key) - return ( - datetime.fromtimestamp(v).date() if v is not None and v != 0 else None - ) - - def get_element_time(self, key): - v = self.get_element(key) - return ( - datetime.fromtimestamp(v).strftime("%H:%M") - if v is not None and v != 0 - else None - ) - - def add_fragment(self, i, event): - self.fragments[i] = event - - if FacebookEventExtractor.FacebookEvent.keys[i] == [ - "start_timestamp", - "end_timestamp", - ]: - self.get_possible_end_timestamp(i, event) - else: - for k in FacebookEventExtractor.FacebookEvent.keys[i]: - if k == "comet_neighboring_siblings": - self.get_neighbor_events(event[k]) - elif k in FacebookEventExtractor.FacebookEvent.rules: - for nk, rule in FacebookEventExtractor.FacebookEvent.rules[ - k - ].items(): - error = False - c = event[k] - for ki in rule: - if c is not None: - c = c[ki] - else: - error = True - if not error: - self.elements[nk] = c - else: - self.elements[k] = event[k] - - def get_possible_end_timestamp(self, i, data): - self.possible_end_timestamp.append( - dict((k, data[k]) for k in FacebookEventExtractor.FacebookEvent.keys[i]) - ) - - def get_neighbor_events(self, data): - self.neighbor_events = [ - FacebookEventExtractor.SimpleFacebookEvent(d) for d in data - ] - - def __str__(self): - return ( - str(self.elements) - + "\n Neighbors: " - + ", ".join([ne.elements["id"] for ne in self.neighbor_events]) - ) - - def consolidate_current_event(self): - if ( - self.neighbor_events is not None - and "id" in self.elements - and "end_timestamp" not in self.elements - ): - if self.neighbor_events is not None and "id" in self.elements: - id = self.elements["id"] - for ne in self.neighbor_events: - if ne.elements["id"] == id: - self.elements["end_timestamp"] = ne.elements[ - "end_timestamp" - ] - - if ( - "end_timestamp" not in self.elements - and len(self.possible_end_timestamp) != 0 - ): - for s in self.possible_end_timestamp: - if ( - "start_timestamp" in s - and "start_timestamp" in self.elements - and s["start_timestamp"] == self.elements["start_timestamp"] - ): - self.elements["end_timestamp"] = s["end_timestamp"] - break - - def find_event_fragment_in_array(array, event, first=True): - if isinstance(array, dict): - seen = False - for i, ks in enumerate(FacebookEventExtractor.FacebookEvent.keys): - # DEBUG: print([k for k in ks if k in array], "il manque", [k for k in ks if k not in array]) - if len(ks) == len([k for k in ks if k in array]): - seen = True - if event is None: - event = FacebookEventExtractor.FacebookEvent(i, array) - else: - event.add_fragment(i, array) - # only consider the first of FacebookEvent.keys - break - if not seen: - for k in array: - event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array( - array[k], event, False - ) - elif isinstance(array, list): - for e in array: - event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array( - e, event, False - ) - - if event is not None and first: - event.consolidate_current_event() - return event - - def build_event(self, url): - self.get_element("image") - - return { - "title": self.get_element("name"), - "category": None, - "start_day": self.get_element_date("start_timestamp"), - "location": self.get_element("event_place_name"), - "description": self.get_element("description"), - "tags": [], - "uuids": [url], - "url_human": url, - "start_time": self.get_element_time("start_timestamp"), - "end_day": self.get_element_date("end_timestamp"), - "end_time": self.get_element_time("end_timestamp"), - "image": self.get_element("image"), - "image_alt": self.get_element("image"), - } +class FacebookEventExtractor(Extractor): def __init__(self, single_event=False): self.single_event = single_event @@ -223,7 +224,7 @@ class FacebookEventExtractor(Extractor): for json_script in soup.find_all("script", type="application/json"): json_txt = json_script.get_text() json_struct = json.loads(json_txt) - fevent = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array( + fevent = FacebookEvent.find_event_fragment_in_array( json_struct, fevent ) diff --git a/src/agenda_culturel/migrations/0068_alter_recurrentimport_processor.py b/src/agenda_culturel/migrations/0068_alter_recurrentimport_processor.py new file mode 100644 index 0000000..64054f3 --- /dev/null +++ b/src/agenda_culturel/migrations/0068_alter_recurrentimport_processor.py @@ -0,0 +1,18 @@ +# Generated by Django 4.2.7 on 2024-08-28 21:42 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('agenda_culturel', '0067_categorisationrule_place'), + ] + + operations = [ + migrations.AlterField( + model_name='recurrentimport', + name='processor', + field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page")], default='ical', max_length=20, verbose_name='Processor'), + ), + ] diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index be59f39..be9500b 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -1237,8 +1237,9 @@ class RecurrentImport(models.Model): LACOOPE = "lacoope", _("lacoope.org") LACOMEDIE = "lacomedie", _("la comédie") LEFOTOMAT = "lefotomat", _("le fotomat") - LAPUCEALOREILLE = "lapucealoreille", _("la puce à l" "oreille") + LAPUCEALOREILLE = "lapucealoreille", _("la puce à l'oreille") MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC") + FBEVENTS = "Facebook events", _("Événements d'une page") class DOWNLOADER(models.TextChoices): SIMPLE = "simple", _("simple")