Ajout de l'extraction des événements d'une page facebook

This commit is contained in:
Jean-Marie Favreau 2024-08-28 23:50:29 +02:00
parent cbb34190cf
commit 81601ec5da
7 changed files with 301 additions and 187 deletions

View File

@ -0,0 +1,43 @@
#!/usr/bin/python3
# coding: utf-8
import os
import json
import sys
# getting the name of the directory
# where the this file is present.
current = os.path.dirname(os.path.realpath(__file__))
# Getting the parent directory name
# where the current directory is present.
parent = os.path.dirname(current)
# adding the parent directory to
# the sys.path.
sys.path.append(parent)
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
if __name__ == "__main__":
u2e = URL2Events(ChromiumHeadlessDownloader(), fbevents.CExtractor())
url = "https://www.facebook.com/laJeteeClermont/events"
url_human = "https://www.facebook.com/laJeteeClermont/events"
try:
events = u2e.process(url, url_human, cache = "cache-lajetee-fb.html", default_values = {"location": "La Jetée"}, published = True)
exportfile = "events-lajetee-fb.json"
print("Saving events to file {}".format(exportfile))
with open(exportfile, "w") as f:
json.dump(events, f, indent=4, default=str)
except Exception as e:
print("Exception: " + str(e))

View File

@ -124,6 +124,8 @@ def run_recurrent_import(self, pk):
extractor = lapucealoreille.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.MECWORDPRESS:
extractor = wordpress_mec.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.FBEVENTS:
extractor = fbevents.CExtractor()
else:
extractor = None

View File

@ -0,0 +1,48 @@
from ..generic_extractors import *
from ..extractor_facebook import FacebookEvent
import json5
from bs4 import BeautifulSoup
import json
# A class dedicated to get events from a facebook events page
# such as https://www.facebook.com/laJeteeClermont/events
class CExtractor(TwoStepsExtractor):
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
links = soup.find_all("a")
for link in links:
if link.get("href").startswith('https://www.facebook.com/events/'):
self.add_event_url(link.get('href').split('?')[0])
def add_event_from_content(
self,
event_content,
event_url,
url_human=None,
default_values=None,
published=False,
):
fevent = None
soup = BeautifulSoup(event_content, "html.parser")
for json_script in soup.find_all("script", type="application/json"):
json_txt = json_script.get_text()
json_struct = json.loads(json_txt)
fevent = FacebookEvent.find_event_fragment_in_array(
json_struct, fevent
)
if fevent is not None:
event = fevent.build_event(event_url)
event["published"] = published
print([e.elements for e in fevent.neighbor_events])
if "category" in default_values:
event["category"] = default_values["category"]
self.add_event(**event)

View File

@ -5,9 +5,9 @@ import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import *
from abc import ABC, abstractmethod
class Downloader(ABC):
def __init__(self):
pass
@ -90,19 +90,20 @@ class ChromiumHeadlessDownloader(Downloader):
self.driver.get(url)
doc = self.driver.page_source
except exceptions.StaleElementReferenceException as e:
except StaleElementReferenceException as e:
print(f">> {type(e).__name__}: {e.args}")
return None
except exceptions.NoSuchElementException as e:
except NoSuchElementException as e:
print(f">> {type(e).__name__}: {e.args}")
return None
except exceptions.TimeoutException as e:
except TimeoutException as e:
print(f">> {type(e).__name__}: {e.args}")
return None
except exceptions.WebDriverException as e:
except WebDriverException as e:
print(f">> {type(e).__name__}: {e.args}")
return None
except exceptions.SessionNotCreatedException as e:
except SessionNotCreatedException as e:
print(f">> {type(e).__name__}: {e.args}")
return None
except Exception as e:

View File

@ -9,9 +9,7 @@ import logging
logger = logging.getLogger(__name__)
class FacebookEventExtractor(Extractor):
class SimpleFacebookEvent:
class SimpleFacebookEvent:
def __init__(self, data):
self.elements = {}
@ -19,11 +17,11 @@ class FacebookEventExtractor(Extractor):
self.elements[key] = data[key] if key in data else None
if "parent_event" in data:
self.parent = FacebookEventExtractor.SimpleFacebookEvent(
self.parent = SimpleFacebookEvent(
data["parent_event"]
)
class FacebookEvent:
class FacebookEvent:
name = "event"
keys = [
[
@ -81,17 +79,17 @@ class FacebookEventExtractor(Extractor):
def add_fragment(self, i, event):
self.fragments[i] = event
if FacebookEventExtractor.FacebookEvent.keys[i] == [
if FacebookEvent.keys[i] == [
"start_timestamp",
"end_timestamp",
]:
self.get_possible_end_timestamp(i, event)
else:
for k in FacebookEventExtractor.FacebookEvent.keys[i]:
for k in FacebookEvent.keys[i]:
if k == "comet_neighboring_siblings":
self.get_neighbor_events(event[k])
elif k in FacebookEventExtractor.FacebookEvent.rules:
for nk, rule in FacebookEventExtractor.FacebookEvent.rules[
elif k in FacebookEvent.rules:
for nk, rule in FacebookEvent.rules[
k
].items():
error = False
@ -108,12 +106,12 @@ class FacebookEventExtractor(Extractor):
def get_possible_end_timestamp(self, i, data):
self.possible_end_timestamp.append(
dict((k, data[k]) for k in FacebookEventExtractor.FacebookEvent.keys[i])
dict((k, data[k]) for k in FacebookEvent.keys[i])
)
def get_neighbor_events(self, data):
self.neighbor_events = [
FacebookEventExtractor.SimpleFacebookEvent(d) for d in data
SimpleFacebookEvent(d) for d in data
]
def __str__(self):
@ -153,24 +151,24 @@ class FacebookEventExtractor(Extractor):
def find_event_fragment_in_array(array, event, first=True):
if isinstance(array, dict):
seen = False
for i, ks in enumerate(FacebookEventExtractor.FacebookEvent.keys):
for i, ks in enumerate(FacebookEvent.keys):
# DEBUG: print([k for k in ks if k in array], "il manque", [k for k in ks if k not in array])
if len(ks) == len([k for k in ks if k in array]):
seen = True
if event is None:
event = FacebookEventExtractor.FacebookEvent(i, array)
event = FacebookEvent(i, array)
else:
event.add_fragment(i, array)
# only consider the first of FacebookEvent.keys
break
if not seen:
for k in array:
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(
event = FacebookEvent.find_event_fragment_in_array(
array[k], event, False
)
elif isinstance(array, list):
for e in array:
event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(
event = FacebookEvent.find_event_fragment_in_array(
e, event, False
)
@ -197,6 +195,9 @@ class FacebookEventExtractor(Extractor):
"image_alt": self.get_element("image"),
}
class FacebookEventExtractor(Extractor):
def __init__(self, single_event=False):
self.single_event = single_event
super().__init__()
@ -223,7 +224,7 @@ class FacebookEventExtractor(Extractor):
for json_script in soup.find_all("script", type="application/json"):
json_txt = json_script.get_text()
json_struct = json.loads(json_txt)
fevent = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(
fevent = FacebookEvent.find_event_fragment_in_array(
json_struct, fevent
)

View File

@ -0,0 +1,18 @@
# Generated by Django 4.2.7 on 2024-08-28 21:42
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('agenda_culturel', '0067_categorisationrule_place'),
]
operations = [
migrations.AlterField(
model_name='recurrentimport',
name='processor',
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page")], default='ical', max_length=20, verbose_name='Processor'),
),
]

View File

@ -1237,8 +1237,9 @@ class RecurrentImport(models.Model):
LACOOPE = "lacoope", _("lacoope.org")
LACOMEDIE = "lacomedie", _("la comédie")
LEFOTOMAT = "lefotomat", _("le fotomat")
LAPUCEALOREILLE = "lapucealoreille", _("la puce à l" "oreille")
LAPUCEALOREILLE = "lapucealoreille", _("la puce à l'oreille")
MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC")
FBEVENTS = "Facebook events", _("Événements d'une page")
class DOWNLOADER(models.TextChoices):
SIMPLE = "simple", _("simple")