Restructuration des fichiers d'import

This commit is contained in:
Jean-Marie Favreau 2025-03-01 15:24:35 +01:00
parent 571a6775c4
commit 3a78972391
32 changed files with 206 additions and 198 deletions

View File

@ -43,7 +43,7 @@ On peut activer à la main (pour l'instant) un proxy type socket pour l'import d
### Ajout d'une nouvelle source *custom*
Pour ajouter une nouvelle source custom:
- ajouter un fichier dans ```src/agenda_culturel/import_tasks/custom_extractors``` en s'inspirant des autres fichiers présents. Il existe de nombreuses facilités dans les classes mères correspondantes
- ajouter un fichier dans ```src/agenda_culturel/import_tasks/custom_extractors``` (ou ```src/agenda_culturel/import_tasks/generic_extractors``` s'il s'agit d'un format de source qui est réutilisable) en s'inspirant des autres fichiers présents. Il existe de nombreuses facilités dans les classes mères correspondantes
- s'inspirer des scripts présents dans ```experimentations/``` pour créer son propre script de test
- quand l'import fonctionne de manière indépendante dans ces expérimentations, il est tant de l'ajouter au site internet:
- ajouter à la classe ```RecurrentImport.PROCESSOR``` présente dans le fichier ```src/agenda_culturel/models.py``` une entrée correspondant à cette source pour qu'elle soit proposée aux utilisateurs

View File

@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.extractor_facebook import *
from src.agenda_culturel.import_tasks.generic_extractors.fbevent import *
@ -29,7 +29,7 @@ from src.agenda_culturel.import_tasks.extractor_facebook import *
if __name__ == "__main__":
u2e = URL2Events(ChromiumHeadlessDownloader(), FacebookEventExtractor())
u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor())
url="https://www.facebook.com/events/3575802569389184/3575802576055850/?active_tab=about"
events = u2e.process(url, cache = "fb.html", published = True)

View File

@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
from src.agenda_culturel.import_tasks.generic_extractors import *

View File

@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.extractor_ical import *
from src.agenda_culturel.import_tasks.generic_extractors.ical import *

View File

@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
from src.agenda_culturel.import_tasks.generic_extractors import *

View File

@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
from src.agenda_culturel.import_tasks.generic_extractors import *

2
experimentations/get_mobilizon.py Normal file → Executable file
View File

@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
from src.agenda_culturel.import_tasks.generic_extractors import *

View File

@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
from src.agenda_culturel.import_tasks.downloader import *
from src.agenda_culturel.import_tasks.extractor import *
from src.agenda_culturel.import_tasks.importer import *
from src.agenda_culturel.import_tasks.custom_extractors import *
from src.agenda_culturel.import_tasks.generic_extractors import *

View File

@ -14,8 +14,8 @@ from contextlib import contextmanager
from .import_tasks.downloader import *
from .import_tasks.extractor import *
from .import_tasks.importer import *
from .import_tasks.extractor_ical import *
from .import_tasks.custom_extractors import *
from .import_tasks.generic_extractors import *
# Set the default Django settings module for the 'celery' program.
@ -140,13 +140,13 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
if rimport.processor == RecurrentImport.PROCESSOR.ICAL:
extractor = ICALExtractor()
extractor = ical.ICALExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOBUSY:
extractor = ICALNoBusyExtractor()
extractor = ical.ICALNoBusyExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
extractor = ICALNoVCExtractor()
extractor = ical.ICALNoVCExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNAIVETZ:
extractor = ICALNaiveTimezone()
extractor = ical.ICALNaiveTimezone()
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
extractor = lacoope.CExtractor()
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup
# A class dedicated to get events from Arachnée Concert

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup
from datetime import timedelta

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
import json5
from bs4 import BeautifulSoup

View File

@ -1,4 +1,5 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from ..generic_extractors.ggcal_link import GGCalendar
import re
import json5
from bs4 import BeautifulSoup

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
import re
from bs4 import BeautifulSoup

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup
from datetime import datetime

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup
# A class dedicated to get events from Le Fotomat'

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup
from datetime import datetime

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup
from datetime import datetime, date

View File

@ -284,8 +284,8 @@ class Extractor(ABC):
return {"header": self.header, "events": self.events}
def clean_url(url):
from .extractor_ical import ICALExtractor
from .extractor_facebook import FacebookEventExtractor
from .generic_extractors.ical import ICALExtractor
from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor
result = url
for e in [ICALExtractor, FacebookEventExtractor]:
@ -293,9 +293,9 @@ class Extractor(ABC):
return result
def get_default_extractors(single_event=False):
from .extractor_ical import ICALExtractor
from .extractor_facebook import FacebookEventExtractor
from .extractor_ggcal_link import GoogleCalendarLinkEventExtractor
from .generic_extractors.ical import ICALExtractor
from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor
from .generic_extractors.ggcal_link import CExtractor as GoogleCalendarLinkEventExtractor
if single_event:
return [FacebookEventExtractor(), GoogleCalendarLinkEventExtractor(), EventNotFoundExtractor()]

View File

@ -1,88 +0,0 @@
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from .extractor import *
from .generic_extractors import *
import json
import logging
logger = logging.getLogger(__name__)
class GoogleCalendarLinkEventExtractor(Extractor):
def __init__(self):
super().__init__()
self.possible_urls = ["https://calendar.google.com/calendar/", "https://addtocalendar.com/", "https://www.google.com/calendar/event"]
def guess_image(self, soup, url):
image = soup.find("meta", property="og:image")
if image is None:
for img in soup.select('img'):
if img.find_parent(name='nav'):
continue
image = img["src"]
break
else:
image = image["content"]
if image.startswith("/"):
root_url = "https://" + urlparse(url).netloc + "/"
image = root_url + image
return image
def extract(
self, content, url, url_human=None, default_values=None, published=False
):
soup = BeautifulSoup(content, "html.parser")
for ggu in self.possible_urls:
link_calendar = soup.select('a[href^="' + ggu + '"]')
if len(link_calendar) != 0:
gg_cal = GGCalendar(link_calendar[0]["href"])
if gg_cal.is_valid_event():
start_day = gg_cal.start_day
start_time = gg_cal.start_time
description = gg_cal.description.replace(' ', '')
end_day = gg_cal.end_day
end_time = gg_cal.end_time
location = gg_cal.location
title = gg_cal.title
url_human = url
self.set_header(url)
image = self.guess_image(soup, url)
category = None
self.add_event(
default_values,
title=title,
category=category,
start_day=start_day,
location=location,
description=description,
tags=[],
uuids=[url],
recurrences=None,
url_human=url_human,
start_time=start_time,
end_day=end_day,
end_time=end_time,
published=published,
image=image,
)
break
return self.get_structure()

View File

@ -0,0 +1,7 @@
from os.path import dirname, basename, isfile, join
import glob
modules = glob.glob(join(dirname(__file__), "*.py"))
__all__ = [
basename(f)[:-3] for f in modules if isfile(f) and not f.endswith("__init__.py")
]

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup
from datetime import datetime

View File

@ -5,7 +5,7 @@ import time as t
from django.utils.translation import gettext_lazy as _
from .extractor import *
from ..extractor import *
import json
import logging
@ -231,7 +231,7 @@ class FacebookEvent:
result.append(clone.build_event(url_base + nb_e.elements["id"] + "/"))
return result
class FacebookEventExtractor(Extractor):
class CExtractor(Extractor):
def __init__(self):
super().__init__()
@ -259,11 +259,11 @@ class FacebookEventExtractor(Extractor):
t.sleep(5)
def prepare_2nd_extract(self):
FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader)
CExtractor.prepare_2nd_extract_dler(self.downloader)
def clean_url(url):
if FacebookEventExtractor.is_known_url(url, False):
if CExtractor.is_known_url(url, False):
u = urlparse(url)
result = "https://www.facebook.com" + u.path

View File

@ -1,5 +1,5 @@
from ..generic_extractors import *
from ..extractor_facebook import FacebookEvent, FacebookEventExtractor
from ..twosteps_extractor import *
from .fbevent import FacebookEvent
import json5
from bs4 import BeautifulSoup
import json

View File

@ -0,0 +1,158 @@
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from ..extractor import *
from ..twosteps_extractor import *
import json
import logging
logger = logging.getLogger(__name__)
class GGCalendar:
def __init__(self, url):
self.url = url
self.extract_info()
def filter_keys(params):
result = {}
for k, v in params.items():
if k.startswith('e[0]'):
result[k.replace('e[0][', '')[:-1]] = v
else:
result[k] = v
return result
def is_valid_event(self):
return self.start_day is not None and self.title is not None
def extract_info(self):
parsed_url = urlparse(self.url.replace("#", "%23"))
params = parse_qs(parsed_url.query)
params = GGCalendar.filter_keys(params)
self.location = params["location"][0] if "location" in params else ""
self.title = params["text"][0] if "text" in params else params["title"][0] if "title" in params else ""
self.description = params["description"][0] if "description" in params else params["details"][0] if "details" in params else ""
if self.description != "":
self.description = BeautifulSoup(self.description, "html.parser").text
if "dates" in params:
dates = [x.replace(" ", "+") for x in params["dates"][0].split("/")]
if len(dates) > 0:
date = parser.parse(dates[0])
self.start_day = date.date()
self.start_time = date.time()
if len(dates) == 2:
date = parser.parse(dates[1])
self.end_day = date.date()
self.end_time = date.time()
else:
self.end_day = None
self.end_time = None
elif "date_start" in params:
date = parser.parse(params["date_start"][0])
self.start_day = date.date()
self.start_time = date.time()
if "date_end" in params:
dateend = parser.parse(params["date_end"][0])
if dateend != date:
self.end_day = dateend.date()
self.end_time = dateend.time()
else:
self.end_day = None
self.end_time = None
if self.start_time == datetime.time(0):
self.start_time = None
else:
self.end_day = None
self.end_time = None
else:
raise Exception("Unable to find a date in google calendar URL")
self.start_day = None
self.start_time = None
self.end_day = None
self.end_time = None
class CExtractor(Extractor):
def __init__(self):
super().__init__()
self.possible_urls = ["https://calendar.google.com/calendar/", "https://addtocalendar.com/", "https://www.google.com/calendar/event"]
def guess_image(self, soup, url):
image = soup.find("meta", property="og:image")
if image is None:
for img in soup.select('img'):
if img.find_parent(name='nav'):
continue
image = img["src"]
break
else:
image = image["content"]
if image.startswith("/"):
root_url = "https://" + urlparse(url).netloc + "/"
image = root_url + image
return image
def extract(
self, content, url, url_human=None, default_values=None, published=False
):
soup = BeautifulSoup(content, "html.parser")
for ggu in self.possible_urls:
link_calendar = soup.select('a[href^="' + ggu + '"]')
if len(link_calendar) != 0:
gg_cal = GGCalendar(link_calendar[0]["href"])
if gg_cal.is_valid_event():
start_day = gg_cal.start_day
start_time = gg_cal.start_time
description = gg_cal.description.replace(' ', '')
end_day = gg_cal.end_day
end_time = gg_cal.end_time
location = gg_cal.location
title = gg_cal.title
url_human = url
self.set_header(url)
image = self.guess_image(soup, url)
category = None
self.add_event(
default_values,
title=title,
category=category,
start_day=start_day,
location=location,
description=description,
tags=[],
uuids=[url],
recurrences=None,
url_human=url_human,
start_time=start_time,
end_day=end_day,
end_time=end_time,
published=published,
image=image,
)
break
return self.get_structure()

View File

@ -8,7 +8,7 @@ from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import pytz
from .extractor import *
from ..extractor import *
from celery.utils.log import get_task_logger

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urlparse

View File

@ -1,4 +1,4 @@
from ..generic_extractors import *
from ..twosteps_extractor import *
from bs4 import BeautifulSoup

View File

@ -1,6 +1,6 @@
from .downloader import *
from .extractor import *
from .extractor_facebook import FacebookEventExtractor
from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor
import logging

View File

@ -14,76 +14,6 @@ from django.utils.translation import gettext_lazy as _
from dateutil import parser
import datetime
class GGCalendar:
def __init__(self, url):
self.url = url
self.extract_info()
def filter_keys(params):
result = {}
for k, v in params.items():
if k.startswith('e[0]'):
result[k.replace('e[0][', '')[:-1]] = v
else:
result[k] = v
return result
def is_valid_event(self):
return self.start_day is not None and self.title is not None
def extract_info(self):
parsed_url = urlparse(self.url.replace("#", "%23"))
params = parse_qs(parsed_url.query)
params = GGCalendar.filter_keys(params)
self.location = params["location"][0] if "location" in params else ""
self.title = params["text"][0] if "text" in params else params["title"][0] if "title" in params else ""
self.description = params["description"][0] if "description" in params else params["details"][0] if "details" in params else ""
if self.description != "":
self.description = BeautifulSoup(self.description, "html.parser").text
if "dates" in params:
dates = [x.replace(" ", "+") for x in params["dates"][0].split("/")]
if len(dates) > 0:
date = parser.parse(dates[0])
self.start_day = date.date()
self.start_time = date.time()
if len(dates) == 2:
date = parser.parse(dates[1])
self.end_day = date.date()
self.end_time = date.time()
else:
self.end_day = None
self.end_time = None
elif "date_start" in params:
date = parser.parse(params["date_start"][0])
self.start_day = date.date()
self.start_time = date.time()
if "date_end" in params:
dateend = parser.parse(params["date_end"][0])
if dateend != date:
self.end_day = dateend.date()
self.end_time = dateend.time()
else:
self.end_day = None
self.end_time = None
if self.start_time == datetime.time(0):
self.start_time = None
else:
self.end_day = None
self.end_time = None
else:
raise Exception("Unable to find a date in google calendar URL")
self.start_day = None
self.start_time = None
self.end_day = None
self.end_time = None
# A class to extract events from URL with two steps:
# - first build a list of urls where the events will be found
# - then for each document downloaded from these urls, build the events

View File

@ -36,7 +36,7 @@ import recurrence
import copy
import unicodedata
from collections import defaultdict
from .import_tasks.extractor_facebook import FacebookEventExtractor
from .import_tasks.generic_extractors.fbevent import CExtractor as FacebookEventExtractor
from .import_tasks.extractor import Extractor
from django.template.defaultfilters import date as _date