From 3a78972391d273f0635fe74974b8866a8681d13c Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 1 Mar 2025 15:24:35 +0100 Subject: [PATCH] Restructuration des fichiers d'import --- README.md | 2 +- experimentations/get_facebook_event.py | 4 +- experimentations/get_facebook_events.py | 2 +- experimentations/get_ical_events.py | 2 +- experimentations/get_les_vinzelles.py | 2 +- experimentations/get_meditheques_clermont.py | 2 +- experimentations/get_mobilizon.py | 2 +- experimentations/get_puydedome.py | 2 +- src/agenda_culturel/celery.py | 10 +- .../custom_extractors/arachnee.py | 2 +- .../custom_extractors/billetterie_cf.py | 2 +- .../custom_extractors/lacomedie.py | 2 +- .../import_tasks/custom_extractors/lacoope.py | 3 +- .../custom_extractors/lapucealoreille.py | 2 +- .../custom_extractors/laraymonde.py | 2 +- .../custom_extractors/lefotomat.py | 2 +- .../import_tasks/custom_extractors/lerio.py | 2 +- .../custom_extractors/mille_formes.py | 2 +- src/agenda_culturel/import_tasks/extractor.py | 10 +- .../import_tasks/extractor_ggcal_link.py | 88 ---------- .../generic_extractors/__init__.py | 7 + .../apidae_tourisme.py | 2 +- .../fbevent.py} | 8 +- .../fbevents.py | 4 +- .../generic_extractors/ggcal_link.py | 158 ++++++++++++++++++ .../ical.py} | 2 +- .../iguana_agenda.py | 2 +- .../mobilizon.py | 0 .../wordpress_mec.py | 2 +- src/agenda_culturel/import_tasks/importer.py | 2 +- ...ic_extractors.py => twosteps_extractor.py} | 70 -------- src/agenda_culturel/models.py | 2 +- 32 files changed, 206 insertions(+), 198 deletions(-) mode change 100644 => 100755 experimentations/get_mobilizon.py delete mode 100644 src/agenda_culturel/import_tasks/extractor_ggcal_link.py create mode 100644 src/agenda_culturel/import_tasks/generic_extractors/__init__.py rename src/agenda_culturel/import_tasks/{custom_extractors => generic_extractors}/apidae_tourisme.py (99%) rename src/agenda_culturel/import_tasks/{extractor_facebook.py => generic_extractors/fbevent.py} (98%) rename src/agenda_culturel/import_tasks/{custom_extractors => generic_extractors}/fbevents.py (97%) create mode 100644 src/agenda_culturel/import_tasks/generic_extractors/ggcal_link.py rename src/agenda_culturel/import_tasks/{extractor_ical.py => generic_extractors/ical.py} (99%) rename src/agenda_culturel/import_tasks/{custom_extractors => generic_extractors}/iguana_agenda.py (99%) rename src/agenda_culturel/import_tasks/{custom_extractors => generic_extractors}/mobilizon.py (100%) rename src/agenda_culturel/import_tasks/{custom_extractors => generic_extractors}/wordpress_mec.py (99%) rename src/agenda_culturel/import_tasks/{generic_extractors.py => twosteps_extractor.py} (74%) diff --git a/README.md b/README.md index 6b3c7e8..9997d69 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ On peut activer à la main (pour l'instant) un proxy type socket pour l'import d ### Ajout d'une nouvelle source *custom* Pour ajouter une nouvelle source custom: -- ajouter un fichier dans ```src/agenda_culturel/import_tasks/custom_extractors``` en s'inspirant des autres fichiers présents. Il existe de nombreuses facilités dans les classes mères correspondantes +- ajouter un fichier dans ```src/agenda_culturel/import_tasks/custom_extractors``` (ou ```src/agenda_culturel/import_tasks/generic_extractors``` s'il s'agit d'un format de source qui est réutilisable) en s'inspirant des autres fichiers présents. Il existe de nombreuses facilités dans les classes mères correspondantes - s'inspirer des scripts présents dans ```experimentations/``` pour créer son propre script de test - quand l'import fonctionne de manière indépendante dans ces expérimentations, il est tant de l'ajouter au site internet: - ajouter à la classe ```RecurrentImport.PROCESSOR``` présente dans le fichier ```src/agenda_culturel/models.py``` une entrée correspondant à cette source pour qu'elle soit proposée aux utilisateurs diff --git a/experimentations/get_facebook_event.py b/experimentations/get_facebook_event.py index 0f16c7e..0da957f 100755 --- a/experimentations/get_facebook_event.py +++ b/experimentations/get_facebook_event.py @@ -21,7 +21,7 @@ sys.path.append(parent + "/src") from src.agenda_culturel.import_tasks.downloader import * from src.agenda_culturel.import_tasks.extractor import * from src.agenda_culturel.import_tasks.importer import * -from src.agenda_culturel.import_tasks.extractor_facebook import * +from src.agenda_culturel.import_tasks.generic_extractors.fbevent import * @@ -29,7 +29,7 @@ from src.agenda_culturel.import_tasks.extractor_facebook import * if __name__ == "__main__": - u2e = URL2Events(ChromiumHeadlessDownloader(), FacebookEventExtractor()) + u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor()) url="https://www.facebook.com/events/3575802569389184/3575802576055850/?active_tab=about" events = u2e.process(url, cache = "fb.html", published = True) diff --git a/experimentations/get_facebook_events.py b/experimentations/get_facebook_events.py index ea6981e..8760c3d 100755 --- a/experimentations/get_facebook_events.py +++ b/experimentations/get_facebook_events.py @@ -21,7 +21,7 @@ sys.path.append(parent + "/src") from src.agenda_culturel.import_tasks.downloader import * from src.agenda_culturel.import_tasks.extractor import * from src.agenda_culturel.import_tasks.importer import * -from src.agenda_culturel.import_tasks.custom_extractors import * +from src.agenda_culturel.import_tasks.generic_extractors import * diff --git a/experimentations/get_ical_events.py b/experimentations/get_ical_events.py index 2fc1854..fcc6dfd 100755 --- a/experimentations/get_ical_events.py +++ b/experimentations/get_ical_events.py @@ -21,7 +21,7 @@ sys.path.append(parent + "/src") from src.agenda_culturel.import_tasks.downloader import * from src.agenda_culturel.import_tasks.extractor import * from src.agenda_culturel.import_tasks.importer import * -from src.agenda_culturel.import_tasks.extractor_ical import * +from src.agenda_culturel.import_tasks.generic_extractors.ical import * diff --git a/experimentations/get_les_vinzelles.py b/experimentations/get_les_vinzelles.py index 9945d1e..95a8878 100755 --- a/experimentations/get_les_vinzelles.py +++ b/experimentations/get_les_vinzelles.py @@ -21,7 +21,7 @@ sys.path.append(parent + "/src") from src.agenda_culturel.import_tasks.downloader import * from src.agenda_culturel.import_tasks.extractor import * from src.agenda_culturel.import_tasks.importer import * -from src.agenda_culturel.import_tasks.custom_extractors import * +from src.agenda_culturel.import_tasks.generic_extractors import * diff --git a/experimentations/get_meditheques_clermont.py b/experimentations/get_meditheques_clermont.py index 1e43c82..026f1a4 100755 --- a/experimentations/get_meditheques_clermont.py +++ b/experimentations/get_meditheques_clermont.py @@ -21,7 +21,7 @@ sys.path.append(parent + "/src") from src.agenda_culturel.import_tasks.downloader import * from src.agenda_culturel.import_tasks.extractor import * from src.agenda_culturel.import_tasks.importer import * -from src.agenda_culturel.import_tasks.custom_extractors import * +from src.agenda_culturel.import_tasks.generic_extractors import * diff --git a/experimentations/get_mobilizon.py b/experimentations/get_mobilizon.py old mode 100644 new mode 100755 index e5a29c3..36d1145 --- a/experimentations/get_mobilizon.py +++ b/experimentations/get_mobilizon.py @@ -21,7 +21,7 @@ sys.path.append(parent + "/src") from src.agenda_culturel.import_tasks.downloader import * from src.agenda_culturel.import_tasks.extractor import * from src.agenda_culturel.import_tasks.importer import * -from src.agenda_culturel.import_tasks.custom_extractors import * +from src.agenda_culturel.import_tasks.generic_extractors import * diff --git a/experimentations/get_puydedome.py b/experimentations/get_puydedome.py index 1069070..b54d10b 100755 --- a/experimentations/get_puydedome.py +++ b/experimentations/get_puydedome.py @@ -21,7 +21,7 @@ sys.path.append(parent + "/src") from src.agenda_culturel.import_tasks.downloader import * from src.agenda_culturel.import_tasks.extractor import * from src.agenda_culturel.import_tasks.importer import * -from src.agenda_culturel.import_tasks.custom_extractors import * +from src.agenda_culturel.import_tasks.generic_extractors import * diff --git a/src/agenda_culturel/celery.py b/src/agenda_culturel/celery.py index 6eca002..b95d6c3 100644 --- a/src/agenda_culturel/celery.py +++ b/src/agenda_culturel/celery.py @@ -14,8 +14,8 @@ from contextlib import contextmanager from .import_tasks.downloader import * from .import_tasks.extractor import * from .import_tasks.importer import * -from .import_tasks.extractor_ical import * from .import_tasks.custom_extractors import * +from .import_tasks.generic_extractors import * # Set the default Django settings module for the 'celery' program. @@ -140,13 +140,13 @@ def run_recurrent_import_internal(rimport, downloader, req_id): if rimport.processor == RecurrentImport.PROCESSOR.ICAL: - extractor = ICALExtractor() + extractor = ical.ICALExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOBUSY: - extractor = ICALNoBusyExtractor() + extractor = ical.ICALNoBusyExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC: - extractor = ICALNoVCExtractor() + extractor = ical.ICALNoVCExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.ICALNAIVETZ: - extractor = ICALNaiveTimezone() + extractor = ical.ICALNaiveTimezone() elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE: extractor = lacoope.CExtractor() elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE: diff --git a/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py b/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py index 043f6f7..cf17be7 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/arachnee.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup # A class dedicated to get events from Arachnée Concert diff --git a/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py b/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py index f4a227f..24e78fb 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup from datetime import timedelta diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py index bd8aadb..ab6747e 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * import json5 from bs4 import BeautifulSoup diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py b/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py index abb641a..71f672d 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lacoope.py @@ -1,4 +1,5 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * +from ..generic_extractors.ggcal_link import GGCalendar import re import json5 from bs4 import BeautifulSoup diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py index 17935d4..c9bb1db 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lapucealoreille.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * import re from bs4 import BeautifulSoup diff --git a/src/agenda_culturel/import_tasks/custom_extractors/laraymonde.py b/src/agenda_culturel/import_tasks/custom_extractors/laraymonde.py index 2a74e95..ceb62df 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/laraymonde.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/laraymonde.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup from datetime import datetime diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py b/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py index 653c084..c385662 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lefotomat.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup # A class dedicated to get events from Le Fotomat' diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lerio.py b/src/agenda_culturel/import_tasks/custom_extractors/lerio.py index 0d859f2..9420dea 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lerio.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lerio.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup from datetime import datetime diff --git a/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py b/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py index 3859cb7..038cee8 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/mille_formes.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup from datetime import datetime, date diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index a73ae22..3a71410 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -284,8 +284,8 @@ class Extractor(ABC): return {"header": self.header, "events": self.events} def clean_url(url): - from .extractor_ical import ICALExtractor - from .extractor_facebook import FacebookEventExtractor + from .generic_extractors.ical import ICALExtractor + from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor result = url for e in [ICALExtractor, FacebookEventExtractor]: @@ -293,9 +293,9 @@ class Extractor(ABC): return result def get_default_extractors(single_event=False): - from .extractor_ical import ICALExtractor - from .extractor_facebook import FacebookEventExtractor - from .extractor_ggcal_link import GoogleCalendarLinkEventExtractor + from .generic_extractors.ical import ICALExtractor + from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor + from .generic_extractors.ggcal_link import CExtractor as GoogleCalendarLinkEventExtractor if single_event: return [FacebookEventExtractor(), GoogleCalendarLinkEventExtractor(), EventNotFoundExtractor()] diff --git a/src/agenda_culturel/import_tasks/extractor_ggcal_link.py b/src/agenda_culturel/import_tasks/extractor_ggcal_link.py deleted file mode 100644 index 5ce6ead..0000000 --- a/src/agenda_culturel/import_tasks/extractor_ggcal_link.py +++ /dev/null @@ -1,88 +0,0 @@ -from datetime import datetime -from bs4 import BeautifulSoup -from urllib.parse import urlparse - -from .extractor import * -from .generic_extractors import * - -import json - -import logging - -logger = logging.getLogger(__name__) - -class GoogleCalendarLinkEventExtractor(Extractor): - def __init__(self): - super().__init__() - self.possible_urls = ["https://calendar.google.com/calendar/", "https://addtocalendar.com/", "https://www.google.com/calendar/event"] - - - def guess_image(self, soup, url): - image = soup.find("meta", property="og:image") - - if image is None: - for img in soup.select('img'): - if img.find_parent(name='nav'): - continue - image = img["src"] - break - else: - image = image["content"] - - if image.startswith("/"): - root_url = "https://" + urlparse(url).netloc + "/" - image = root_url + image - - return image - - - def extract( - self, content, url, url_human=None, default_values=None, published=False - ): - soup = BeautifulSoup(content, "html.parser") - - for ggu in self.possible_urls: - - link_calendar = soup.select('a[href^="' + ggu + '"]') - if len(link_calendar) != 0: - - gg_cal = GGCalendar(link_calendar[0]["href"]) - - if gg_cal.is_valid_event(): - start_day = gg_cal.start_day - start_time = gg_cal.start_time - description = gg_cal.description.replace(' ', '') - end_day = gg_cal.end_day - end_time = gg_cal.end_time - location = gg_cal.location - title = gg_cal.title - url_human = url - - self.set_header(url) - - image = self.guess_image(soup, url) - - category = None - - self.add_event( - default_values, - title=title, - category=category, - start_day=start_day, - location=location, - description=description, - tags=[], - uuids=[url], - recurrences=None, - url_human=url_human, - start_time=start_time, - end_day=end_day, - end_time=end_time, - published=published, - image=image, - ) - - break - - - return self.get_structure() \ No newline at end of file diff --git a/src/agenda_culturel/import_tasks/generic_extractors/__init__.py b/src/agenda_culturel/import_tasks/generic_extractors/__init__.py new file mode 100644 index 0000000..ecf5ec1 --- /dev/null +++ b/src/agenda_culturel/import_tasks/generic_extractors/__init__.py @@ -0,0 +1,7 @@ +from os.path import dirname, basename, isfile, join +import glob + +modules = glob.glob(join(dirname(__file__), "*.py")) +__all__ = [ + basename(f)[:-3] for f in modules if isfile(f) and not f.endswith("__init__.py") +] diff --git a/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py b/src/agenda_culturel/import_tasks/generic_extractors/apidae_tourisme.py similarity index 99% rename from src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py rename to src/agenda_culturel/import_tasks/generic_extractors/apidae_tourisme.py index 4cb3aff..ec5adf1 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/apidae_tourisme.py +++ b/src/agenda_culturel/import_tasks/generic_extractors/apidae_tourisme.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup from datetime import datetime diff --git a/src/agenda_culturel/import_tasks/extractor_facebook.py b/src/agenda_culturel/import_tasks/generic_extractors/fbevent.py similarity index 98% rename from src/agenda_culturel/import_tasks/extractor_facebook.py rename to src/agenda_culturel/import_tasks/generic_extractors/fbevent.py index 079b973..cddbc76 100644 --- a/src/agenda_culturel/import_tasks/extractor_facebook.py +++ b/src/agenda_culturel/import_tasks/generic_extractors/fbevent.py @@ -5,7 +5,7 @@ import time as t from django.utils.translation import gettext_lazy as _ -from .extractor import * +from ..extractor import * import json import logging @@ -231,7 +231,7 @@ class FacebookEvent: result.append(clone.build_event(url_base + nb_e.elements["id"] + "/")) return result -class FacebookEventExtractor(Extractor): +class CExtractor(Extractor): def __init__(self): super().__init__() @@ -259,11 +259,11 @@ class FacebookEventExtractor(Extractor): t.sleep(5) def prepare_2nd_extract(self): - FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader) + CExtractor.prepare_2nd_extract_dler(self.downloader) def clean_url(url): - if FacebookEventExtractor.is_known_url(url, False): + if CExtractor.is_known_url(url, False): u = urlparse(url) result = "https://www.facebook.com" + u.path diff --git a/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py b/src/agenda_culturel/import_tasks/generic_extractors/fbevents.py similarity index 97% rename from src/agenda_culturel/import_tasks/custom_extractors/fbevents.py rename to src/agenda_culturel/import_tasks/generic_extractors/fbevents.py index d20677a..f806060 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py +++ b/src/agenda_culturel/import_tasks/generic_extractors/fbevents.py @@ -1,5 +1,5 @@ -from ..generic_extractors import * -from ..extractor_facebook import FacebookEvent, FacebookEventExtractor +from ..twosteps_extractor import * +from .fbevent import FacebookEvent import json5 from bs4 import BeautifulSoup import json diff --git a/src/agenda_culturel/import_tasks/generic_extractors/ggcal_link.py b/src/agenda_culturel/import_tasks/generic_extractors/ggcal_link.py new file mode 100644 index 0000000..4eb21c9 --- /dev/null +++ b/src/agenda_culturel/import_tasks/generic_extractors/ggcal_link.py @@ -0,0 +1,158 @@ +from datetime import datetime +from bs4 import BeautifulSoup +from urllib.parse import urlparse + +from ..extractor import * +from ..twosteps_extractor import * + +import json + +import logging + +logger = logging.getLogger(__name__) + +class GGCalendar: + def __init__(self, url): + self.url = url + self.extract_info() + + def filter_keys(params): + result = {} + + for k, v in params.items(): + if k.startswith('e[0]'): + result[k.replace('e[0][', '')[:-1]] = v + else: + result[k] = v + + return result + + def is_valid_event(self): + return self.start_day is not None and self.title is not None + + def extract_info(self): + parsed_url = urlparse(self.url.replace("#", "%23")) + params = parse_qs(parsed_url.query) + + params = GGCalendar.filter_keys(params) + + self.location = params["location"][0] if "location" in params else "" + self.title = params["text"][0] if "text" in params else params["title"][0] if "title" in params else "" + self.description = params["description"][0] if "description" in params else params["details"][0] if "details" in params else "" + if self.description != "": + self.description = BeautifulSoup(self.description, "html.parser").text + if "dates" in params: + dates = [x.replace(" ", "+") for x in params["dates"][0].split("/")] + if len(dates) > 0: + date = parser.parse(dates[0]) + self.start_day = date.date() + self.start_time = date.time() + if len(dates) == 2: + date = parser.parse(dates[1]) + self.end_day = date.date() + self.end_time = date.time() + else: + self.end_day = None + self.end_time = None + elif "date_start" in params: + date = parser.parse(params["date_start"][0]) + self.start_day = date.date() + self.start_time = date.time() + if "date_end" in params: + dateend = parser.parse(params["date_end"][0]) + if dateend != date: + self.end_day = dateend.date() + self.end_time = dateend.time() + else: + self.end_day = None + self.end_time = None + if self.start_time == datetime.time(0): + self.start_time = None + + else: + self.end_day = None + self.end_time = None + else: + raise Exception("Unable to find a date in google calendar URL") + self.start_day = None + self.start_time = None + self.end_day = None + self.end_time = None + + + +class CExtractor(Extractor): + def __init__(self): + super().__init__() + self.possible_urls = ["https://calendar.google.com/calendar/", "https://addtocalendar.com/", "https://www.google.com/calendar/event"] + + + def guess_image(self, soup, url): + image = soup.find("meta", property="og:image") + + if image is None: + for img in soup.select('img'): + if img.find_parent(name='nav'): + continue + image = img["src"] + break + else: + image = image["content"] + + if image.startswith("/"): + root_url = "https://" + urlparse(url).netloc + "/" + image = root_url + image + + return image + + + def extract( + self, content, url, url_human=None, default_values=None, published=False + ): + soup = BeautifulSoup(content, "html.parser") + + for ggu in self.possible_urls: + + link_calendar = soup.select('a[href^="' + ggu + '"]') + if len(link_calendar) != 0: + + gg_cal = GGCalendar(link_calendar[0]["href"]) + + if gg_cal.is_valid_event(): + start_day = gg_cal.start_day + start_time = gg_cal.start_time + description = gg_cal.description.replace(' ', '') + end_day = gg_cal.end_day + end_time = gg_cal.end_time + location = gg_cal.location + title = gg_cal.title + url_human = url + + self.set_header(url) + + image = self.guess_image(soup, url) + + category = None + + self.add_event( + default_values, + title=title, + category=category, + start_day=start_day, + location=location, + description=description, + tags=[], + uuids=[url], + recurrences=None, + url_human=url_human, + start_time=start_time, + end_day=end_day, + end_time=end_time, + published=published, + image=image, + ) + + break + + + return self.get_structure() \ No newline at end of file diff --git a/src/agenda_culturel/import_tasks/extractor_ical.py b/src/agenda_culturel/import_tasks/generic_extractors/ical.py similarity index 99% rename from src/agenda_culturel/import_tasks/extractor_ical.py rename to src/agenda_culturel/import_tasks/generic_extractors/ical.py index 8b88df0..9b85211 100644 --- a/src/agenda_culturel/import_tasks/extractor_ical.py +++ b/src/agenda_culturel/import_tasks/generic_extractors/ical.py @@ -8,7 +8,7 @@ from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning import pytz -from .extractor import * +from ..extractor import * from celery.utils.log import get_task_logger diff --git a/src/agenda_culturel/import_tasks/custom_extractors/iguana_agenda.py b/src/agenda_culturel/import_tasks/generic_extractors/iguana_agenda.py similarity index 99% rename from src/agenda_culturel/import_tasks/custom_extractors/iguana_agenda.py rename to src/agenda_culturel/import_tasks/generic_extractors/iguana_agenda.py index 313870c..ac184bd 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/iguana_agenda.py +++ b/src/agenda_culturel/import_tasks/generic_extractors/iguana_agenda.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup from datetime import datetime from urllib.parse import urlparse diff --git a/src/agenda_culturel/import_tasks/custom_extractors/mobilizon.py b/src/agenda_culturel/import_tasks/generic_extractors/mobilizon.py similarity index 100% rename from src/agenda_culturel/import_tasks/custom_extractors/mobilizon.py rename to src/agenda_culturel/import_tasks/generic_extractors/mobilizon.py diff --git a/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py b/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py similarity index 99% rename from src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py rename to src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py index 0d1d66e..7ecd8a9 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/wordpress_mec.py +++ b/src/agenda_culturel/import_tasks/generic_extractors/wordpress_mec.py @@ -1,4 +1,4 @@ -from ..generic_extractors import * +from ..twosteps_extractor import * from bs4 import BeautifulSoup diff --git a/src/agenda_culturel/import_tasks/importer.py b/src/agenda_culturel/import_tasks/importer.py index 9f854db..150f0c9 100644 --- a/src/agenda_culturel/import_tasks/importer.py +++ b/src/agenda_culturel/import_tasks/importer.py @@ -1,6 +1,6 @@ from .downloader import * from .extractor import * -from .extractor_facebook import FacebookEventExtractor +from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor import logging diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/twosteps_extractor.py similarity index 74% rename from src/agenda_culturel/import_tasks/generic_extractors.py rename to src/agenda_culturel/import_tasks/twosteps_extractor.py index 646faa8..40d6488 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/twosteps_extractor.py @@ -14,76 +14,6 @@ from django.utils.translation import gettext_lazy as _ from dateutil import parser import datetime - -class GGCalendar: - def __init__(self, url): - self.url = url - self.extract_info() - - def filter_keys(params): - result = {} - - for k, v in params.items(): - if k.startswith('e[0]'): - result[k.replace('e[0][', '')[:-1]] = v - else: - result[k] = v - - return result - - def is_valid_event(self): - return self.start_day is not None and self.title is not None - - def extract_info(self): - parsed_url = urlparse(self.url.replace("#", "%23")) - params = parse_qs(parsed_url.query) - - params = GGCalendar.filter_keys(params) - - self.location = params["location"][0] if "location" in params else "" - self.title = params["text"][0] if "text" in params else params["title"][0] if "title" in params else "" - self.description = params["description"][0] if "description" in params else params["details"][0] if "details" in params else "" - if self.description != "": - self.description = BeautifulSoup(self.description, "html.parser").text - if "dates" in params: - dates = [x.replace(" ", "+") for x in params["dates"][0].split("/")] - if len(dates) > 0: - date = parser.parse(dates[0]) - self.start_day = date.date() - self.start_time = date.time() - if len(dates) == 2: - date = parser.parse(dates[1]) - self.end_day = date.date() - self.end_time = date.time() - else: - self.end_day = None - self.end_time = None - elif "date_start" in params: - date = parser.parse(params["date_start"][0]) - self.start_day = date.date() - self.start_time = date.time() - if "date_end" in params: - dateend = parser.parse(params["date_end"][0]) - if dateend != date: - self.end_day = dateend.date() - self.end_time = dateend.time() - else: - self.end_day = None - self.end_time = None - if self.start_time == datetime.time(0): - self.start_time = None - - else: - self.end_day = None - self.end_time = None - else: - raise Exception("Unable to find a date in google calendar URL") - self.start_day = None - self.start_time = None - self.end_day = None - self.end_time = None - - # A class to extract events from URL with two steps: # - first build a list of urls where the events will be found # - then for each document downloaded from these urls, build the events diff --git a/src/agenda_culturel/models.py b/src/agenda_culturel/models.py index e154863..ec0765a 100644 --- a/src/agenda_culturel/models.py +++ b/src/agenda_culturel/models.py @@ -36,7 +36,7 @@ import recurrence import copy import unicodedata from collections import defaultdict -from .import_tasks.extractor_facebook import FacebookEventExtractor +from .import_tasks.generic_extractors.fbevent import CExtractor as FacebookEventExtractor from .import_tasks.extractor import Extractor from django.template.defaultfilters import date as _date