Restructuration des fichiers d'import
This commit is contained in:
parent
571a6775c4
commit
3a78972391
@ -43,7 +43,7 @@ On peut activer à la main (pour l'instant) un proxy type socket pour l'import d
|
|||||||
### Ajout d'une nouvelle source *custom*
|
### Ajout d'une nouvelle source *custom*
|
||||||
|
|
||||||
Pour ajouter une nouvelle source custom:
|
Pour ajouter une nouvelle source custom:
|
||||||
- ajouter un fichier dans ```src/agenda_culturel/import_tasks/custom_extractors``` en s'inspirant des autres fichiers présents. Il existe de nombreuses facilités dans les classes mères correspondantes
|
- ajouter un fichier dans ```src/agenda_culturel/import_tasks/custom_extractors``` (ou ```src/agenda_culturel/import_tasks/generic_extractors``` s'il s'agit d'un format de source qui est réutilisable) en s'inspirant des autres fichiers présents. Il existe de nombreuses facilités dans les classes mères correspondantes
|
||||||
- s'inspirer des scripts présents dans ```experimentations/``` pour créer son propre script de test
|
- s'inspirer des scripts présents dans ```experimentations/``` pour créer son propre script de test
|
||||||
- quand l'import fonctionne de manière indépendante dans ces expérimentations, il est tant de l'ajouter au site internet:
|
- quand l'import fonctionne de manière indépendante dans ces expérimentations, il est tant de l'ajouter au site internet:
|
||||||
- ajouter à la classe ```RecurrentImport.PROCESSOR``` présente dans le fichier ```src/agenda_culturel/models.py``` une entrée correspondant à cette source pour qu'elle soit proposée aux utilisateurs
|
- ajouter à la classe ```RecurrentImport.PROCESSOR``` présente dans le fichier ```src/agenda_culturel/models.py``` une entrée correspondant à cette source pour qu'elle soit proposée aux utilisateurs
|
||||||
|
@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
|
|||||||
from src.agenda_culturel.import_tasks.downloader import *
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
from src.agenda_culturel.import_tasks.extractor import *
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
from src.agenda_culturel.import_tasks.importer import *
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
from src.agenda_culturel.import_tasks.extractor_facebook import *
|
from src.agenda_culturel.import_tasks.generic_extractors.fbevent import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -29,7 +29,7 @@ from src.agenda_culturel.import_tasks.extractor_facebook import *
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
u2e = URL2Events(ChromiumHeadlessDownloader(), FacebookEventExtractor())
|
u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor())
|
||||||
url="https://www.facebook.com/events/3575802569389184/3575802576055850/?active_tab=about"
|
url="https://www.facebook.com/events/3575802569389184/3575802576055850/?active_tab=about"
|
||||||
|
|
||||||
events = u2e.process(url, cache = "fb.html", published = True)
|
events = u2e.process(url, cache = "fb.html", published = True)
|
||||||
|
@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
|
|||||||
from src.agenda_culturel.import_tasks.downloader import *
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
from src.agenda_culturel.import_tasks.extractor import *
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
from src.agenda_culturel.import_tasks.importer import *
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
from src.agenda_culturel.import_tasks.generic_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
|
|||||||
from src.agenda_culturel.import_tasks.downloader import *
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
from src.agenda_culturel.import_tasks.extractor import *
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
from src.agenda_culturel.import_tasks.importer import *
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
from src.agenda_culturel.import_tasks.extractor_ical import *
|
from src.agenda_culturel.import_tasks.generic_extractors.ical import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
|
|||||||
from src.agenda_culturel.import_tasks.downloader import *
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
from src.agenda_culturel.import_tasks.extractor import *
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
from src.agenda_culturel.import_tasks.importer import *
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
from src.agenda_culturel.import_tasks.generic_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
|
|||||||
from src.agenda_culturel.import_tasks.downloader import *
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
from src.agenda_culturel.import_tasks.extractor import *
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
from src.agenda_culturel.import_tasks.importer import *
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
from src.agenda_culturel.import_tasks.generic_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
2
experimentations/get_mobilizon.py
Normal file → Executable file
2
experimentations/get_mobilizon.py
Normal file → Executable file
@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
|
|||||||
from src.agenda_culturel.import_tasks.downloader import *
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
from src.agenda_culturel.import_tasks.extractor import *
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
from src.agenda_culturel.import_tasks.importer import *
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
from src.agenda_culturel.import_tasks.generic_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ sys.path.append(parent + "/src")
|
|||||||
from src.agenda_culturel.import_tasks.downloader import *
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
from src.agenda_culturel.import_tasks.extractor import *
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
from src.agenda_culturel.import_tasks.importer import *
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
from src.agenda_culturel.import_tasks.custom_extractors import *
|
from src.agenda_culturel.import_tasks.generic_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,8 +14,8 @@ from contextlib import contextmanager
|
|||||||
from .import_tasks.downloader import *
|
from .import_tasks.downloader import *
|
||||||
from .import_tasks.extractor import *
|
from .import_tasks.extractor import *
|
||||||
from .import_tasks.importer import *
|
from .import_tasks.importer import *
|
||||||
from .import_tasks.extractor_ical import *
|
|
||||||
from .import_tasks.custom_extractors import *
|
from .import_tasks.custom_extractors import *
|
||||||
|
from .import_tasks.generic_extractors import *
|
||||||
|
|
||||||
|
|
||||||
# Set the default Django settings module for the 'celery' program.
|
# Set the default Django settings module for the 'celery' program.
|
||||||
@ -140,13 +140,13 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
|
|||||||
|
|
||||||
|
|
||||||
if rimport.processor == RecurrentImport.PROCESSOR.ICAL:
|
if rimport.processor == RecurrentImport.PROCESSOR.ICAL:
|
||||||
extractor = ICALExtractor()
|
extractor = ical.ICALExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOBUSY:
|
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOBUSY:
|
||||||
extractor = ICALNoBusyExtractor()
|
extractor = ical.ICALNoBusyExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
|
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
|
||||||
extractor = ICALNoVCExtractor()
|
extractor = ical.ICALNoVCExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNAIVETZ:
|
elif rimport.processor == RecurrentImport.PROCESSOR.ICALNAIVETZ:
|
||||||
extractor = ICALNaiveTimezone()
|
extractor = ical.ICALNaiveTimezone()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
|
elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
|
||||||
extractor = lacoope.CExtractor()
|
extractor = lacoope.CExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
|
elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# A class dedicated to get events from Arachnée Concert
|
# A class dedicated to get events from Arachnée Concert
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
import json5
|
import json5
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
|
from ..generic_extractors.ggcal_link import GGCalendar
|
||||||
import re
|
import re
|
||||||
import json5
|
import json5
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
import re
|
import re
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# A class dedicated to get events from Le Fotomat'
|
# A class dedicated to get events from Le Fotomat'
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
|
|
||||||
|
@ -284,8 +284,8 @@ class Extractor(ABC):
|
|||||||
return {"header": self.header, "events": self.events}
|
return {"header": self.header, "events": self.events}
|
||||||
|
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
from .extractor_ical import ICALExtractor
|
from .generic_extractors.ical import ICALExtractor
|
||||||
from .extractor_facebook import FacebookEventExtractor
|
from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor
|
||||||
|
|
||||||
result = url
|
result = url
|
||||||
for e in [ICALExtractor, FacebookEventExtractor]:
|
for e in [ICALExtractor, FacebookEventExtractor]:
|
||||||
@ -293,9 +293,9 @@ class Extractor(ABC):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def get_default_extractors(single_event=False):
|
def get_default_extractors(single_event=False):
|
||||||
from .extractor_ical import ICALExtractor
|
from .generic_extractors.ical import ICALExtractor
|
||||||
from .extractor_facebook import FacebookEventExtractor
|
from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor
|
||||||
from .extractor_ggcal_link import GoogleCalendarLinkEventExtractor
|
from .generic_extractors.ggcal_link import CExtractor as GoogleCalendarLinkEventExtractor
|
||||||
|
|
||||||
if single_event:
|
if single_event:
|
||||||
return [FacebookEventExtractor(), GoogleCalendarLinkEventExtractor(), EventNotFoundExtractor()]
|
return [FacebookEventExtractor(), GoogleCalendarLinkEventExtractor(), EventNotFoundExtractor()]
|
||||||
|
@ -1,88 +0,0 @@
|
|||||||
from datetime import datetime
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
from .extractor import *
|
|
||||||
from .generic_extractors import *
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
class GoogleCalendarLinkEventExtractor(Extractor):
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
self.possible_urls = ["https://calendar.google.com/calendar/", "https://addtocalendar.com/", "https://www.google.com/calendar/event"]
|
|
||||||
|
|
||||||
|
|
||||||
def guess_image(self, soup, url):
|
|
||||||
image = soup.find("meta", property="og:image")
|
|
||||||
|
|
||||||
if image is None:
|
|
||||||
for img in soup.select('img'):
|
|
||||||
if img.find_parent(name='nav'):
|
|
||||||
continue
|
|
||||||
image = img["src"]
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
image = image["content"]
|
|
||||||
|
|
||||||
if image.startswith("/"):
|
|
||||||
root_url = "https://" + urlparse(url).netloc + "/"
|
|
||||||
image = root_url + image
|
|
||||||
|
|
||||||
return image
|
|
||||||
|
|
||||||
|
|
||||||
def extract(
|
|
||||||
self, content, url, url_human=None, default_values=None, published=False
|
|
||||||
):
|
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
|
||||||
|
|
||||||
for ggu in self.possible_urls:
|
|
||||||
|
|
||||||
link_calendar = soup.select('a[href^="' + ggu + '"]')
|
|
||||||
if len(link_calendar) != 0:
|
|
||||||
|
|
||||||
gg_cal = GGCalendar(link_calendar[0]["href"])
|
|
||||||
|
|
||||||
if gg_cal.is_valid_event():
|
|
||||||
start_day = gg_cal.start_day
|
|
||||||
start_time = gg_cal.start_time
|
|
||||||
description = gg_cal.description.replace(' ', '')
|
|
||||||
end_day = gg_cal.end_day
|
|
||||||
end_time = gg_cal.end_time
|
|
||||||
location = gg_cal.location
|
|
||||||
title = gg_cal.title
|
|
||||||
url_human = url
|
|
||||||
|
|
||||||
self.set_header(url)
|
|
||||||
|
|
||||||
image = self.guess_image(soup, url)
|
|
||||||
|
|
||||||
category = None
|
|
||||||
|
|
||||||
self.add_event(
|
|
||||||
default_values,
|
|
||||||
title=title,
|
|
||||||
category=category,
|
|
||||||
start_day=start_day,
|
|
||||||
location=location,
|
|
||||||
description=description,
|
|
||||||
tags=[],
|
|
||||||
uuids=[url],
|
|
||||||
recurrences=None,
|
|
||||||
url_human=url_human,
|
|
||||||
start_time=start_time,
|
|
||||||
end_day=end_day,
|
|
||||||
end_time=end_time,
|
|
||||||
published=published,
|
|
||||||
image=image,
|
|
||||||
)
|
|
||||||
|
|
||||||
break
|
|
||||||
|
|
||||||
|
|
||||||
return self.get_structure()
|
|
@ -0,0 +1,7 @@
|
|||||||
|
from os.path import dirname, basename, isfile, join
|
||||||
|
import glob
|
||||||
|
|
||||||
|
modules = glob.glob(join(dirname(__file__), "*.py"))
|
||||||
|
__all__ = [
|
||||||
|
basename(f)[:-3] for f in modules if isfile(f) and not f.endswith("__init__.py")
|
||||||
|
]
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
@ -5,7 +5,7 @@ import time as t
|
|||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
|
|
||||||
|
|
||||||
from .extractor import *
|
from ..extractor import *
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@ -231,7 +231,7 @@ class FacebookEvent:
|
|||||||
result.append(clone.build_event(url_base + nb_e.elements["id"] + "/"))
|
result.append(clone.build_event(url_base + nb_e.elements["id"] + "/"))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
class FacebookEventExtractor(Extractor):
|
class CExtractor(Extractor):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -259,11 +259,11 @@ class FacebookEventExtractor(Extractor):
|
|||||||
t.sleep(5)
|
t.sleep(5)
|
||||||
|
|
||||||
def prepare_2nd_extract(self):
|
def prepare_2nd_extract(self):
|
||||||
FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader)
|
CExtractor.prepare_2nd_extract_dler(self.downloader)
|
||||||
|
|
||||||
|
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
if FacebookEventExtractor.is_known_url(url, False):
|
if CExtractor.is_known_url(url, False):
|
||||||
u = urlparse(url)
|
u = urlparse(url)
|
||||||
result = "https://www.facebook.com" + u.path
|
result = "https://www.facebook.com" + u.path
|
||||||
|
|
@ -1,5 +1,5 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from ..extractor_facebook import FacebookEvent, FacebookEventExtractor
|
from .fbevent import FacebookEvent
|
||||||
import json5
|
import json5
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import json
|
import json
|
@ -0,0 +1,158 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from ..extractor import *
|
||||||
|
from ..twosteps_extractor import *
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
class GGCalendar:
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
self.extract_info()
|
||||||
|
|
||||||
|
def filter_keys(params):
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
for k, v in params.items():
|
||||||
|
if k.startswith('e[0]'):
|
||||||
|
result[k.replace('e[0][', '')[:-1]] = v
|
||||||
|
else:
|
||||||
|
result[k] = v
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def is_valid_event(self):
|
||||||
|
return self.start_day is not None and self.title is not None
|
||||||
|
|
||||||
|
def extract_info(self):
|
||||||
|
parsed_url = urlparse(self.url.replace("#", "%23"))
|
||||||
|
params = parse_qs(parsed_url.query)
|
||||||
|
|
||||||
|
params = GGCalendar.filter_keys(params)
|
||||||
|
|
||||||
|
self.location = params["location"][0] if "location" in params else ""
|
||||||
|
self.title = params["text"][0] if "text" in params else params["title"][0] if "title" in params else ""
|
||||||
|
self.description = params["description"][0] if "description" in params else params["details"][0] if "details" in params else ""
|
||||||
|
if self.description != "":
|
||||||
|
self.description = BeautifulSoup(self.description, "html.parser").text
|
||||||
|
if "dates" in params:
|
||||||
|
dates = [x.replace(" ", "+") for x in params["dates"][0].split("/")]
|
||||||
|
if len(dates) > 0:
|
||||||
|
date = parser.parse(dates[0])
|
||||||
|
self.start_day = date.date()
|
||||||
|
self.start_time = date.time()
|
||||||
|
if len(dates) == 2:
|
||||||
|
date = parser.parse(dates[1])
|
||||||
|
self.end_day = date.date()
|
||||||
|
self.end_time = date.time()
|
||||||
|
else:
|
||||||
|
self.end_day = None
|
||||||
|
self.end_time = None
|
||||||
|
elif "date_start" in params:
|
||||||
|
date = parser.parse(params["date_start"][0])
|
||||||
|
self.start_day = date.date()
|
||||||
|
self.start_time = date.time()
|
||||||
|
if "date_end" in params:
|
||||||
|
dateend = parser.parse(params["date_end"][0])
|
||||||
|
if dateend != date:
|
||||||
|
self.end_day = dateend.date()
|
||||||
|
self.end_time = dateend.time()
|
||||||
|
else:
|
||||||
|
self.end_day = None
|
||||||
|
self.end_time = None
|
||||||
|
if self.start_time == datetime.time(0):
|
||||||
|
self.start_time = None
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.end_day = None
|
||||||
|
self.end_time = None
|
||||||
|
else:
|
||||||
|
raise Exception("Unable to find a date in google calendar URL")
|
||||||
|
self.start_day = None
|
||||||
|
self.start_time = None
|
||||||
|
self.end_day = None
|
||||||
|
self.end_time = None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class CExtractor(Extractor):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.possible_urls = ["https://calendar.google.com/calendar/", "https://addtocalendar.com/", "https://www.google.com/calendar/event"]
|
||||||
|
|
||||||
|
|
||||||
|
def guess_image(self, soup, url):
|
||||||
|
image = soup.find("meta", property="og:image")
|
||||||
|
|
||||||
|
if image is None:
|
||||||
|
for img in soup.select('img'):
|
||||||
|
if img.find_parent(name='nav'):
|
||||||
|
continue
|
||||||
|
image = img["src"]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
image = image["content"]
|
||||||
|
|
||||||
|
if image.startswith("/"):
|
||||||
|
root_url = "https://" + urlparse(url).netloc + "/"
|
||||||
|
image = root_url + image
|
||||||
|
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
def extract(
|
||||||
|
self, content, url, url_human=None, default_values=None, published=False
|
||||||
|
):
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
for ggu in self.possible_urls:
|
||||||
|
|
||||||
|
link_calendar = soup.select('a[href^="' + ggu + '"]')
|
||||||
|
if len(link_calendar) != 0:
|
||||||
|
|
||||||
|
gg_cal = GGCalendar(link_calendar[0]["href"])
|
||||||
|
|
||||||
|
if gg_cal.is_valid_event():
|
||||||
|
start_day = gg_cal.start_day
|
||||||
|
start_time = gg_cal.start_time
|
||||||
|
description = gg_cal.description.replace(' ', '')
|
||||||
|
end_day = gg_cal.end_day
|
||||||
|
end_time = gg_cal.end_time
|
||||||
|
location = gg_cal.location
|
||||||
|
title = gg_cal.title
|
||||||
|
url_human = url
|
||||||
|
|
||||||
|
self.set_header(url)
|
||||||
|
|
||||||
|
image = self.guess_image(soup, url)
|
||||||
|
|
||||||
|
category = None
|
||||||
|
|
||||||
|
self.add_event(
|
||||||
|
default_values,
|
||||||
|
title=title,
|
||||||
|
category=category,
|
||||||
|
start_day=start_day,
|
||||||
|
location=location,
|
||||||
|
description=description,
|
||||||
|
tags=[],
|
||||||
|
uuids=[url],
|
||||||
|
recurrences=None,
|
||||||
|
url_human=url_human,
|
||||||
|
start_time=start_time,
|
||||||
|
end_day=end_day,
|
||||||
|
end_time=end_time,
|
||||||
|
published=published,
|
||||||
|
image=image,
|
||||||
|
)
|
||||||
|
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
return self.get_structure()
|
@ -8,7 +8,7 @@ from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
|
|||||||
import pytz
|
import pytz
|
||||||
|
|
||||||
|
|
||||||
from .extractor import *
|
from ..extractor import *
|
||||||
|
|
||||||
from celery.utils.log import get_task_logger
|
from celery.utils.log import get_task_logger
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
@ -1,4 +1,4 @@
|
|||||||
from ..generic_extractors import *
|
from ..twosteps_extractor import *
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
|||||||
from .downloader import *
|
from .downloader import *
|
||||||
from .extractor import *
|
from .extractor import *
|
||||||
from .extractor_facebook import FacebookEventExtractor
|
from .generic_extractors.fbevent import CExtractor as FacebookEventExtractor
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
@ -14,76 +14,6 @@ from django.utils.translation import gettext_lazy as _
|
|||||||
from dateutil import parser
|
from dateutil import parser
|
||||||
import datetime
|
import datetime
|
||||||
|
|
||||||
|
|
||||||
class GGCalendar:
|
|
||||||
def __init__(self, url):
|
|
||||||
self.url = url
|
|
||||||
self.extract_info()
|
|
||||||
|
|
||||||
def filter_keys(params):
|
|
||||||
result = {}
|
|
||||||
|
|
||||||
for k, v in params.items():
|
|
||||||
if k.startswith('e[0]'):
|
|
||||||
result[k.replace('e[0][', '')[:-1]] = v
|
|
||||||
else:
|
|
||||||
result[k] = v
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def is_valid_event(self):
|
|
||||||
return self.start_day is not None and self.title is not None
|
|
||||||
|
|
||||||
def extract_info(self):
|
|
||||||
parsed_url = urlparse(self.url.replace("#", "%23"))
|
|
||||||
params = parse_qs(parsed_url.query)
|
|
||||||
|
|
||||||
params = GGCalendar.filter_keys(params)
|
|
||||||
|
|
||||||
self.location = params["location"][0] if "location" in params else ""
|
|
||||||
self.title = params["text"][0] if "text" in params else params["title"][0] if "title" in params else ""
|
|
||||||
self.description = params["description"][0] if "description" in params else params["details"][0] if "details" in params else ""
|
|
||||||
if self.description != "":
|
|
||||||
self.description = BeautifulSoup(self.description, "html.parser").text
|
|
||||||
if "dates" in params:
|
|
||||||
dates = [x.replace(" ", "+") for x in params["dates"][0].split("/")]
|
|
||||||
if len(dates) > 0:
|
|
||||||
date = parser.parse(dates[0])
|
|
||||||
self.start_day = date.date()
|
|
||||||
self.start_time = date.time()
|
|
||||||
if len(dates) == 2:
|
|
||||||
date = parser.parse(dates[1])
|
|
||||||
self.end_day = date.date()
|
|
||||||
self.end_time = date.time()
|
|
||||||
else:
|
|
||||||
self.end_day = None
|
|
||||||
self.end_time = None
|
|
||||||
elif "date_start" in params:
|
|
||||||
date = parser.parse(params["date_start"][0])
|
|
||||||
self.start_day = date.date()
|
|
||||||
self.start_time = date.time()
|
|
||||||
if "date_end" in params:
|
|
||||||
dateend = parser.parse(params["date_end"][0])
|
|
||||||
if dateend != date:
|
|
||||||
self.end_day = dateend.date()
|
|
||||||
self.end_time = dateend.time()
|
|
||||||
else:
|
|
||||||
self.end_day = None
|
|
||||||
self.end_time = None
|
|
||||||
if self.start_time == datetime.time(0):
|
|
||||||
self.start_time = None
|
|
||||||
|
|
||||||
else:
|
|
||||||
self.end_day = None
|
|
||||||
self.end_time = None
|
|
||||||
else:
|
|
||||||
raise Exception("Unable to find a date in google calendar URL")
|
|
||||||
self.start_day = None
|
|
||||||
self.start_time = None
|
|
||||||
self.end_day = None
|
|
||||||
self.end_time = None
|
|
||||||
|
|
||||||
|
|
||||||
# A class to extract events from URL with two steps:
|
# A class to extract events from URL with two steps:
|
||||||
# - first build a list of urls where the events will be found
|
# - first build a list of urls where the events will be found
|
||||||
# - then for each document downloaded from these urls, build the events
|
# - then for each document downloaded from these urls, build the events
|
@ -36,7 +36,7 @@ import recurrence
|
|||||||
import copy
|
import copy
|
||||||
import unicodedata
|
import unicodedata
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from .import_tasks.extractor_facebook import FacebookEventExtractor
|
from .import_tasks.generic_extractors.fbevent import CExtractor as FacebookEventExtractor
|
||||||
from .import_tasks.extractor import Extractor
|
from .import_tasks.extractor import Extractor
|
||||||
|
|
||||||
from django.template.defaultfilters import date as _date
|
from django.template.defaultfilters import date as _date
|
||||||
|
Loading…
x
Reference in New Issue
Block a user