* Ajout de l'import récurrent (manque la partie cron)
* Correction des textes en français. Fix #73
This commit is contained in:
@@ -1,249 +1,29 @@
|
||||
#!/usr/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from urllib.parse import urlparse
|
||||
import urllib.request
|
||||
import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
import icalendar
|
||||
from icalendar import vDatetime
|
||||
from datetime import datetime, date
|
||||
import json
|
||||
from bs4 import BeautifulSoup
|
||||
import pickle
|
||||
import sys
|
||||
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
|
||||
class Downloader(ABC):
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def download(self, url):
|
||||
pass
|
||||
|
||||
class SimpleDownloader(Downloader):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
|
||||
def download(self, url):
|
||||
print("Downloading {}".format(url))
|
||||
|
||||
try:
|
||||
resource = urllib.request.urlopen(url)
|
||||
data = resource.read().decode(resource.headers.get_content_charset())
|
||||
return data
|
||||
except:
|
||||
return None
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.extractor_ical import *
|
||||
|
||||
|
||||
|
||||
class ChromiumHeadlessDownloader(Downloader):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
options = Options()
|
||||
options.add_argument("--headless=new")
|
||||
service = Service("/usr/bin/chromedriver")
|
||||
self.driver = webdriver.Chrome(service=service, options=options)
|
||||
|
||||
|
||||
def download(self, url):
|
||||
print("Download {}".format(url))
|
||||
|
||||
self.driver.get(url)
|
||||
return driver.page_source
|
||||
|
||||
|
||||
class Extractor(ABC):
|
||||
|
||||
def __init__(self):
|
||||
self.header = {}
|
||||
self.events = []
|
||||
|
||||
@abstractmethod
|
||||
def extract(self, content, url, url_human = None):
|
||||
pass
|
||||
|
||||
def set_header(self, url):
|
||||
self.header["url"] = url
|
||||
self.header["date"] = datetime.now()
|
||||
|
||||
def clear_events(self):
|
||||
self.events = []
|
||||
|
||||
def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False):
|
||||
if title is None:
|
||||
print("ERROR: cannot import an event without name")
|
||||
return
|
||||
if start_day is None:
|
||||
print("ERROR: cannot import an event without start day")
|
||||
return
|
||||
|
||||
event = {
|
||||
"title": title,
|
||||
"category": category,
|
||||
"start_day": start_day,
|
||||
"uuid": uuid,
|
||||
"location": location,
|
||||
"description": description,
|
||||
"tags": tags,
|
||||
"published": published
|
||||
}
|
||||
if url_human is not None:
|
||||
event["url_human"] = url_human
|
||||
if start_time is not None:
|
||||
event["start_time"] = start_time
|
||||
if end_day is not None:
|
||||
event["end_day"] = end_day
|
||||
if end_time is not None:
|
||||
event["end_time"] = end_time
|
||||
|
||||
if last_modified is not None:
|
||||
event["last_modified"] = last_modified
|
||||
|
||||
if recurrences is not None:
|
||||
event["recurrences"] = recurrences
|
||||
|
||||
self.events.append(event)
|
||||
|
||||
def default_value_if_exists(self, default_values, key):
|
||||
return default_values[key] if default_values is not None and key in default_values else None
|
||||
|
||||
def get_structure(self):
|
||||
return { "header": self.header, "events": self.events}
|
||||
|
||||
class ICALExtractor(Extractor):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def get_item_from_vevent(self, event, name, raw = False):
|
||||
try:
|
||||
r = event.decoded(name)
|
||||
if raw:
|
||||
return r
|
||||
else:
|
||||
return r.decode()
|
||||
except:
|
||||
return None
|
||||
|
||||
def get_dt_item_from_vevent(self, event, name):
|
||||
item = self.get_item_from_vevent(event, name, raw = True)
|
||||
|
||||
day = None
|
||||
time = None
|
||||
|
||||
if item is not None:
|
||||
if isinstance(item, datetime):
|
||||
day = item.date()
|
||||
time = item.time()
|
||||
elif isinstance(item, date):
|
||||
day = item
|
||||
time = None
|
||||
|
||||
return day, time
|
||||
|
||||
|
||||
def extract(self, content, url, url_human = None, default_values = None, published = False):
|
||||
print("Extracting ical events from {}".format(url))
|
||||
self.set_header(url)
|
||||
self.clear_events()
|
||||
self.uuids = {}
|
||||
|
||||
calendar = icalendar.Calendar.from_ical(content)
|
||||
|
||||
for event in calendar.walk('VEVENT'):
|
||||
title = self.get_item_from_vevent(event, "SUMMARY")
|
||||
category = self.default_value_if_exists(default_values, "category")
|
||||
|
||||
start_day, start_time = self.get_dt_item_from_vevent(event, "DTSTART")
|
||||
|
||||
end_day, end_time = self.get_dt_item_from_vevent(event, "DTEND")
|
||||
|
||||
location = self.get_item_from_vevent(event, "LOCATION")
|
||||
if location is None:
|
||||
location = self.default_value_if_exists(default_values, "location")
|
||||
|
||||
description = self.get_item_from_vevent(event, "DESCRIPTION")
|
||||
if description is not None:
|
||||
soup = BeautifulSoup(description)
|
||||
delimiter = '\n'
|
||||
for line_break in soup.findAll('br'):
|
||||
line_break.replaceWith(delimiter)
|
||||
description = soup.get_text()
|
||||
|
||||
last_modified = self.get_item_from_vevent(event, "LAST_MODIFIED")
|
||||
|
||||
uuid = self.get_item_from_vevent(event, "UID")
|
||||
|
||||
if uuid is not None:
|
||||
if uuid in self.uuids:
|
||||
self.uuids[uuid] += 1
|
||||
uuid += ":{:04}".format(self.uuids[uuid] - 1)
|
||||
else:
|
||||
self.uuids[uuid] = 1
|
||||
event_url = url + "#" + uuid
|
||||
|
||||
tags = self.default_value_if_exists(default_values, "tags")
|
||||
|
||||
last_modified = self.get_item_from_vevent(event, "LAST-MODIFIED", raw = True)
|
||||
|
||||
recurrence_entries = {}
|
||||
for e in ["RRULE", "EXRULE", "EXDATE", "RDATE"]:
|
||||
i = self.get_item_from_vevent(event, e, raw = True)
|
||||
if i is not None:
|
||||
recurrence_entries[e] = i
|
||||
|
||||
if start_day is not None and len(recurrence_entries) != 0:
|
||||
recurrences = ""
|
||||
|
||||
for k, r in recurrence_entries.items():
|
||||
if isinstance(r, list):
|
||||
recurrences += "\n".join([k + ":" + e.to_ical().decode() for e in r]) + "\n"
|
||||
else:
|
||||
recurrences += k + ":" + r.to_ical().decode() + "\n"
|
||||
else:
|
||||
recurrences = None
|
||||
|
||||
|
||||
self.add_event(title, category, start_day, location, description, tags, recurrences=recurrences, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, last_modified=last_modified, published=published)
|
||||
|
||||
return self.get_structure()
|
||||
|
||||
|
||||
|
||||
class URL2Events:
|
||||
|
||||
def __init__(self, downloader, extractor):
|
||||
|
||||
self.downloader = downloader
|
||||
self.extractor = extractor
|
||||
|
||||
def process(self, url, url_human = None, cache = None, default_values = None, published = False):
|
||||
|
||||
if cache and os.path.exists(cache):
|
||||
print("Loading cache ({})".format(cache))
|
||||
with open(cache) as f:
|
||||
content = "\n".join(f.readlines())
|
||||
else:
|
||||
content = self.downloader.download(url)
|
||||
|
||||
if cache:
|
||||
print("Saving cache ({})".format(cache))
|
||||
dir = os.path.dirname(cache)
|
||||
if dir != "" and not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with open(cache, "w") as text_file:
|
||||
text_file.write(content)
|
||||
|
||||
return self.extractor.extract(content, url, url_human, default_values, published)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
Reference in New Issue
Block a user