L'import facebook partage maintenant son code avec les autres imports
Fix #80
This commit is contained in:
		
							
								
								
									
										3
									
								
								experimentations/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								experimentations/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,3 @@
 | 
			
		||||
*.json
 | 
			
		||||
*.html
 | 
			
		||||
*.ical
 | 
			
		||||
@@ -1,171 +1,40 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
# coding: utf-8
 | 
			
		||||
 | 
			
		||||
import requests
 | 
			
		||||
import hashlib
 | 
			
		||||
import os
 | 
			
		||||
from selenium import webdriver
 | 
			
		||||
from selenium.webdriver.chrome.service import Service
 | 
			
		||||
from selenium.webdriver.chrome.options import Options
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
class SimpleEvent:
 | 
			
		||||
# getting the name of the directory
 | 
			
		||||
# where the this file is present.
 | 
			
		||||
current = os.path.dirname(os.path.realpath(__file__))
 | 
			
		||||
 
 | 
			
		||||
# Getting the parent directory name
 | 
			
		||||
# where the current directory is present.
 | 
			
		||||
parent = os.path.dirname(current)
 | 
			
		||||
 
 | 
			
		||||
# adding the parent directory to 
 | 
			
		||||
# the sys.path.
 | 
			
		||||
sys.path.append(parent)
 | 
			
		||||
 | 
			
		||||
    def __init__(self, data):
 | 
			
		||||
        self.elements = {}
 | 
			
		||||
 | 
			
		||||
        for key in ["id", "start_timestamp", "end_timestamp"]:
 | 
			
		||||
            self.elements[key] = data[key] if key in data else None
 | 
			
		||||
 | 
			
		||||
        if "parent_event" in data:
 | 
			
		||||
            self.parent = SimpleEvent(data["parent_event"])
 | 
			
		||||
from src.agenda_culturel.import_tasks.downloader import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.extractor import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.importer import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.extractor_facebook import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Event:
 | 
			
		||||
 | 
			
		||||
    name = "event"
 | 
			
		||||
    keys = [
 | 
			
		||||
            ["start_time_formatted", 'start_timestamp', 
 | 
			
		||||
             'is_past', 
 | 
			
		||||
             "name", 
 | 
			
		||||
             "price_info", 
 | 
			
		||||
             "cover_media_renderer", 
 | 
			
		||||
             "event_creator", 
 | 
			
		||||
             "id", 
 | 
			
		||||
             "day_time_sentence", 
 | 
			
		||||
             "event_place", 
 | 
			
		||||
             "comet_neighboring_siblings"],
 | 
			
		||||
            ["event_description"],
 | 
			
		||||
            ["start_timestamp", "end_timestamp"]
 | 
			
		||||
    ]
 | 
			
		||||
    rules = {
 | 
			
		||||
        "event_description": { "description": ["text"]},
 | 
			
		||||
        "cover_media_renderer": {"image_alt": ["cover_photo", "photo", "accessibility_caption"], "image": ["cover_photo", "photo", "full_image", "uri"]},
 | 
			
		||||
        "event_creator": { "event_creator_name": ["name"], "event_creator_url": ["url"] },
 | 
			
		||||
        "event_place": {"event_place_name": ["name"] }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    def __init__(self, i, event):
 | 
			
		||||
        self.fragments = {}
 | 
			
		||||
        self.elements = {}
 | 
			
		||||
        self.neighbor_events = None
 | 
			
		||||
        self.possible_end_timestamp = []
 | 
			
		||||
        self.add_fragment(i, event)
 | 
			
		||||
 | 
			
		||||
    def add_fragment(self, i, event):
 | 
			
		||||
        self.fragments[i] = event
 | 
			
		||||
 | 
			
		||||
        if Event.keys[i] == ["start_timestamp", "end_timestamp"]:
 | 
			
		||||
            self.get_possible_end_timestamp(i, event)
 | 
			
		||||
        else:
 | 
			
		||||
            for k in Event.keys[i]:
 | 
			
		||||
                if k == "comet_neighboring_siblings":
 | 
			
		||||
                    self.get_neighbor_events(event[k])
 | 
			
		||||
                elif k in Event.rules:
 | 
			
		||||
                    for nk, rule in Event.rules[k].items():
 | 
			
		||||
                        c = event[k]
 | 
			
		||||
                        for ki in rule:
 | 
			
		||||
                            c = c[ki]
 | 
			
		||||
                        self.elements[nk] = c
 | 
			
		||||
                else:
 | 
			
		||||
                    self.elements[k] = event[k]
 | 
			
		||||
 | 
			
		||||
    def get_possible_end_timestamp(self, i, data):
 | 
			
		||||
        self.possible_end_timestamp.append(dict((k, data[k]) for k in Event.keys[i]))
 | 
			
		||||
 | 
			
		||||
    def get_neighbor_events(self, data):
 | 
			
		||||
        self.neighbor_events = [SimpleEvent(d) for d in data]
 | 
			
		||||
 | 
			
		||||
    def __str__(self):
 | 
			
		||||
        return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
 | 
			
		||||
 | 
			
		||||
    def consolidate_current_event(self):
 | 
			
		||||
        if self.neighbor_events is not None and "id" in self.elements and "end_timestamp" not in self.elements:
 | 
			
		||||
            id = self.elements["id"]
 | 
			
		||||
            for ne in self.neighbor_events:
 | 
			
		||||
                if ne.elements["id"] == id:
 | 
			
		||||
                    self.elements["end_timestamp"] = ne.elements["end_timestamp"]
 | 
			
		||||
        
 | 
			
		||||
        if "end_timestamp" not in self.elements and len(self.possible_end_timestamp) != 0:
 | 
			
		||||
            for s in self.possible_end_timestamp:
 | 
			
		||||
                if s["start_timestamp"] == self.elements["start_timestamp"]:
 | 
			
		||||
                    self.elements["end_timestamp"] = s["end_timestamp"]
 | 
			
		||||
                    break
 | 
			
		||||
 | 
			
		||||
    def find_event_fragment_in_array(array, event, first = True):
 | 
			
		||||
        if isinstance(array, dict):
 | 
			
		||||
 | 
			
		||||
            seen = False
 | 
			
		||||
            for i, ks in enumerate(Event.keys):
 | 
			
		||||
                if len(ks) == len([k for k in ks if k in array]):
 | 
			
		||||
                    seen = True
 | 
			
		||||
                    if event is None:
 | 
			
		||||
                            event = Event(i, array)
 | 
			
		||||
                    else:
 | 
			
		||||
                        event.add_fragment(i, array)
 | 
			
		||||
                    # only consider the first of Event.keys
 | 
			
		||||
                    break
 | 
			
		||||
            if not seen:
 | 
			
		||||
                for k in array:
 | 
			
		||||
                    event = Event.find_event_fragment_in_array(array[k], event, False)
 | 
			
		||||
        elif isinstance(array, list):
 | 
			
		||||
            for e in array:
 | 
			
		||||
                event = Event.find_event_fragment_in_array(e, event, False)
 | 
			
		||||
 | 
			
		||||
        if event is not None and first:
 | 
			
		||||
            event.consolidate_current_event()
 | 
			
		||||
        return event
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#url="https://www.facebook.com/events/ical/export/?eid=2294200007432315"
 | 
			
		||||
#url="https://www.facebook.com/events/2294199997432316/2294200007432315/"
 | 
			
		||||
#url="https://www.facebook.com/events/635247792092358/"
 | 
			
		||||
url="https://www.facebook.com/events/872781744074648"
 | 
			
		||||
url="https://www.facebook.com/events/1432798543943663?"
 | 
			
		||||
#url_cal = "https://www.facebook.com/events/ical/export/?eid=993406668581410"
 | 
			
		||||
#url="https://jmtrivial.info"
 | 
			
		||||
 | 
			
		||||
cachedir = "cache"
 | 
			
		||||
result = hashlib.md5(url.encode())
 | 
			
		||||
hash = result.hexdigest()
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
filename = os.path.join(cachedir, hash + ".html")
 | 
			
		||||
    u2e = URL2Events(ChromiumHeadlessDownloader(), FacebookEventExtractor(single_event=True))
 | 
			
		||||
    url="https://www.facebook.com/events/872781744074648"
 | 
			
		||||
 | 
			
		||||
if os.path.isfile(filename):
 | 
			
		||||
    # print("Use cache")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        doc = "\n".join(f.readlines())
 | 
			
		||||
else:
 | 
			
		||||
    print("Download page")
 | 
			
		||||
    events = u2e.process(url, cache = "fb.html", published = True)
 | 
			
		||||
 | 
			
		||||
    options = Options()
 | 
			
		||||
    options.add_argument("--headless=new")
 | 
			
		||||
    service = Service("/usr/bin/chromedriver")
 | 
			
		||||
 | 
			
		||||
    driver = webdriver.Chrome(service=service, options=options)
 | 
			
		||||
    driver.get(url)
 | 
			
		||||
    doc = driver.page_source
 | 
			
		||||
    driver.quit()
 | 
			
		||||
 | 
			
		||||
    dir = os.path.dirname(filename)
 | 
			
		||||
    if not os.path.exists(dir):
 | 
			
		||||
        os.makedirs(dir)
 | 
			
		||||
    with open(filename, "w") as text_file:
 | 
			
		||||
        text_file.write(doc)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
soup = BeautifulSoup(doc)
 | 
			
		||||
 | 
			
		||||
event = None
 | 
			
		||||
for json_script in soup.find_all('script', type="application/json"):
 | 
			
		||||
    json_txt = json_script.get_text()
 | 
			
		||||
    json_struct = json.loads(json_txt)
 | 
			
		||||
 | 
			
		||||
    event = Event.find_event_fragment_in_array(json_struct, event)
 | 
			
		||||
 | 
			
		||||
print(event)
 | 
			
		||||
    exportfile = "event-facebook.json"
 | 
			
		||||
    print("Saving events to file {}".format(exportfile))
 | 
			
		||||
    with open(exportfile, "w") as f:
 | 
			
		||||
        json.dump(events, f, indent=4, default=str)
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -5,8 +5,6 @@ from celery import Celery
 | 
			
		||||
from celery.schedules import crontab
 | 
			
		||||
from celery.utils.log import get_task_logger
 | 
			
		||||
 | 
			
		||||
from .extractors import ExtractorAllURLs 
 | 
			
		||||
 | 
			
		||||
from .import_tasks.downloader import *
 | 
			
		||||
from .import_tasks.extractor import *
 | 
			
		||||
from .import_tasks.importer import *
 | 
			
		||||
@@ -53,7 +51,7 @@ def close_import_task(taskid, success, error_message, importer):
 | 
			
		||||
@app.task(bind=True)
 | 
			
		||||
def import_events_from_json(self, json):
 | 
			
		||||
    from agenda_culturel.models import Event, BatchImportation
 | 
			
		||||
    from .importation import EventsImporter
 | 
			
		||||
    from .db_importer import DBImporterEvents
 | 
			
		||||
 | 
			
		||||
    # create a batch importation
 | 
			
		||||
    importation = BatchImportation(celery_id=self.request.id)
 | 
			
		||||
@@ -63,7 +61,7 @@ def import_events_from_json(self, json):
 | 
			
		||||
 | 
			
		||||
    logger.info("Import events from json: {}".format(self.request.id))
 | 
			
		||||
 | 
			
		||||
    importer = EventsImporter(self.request.id)
 | 
			
		||||
    importer = DBImporterEvents(self.request.id)
 | 
			
		||||
 | 
			
		||||
    #try:
 | 
			
		||||
    success, error_message = importer.import_events(json)
 | 
			
		||||
@@ -78,7 +76,7 @@ def import_events_from_json(self, json):
 | 
			
		||||
@app.task(bind=True)
 | 
			
		||||
def run_recurrent_import(self, pk):
 | 
			
		||||
    from agenda_culturel.models import RecurrentImport, BatchImportation
 | 
			
		||||
    from .importation import EventsImporter
 | 
			
		||||
    from .db_importer import DBImporterEvents
 | 
			
		||||
    from django.shortcuts import get_object_or_404
 | 
			
		||||
 | 
			
		||||
    logger.info("Run recurrent import: {}".format(self.request.id))
 | 
			
		||||
@@ -92,7 +90,7 @@ def run_recurrent_import(self, pk):
 | 
			
		||||
    importation.save()
 | 
			
		||||
 | 
			
		||||
    # create an importer
 | 
			
		||||
    importer = EventsImporter(self.request.id)
 | 
			
		||||
    importer = DBImporterEvents(self.request.id)
 | 
			
		||||
 | 
			
		||||
    # prepare downloading and extracting processes
 | 
			
		||||
    downloader = SimpleDownloader() if rimport.downloader == RecurrentImport.DOWNLOADER.SIMPLE else ChromiumHeadlessDownloader()
 | 
			
		||||
 
 | 
			
		||||
@@ -7,7 +7,7 @@ import logging
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class EventsImporter:
 | 
			
		||||
class DBImporterEvents:
 | 
			
		||||
 | 
			
		||||
    def __init__(self, celery_id):
 | 
			
		||||
        self.celery_id = celery_id
 | 
			
		||||
@@ -37,14 +37,18 @@ class ChromiumHeadlessDownloader(Downloader):
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        options = Options()
 | 
			
		||||
        options.add_argument("--headless=new")
 | 
			
		||||
        service = Service("/usr/bin/chromedriver")
 | 
			
		||||
        self.driver = webdriver.Chrome(service=service, options=options)
 | 
			
		||||
        self.options = Options()
 | 
			
		||||
        self.options.add_argument("--headless=new")
 | 
			
		||||
        self.options.add_argument("--disable-dev-shm-usage")
 | 
			
		||||
        self.options.add_argument("--no-sandbox")
 | 
			
		||||
        self.service = Service("/usr/bin/chromedriver")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def download(self, url):
 | 
			
		||||
        print("Download {}".format(url))
 | 
			
		||||
        self.driver = webdriver.Chrome(service=self.service, options=self.options)
 | 
			
		||||
 | 
			
		||||
        self.driver.get(url)
 | 
			
		||||
        return driver.page_source
 | 
			
		||||
        doc = self.driver.page_source
 | 
			
		||||
        self.driver.quit()
 | 
			
		||||
        return doc
 | 
			
		||||
 
 | 
			
		||||
@@ -13,6 +13,10 @@ class Extractor(ABC):
 | 
			
		||||
    def extract(self, content, url, url_human = None):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def clean_url(url):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    def set_header(self, url):
 | 
			
		||||
        self.header["url"] = url
 | 
			
		||||
        self.header["date"] = datetime.now()
 | 
			
		||||
@@ -20,7 +24,7 @@ class Extractor(ABC):
 | 
			
		||||
    def clear_events(self):
 | 
			
		||||
        self.events = []
 | 
			
		||||
 | 
			
		||||
    def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False):
 | 
			
		||||
    def add_event(self, title, category, start_day, location, description, tags, uuid, recurrences=None, url_human=None, start_time=None, end_day=None, end_time=None, last_modified=None, published=False, image=None, image_alt=None):
 | 
			
		||||
        if title is None:
 | 
			
		||||
            print("ERROR: cannot import an event without name")
 | 
			
		||||
            return
 | 
			
		||||
@@ -36,8 +40,11 @@ class Extractor(ABC):
 | 
			
		||||
            "location": location,
 | 
			
		||||
            "description": description,
 | 
			
		||||
            "tags": tags,
 | 
			
		||||
            "published": published
 | 
			
		||||
            "published": published,
 | 
			
		||||
            "image": image,
 | 
			
		||||
            "image_alt": image_alt
 | 
			
		||||
        }
 | 
			
		||||
        # TODO: pourquoi url_human et non reference_url
 | 
			
		||||
        if url_human is not None:
 | 
			
		||||
            event["url_human"] = url_human
 | 
			
		||||
        if start_time is not None:
 | 
			
		||||
@@ -60,3 +67,21 @@ class Extractor(ABC):
 | 
			
		||||
 | 
			
		||||
    def get_structure(self):
 | 
			
		||||
        return { "header": self.header, "events": self.events}
 | 
			
		||||
 | 
			
		||||
    def clean_url(url):
 | 
			
		||||
        from .extractor_ical import ICALExtractor
 | 
			
		||||
        from .extractor_facebook import FacebookEventExtractor
 | 
			
		||||
 | 
			
		||||
        result = url
 | 
			
		||||
        for e in [ICALExtractor, FacebookEventExtractor]:
 | 
			
		||||
            result = e.clean_url(result)
 | 
			
		||||
        return result
 | 
			
		||||
 | 
			
		||||
    def get_default_extractors(single_event=False):
 | 
			
		||||
        from .extractor_ical import ICALExtractor
 | 
			
		||||
        from .extractor_facebook import FacebookEventExtractor
 | 
			
		||||
 | 
			
		||||
        if single_event:
 | 
			
		||||
            return [FacebookEventExtractor(single_event=True)]
 | 
			
		||||
        else:
 | 
			
		||||
            return [ICALExtractor(), FacebookEventExtractor(single_event=False)]
 | 
			
		||||
@@ -1,65 +1,18 @@
 | 
			
		||||
from abc import ABC, abstractmethod
 | 
			
		||||
import icalendar
 | 
			
		||||
import warnings
 | 
			
		||||
 | 
			
		||||
from django.db import models
 | 
			
		||||
 | 
			
		||||
from selenium import webdriver
 | 
			
		||||
from selenium.webdriver.chrome.service import Service
 | 
			
		||||
from selenium.webdriver.chrome.options import Options
 | 
			
		||||
 | 
			
		||||
import urllib.request
 | 
			
		||||
from tempfile import NamedTemporaryFile
 | 
			
		||||
from urllib.parse import urlparse
 | 
			
		||||
import os
 | 
			
		||||
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
import json
 | 
			
		||||
from datetime import datetime, date
 | 
			
		||||
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
 | 
			
		||||
from urllib.parse import urlparse
 | 
			
		||||
 | 
			
		||||
from .extractor import *
 | 
			
		||||
import json
 | 
			
		||||
 | 
			
		||||
import logging
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Extractor:
 | 
			
		||||
 | 
			
		||||
    name = None
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def is_known_url(url):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def extract(url):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    @abstractmethod
 | 
			
		||||
    def clean_url(url):
 | 
			
		||||
        pass
 | 
			
		||||
 | 
			
		||||
    def download(url):
 | 
			
		||||
        try:
 | 
			
		||||
            options = Options()
 | 
			
		||||
            options.add_argument("--headless=new")
 | 
			
		||||
            options.add_argument("--disable-dev-shm-usage")
 | 
			
		||||
            options.add_argument("--no-sandbox")
 | 
			
		||||
            service = Service("/usr/bin/chromedriver")
 | 
			
		||||
 | 
			
		||||
            driver = webdriver.Chrome(service=service, options=options)
 | 
			
		||||
            driver.get(url)
 | 
			
		||||
            doc = driver.page_source
 | 
			
		||||
            driver.quit()
 | 
			
		||||
            return doc
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(e)
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ExtractorFacebook(Extractor):
 | 
			
		||||
 | 
			
		||||
    name = "Facebook"
 | 
			
		||||
class FacebookEventExtractor(Extractor):
 | 
			
		||||
 | 
			
		||||
    class SimpleFacebookEvent:
 | 
			
		||||
 | 
			
		||||
@@ -70,7 +23,7 @@ class ExtractorFacebook(Extractor):
 | 
			
		||||
                self.elements[key] = data[key] if key in data else None
 | 
			
		||||
 | 
			
		||||
            if "parent_event" in data:
 | 
			
		||||
                self.parent = ExtractorFacebook.SimpleFacebookEvent(data["parent_event"])
 | 
			
		||||
                self.parent = FacebookEventExtractor.SimpleFacebookEvent(data["parent_event"])
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    class FacebookEvent:
 | 
			
		||||
@@ -119,14 +72,14 @@ class ExtractorFacebook(Extractor):
 | 
			
		||||
        def add_fragment(self, i, event):
 | 
			
		||||
            self.fragments[i] = event
 | 
			
		||||
 | 
			
		||||
            if ExtractorFacebook.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
 | 
			
		||||
            if FacebookEventExtractor.FacebookEvent.keys[i] == ["start_timestamp", "end_timestamp"]:
 | 
			
		||||
                self.get_possible_end_timestamp(i, event)
 | 
			
		||||
            else:
 | 
			
		||||
                for k in ExtractorFacebook.FacebookEvent.keys[i]:
 | 
			
		||||
                for k in FacebookEventExtractor.FacebookEvent.keys[i]:
 | 
			
		||||
                    if k == "comet_neighboring_siblings":
 | 
			
		||||
                        self.get_neighbor_events(event[k])
 | 
			
		||||
                    elif k in ExtractorFacebook.FacebookEvent.rules:
 | 
			
		||||
                        for nk, rule in ExtractorFacebook.FacebookEvent.rules[k].items():
 | 
			
		||||
                    elif k in FacebookEventExtractor.FacebookEvent.rules:
 | 
			
		||||
                        for nk, rule in FacebookEventExtractor.FacebookEvent.rules[k].items():
 | 
			
		||||
                            error = False
 | 
			
		||||
                            c = event[k]
 | 
			
		||||
                            for ki in rule:
 | 
			
		||||
@@ -141,11 +94,11 @@ class ExtractorFacebook(Extractor):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        def get_possible_end_timestamp(self, i, data):
 | 
			
		||||
            self.possible_end_timestamp.append(dict((k, data[k]) for k in ExtractorFacebook.FacebookEvent.keys[i]))
 | 
			
		||||
            self.possible_end_timestamp.append(dict((k, data[k]) for k in FacebookEventExtractor.FacebookEvent.keys[i]))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        def get_neighbor_events(self, data):
 | 
			
		||||
            self.neighbor_events = [ExtractorFacebook.SimpleFacebookEvent(d) for d in data]
 | 
			
		||||
            self.neighbor_events = [FacebookEventExtractor.SimpleFacebookEvent(d) for d in data]
 | 
			
		||||
 | 
			
		||||
        def __str__(self):
 | 
			
		||||
            return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
 | 
			
		||||
@@ -168,21 +121,21 @@ class ExtractorFacebook(Extractor):
 | 
			
		||||
            if isinstance(array, dict):
 | 
			
		||||
 | 
			
		||||
                seen = False
 | 
			
		||||
                for i, ks in enumerate(ExtractorFacebook.FacebookEvent.keys):
 | 
			
		||||
                for i, ks in enumerate(FacebookEventExtractor.FacebookEvent.keys):
 | 
			
		||||
                    if len(ks) == len([k for k in ks if k in array]):
 | 
			
		||||
                        seen = True
 | 
			
		||||
                        if event is None:
 | 
			
		||||
                                event = ExtractorFacebook.FacebookEvent(i, array)
 | 
			
		||||
                                event = FacebookEventExtractor.FacebookEvent(i, array)
 | 
			
		||||
                        else:
 | 
			
		||||
                            event.add_fragment(i, array)
 | 
			
		||||
                        # only consider the first of FacebookEvent.keys
 | 
			
		||||
                        break
 | 
			
		||||
                if not seen:
 | 
			
		||||
                    for k in array:
 | 
			
		||||
                        event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
 | 
			
		||||
                        event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(array[k], event, False)
 | 
			
		||||
            elif isinstance(array, list):
 | 
			
		||||
                for e in array:
 | 
			
		||||
                    event = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(e, event, False)
 | 
			
		||||
                    event = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(e, event, False)
 | 
			
		||||
 | 
			
		||||
            if event is not None and first:
 | 
			
		||||
                event.consolidate_current_event()
 | 
			
		||||
@@ -190,28 +143,33 @@ class ExtractorFacebook(Extractor):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        def build_event(self, url):
 | 
			
		||||
            from .models import Event
 | 
			
		||||
 | 
			
		||||
            image = self.get_element("image")
 | 
			
		||||
 | 
			
		||||
            return {
 | 
			
		||||
                "title": self.get_element("name"), 
 | 
			
		||||
                "category": None, 
 | 
			
		||||
                "start_day": self.get_element_date("start_timestamp"), 
 | 
			
		||||
                "location": self.get_element("event_place_name"), 
 | 
			
		||||
                "description": self.get_element("description"), 
 | 
			
		||||
                "tags": [], 
 | 
			
		||||
                "uuid": url, 
 | 
			
		||||
                "url_human": url, 
 | 
			
		||||
                "start_time": self.get_element_time("start_timestamp"), 
 | 
			
		||||
                "end_day": self.get_element_date("end_timestamp"), 
 | 
			
		||||
                "end_time": self.get_element_time("end_timestamp"), 
 | 
			
		||||
                "image": self.get_element("image"),
 | 
			
		||||
                "image_alt": self.get_element("image"),
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            return Event(title=self.get_element("name"), 
 | 
			
		||||
                status=Event.STATUS.DRAFT,
 | 
			
		||||
                start_day=self.get_element_date("start_timestamp"),
 | 
			
		||||
                start_time=self.get_element_time("start_timestamp"),
 | 
			
		||||
                end_day=self.get_element_date("end_timestamp"),
 | 
			
		||||
                end_time=self.get_element_time("end_timestamp"),
 | 
			
		||||
                location=self.get_element("event_place_name"),
 | 
			
		||||
                description=self.get_element("description"),
 | 
			
		||||
                image=self.get_element("image"),
 | 
			
		||||
                image_alt=self.get_element("image_alt"),
 | 
			
		||||
                uuids=[url],
 | 
			
		||||
                reference_urls=[url])
 | 
			
		||||
 | 
			
		||||
    def __init__(self, single_event=False):
 | 
			
		||||
        self.single_event = single_event
 | 
			
		||||
        super().__init__()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def clean_url(url):
 | 
			
		||||
 | 
			
		||||
        if ExtractorFacebook.is_known_url(url):
 | 
			
		||||
        if FacebookEventExtractor.is_known_url(url):
 | 
			
		||||
            u = urlparse(url)
 | 
			
		||||
            return "https://www.facebook.com" + u.path
 | 
			
		||||
        else:
 | 
			
		||||
@@ -222,46 +180,23 @@ class ExtractorFacebook(Extractor):
 | 
			
		||||
        return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def process_page(txt, url):
 | 
			
		||||
    def extract(self, content, url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        # NOTE: this method does not use url_human = None and default_values = None
 | 
			
		||||
 | 
			
		||||
        # get step by step all information from the content
 | 
			
		||||
        fevent = None
 | 
			
		||||
        soup = BeautifulSoup(txt, "html.parser")
 | 
			
		||||
        soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
        for json_script in soup.find_all('script', type="application/json"):
 | 
			
		||||
            json_txt = json_script.get_text()
 | 
			
		||||
            json_struct = json.loads(json_txt)
 | 
			
		||||
            fevent = ExtractorFacebook.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
 | 
			
		||||
            fevent = FacebookEventExtractor.FacebookEvent.find_event_fragment_in_array(json_struct, fevent)
 | 
			
		||||
 | 
			
		||||
        if fevent is not None:
 | 
			
		||||
            logger.info("Facebook event: " + str(fevent))
 | 
			
		||||
            result = fevent.build_event(url)
 | 
			
		||||
            return result
 | 
			
		||||
            self.set_header(url)
 | 
			
		||||
            event = fevent.build_event(url)
 | 
			
		||||
            logger.warning("published: " + str(published))
 | 
			
		||||
            event["published"] = published
 | 
			
		||||
            self.add_event(**event)
 | 
			
		||||
            return self.get_structure()
 | 
			
		||||
            
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ExtractorAllURLs:
 | 
			
		||||
 | 
			
		||||
    extractors = [ExtractorFacebook]
 | 
			
		||||
 | 
			
		||||
    def clean_url(url):
 | 
			
		||||
        result = url
 | 
			
		||||
        for e in ExtractorAllURLs.extractors:
 | 
			
		||||
            result = e.clean_url(result)
 | 
			
		||||
        return result
 | 
			
		||||
 | 
			
		||||
    def extract(url):
 | 
			
		||||
        logger.info("Run extraction")
 | 
			
		||||
 | 
			
		||||
        txt = Extractor.download(url)
 | 
			
		||||
        if txt is None:
 | 
			
		||||
            logger.info("Cannot download url")
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        for e in ExtractorAllURLs.extractors:
 | 
			
		||||
            result = e.process_page(txt, url)
 | 
			
		||||
            if result is not None:
 | 
			
		||||
                return result
 | 
			
		||||
            else:
 | 
			
		||||
                logger.info("Not a " + e.name + " link")
 | 
			
		||||
 | 
			
		||||
        return None
 | 
			
		||||
        return None
 | 
			
		||||
@@ -39,6 +39,9 @@ class ICALExtractor(Extractor):
 | 
			
		||||
 | 
			
		||||
        return day, time
 | 
			
		||||
 | 
			
		||||
    def clean_url(url):
 | 
			
		||||
        return url
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def extract(self, content, url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
 | 
			
		||||
 
 | 
			
		||||
@@ -6,13 +6,13 @@ from .extractor import *
 | 
			
		||||
 | 
			
		||||
class URL2Events:
 | 
			
		||||
 | 
			
		||||
    def __init__(self, downloader, extractor):
 | 
			
		||||
    def __init__(self, downloader = SimpleDownloader(), extractor = None, single_event=False):
 | 
			
		||||
 | 
			
		||||
        self.downloader = downloader
 | 
			
		||||
        self.extractor = extractor
 | 
			
		||||
        self.single_event = single_event
 | 
			
		||||
 | 
			
		||||
    def process(self, url, url_human = None, cache = None, default_values = None, published = False):
 | 
			
		||||
 | 
			
		||||
    def get_content(self, url, cache = None):
 | 
			
		||||
        if cache and os.path.exists(cache):
 | 
			
		||||
            print("Loading cache ({})".format(cache))
 | 
			
		||||
            with open(cache) as f:
 | 
			
		||||
@@ -27,5 +27,25 @@ class URL2Events:
 | 
			
		||||
                    os.makedirs(dir)
 | 
			
		||||
                with open(cache, "w") as text_file:
 | 
			
		||||
                    text_file.write(content)
 | 
			
		||||
        return content
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def process(self, url, url_human = None, cache = None, default_values = None, published = False):
 | 
			
		||||
        content = self.get_content(url, cache)
 | 
			
		||||
 | 
			
		||||
        if content is None:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
        if self.extractor is not None:
 | 
			
		||||
            return self.extractor.extract(content, url, url_human, default_values, published)
 | 
			
		||||
        else:
 | 
			
		||||
            # if the extractor is not defined, use a list of default extractors
 | 
			
		||||
            for e in Extractor.get_default_extractors(self.single_event):
 | 
			
		||||
                #try:
 | 
			
		||||
                    events = e.extract(content, url, url_human, default_values, published)
 | 
			
		||||
                    if events is not None:
 | 
			
		||||
                        return events
 | 
			
		||||
                #except:
 | 
			
		||||
                #    continue
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
        return self.extractor.extract(content, url, url_human, default_values, published)
 | 
			
		||||
 
 | 
			
		||||
@@ -345,6 +345,9 @@ article#filters {
 | 
			
		||||
.helptext, .subentry-search, .remarque {
 | 
			
		||||
    font-size: 80%;
 | 
			
		||||
    margin-top: -0.7em;
 | 
			
		||||
    ul {
 | 
			
		||||
        font-size: 100%;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
.django-ckeditor-widget {
 | 
			
		||||
@@ -648,6 +651,9 @@ aside nav a.badge {
 | 
			
		||||
 | 
			
		||||
/* mise en forme pour les récurrences */
 | 
			
		||||
.container-fluid article form p .recurrence-widget {
 | 
			
		||||
    @extend article;
 | 
			
		||||
    width: 100%;
 | 
			
		||||
    border: 0;
 | 
			
		||||
 | 
			
		||||
    .header a, .add-button {
 | 
			
		||||
        @extend [role="button"];
 | 
			
		||||
 
 | 
			
		||||
@@ -27,11 +27,14 @@
 | 
			
		||||
 | 
			
		||||
{% load static_content_extra %}
 | 
			
		||||
 | 
			
		||||
{% if object %}
 | 
			
		||||
<h1>Édition de l'événement {{ object.title }} ({{ object.start_day }})</h1>
 | 
			
		||||
{% else %}
 | 
			
		||||
<h1>Édition de l'événement importé</h1>
 | 
			
		||||
{% endif %}
 | 
			
		||||
<article>
 | 
			
		||||
    <header>
 | 
			
		||||
        {% if object %}
 | 
			
		||||
        <h1>Édition de l'événement {{ object.title }} ({{ object.start_day }})</h1>
 | 
			
		||||
        {% else %}
 | 
			
		||||
        <h1>Édition de l'événement importé</h1>
 | 
			
		||||
        {% endif %}
 | 
			
		||||
    </header>
 | 
			
		||||
 | 
			
		||||
<div id="container"></div>
 | 
			
		||||
<form method="post">{% csrf_token %}
 | 
			
		||||
@@ -42,5 +45,37 @@
 | 
			
		||||
        <input type="submit" value="Enregistrer">
 | 
			
		||||
    </div>
 | 
			
		||||
</form>
 | 
			
		||||
{% if object %}
 | 
			
		||||
<footer class="remarque">
 | 
			
		||||
    Informations complémentaires non éditables :
 | 
			
		||||
    <ul>
 | 
			
		||||
        {% if object.created_date %}<li>Création : {{ object.created_date }}</li>{% endif %}
 | 
			
		||||
        {% if object.modified_date %}<li>Dernière modification : {{ object.modified_date }}</li>{% endif %}
 | 
			
		||||
        {% if object.imported_date %}<li>Dernière importation : {{ object.imported_date }}</li>{% endif %}
 | 
			
		||||
        {% if object.uuids %}
 | 
			
		||||
            {% if object.uuids|length > 0 %}
 | 
			
		||||
                <li>UUIDs (identifiants uniques d'événements dans les sources) :
 | 
			
		||||
                <ul>
 | 
			
		||||
                    {% for u in object.uuids %}
 | 
			
		||||
                        <li>{{ u }}</li>
 | 
			
		||||
                    {% endfor %}
 | 
			
		||||
                </ul></li>
 | 
			
		||||
            {% endif %}
 | 
			
		||||
        {% endif %}
 | 
			
		||||
        {% if object.import_sources %}
 | 
			
		||||
            {% if object.import_sources|length > 0 %}
 | 
			
		||||
                <li>Sources d'import :
 | 
			
		||||
                <ul>
 | 
			
		||||
                    {% for u in object.import_sources %}
 | 
			
		||||
                        <li><a href="{{ u }}">{{ u }}</a></li>
 | 
			
		||||
                    {% endfor %}
 | 
			
		||||
                </ul>
 | 
			
		||||
                </li>
 | 
			
		||||
            {% endif %}
 | 
			
		||||
        {% endif %}
 | 
			
		||||
    </ul>
 | 
			
		||||
</footer>
 | 
			
		||||
{% endif %}
 | 
			
		||||
</article>
 | 
			
		||||
 | 
			
		||||
{% endblock %}
 | 
			
		||||
@@ -30,7 +30,10 @@ from django.contrib import messages
 | 
			
		||||
from django.contrib.messages.views import SuccessMessageMixin
 | 
			
		||||
 | 
			
		||||
from .calendar import CalendarMonth, CalendarWeek, CalendarDay
 | 
			
		||||
from .extractors import ExtractorAllURLs
 | 
			
		||||
 | 
			
		||||
from .import_tasks.importer import URL2Events
 | 
			
		||||
from .import_tasks.extractor import Extractor
 | 
			
		||||
from .import_tasks.downloader import ChromiumHeadlessDownloader
 | 
			
		||||
 | 
			
		||||
from .celery import app as celery_app, import_events_from_json, run_recurrent_import
 | 
			
		||||
 | 
			
		||||
@@ -262,7 +265,7 @@ def import_from_url(request):
 | 
			
		||||
    logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
    if request.method == 'POST' and "title" in request.POST:
 | 
			
		||||
        form = EventForm(request.POST)
 | 
			
		||||
        form = EventForm(request.POST, is_authenticated=request.user.is_authenticated)
 | 
			
		||||
        if form.is_valid():
 | 
			
		||||
            new_event = form.save()
 | 
			
		||||
            if request.user.is_authenticated:
 | 
			
		||||
@@ -284,25 +287,32 @@ def import_from_url(request):
 | 
			
		||||
        form_event = EventForm(initial=initial)
 | 
			
		||||
 | 
			
		||||
        if request.method == 'POST':
 | 
			
		||||
            form = EventSubmissionForm(request.POST)
 | 
			
		||||
 | 
			
		||||
            form = EventSubmissionForm(request.POST)
 | 
			
		||||
 | 
			
		||||
            if form.is_valid():
 | 
			
		||||
                cd = form.cleaned_data
 | 
			
		||||
                url = cd.get('url')
 | 
			
		||||
 | 
			
		||||
                url = ExtractorAllURLs.clean_url(url)
 | 
			
		||||
                url = Extractor.clean_url(url)
 | 
			
		||||
 | 
			
		||||
                existing = Event.objects.filter(uuids__contains=[url])
 | 
			
		||||
 | 
			
		||||
                if len(existing) == 0:
 | 
			
		||||
                    event = ExtractorAllURLs.extract(url)
 | 
			
		||||
                    event = None
 | 
			
		||||
 | 
			
		||||
                    u2e = URL2Events(ChromiumHeadlessDownloader(), single_event=True)
 | 
			
		||||
                    events_structure = u2e.process(url, published=request.user.is_authenticated)
 | 
			
		||||
                    if events_structure is not None and "events" in events_structure and len(events_structure["events"]) > 0:
 | 
			
		||||
                        event = Event.from_structure(events_structure["events"][0], events_structure["header"]["url"])
 | 
			
		||||
                        # TODO: use celery to import the other events
 | 
			
		||||
 | 
			
		||||
                    if event != None:
 | 
			
		||||
                        form = EventForm(instance=event)
 | 
			
		||||
                        form = EventForm(instance=event, is_authenticated=request.user.is_authenticated)
 | 
			
		||||
                        messages.success(request, _("The event has been successfully extracted, and you can now submit it after modifying it if necessary."))
 | 
			
		||||
                        return render(request, 'agenda_culturel/event_form.html', context={'form': form })
 | 
			
		||||
                    else:
 | 
			
		||||
                        form = EventForm(initial={'reference_urls': [url]})
 | 
			
		||||
                        form = EventForm(initial={'reference_urls': [url]}, is_authenticated=request.user.is_authenticated)
 | 
			
		||||
                        messages.error(request, _("Unable to extract an event from the proposed URL. Please use the form below to submit the event."))
 | 
			
		||||
                        return render(request, 'agenda_culturel/import.html', context={'form': form, 'form_event': form_event})
 | 
			
		||||
                else:
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user