L'import facebook partage maintenant son code avec les autres imports
Fix #80
This commit is contained in:
3
experimentations/.gitignore
vendored
Normal file
3
experimentations/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
*.json
|
||||
*.html
|
||||
*.ical
|
@@ -1,171 +1,40 @@
|
||||
#!/usr/bin/python3
|
||||
# coding: utf-8
|
||||
|
||||
import requests
|
||||
import hashlib
|
||||
import os
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.chrome.service import Service
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import json
|
||||
import sys
|
||||
|
||||
class SimpleEvent:
|
||||
# getting the name of the directory
|
||||
# where the this file is present.
|
||||
current = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
# Getting the parent directory name
|
||||
# where the current directory is present.
|
||||
parent = os.path.dirname(current)
|
||||
|
||||
# adding the parent directory to
|
||||
# the sys.path.
|
||||
sys.path.append(parent)
|
||||
|
||||
def __init__(self, data):
|
||||
self.elements = {}
|
||||
|
||||
for key in ["id", "start_timestamp", "end_timestamp"]:
|
||||
self.elements[key] = data[key] if key in data else None
|
||||
|
||||
if "parent_event" in data:
|
||||
self.parent = SimpleEvent(data["parent_event"])
|
||||
from src.agenda_culturel.import_tasks.downloader import *
|
||||
from src.agenda_culturel.import_tasks.extractor import *
|
||||
from src.agenda_culturel.import_tasks.importer import *
|
||||
from src.agenda_culturel.import_tasks.extractor_facebook import *
|
||||
|
||||
|
||||
class Event:
|
||||
|
||||
name = "event"
|
||||
keys = [
|
||||
["start_time_formatted", 'start_timestamp',
|
||||
'is_past',
|
||||
"name",
|
||||
"price_info",
|
||||
"cover_media_renderer",
|
||||
"event_creator",
|
||||
"id",
|
||||
"day_time_sentence",
|
||||
"event_place",
|
||||
"comet_neighboring_siblings"],
|
||||
["event_description"],
|
||||
["start_timestamp", "end_timestamp"]
|
||||
]
|
||||
rules = {
|
||||
"event_description": { "description": ["text"]},
|
||||
"cover_media_renderer": {"image_alt": ["cover_photo", "photo", "accessibility_caption"], "image": ["cover_photo", "photo", "full_image", "uri"]},
|
||||
"event_creator": { "event_creator_name": ["name"], "event_creator_url": ["url"] },
|
||||
"event_place": {"event_place_name": ["name"] }
|
||||
}
|
||||
|
||||
def __init__(self, i, event):
|
||||
self.fragments = {}
|
||||
self.elements = {}
|
||||
self.neighbor_events = None
|
||||
self.possible_end_timestamp = []
|
||||
self.add_fragment(i, event)
|
||||
|
||||
def add_fragment(self, i, event):
|
||||
self.fragments[i] = event
|
||||
|
||||
if Event.keys[i] == ["start_timestamp", "end_timestamp"]:
|
||||
self.get_possible_end_timestamp(i, event)
|
||||
else:
|
||||
for k in Event.keys[i]:
|
||||
if k == "comet_neighboring_siblings":
|
||||
self.get_neighbor_events(event[k])
|
||||
elif k in Event.rules:
|
||||
for nk, rule in Event.rules[k].items():
|
||||
c = event[k]
|
||||
for ki in rule:
|
||||
c = c[ki]
|
||||
self.elements[nk] = c
|
||||
else:
|
||||
self.elements[k] = event[k]
|
||||
|
||||
def get_possible_end_timestamp(self, i, data):
|
||||
self.possible_end_timestamp.append(dict((k, data[k]) for k in Event.keys[i]))
|
||||
|
||||
def get_neighbor_events(self, data):
|
||||
self.neighbor_events = [SimpleEvent(d) for d in data]
|
||||
|
||||
def __str__(self):
|
||||
return str(self.elements) + "\n Neighbors: " + ", ".join([ne.elements["id"] for ne in self.neighbor_events])
|
||||
|
||||
def consolidate_current_event(self):
|
||||
if self.neighbor_events is not None and "id" in self.elements and "end_timestamp" not in self.elements:
|
||||
id = self.elements["id"]
|
||||
for ne in self.neighbor_events:
|
||||
if ne.elements["id"] == id:
|
||||
self.elements["end_timestamp"] = ne.elements["end_timestamp"]
|
||||
|
||||
if "end_timestamp" not in self.elements and len(self.possible_end_timestamp) != 0:
|
||||
for s in self.possible_end_timestamp:
|
||||
if s["start_timestamp"] == self.elements["start_timestamp"]:
|
||||
self.elements["end_timestamp"] = s["end_timestamp"]
|
||||
break
|
||||
|
||||
def find_event_fragment_in_array(array, event, first = True):
|
||||
if isinstance(array, dict):
|
||||
|
||||
seen = False
|
||||
for i, ks in enumerate(Event.keys):
|
||||
if len(ks) == len([k for k in ks if k in array]):
|
||||
seen = True
|
||||
if event is None:
|
||||
event = Event(i, array)
|
||||
else:
|
||||
event.add_fragment(i, array)
|
||||
# only consider the first of Event.keys
|
||||
break
|
||||
if not seen:
|
||||
for k in array:
|
||||
event = Event.find_event_fragment_in_array(array[k], event, False)
|
||||
elif isinstance(array, list):
|
||||
for e in array:
|
||||
event = Event.find_event_fragment_in_array(e, event, False)
|
||||
|
||||
if event is not None and first:
|
||||
event.consolidate_current_event()
|
||||
return event
|
||||
|
||||
|
||||
#url="https://www.facebook.com/events/ical/export/?eid=2294200007432315"
|
||||
#url="https://www.facebook.com/events/2294199997432316/2294200007432315/"
|
||||
#url="https://www.facebook.com/events/635247792092358/"
|
||||
url="https://www.facebook.com/events/872781744074648"
|
||||
url="https://www.facebook.com/events/1432798543943663?"
|
||||
#url_cal = "https://www.facebook.com/events/ical/export/?eid=993406668581410"
|
||||
#url="https://jmtrivial.info"
|
||||
|
||||
cachedir = "cache"
|
||||
result = hashlib.md5(url.encode())
|
||||
hash = result.hexdigest()
|
||||
if __name__ == "__main__":
|
||||
|
||||
filename = os.path.join(cachedir, hash + ".html")
|
||||
u2e = URL2Events(ChromiumHeadlessDownloader(), FacebookEventExtractor(single_event=True))
|
||||
url="https://www.facebook.com/events/872781744074648"
|
||||
|
||||
if os.path.isfile(filename):
|
||||
# print("Use cache")
|
||||
with open(filename) as f:
|
||||
doc = "\n".join(f.readlines())
|
||||
else:
|
||||
print("Download page")
|
||||
events = u2e.process(url, cache = "fb.html", published = True)
|
||||
|
||||
options = Options()
|
||||
options.add_argument("--headless=new")
|
||||
service = Service("/usr/bin/chromedriver")
|
||||
|
||||
driver = webdriver.Chrome(service=service, options=options)
|
||||
driver.get(url)
|
||||
doc = driver.page_source
|
||||
driver.quit()
|
||||
|
||||
dir = os.path.dirname(filename)
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with open(filename, "w") as text_file:
|
||||
text_file.write(doc)
|
||||
|
||||
|
||||
soup = BeautifulSoup(doc)
|
||||
|
||||
event = None
|
||||
for json_script in soup.find_all('script', type="application/json"):
|
||||
json_txt = json_script.get_text()
|
||||
json_struct = json.loads(json_txt)
|
||||
|
||||
event = Event.find_event_fragment_in_array(json_struct, event)
|
||||
|
||||
print(event)
|
||||
exportfile = "event-facebook.json"
|
||||
print("Saving events to file {}".format(exportfile))
|
||||
with open(exportfile, "w") as f:
|
||||
json.dump(events, f, indent=4, default=str)
|
||||
|
||||
|
Reference in New Issue
Block a user