WIP (le clic marche mais ne déverrouille pas le bousin)
This commit is contained in:
parent
9647f77c00
commit
52a355e95b
@ -1,5 +1,5 @@
|
|||||||
from ..generic_extractors import *
|
from ..generic_extractors import *
|
||||||
from ..extractor_facebook import FacebookEvent
|
from ..extractor_facebook import FacebookEvent, FacebookEventExtractor
|
||||||
import json5
|
import json5
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import json
|
import json
|
||||||
@ -15,6 +15,11 @@ logger = logging.getLogger(__name__)
|
|||||||
# such as https://www.facebook.com/laJeteeClermont/events
|
# such as https://www.facebook.com/laJeteeClermont/events
|
||||||
class CExtractor(TwoStepsExtractor):
|
class CExtractor(TwoStepsExtractor):
|
||||||
|
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.has_2nd_method_in_list = True
|
||||||
|
|
||||||
def find_event_id_fragment_in_array(self, array, first=True):
|
def find_event_id_fragment_in_array(self, array, first=True):
|
||||||
found = False
|
found = False
|
||||||
if isinstance(array, dict):
|
if isinstance(array, dict):
|
||||||
@ -40,6 +45,9 @@ class CExtractor(TwoStepsExtractor):
|
|||||||
|
|
||||||
return found
|
return found
|
||||||
|
|
||||||
|
def prepare_2nd_extract_in_list(self):
|
||||||
|
FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader)
|
||||||
|
|
||||||
|
|
||||||
def build_event_url_list(self, content):
|
def build_event_url_list(self, content):
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
@ -49,8 +57,9 @@ class CExtractor(TwoStepsExtractor):
|
|||||||
found = False
|
found = False
|
||||||
links = soup.find_all("a")
|
links = soup.find_all("a")
|
||||||
for link in links:
|
for link in links:
|
||||||
if link.get("href").startswith('https://www.facebook.com/events/'):
|
href = link.get('href')
|
||||||
self.add_event_url(link.get('href').split('?')[0])
|
if not href is None and href.startswith('https://www.facebook.com/events/'):
|
||||||
|
self.add_event_url(href.split('?')[0])
|
||||||
found = True
|
found = True
|
||||||
|
|
||||||
found = self.find_in_js(soup) or found
|
found = self.find_in_js(soup) or found
|
||||||
|
@ -11,7 +11,7 @@ import time
|
|||||||
|
|
||||||
class Downloader(ABC):
|
class Downloader(ABC):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
self.support_2nd_extract = False
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def download(self, url, post=None):
|
def download(self, url, post=None):
|
||||||
@ -68,6 +68,8 @@ class SimpleDownloader(Downloader):
|
|||||||
class ChromiumHeadlessDownloader(Downloader):
|
class ChromiumHeadlessDownloader(Downloader):
|
||||||
def __init__(self, pause=True, noimage=True):
|
def __init__(self, pause=True, noimage=True):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.support_2nd_extract = True
|
||||||
|
|
||||||
self.pause = pause
|
self.pause = pause
|
||||||
self.options = Options()
|
self.options = Options()
|
||||||
self.options.add_argument("--headless=new")
|
self.options.add_argument("--headless=new")
|
||||||
@ -78,6 +80,7 @@ class ChromiumHeadlessDownloader(Downloader):
|
|||||||
self.options.add_argument("--disable-dev-shm-usage")
|
self.options.add_argument("--disable-dev-shm-usage")
|
||||||
self.options.add_argument("--disable-browser-side-navigation")
|
self.options.add_argument("--disable-browser-side-navigation")
|
||||||
self.options.add_argument("--disable-gpu")
|
self.options.add_argument("--disable-gpu")
|
||||||
|
self.options.add_argument("--proxy-server=socks5://127.0.0.1:12345")
|
||||||
if noimage:
|
if noimage:
|
||||||
self.options.add_experimental_option(
|
self.options.add_experimental_option(
|
||||||
"prefs", {
|
"prefs", {
|
||||||
|
@ -18,8 +18,12 @@ class Extractor(ABC):
|
|||||||
self.header = {}
|
self.header = {}
|
||||||
self.events = []
|
self.events = []
|
||||||
self.downloader = None
|
self.downloader = None
|
||||||
|
self.has_2nd_method = False
|
||||||
self.referer = ""
|
self.referer = ""
|
||||||
|
|
||||||
|
def prepare_2nd_extract(self):
|
||||||
|
pass
|
||||||
|
|
||||||
def remove_accents(input_str):
|
def remove_accents(input_str):
|
||||||
nfkd_form = unicodedata.normalize("NFKD", input_str)
|
nfkd_form = unicodedata.normalize("NFKD", input_str)
|
||||||
return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
|
return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
|
||||||
@ -167,6 +171,9 @@ class Extractor(ABC):
|
|||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def is_known_url(url):
|
||||||
|
return False
|
||||||
|
|
||||||
def set_header(self, url):
|
def set_header(self, url):
|
||||||
self.header["url"] = url
|
self.header["url"] = url
|
||||||
self.header["date"] = datetime.now()
|
self.header["date"] = datetime.now()
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
import time as t
|
||||||
|
|
||||||
from .extractor import *
|
from .extractor import *
|
||||||
import json
|
import json
|
||||||
@ -232,6 +233,27 @@ class FacebookEventExtractor(Extractor):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.has_2nd_method = True
|
||||||
|
|
||||||
|
def prepare_2nd_extract_dler(downloader):
|
||||||
|
logger.warning("prepare_2nd_extract_dler")
|
||||||
|
if downloader.support_2nd_extract:
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
|
||||||
|
path = './/div[not(@aria-hidden)]/div[@aria-label="Allow all cookies"]'
|
||||||
|
element = WebDriverWait(downloader.driver, 10).until(EC.visibility_of_element_located((By.XPATH, path)))
|
||||||
|
button = downloader.driver.find_element(By.XPATH, path)
|
||||||
|
logger.warning("button")
|
||||||
|
logger.warning(button)
|
||||||
|
button.click()
|
||||||
|
t.sleep(3)
|
||||||
|
logger.warning(downloader.driver.page_source)
|
||||||
|
|
||||||
|
def prepare_2nd_extract(self):
|
||||||
|
FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader)
|
||||||
|
|
||||||
|
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
if FacebookEventExtractor.is_known_url(url):
|
if FacebookEventExtractor.is_known_url(url):
|
||||||
|
@ -89,6 +89,7 @@ class TwoStepsExtractor(Extractor):
|
|||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
self.has_2nd_method_in_list = False
|
||||||
self.event_urls = None
|
self.event_urls = None
|
||||||
self.event_properties = {}
|
self.event_properties = {}
|
||||||
|
|
||||||
@ -204,6 +205,9 @@ class TwoStepsExtractor(Extractor):
|
|||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def prepare_2nd_extract_in_list(self):
|
||||||
|
pass
|
||||||
|
|
||||||
def extract(
|
def extract(
|
||||||
self,
|
self,
|
||||||
content,
|
content,
|
||||||
@ -212,9 +216,12 @@ class TwoStepsExtractor(Extractor):
|
|||||||
default_values=None,
|
default_values=None,
|
||||||
published=False,
|
published=False,
|
||||||
only_future=True,
|
only_future=True,
|
||||||
ignore_404=True
|
ignore_404=True,
|
||||||
|
first=True
|
||||||
|
|
||||||
):
|
):
|
||||||
|
|
||||||
|
first = True
|
||||||
self.only_future = only_future
|
self.only_future = only_future
|
||||||
self.now = datetime.datetime.now().date()
|
self.now = datetime.datetime.now().date()
|
||||||
self.set_header(url)
|
self.set_header(url)
|
||||||
@ -249,6 +256,16 @@ class TwoStepsExtractor(Extractor):
|
|||||||
self.add_event_from_content(
|
self.add_event_from_content(
|
||||||
content_event, event_url, url_human, default_values, published
|
content_event, event_url, url_human, default_values, published
|
||||||
)
|
)
|
||||||
|
# some website (FB) sometime need a second step
|
||||||
|
if first and len(self.events) == 0 and self.has_2nd_method_in_list and self.downloader.support_2nd_extract:
|
||||||
|
first = False
|
||||||
|
self.prepare_2nd_extract_in_list()
|
||||||
|
content_event = self.downloader.get_content(event_url)
|
||||||
|
if not content_event is None:
|
||||||
|
self.add_event_from_content(
|
||||||
|
content_event, event_url, url_human, default_values, published
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
return self.get_structure()
|
return self.get_structure()
|
||||||
|
|
||||||
|
@ -16,7 +16,8 @@ class URL2Events:
|
|||||||
self.single_event = single_event
|
self.single_event = single_event
|
||||||
|
|
||||||
def process(
|
def process(
|
||||||
self, url, url_human=None, cache=None, default_values=None, published=False
|
self, url, url_human=None, cache=None, default_values=None, published=False,
|
||||||
|
first=True
|
||||||
):
|
):
|
||||||
referer = ""
|
referer = ""
|
||||||
if self.extractor:
|
if self.extractor:
|
||||||
@ -37,6 +38,13 @@ class URL2Events:
|
|||||||
logger.warning('Extractor::' + type(e).__name__)
|
logger.warning('Extractor::' + type(e).__name__)
|
||||||
e.set_downloader(self.downloader)
|
e.set_downloader(self.downloader)
|
||||||
events = e.extract(content, url, url_human, default_values, published)
|
events = e.extract(content, url, url_human, default_values, published)
|
||||||
if events is not None and len(events) > 0:
|
if events is not None:
|
||||||
return events
|
if len(events) > 0:
|
||||||
|
return events
|
||||||
|
else:
|
||||||
|
logger.warning("cas sans event")
|
||||||
|
if first and FacebookEventExtractor.is_known_url(url) and self.downloader.support_2nd_extract and e.has_2nd_method:
|
||||||
|
logger.warning("on avance")
|
||||||
|
e.prepare_2nd_extract()
|
||||||
|
return self.process(url, url_human, cache, default_values, published, False)
|
||||||
return None
|
return None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user