WIP (le clic marche mais ne déverrouille pas le bousin)
This commit is contained in:
parent
9647f77c00
commit
52a355e95b
@ -1,5 +1,5 @@
|
||||
from ..generic_extractors import *
|
||||
from ..extractor_facebook import FacebookEvent
|
||||
from ..extractor_facebook import FacebookEvent, FacebookEventExtractor
|
||||
import json5
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
@ -15,6 +15,11 @@ logger = logging.getLogger(__name__)
|
||||
# such as https://www.facebook.com/laJeteeClermont/events
|
||||
class CExtractor(TwoStepsExtractor):
|
||||
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.has_2nd_method_in_list = True
|
||||
|
||||
def find_event_id_fragment_in_array(self, array, first=True):
|
||||
found = False
|
||||
if isinstance(array, dict):
|
||||
@ -40,6 +45,9 @@ class CExtractor(TwoStepsExtractor):
|
||||
|
||||
return found
|
||||
|
||||
def prepare_2nd_extract_in_list(self):
|
||||
FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader)
|
||||
|
||||
|
||||
def build_event_url_list(self, content):
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
@ -49,8 +57,9 @@ class CExtractor(TwoStepsExtractor):
|
||||
found = False
|
||||
links = soup.find_all("a")
|
||||
for link in links:
|
||||
if link.get("href").startswith('https://www.facebook.com/events/'):
|
||||
self.add_event_url(link.get('href').split('?')[0])
|
||||
href = link.get('href')
|
||||
if not href is None and href.startswith('https://www.facebook.com/events/'):
|
||||
self.add_event_url(href.split('?')[0])
|
||||
found = True
|
||||
|
||||
found = self.find_in_js(soup) or found
|
||||
|
@ -11,7 +11,7 @@ import time
|
||||
|
||||
class Downloader(ABC):
|
||||
def __init__(self):
|
||||
pass
|
||||
self.support_2nd_extract = False
|
||||
|
||||
@abstractmethod
|
||||
def download(self, url, post=None):
|
||||
@ -68,6 +68,8 @@ class SimpleDownloader(Downloader):
|
||||
class ChromiumHeadlessDownloader(Downloader):
|
||||
def __init__(self, pause=True, noimage=True):
|
||||
super().__init__()
|
||||
self.support_2nd_extract = True
|
||||
|
||||
self.pause = pause
|
||||
self.options = Options()
|
||||
self.options.add_argument("--headless=new")
|
||||
@ -78,6 +80,7 @@ class ChromiumHeadlessDownloader(Downloader):
|
||||
self.options.add_argument("--disable-dev-shm-usage")
|
||||
self.options.add_argument("--disable-browser-side-navigation")
|
||||
self.options.add_argument("--disable-gpu")
|
||||
self.options.add_argument("--proxy-server=socks5://127.0.0.1:12345")
|
||||
if noimage:
|
||||
self.options.add_experimental_option(
|
||||
"prefs", {
|
||||
|
@ -18,7 +18,11 @@ class Extractor(ABC):
|
||||
self.header = {}
|
||||
self.events = []
|
||||
self.downloader = None
|
||||
self.has_2nd_method = False
|
||||
self.referer = ""
|
||||
|
||||
def prepare_2nd_extract(self):
|
||||
pass
|
||||
|
||||
def remove_accents(input_str):
|
||||
nfkd_form = unicodedata.normalize("NFKD", input_str)
|
||||
@ -167,6 +171,9 @@ class Extractor(ABC):
|
||||
def clean_url(url):
|
||||
pass
|
||||
|
||||
def is_known_url(url):
|
||||
return False
|
||||
|
||||
def set_header(self, url):
|
||||
self.header["url"] = url
|
||||
self.header["date"] = datetime.now()
|
||||
|
@ -1,6 +1,7 @@
|
||||
from datetime import datetime
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse
|
||||
import time as t
|
||||
|
||||
from .extractor import *
|
||||
import json
|
||||
@ -232,6 +233,27 @@ class FacebookEventExtractor(Extractor):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.has_2nd_method = True
|
||||
|
||||
def prepare_2nd_extract_dler(downloader):
|
||||
logger.warning("prepare_2nd_extract_dler")
|
||||
if downloader.support_2nd_extract:
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
path = './/div[not(@aria-hidden)]/div[@aria-label="Allow all cookies"]'
|
||||
element = WebDriverWait(downloader.driver, 10).until(EC.visibility_of_element_located((By.XPATH, path)))
|
||||
button = downloader.driver.find_element(By.XPATH, path)
|
||||
logger.warning("button")
|
||||
logger.warning(button)
|
||||
button.click()
|
||||
t.sleep(3)
|
||||
logger.warning(downloader.driver.page_source)
|
||||
|
||||
def prepare_2nd_extract(self):
|
||||
FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader)
|
||||
|
||||
|
||||
def clean_url(url):
|
||||
if FacebookEventExtractor.is_known_url(url):
|
||||
|
@ -89,6 +89,7 @@ class TwoStepsExtractor(Extractor):
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.has_2nd_method_in_list = False
|
||||
self.event_urls = None
|
||||
self.event_properties = {}
|
||||
|
||||
@ -204,6 +205,9 @@ class TwoStepsExtractor(Extractor):
|
||||
):
|
||||
pass
|
||||
|
||||
def prepare_2nd_extract_in_list(self):
|
||||
pass
|
||||
|
||||
def extract(
|
||||
self,
|
||||
content,
|
||||
@ -212,9 +216,12 @@ class TwoStepsExtractor(Extractor):
|
||||
default_values=None,
|
||||
published=False,
|
||||
only_future=True,
|
||||
ignore_404=True
|
||||
ignore_404=True,
|
||||
first=True
|
||||
|
||||
):
|
||||
|
||||
first = True
|
||||
self.only_future = only_future
|
||||
self.now = datetime.datetime.now().date()
|
||||
self.set_header(url)
|
||||
@ -249,6 +256,16 @@ class TwoStepsExtractor(Extractor):
|
||||
self.add_event_from_content(
|
||||
content_event, event_url, url_human, default_values, published
|
||||
)
|
||||
# some website (FB) sometime need a second step
|
||||
if first and len(self.events) == 0 and self.has_2nd_method_in_list and self.downloader.support_2nd_extract:
|
||||
first = False
|
||||
self.prepare_2nd_extract_in_list()
|
||||
content_event = self.downloader.get_content(event_url)
|
||||
if not content_event is None:
|
||||
self.add_event_from_content(
|
||||
content_event, event_url, url_human, default_values, published
|
||||
)
|
||||
|
||||
|
||||
return self.get_structure()
|
||||
|
||||
|
@ -16,7 +16,8 @@ class URL2Events:
|
||||
self.single_event = single_event
|
||||
|
||||
def process(
|
||||
self, url, url_human=None, cache=None, default_values=None, published=False
|
||||
self, url, url_human=None, cache=None, default_values=None, published=False,
|
||||
first=True
|
||||
):
|
||||
referer = ""
|
||||
if self.extractor:
|
||||
@ -37,6 +38,13 @@ class URL2Events:
|
||||
logger.warning('Extractor::' + type(e).__name__)
|
||||
e.set_downloader(self.downloader)
|
||||
events = e.extract(content, url, url_human, default_values, published)
|
||||
if events is not None and len(events) > 0:
|
||||
return events
|
||||
if events is not None:
|
||||
if len(events) > 0:
|
||||
return events
|
||||
else:
|
||||
logger.warning("cas sans event")
|
||||
if first and FacebookEventExtractor.is_known_url(url) and self.downloader.support_2nd_extract and e.has_2nd_method:
|
||||
logger.warning("on avance")
|
||||
e.prepare_2nd_extract()
|
||||
return self.process(url, url_human, cache, default_values, published, False)
|
||||
return None
|
||||
|
Loading…
x
Reference in New Issue
Block a user