WIP (le clic marche mais ne déverrouille pas le bousin)

This commit is contained in:
Jean-Marie Favreau 2025-02-06 19:18:32 +01:00
parent 9647f77c00
commit 52a355e95b
6 changed files with 74 additions and 8 deletions

View File

@ -1,5 +1,5 @@
from ..generic_extractors import *
from ..extractor_facebook import FacebookEvent
from ..extractor_facebook import FacebookEvent, FacebookEventExtractor
import json5
from bs4 import BeautifulSoup
import json
@ -15,6 +15,11 @@ logger = logging.getLogger(__name__)
# such as https://www.facebook.com/laJeteeClermont/events
class CExtractor(TwoStepsExtractor):
def __init__(self):
super().__init__()
self.has_2nd_method_in_list = True
def find_event_id_fragment_in_array(self, array, first=True):
found = False
if isinstance(array, dict):
@ -40,6 +45,9 @@ class CExtractor(TwoStepsExtractor):
return found
def prepare_2nd_extract_in_list(self):
FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader)
def build_event_url_list(self, content):
soup = BeautifulSoup(content, "html.parser")
@ -49,8 +57,9 @@ class CExtractor(TwoStepsExtractor):
found = False
links = soup.find_all("a")
for link in links:
if link.get("href").startswith('https://www.facebook.com/events/'):
self.add_event_url(link.get('href').split('?')[0])
href = link.get('href')
if not href is None and href.startswith('https://www.facebook.com/events/'):
self.add_event_url(href.split('?')[0])
found = True
found = self.find_in_js(soup) or found

View File

@ -11,7 +11,7 @@ import time
class Downloader(ABC):
def __init__(self):
pass
self.support_2nd_extract = False
@abstractmethod
def download(self, url, post=None):
@ -68,6 +68,8 @@ class SimpleDownloader(Downloader):
class ChromiumHeadlessDownloader(Downloader):
def __init__(self, pause=True, noimage=True):
super().__init__()
self.support_2nd_extract = True
self.pause = pause
self.options = Options()
self.options.add_argument("--headless=new")
@ -78,6 +80,7 @@ class ChromiumHeadlessDownloader(Downloader):
self.options.add_argument("--disable-dev-shm-usage")
self.options.add_argument("--disable-browser-side-navigation")
self.options.add_argument("--disable-gpu")
self.options.add_argument("--proxy-server=socks5://127.0.0.1:12345")
if noimage:
self.options.add_experimental_option(
"prefs", {

View File

@ -18,7 +18,11 @@ class Extractor(ABC):
self.header = {}
self.events = []
self.downloader = None
self.has_2nd_method = False
self.referer = ""
def prepare_2nd_extract(self):
pass
def remove_accents(input_str):
nfkd_form = unicodedata.normalize("NFKD", input_str)
@ -167,6 +171,9 @@ class Extractor(ABC):
def clean_url(url):
pass
def is_known_url(url):
return False
def set_header(self, url):
self.header["url"] = url
self.header["date"] = datetime.now()

View File

@ -1,6 +1,7 @@
from datetime import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import time as t
from .extractor import *
import json
@ -232,6 +233,27 @@ class FacebookEventExtractor(Extractor):
def __init__(self):
super().__init__()
self.has_2nd_method = True
def prepare_2nd_extract_dler(downloader):
logger.warning("prepare_2nd_extract_dler")
if downloader.support_2nd_extract:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
path = './/div[not(@aria-hidden)]/div[@aria-label="Allow all cookies"]'
element = WebDriverWait(downloader.driver, 10).until(EC.visibility_of_element_located((By.XPATH, path)))
button = downloader.driver.find_element(By.XPATH, path)
logger.warning("button")
logger.warning(button)
button.click()
t.sleep(3)
logger.warning(downloader.driver.page_source)
def prepare_2nd_extract(self):
FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader)
def clean_url(url):
if FacebookEventExtractor.is_known_url(url):

View File

@ -89,6 +89,7 @@ class TwoStepsExtractor(Extractor):
def __init__(self):
super().__init__()
self.has_2nd_method_in_list = False
self.event_urls = None
self.event_properties = {}
@ -204,6 +205,9 @@ class TwoStepsExtractor(Extractor):
):
pass
def prepare_2nd_extract_in_list(self):
pass
def extract(
self,
content,
@ -212,9 +216,12 @@ class TwoStepsExtractor(Extractor):
default_values=None,
published=False,
only_future=True,
ignore_404=True
ignore_404=True,
first=True
):
first = True
self.only_future = only_future
self.now = datetime.datetime.now().date()
self.set_header(url)
@ -249,6 +256,16 @@ class TwoStepsExtractor(Extractor):
self.add_event_from_content(
content_event, event_url, url_human, default_values, published
)
# some website (FB) sometime need a second step
if first and len(self.events) == 0 and self.has_2nd_method_in_list and self.downloader.support_2nd_extract:
first = False
self.prepare_2nd_extract_in_list()
content_event = self.downloader.get_content(event_url)
if not content_event is None:
self.add_event_from_content(
content_event, event_url, url_human, default_values, published
)
return self.get_structure()

View File

@ -16,7 +16,8 @@ class URL2Events:
self.single_event = single_event
def process(
self, url, url_human=None, cache=None, default_values=None, published=False
self, url, url_human=None, cache=None, default_values=None, published=False,
first=True
):
referer = ""
if self.extractor:
@ -37,6 +38,13 @@ class URL2Events:
logger.warning('Extractor::' + type(e).__name__)
e.set_downloader(self.downloader)
events = e.extract(content, url, url_human, default_values, published)
if events is not None and len(events) > 0:
return events
if events is not None:
if len(events) > 0:
return events
else:
logger.warning("cas sans event")
if first and FacebookEventExtractor.is_known_url(url) and self.downloader.support_2nd_extract and e.has_2nd_method:
logger.warning("on avance")
e.prepare_2nd_extract()
return self.process(url, url_human, cache, default_values, published, False)
return None