From 52a355e95bd5a0404f28f89b08fa8e15f84a19c5 Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Thu, 6 Feb 2025 19:18:32 +0100 Subject: [PATCH] =?UTF-8?q?WIP=20(le=20clic=20marche=20mais=20ne=20d=C3=A9?= =?UTF-8?q?verrouille=20pas=20le=20bousin)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../custom_extractors/fbevents.py | 15 ++++++++++--- .../import_tasks/downloader.py | 5 ++++- src/agenda_culturel/import_tasks/extractor.py | 7 ++++++ .../import_tasks/extractor_facebook.py | 22 +++++++++++++++++++ .../import_tasks/generic_extractors.py | 19 +++++++++++++++- src/agenda_culturel/import_tasks/importer.py | 14 +++++++++--- 6 files changed, 74 insertions(+), 8 deletions(-) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py b/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py index c47c8ba..2d5c20e 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/fbevents.py @@ -1,5 +1,5 @@ from ..generic_extractors import * -from ..extractor_facebook import FacebookEvent +from ..extractor_facebook import FacebookEvent, FacebookEventExtractor import json5 from bs4 import BeautifulSoup import json @@ -15,6 +15,11 @@ logger = logging.getLogger(__name__) # such as https://www.facebook.com/laJeteeClermont/events class CExtractor(TwoStepsExtractor): + + def __init__(self): + super().__init__() + self.has_2nd_method_in_list = True + def find_event_id_fragment_in_array(self, array, first=True): found = False if isinstance(array, dict): @@ -40,6 +45,9 @@ class CExtractor(TwoStepsExtractor): return found + def prepare_2nd_extract_in_list(self): + FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader) + def build_event_url_list(self, content): soup = BeautifulSoup(content, "html.parser") @@ -49,8 +57,9 @@ class CExtractor(TwoStepsExtractor): found = False links = soup.find_all("a") for link in links: - if link.get("href").startswith('https://www.facebook.com/events/'): - self.add_event_url(link.get('href').split('?')[0]) + href = link.get('href') + if not href is None and href.startswith('https://www.facebook.com/events/'): + self.add_event_url(href.split('?')[0]) found = True found = self.find_in_js(soup) or found diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index 905c130..b7371b1 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -11,7 +11,7 @@ import time class Downloader(ABC): def __init__(self): - pass + self.support_2nd_extract = False @abstractmethod def download(self, url, post=None): @@ -68,6 +68,8 @@ class SimpleDownloader(Downloader): class ChromiumHeadlessDownloader(Downloader): def __init__(self, pause=True, noimage=True): super().__init__() + self.support_2nd_extract = True + self.pause = pause self.options = Options() self.options.add_argument("--headless=new") @@ -78,6 +80,7 @@ class ChromiumHeadlessDownloader(Downloader): self.options.add_argument("--disable-dev-shm-usage") self.options.add_argument("--disable-browser-side-navigation") self.options.add_argument("--disable-gpu") + self.options.add_argument("--proxy-server=socks5://127.0.0.1:12345") if noimage: self.options.add_experimental_option( "prefs", { diff --git a/src/agenda_culturel/import_tasks/extractor.py b/src/agenda_culturel/import_tasks/extractor.py index 4efd504..895f6a8 100644 --- a/src/agenda_culturel/import_tasks/extractor.py +++ b/src/agenda_culturel/import_tasks/extractor.py @@ -18,7 +18,11 @@ class Extractor(ABC): self.header = {} self.events = [] self.downloader = None + self.has_2nd_method = False self.referer = "" + + def prepare_2nd_extract(self): + pass def remove_accents(input_str): nfkd_form = unicodedata.normalize("NFKD", input_str) @@ -167,6 +171,9 @@ class Extractor(ABC): def clean_url(url): pass + def is_known_url(url): + return False + def set_header(self, url): self.header["url"] = url self.header["date"] = datetime.now() diff --git a/src/agenda_culturel/import_tasks/extractor_facebook.py b/src/agenda_culturel/import_tasks/extractor_facebook.py index b7970ab..f8b7fad 100644 --- a/src/agenda_culturel/import_tasks/extractor_facebook.py +++ b/src/agenda_culturel/import_tasks/extractor_facebook.py @@ -1,6 +1,7 @@ from datetime import datetime from bs4 import BeautifulSoup from urllib.parse import urlparse +import time as t from .extractor import * import json @@ -232,6 +233,27 @@ class FacebookEventExtractor(Extractor): def __init__(self): super().__init__() + self.has_2nd_method = True + + def prepare_2nd_extract_dler(downloader): + logger.warning("prepare_2nd_extract_dler") + if downloader.support_2nd_extract: + from selenium.webdriver.common.by import By + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + + path = './/div[not(@aria-hidden)]/div[@aria-label="Allow all cookies"]' + element = WebDriverWait(downloader.driver, 10).until(EC.visibility_of_element_located((By.XPATH, path))) + button = downloader.driver.find_element(By.XPATH, path) + logger.warning("button") + logger.warning(button) + button.click() + t.sleep(3) + logger.warning(downloader.driver.page_source) + + def prepare_2nd_extract(self): + FacebookEventExtractor.prepare_2nd_extract_dler(self.downloader) + def clean_url(url): if FacebookEventExtractor.is_known_url(url): diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py index d20893a..bc20357 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -89,6 +89,7 @@ class TwoStepsExtractor(Extractor): def __init__(self): super().__init__() + self.has_2nd_method_in_list = False self.event_urls = None self.event_properties = {} @@ -204,6 +205,9 @@ class TwoStepsExtractor(Extractor): ): pass + def prepare_2nd_extract_in_list(self): + pass + def extract( self, content, @@ -212,9 +216,12 @@ class TwoStepsExtractor(Extractor): default_values=None, published=False, only_future=True, - ignore_404=True + ignore_404=True, + first=True + ): + first = True self.only_future = only_future self.now = datetime.datetime.now().date() self.set_header(url) @@ -249,6 +256,16 @@ class TwoStepsExtractor(Extractor): self.add_event_from_content( content_event, event_url, url_human, default_values, published ) + # some website (FB) sometime need a second step + if first and len(self.events) == 0 and self.has_2nd_method_in_list and self.downloader.support_2nd_extract: + first = False + self.prepare_2nd_extract_in_list() + content_event = self.downloader.get_content(event_url) + if not content_event is None: + self.add_event_from_content( + content_event, event_url, url_human, default_values, published + ) + return self.get_structure() diff --git a/src/agenda_culturel/import_tasks/importer.py b/src/agenda_culturel/import_tasks/importer.py index f9c0b0a..5a2e264 100644 --- a/src/agenda_culturel/import_tasks/importer.py +++ b/src/agenda_culturel/import_tasks/importer.py @@ -16,7 +16,8 @@ class URL2Events: self.single_event = single_event def process( - self, url, url_human=None, cache=None, default_values=None, published=False + self, url, url_human=None, cache=None, default_values=None, published=False, + first=True ): referer = "" if self.extractor: @@ -37,6 +38,13 @@ class URL2Events: logger.warning('Extractor::' + type(e).__name__) e.set_downloader(self.downloader) events = e.extract(content, url, url_human, default_values, published) - if events is not None and len(events) > 0: - return events + if events is not None: + if len(events) > 0: + return events + else: + logger.warning("cas sans event") + if first and FacebookEventExtractor.is_known_url(url) and self.downloader.support_2nd_extract and e.has_2nd_method: + logger.warning("on avance") + e.prepare_2nd_extract() + return self.process(url, url_human, cache, default_values, published, False) return None