From becce291afaefc712be9880d03ae50b7cf54a0ea Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 1 Jun 2024 19:58:38 +0200 Subject: [PATCH] =?UTF-8?q?On=20ajoute=20un=20referer=20pour=20toutes=20le?= =?UTF-8?q?s=20requ=C3=AAtes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../import_tasks/custom_extractors/lacomedie.py | 1 + src/agenda_culturel/import_tasks/downloader.py | 16 ++++++++-------- .../import_tasks/generic_extractors.py | 2 ++ src/agenda_culturel/import_tasks/importer.py | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py index f5f72cb..7c97151 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py @@ -7,6 +7,7 @@ from bs4 import BeautifulSoup # URL pour les humains: https://lacomediedeclermont.com/saison23-24/ class CExtractor(TwoStepsExtractor): nom_lieu = "La Comédie de Clermont" + url_referer = "https://lacomediedeclermont.com/saison23-24/" def category_comedie2agenda(self, category): mapping = { diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index 8ef3104..c6a9cdf 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -39,19 +39,19 @@ class SimpleDownloader(Downloader): super().__init__() def download(self, url, referer=None, post=None): - print("Downloading {}".format(url)) + print("Downloading {} referer: {} post: {}".format(url, referer, post)) try: + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0", + } + if referer is not None: + headers["Referer"] = referer + req = Request(url, headers=headers) if post: post_args = urlencode(post).encode("utf-8") - headers = { - "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0", - } - if referer is not None: - headers["Referer"] = referer - req = Request(url, headers=headers) resource = urllib.request.urlopen(req, post_args) else: - resource = urllib.request.urlopen(url) + resource = urllib.request.urlopen(req) data = resource.read().decode(resource.headers.get_content_charset()) return data except Exception as e: diff --git a/src/agenda_culturel/import_tasks/generic_extractors.py b/src/agenda_culturel/import_tasks/generic_extractors.py index c058477..62758a0 100644 --- a/src/agenda_culturel/import_tasks/generic_extractors.py +++ b/src/agenda_culturel/import_tasks/generic_extractors.py @@ -47,6 +47,8 @@ class GGCalendar: # - then for each document downloaded from these urls, build the events # This class is an abstract class class TwoStepsExtractor(Extractor): + url_referer=None + def __init__(self): super().__init__() self.event_urls = None diff --git a/src/agenda_culturel/import_tasks/importer.py b/src/agenda_culturel/import_tasks/importer.py index f3bc829..960f93c 100644 --- a/src/agenda_culturel/import_tasks/importer.py +++ b/src/agenda_culturel/import_tasks/importer.py @@ -13,7 +13,7 @@ class URL2Events: def process( self, url, url_human=None, cache=None, default_values=None, published=False ): - content = self.downloader.get_content(url, cache) + content = self.downloader.get_content(url, cache, referer=self.extractor.url_referer) if content is None: return None