diff --git a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py index e20737a..f5f72cb 100644 --- a/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py +++ b/src/agenda_culturel/import_tasks/custom_extractors/lacomedie.py @@ -28,7 +28,9 @@ class CExtractor(TwoStepsExtractor): for d in list(set(dates)): if not self.only_future or self.now <= datetime.date.fromisoformat(d): events = self.downloader.get_content( - url, post={"action": "load_evenements_jour", "jour": d} + url, + post={"action": "load_evenements_jour", "jour": d}, + referer="https://lacomediedeclermont.com/saison23-24/" ) if events: events = json5.loads(events) diff --git a/src/agenda_culturel/import_tasks/downloader.py b/src/agenda_culturel/import_tasks/downloader.py index ae6e5f5..8ef3104 100644 --- a/src/agenda_culturel/import_tasks/downloader.py +++ b/src/agenda_culturel/import_tasks/downloader.py @@ -1,5 +1,6 @@ from urllib.parse import urlencode import urllib.request +from urllib.request import Request import os from selenium import webdriver from selenium.webdriver.chrome.service import Service @@ -15,13 +16,13 @@ class Downloader(ABC): def download(self, url, post=None): pass - def get_content(self, url, cache=None, post=None): + def get_content(self, url, cache=None, referer=None, post=None): if cache and os.path.exists(cache): print("Loading cache ({})".format(cache)) with open(cache) as f: content = "\n".join(f.readlines()) else: - content = self.download(url, post) + content = self.download(url, referer=referer, post=post) if cache: print("Saving cache ({})".format(cache)) @@ -37,13 +38,18 @@ class SimpleDownloader(Downloader): def __init__(self): super().__init__() - def download(self, url, post=None): + def download(self, url, referer=None, post=None): print("Downloading {}".format(url)) - try: if post: - post_args = urlencode(post).encode() - resource = urllib.request.urlopen(url, post_args) + post_args = urlencode(post).encode("utf-8") + headers = { + "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:126.0) Gecko/20100101 Firefox/126.0", + } + if referer is not None: + headers["Referer"] = referer + req = Request(url, headers=headers) + resource = urllib.request.urlopen(req, post_args) else: resource = urllib.request.urlopen(url) data = resource.read().decode(resource.headers.get_content_charset())