From 55a0094e2f2094dd1917f1be92451e550a738d1c Mon Sep 17 00:00:00 2001 From: Jean-Marie Favreau Date: Sat, 1 Feb 2025 15:10:26 +0100 Subject: [PATCH] Oups --- .../custom_extractors/billetterie_cf.py | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py diff --git a/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py b/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py new file mode 100644 index 0000000..f4a227f --- /dev/null +++ b/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py @@ -0,0 +1,156 @@ +from ..generic_extractors import * +from bs4 import BeautifulSoup +from datetime import timedelta + +# A class dedicated to get events from La Cour des 3 Coquins and Graines de spectacle +# URL: https://billetterie-c3c.clermont-ferrand.fr// +class CExtractor(TwoStepsExtractor): + + def extract( + self, + content, + url, + url_human=None, + default_values=None, + published=False, + only_future=True, + ignore_404=True): + self.root_address = "https://" + urlparse(url).netloc + "/" + return super().extract(content, url, url_human, default_values, published, only_future, ignore_404) + + def category_agenda(self, category): + if not category: + return None + mapping = {"Théâtre": "Spectacles", "Concert": "Fêtes & Concerts", "Projection": "Cinéma"} + mapping_tag = {"Théâtre": "🎭 théâtre", "Concert": "🎵 concert", "Projection": None} + if category in mapping: + return mapping[category], mapping_tag[category] + else: + return None, None + + def build_event_url_list(self, content): + soup = BeautifulSoup(content, "html.parser") + + events = soup.select("div.fiche-info") + + for e in events: + e_url = e.select_one("a.btn.lien_savoir_plus")["href"] + if e_url != "": + e_url = self.url + "/" + e_url + self.add_event_url(e_url) + + def add_event_from_content( + self, + event_content, + event_url, + url_human=None, + default_values=None, + published=False, + ): + soup = BeautifulSoup(event_content, "html.parser") + + title = soup.select_one("h1") + if title: + title = title.text + + image = soup.select_one("#media .swiper-slide img") + if image: + image = image["src"] + else: + image = None + + description = soup.select_one(".presentation").get_text() + duration = soup.select_one("#criteres .DUREE-V .valeur-critere li") + if not duration is None: + duration = Extractor.parse_french_time(duration.text) + + location = soup.select_one("#criteres .LIEU-V .valeur-critere li") + if not location is None: + location = location.text + + categories = [] + tags = [] + for t in soup.select(".sous-titre span"): + classes = t.get("class") + if classes and len(classes) > 0: + if classes[0].startswith("LIEU-"): + location = t.text + elif classes[0].startswith("THEMATIQUE-"): + cat, tag = self.category_agenda(t.text) + if cat: + categories.append(cat) + if tag: + tags.append(tag) + + # TODO: parser les dates, récupérer les heures () + dates = [o.get("value") for o in soup.select("select.datedleb_resa option")] + + patternCodeSite = re.compile(r'.*gsw_vars\["CODEPRESTATAIRE"\] = "(.*?)";.*', flags=re.DOTALL) + patternCodeObject = re.compile(r'.*gsw_vars\["CODEPRESTATION"\] = "(.*?)";.*', flags=re.DOTALL) + patternCodeMoteur = re.compile(r'.*Resa.init_moteur_resa\(\'([0-9]+)\'\);.*', flags=re.DOTALL) + scripts = soup.find_all('script') + codeSite = "" + idObject = "" + moteur = "" + for script in scripts: + if(patternCodeSite.match(str(script.string))): + data = patternCodeSite.match(script.string) + codeSite = data.groups()[0] + if(patternCodeObject.match(str(script.string))): + data = patternCodeObject.match(script.string) + idObject = data.groups()[0] + if(patternCodeMoteur.match(str(script.string))): + data = patternCodeMoteur.match(script.string) + moteur = data.groups()[0] + + + pause = self.downloader.pause + self.downloader.pause = False + + # get exact schedule need two supplementary requests + datetimes = [] + if codeSite != "" and idObject != "" and moteur != "": + for date in dates: + # the first page is required such that the server knows the selected date + page1 = self.downloader.get_content(self.root_address + "/booking?action=searchAjax&cid=" + moteur + "&afficheDirectDispo=" + date + "&type_prestataire=V&cle_fiche=PRESTATION-V-" + codeSite + "-" + idObject + "&datedeb=" + date) + # then we get the form with hours + page2 = self.downloader.get_content(self.root_address + "/booking?action=detailTarifsPrestationAjax&prestation=V-" + codeSite + "-" + idObject) + soup2 = BeautifulSoup(page2, "html.parser") + times = [o.text for o in soup2.select("#quart_en_cours_spec option")] + for t in times: + startdate = Extractor.parse_french_date(date) + starttime = Extractor.parse_french_time(t) + start = datetime.datetime.combine(startdate, starttime) + enddate = None + endtime = None + if duration is not None: + end = start + timedelta(hours=duration.hour, minutes=duration.minute, seconds=duration.second) + enddate = end.date() + endtime = end.time() + datetimes.append((startdate, starttime, enddate, endtime)) + self.downloader.pause = pause + + category = None + if len(categories) > 0: + category = categories[0] + + for dt in datetimes: + + self.add_event_with_props( + default_values, + event_url, + title, + category, + dt[0], + location, + description, + tags, + recurrences=None, + uuids=[event_url], + url_human=url_human, + start_time=dt[1], + end_day=dt[2], + end_time=dt[3], + published=published, + image=image, + )