Oups

2025-02-01 15:10:26 +01:00
parent 9b898d26da
commit 55a0094e2f
1 changed files with 156 additions and 0 deletions
--- a/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py
+++ b/src/agenda_culturel/import_tasks/custom_extractors/billetterie_cf.py
@@ -0,0 +1,156 @@
+from ..generic_extractors import *
+from bs4 import BeautifulSoup
+from datetime import timedelta
+
+# A class dedicated to get events from La Cour des 3 Coquins and Graines de spectacle
+# URL: https://billetterie-c3c.clermont-ferrand.fr//
+class CExtractor(TwoStepsExtractor):
+
+    def extract(
+        self,
+        content,
+        url,
+        url_human=None,
+        default_values=None,
+        published=False,
+        only_future=True,
+        ignore_404=True):
+        self.root_address = "https://" + urlparse(url).netloc + "/"
+        return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
+
+    def category_agenda(self, category):
+        if not category:
+            return None
+        mapping = {"Théâtre": "Spectacles", "Concert": "Fêtes & Concerts", "Projection": "Cinéma"}
+        mapping_tag = {"Théâtre": "🎭 théâtre", "Concert": "🎵 concert", "Projection": None}
+        if category in mapping:
+            return mapping[category], mapping_tag[category]
+        else:
+            return None, None
+
+    def build_event_url_list(self, content):
+        soup = BeautifulSoup(content, "html.parser")
+
+        events = soup.select("div.fiche-info")
+
+        for e in events:
+            e_url = e.select_one("a.btn.lien_savoir_plus")["href"]
+            if e_url != "":
+                e_url = self.url + "/" + e_url
+                self.add_event_url(e_url)
+
+    def add_event_from_content(
+        self,
+        event_content,
+        event_url,
+        url_human=None,
+        default_values=None,
+        published=False,
+    ):
+        soup = BeautifulSoup(event_content, "html.parser")
+
+        title = soup.select_one("h1")
+        if title:
+            title = title.text
+
+        image = soup.select_one("#media .swiper-slide img")
+        if image:
+            image = image["src"]
+        else:
+            image = None
+            
+        description = soup.select_one(".presentation").get_text()
+        duration = soup.select_one("#criteres .DUREE-V .valeur-critere li")
+        if not duration is None:
+            duration = Extractor.parse_french_time(duration.text)
+
+        location = soup.select_one("#criteres .LIEU-V .valeur-critere li")
+        if not location is None:
+            location = location.text
+
+        categories = []
+        tags = []
+        for t in soup.select(".sous-titre span"):
+            classes = t.get("class")
+            if classes and len(classes) > 0:
+                if classes[0].startswith("LIEU-"):
+                    location = t.text
+                elif classes[0].startswith("THEMATIQUE-"):
+                    cat, tag = self.category_agenda(t.text)
+                    if cat:
+                        categories.append(cat)
+                    if tag:
+                        tags.append(tag)
+
+        # TODO: parser les dates, récupérer les heures ()
+        dates = [o.get("value") for o in soup.select("select.datedleb_resa option")]
+        
+        patternCodeSite = re.compile(r'.*gsw_vars\["CODEPRESTATAIRE"\] = "(.*?)";.*', flags=re.DOTALL)
+        patternCodeObject = re.compile(r'.*gsw_vars\["CODEPRESTATION"\] = "(.*?)";.*', flags=re.DOTALL)
+        patternCodeMoteur = re.compile(r'.*Resa.init_moteur_resa\(\'([0-9]+)\'\);.*', flags=re.DOTALL)
+        scripts = soup.find_all('script')
+        codeSite = ""
+        idObject = ""
+        moteur = ""
+        for script in scripts:
+            if(patternCodeSite.match(str(script.string))):
+                data = patternCodeSite.match(script.string)
+                codeSite = data.groups()[0]
+            if(patternCodeObject.match(str(script.string))):
+                data = patternCodeObject.match(script.string)
+                idObject = data.groups()[0]
+            if(patternCodeMoteur.match(str(script.string))):
+                data = patternCodeMoteur.match(script.string)
+                moteur = data.groups()[0]
+
+
+        pause = self.downloader.pause
+        self.downloader.pause = False
+
+        # get exact schedule need two supplementary requests
+        datetimes = []
+        if codeSite != "" and idObject != "" and moteur != "":  
+            for date in dates:
+                # the first page is required such that the server knows the selected date
+                page1 = self.downloader.get_content(self.root_address + "/booking?action=searchAjax&cid=" + moteur + "&afficheDirectDispo=" + date + "&type_prestataire=V&cle_fiche=PRESTATION-V-" + codeSite + "-" + idObject + "&datedeb=" + date)
+                # then we get the form with hours
+                page2 = self.downloader.get_content(self.root_address + "/booking?action=detailTarifsPrestationAjax&prestation=V-" + codeSite + "-" + idObject)
+                soup2 = BeautifulSoup(page2, "html.parser")
+                times = [o.text for o in soup2.select("#quart_en_cours_spec option")]
+                for t in times:
+                    startdate = Extractor.parse_french_date(date)
+                    starttime = Extractor.parse_french_time(t)
+                    start = datetime.datetime.combine(startdate, starttime)
+                    enddate = None
+                    endtime = None
+                    if duration is not None:
+                        end = start + timedelta(hours=duration.hour, minutes=duration.minute, seconds=duration.second)
+                        enddate = end.date()
+                        endtime = end.time()
+                    datetimes.append((startdate, starttime, enddate, endtime))
+        self.downloader.pause = pause
+
+        category = None
+        if len(categories) > 0:
+            category = categories[0]
+
+        for dt in datetimes:
+
+            self.add_event_with_props(
+                default_values,
+                event_url,
+                title,
+                category,
+                dt[0],
+                location,
+                description,
+                tags,
+                recurrences=None,
+                uuids=[event_url],
+                url_human=url_human,
+                start_time=dt[1],
+                end_day=dt[2],
+                end_time=dt[3],
+                published=published,
+                image=image,
+            )