Refactoring extracteurs
This commit is contained in:
		@@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), LaComedieExtractor())
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), lacomedie.CExtractor())
 | 
			
		||||
    url = "https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes"
 | 
			
		||||
    url_human = "https://lacomediedeclermont.com/saison23-24/"
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), LaCoopeExtractor())
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), lacoope.CExtractor())
 | 
			
		||||
    url = "https://www.lacoope.org/concerts-calendrier/"
 | 
			
		||||
    url_human = "https://www.lacoope.org/concerts-calendrier/"
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), LaPuceALOreilleExtractor())
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), lapucealoreille.CExtractor())
 | 
			
		||||
    url = "https://www.lapucealoreille63.fr/programmation/"
 | 
			
		||||
    url_human = "https://www.lapucealoreille63.fr/programmation/"
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -28,7 +28,7 @@ from src.agenda_culturel.import_tasks.custom_extractors import *
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), LeFotomatExtractor())
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), lefotomat.CExtractor())
 | 
			
		||||
    url = "https://www.lefotomat.com/feed"
 | 
			
		||||
    url_human = "https://www.lefotomat.com/"
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -102,13 +102,13 @@ def run_recurrent_import(self, pk):
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.ICALNOVC:
 | 
			
		||||
        extractor = ICALNoVCExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.LACOOPE:
 | 
			
		||||
        extractor = LaCoopeExtractor()
 | 
			
		||||
        extractor = lacoope.CExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.LACOMEDIE:
 | 
			
		||||
        extractor = LaComedieExtractor()
 | 
			
		||||
        extractor = lacomedie.CExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT:
 | 
			
		||||
        extractor = LeFotomatExtractor()
 | 
			
		||||
        extractor = lefotomat.CExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE:
 | 
			
		||||
        extractor = LaPuceALOreilleExtractor()
 | 
			
		||||
        extractor = lapucealoreille.CExtractor()
 | 
			
		||||
    else:
 | 
			
		||||
        extractor = None
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -1,266 +0,0 @@
 | 
			
		||||
 | 
			
		||||
from .generic_extractors import *
 | 
			
		||||
import re
 | 
			
		||||
import json5
 | 
			
		||||
from datetime import timedelta
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from La Coopérative de Mai:
 | 
			
		||||
# URL: https://www.lacoope.org/concerts-calendrier/
 | 
			
		||||
class LaCoopeExtractor(TwoStepsExtractor):
 | 
			
		||||
 | 
			
		||||
    nom_lieu = "La Coopérative de Mai"
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
        script = soup.find('div', class_="js-filter__results").findChildren('script')
 | 
			
		||||
        if len(script) == 0:
 | 
			
		||||
            raise Exception("Cannot find events in the first page")
 | 
			
		||||
        script = script[0]
 | 
			
		||||
        search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
 | 
			
		||||
        if search:
 | 
			
		||||
            data = json5.loads(search.group(1))
 | 
			
		||||
            for e in data['events']:
 | 
			
		||||
                self.add_event_url(e['url'])
 | 
			
		||||
                if e['tag'] == "Gratuit":
 | 
			
		||||
                    self.add_event_tag(e['url'], 'gratuit')
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            raise Exception('Cannot extract events from javascript')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        title = soup.find("h1").contents[0]
 | 
			
		||||
        category = "Concert"
 | 
			
		||||
        image = soup.find("meta", property="og:image")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image["content"]
 | 
			
		||||
 | 
			
		||||
        description = soup.find("div", class_="grid-concert-content")
 | 
			
		||||
        if description:
 | 
			
		||||
            description = description.find('div', class_="content-striped")
 | 
			
		||||
            if description:
 | 
			
		||||
                description = description.find('div', class_='wysiwyg')
 | 
			
		||||
                if description:
 | 
			
		||||
                    description = description.get_text()
 | 
			
		||||
        if description is None:
 | 
			
		||||
            description = ""
 | 
			
		||||
 | 
			
		||||
        tags = []
 | 
			
		||||
 | 
			
		||||
        link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
 | 
			
		||||
        if len(link_calendar) == 0:
 | 
			
		||||
            raise Exception('Cannot find the google calendar url')
 | 
			
		||||
            
 | 
			
		||||
        gg_cal = GGCalendar(link_calendar[0]["href"])
 | 
			
		||||
        start_day = gg_cal.start_day
 | 
			
		||||
        start_time = gg_cal.start_time
 | 
			
		||||
        end_day = gg_cal.end_day
 | 
			
		||||
        end_time = gg_cal.end_time
 | 
			
		||||
        location = LaCoopeExtractor.nom_lieu
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from La Coopérative de Mai:
 | 
			
		||||
# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
 | 
			
		||||
# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
 | 
			
		||||
class LaComedieExtractor(TwoStepsExtractor):
 | 
			
		||||
 | 
			
		||||
    nom_lieu = "La Comédie de Clermont"
 | 
			
		||||
 | 
			
		||||
    def category_comedie2agenda(self, category):
 | 
			
		||||
        mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
 | 
			
		||||
        if category in mapping:
 | 
			
		||||
            return mapping[category]
 | 
			
		||||
        else:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        dates = json5.loads(content)["data"][0]
 | 
			
		||||
        
 | 
			
		||||
        url = self.url.split("?")[0]
 | 
			
		||||
        for d in list(set(dates)):
 | 
			
		||||
            if not self.only_future or self.now <= datetime.date.fromisoformat(d):
 | 
			
		||||
                events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
 | 
			
		||||
                if events:
 | 
			
		||||
                    events = json5.loads(events)
 | 
			
		||||
                    if "data" in events:
 | 
			
		||||
                        events = events["data"][0]
 | 
			
		||||
                        soup = BeautifulSoup(events, "html.parser")
 | 
			
		||||
                        events = soup.select("div.unedatedev")
 | 
			
		||||
                        for e in events:
 | 
			
		||||
                            e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
 | 
			
		||||
                            self.add_event_url(e_url)
 | 
			
		||||
                            self.add_event_start_day(e_url, d)
 | 
			
		||||
                            t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
 | 
			
		||||
                            self.add_event_start_time(e_url, t)
 | 
			
		||||
                            title = e.select('a')[0].contents[0]
 | 
			
		||||
                            self.add_event_title(e_url, title)
 | 
			
		||||
                            category = e.select("div#lieuevtcal span")
 | 
			
		||||
                            if len(category) > 0:
 | 
			
		||||
                                category = self.category_comedie2agenda(category[-1].contents[0])
 | 
			
		||||
                                if category is not None:
 | 
			
		||||
                                    self.add_event_category(e_url, category)
 | 
			
		||||
                            location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
 | 
			
		||||
                            self.add_event_location(e_url, location)
 | 
			
		||||
 | 
			
		||||
            
 | 
			
		||||
        
 | 
			
		||||
    
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
        
 | 
			
		||||
        image = soup.select("#imgspec img")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image[0]["src"]
 | 
			
		||||
        else:
 | 
			
		||||
            image = None
 | 
			
		||||
 | 
			
		||||
        description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
 | 
			
		||||
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from Le Fotomat'
 | 
			
		||||
# URL: https://www.lefotomat.com/
 | 
			
		||||
class LeFotomatExtractor(TwoStepsExtractor):
 | 
			
		||||
    
 | 
			
		||||
    nom_lieu = "Le Fotomat'"
 | 
			
		||||
 | 
			
		||||
    def category_fotomat2agenda(self, category):
 | 
			
		||||
        if not category:
 | 
			
		||||
            return None
 | 
			
		||||
        mapping = { "Concerts": "Concert"}
 | 
			
		||||
        if category in mapping:
 | 
			
		||||
            return mapping[category]
 | 
			
		||||
        else:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        soup = BeautifulSoup(content, "xml")
 | 
			
		||||
 | 
			
		||||
        events = soup.select("item")
 | 
			
		||||
        for e in events:
 | 
			
		||||
            e_url = e.find("link").contents[0]
 | 
			
		||||
            self.add_event_url(e_url)
 | 
			
		||||
 | 
			
		||||
            title = e.find("title").contents[0]
 | 
			
		||||
            self.add_event_title(e_url, title)
 | 
			
		||||
 | 
			
		||||
            category = self.category_fotomat2agenda(e.find("category").contents[0])
 | 
			
		||||
            if category:
 | 
			
		||||
                self.add_event_category(e_url, category)
 | 
			
		||||
            
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
        image = soup.select("div.post-content img.wp-post-image")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image[0]["src"]
 | 
			
		||||
        else:
 | 
			
		||||
            image = None
 | 
			
		||||
        desc = soup.select("head meta[name=description]")[0]["content"]
 | 
			
		||||
        start_day = self.parse_french_date(desc.split("-")[0])
 | 
			
		||||
        start_time = self.parse_french_time(desc.split("-")[1])
 | 
			
		||||
        end_time = self.parse_french_time(desc.split("-")[2])
 | 
			
		||||
        end_day = self.guess_end_day(start_day, start_time, end_time)
 | 
			
		||||
 | 
			
		||||
        location = self.nom_lieu
 | 
			
		||||
        descriptions = soup.select("div.vce-col-content")
 | 
			
		||||
        if descriptions:
 | 
			
		||||
            descriptions = [d.get_text() for d in descriptions]
 | 
			
		||||
            description = max(descriptions, key=len)
 | 
			
		||||
        else:
 | 
			
		||||
            description = None
 | 
			
		||||
 | 
			
		||||
        article = soup.select("article.post")
 | 
			
		||||
        tags = []
 | 
			
		||||
        for c in article[0]["class"]:
 | 
			
		||||
            if c.startswith("category-"):
 | 
			
		||||
                tag = '-'.join(c.split("-")[1:])
 | 
			
		||||
                if tag != "concerts":
 | 
			
		||||
                    tags.append(tag)
 | 
			
		||||
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from La puce à l'oreille
 | 
			
		||||
# URL: https://www.lapucealoreille63.fr/
 | 
			
		||||
class LaPuceALOreilleExtractor(TwoStepsExtractor):
 | 
			
		||||
    
 | 
			
		||||
    nom_lieu = "La Puce à l'Oreille"
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
 | 
			
		||||
        for e in events:
 | 
			
		||||
            e_url = e.find("a")
 | 
			
		||||
            if e_url:
 | 
			
		||||
                if self.add_event_url(e_url["href"]):
 | 
			
		||||
                    title = e.select("div[data-testid=richTextElement] h1.font_0 span")
 | 
			
		||||
                    if title:
 | 
			
		||||
                        title = title[0].contents[0].get_text().replace("\n", " ")
 | 
			
		||||
                        title = re.sub(" +", " ", title)
 | 
			
		||||
                        self.add_event_title(e_url["href"], title)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
 | 
			
		||||
 | 
			
		||||
        spans = soup.select("div[data-testid=richTextElement] span")
 | 
			
		||||
        start_time = None
 | 
			
		||||
        end_time = None
 | 
			
		||||
        location = None
 | 
			
		||||
 | 
			
		||||
        for span in spans:
 | 
			
		||||
            txt = span.get_text()
 | 
			
		||||
            if txt.lstrip().startswith("DÉBUT"):
 | 
			
		||||
                start_time = self.parse_french_time(txt.split(":")[-1])
 | 
			
		||||
                end_time = None
 | 
			
		||||
            elif txt.lstrip().startswith("HORAIRES :"):
 | 
			
		||||
                hs = txt.split(":")[-1].split("-")
 | 
			
		||||
                start_time = self.parse_french_time(hs[0])
 | 
			
		||||
                if len(hs) > 1:
 | 
			
		||||
                    end_time = self.parse_french_time(hs[1])
 | 
			
		||||
                else:
 | 
			
		||||
                    end_time = None
 | 
			
		||||
            elif txt.lstrip().startswith("LIEU :") and not location:
 | 
			
		||||
                location = txt.split(":")[-1].lstrip()
 | 
			
		||||
 | 
			
		||||
        if not location:
 | 
			
		||||
            location = self.nom_lieu
 | 
			
		||||
        end_day = self.guess_end_day(start_day, start_time, end_time)
 | 
			
		||||
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
        tags = []
 | 
			
		||||
 | 
			
		||||
        image = soup.select("wow-image img[fetchpriority=high]")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image[0]["src"]
 | 
			
		||||
        else:
 | 
			
		||||
            image = None
 | 
			
		||||
        
 | 
			
		||||
        descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
 | 
			
		||||
        if descriptions:
 | 
			
		||||
            descriptions = [d.get_text() for d in descriptions]
 | 
			
		||||
            description = max(descriptions, key=len)
 | 
			
		||||
        else:
 | 
			
		||||
            description = None
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 | 
			
		||||
@@ -0,0 +1,4 @@
 | 
			
		||||
from os.path import dirname, basename, isfile, join
 | 
			
		||||
import glob
 | 
			
		||||
modules = glob.glob(join(dirname(__file__), "*.py"))
 | 
			
		||||
__all__ = [ basename(f)[:-3] for f in modules if isfile(f) and not f.endswith('__init__.py')]
 | 
			
		||||
@@ -0,0 +1,69 @@
 | 
			
		||||
from ..generic_extractors import *
 | 
			
		||||
import re
 | 
			
		||||
import json5
 | 
			
		||||
from datetime import timedelta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from La Coopérative de Mai:
 | 
			
		||||
# URL: https://lacomediedeclermont.com/saison23-24/wp-admin/admin-ajax.php?action=load_dates_existantes
 | 
			
		||||
# URL pour les humains: https://lacomediedeclermont.com/saison23-24/
 | 
			
		||||
class CExtractor(TwoStepsExtractor):
 | 
			
		||||
 | 
			
		||||
    nom_lieu = "La Comédie de Clermont"
 | 
			
		||||
 | 
			
		||||
    def category_comedie2agenda(self, category):
 | 
			
		||||
        mapping = { "Théâtre": "Théâtre", "Danse": "Danse", "Rencontre": "Autre", "Sortie de résidence": "Autre", "PopCorn Live": "Autre"}
 | 
			
		||||
        if category in mapping:
 | 
			
		||||
            return mapping[category]
 | 
			
		||||
        else:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        dates = json5.loads(content)["data"][0]
 | 
			
		||||
        
 | 
			
		||||
        url = self.url.split("?")[0]
 | 
			
		||||
        for d in list(set(dates)):
 | 
			
		||||
            if not self.only_future or self.now <= datetime.date.fromisoformat(d):
 | 
			
		||||
                events = self.downloader.get_content(url, post={'action': "load_evenements_jour", "jour": d})
 | 
			
		||||
                if events:
 | 
			
		||||
                    events = json5.loads(events)
 | 
			
		||||
                    if "data" in events:
 | 
			
		||||
                        events = events["data"][0]
 | 
			
		||||
                        soup = BeautifulSoup(events, "html.parser")
 | 
			
		||||
                        events = soup.select("div.unedatedev")
 | 
			
		||||
                        for e in events:
 | 
			
		||||
                            e_url = e.select('a')[0]["href"] + "#" + d # a "fake" url specific for each day of this show
 | 
			
		||||
                            self.add_event_url(e_url)
 | 
			
		||||
                            self.add_event_start_day(e_url, d)
 | 
			
		||||
                            t = str(e.select('div#datecal')[0]).split(' ')[-1].split('<')[0]
 | 
			
		||||
                            self.add_event_start_time(e_url, t)
 | 
			
		||||
                            title = e.select('a')[0].contents[0]
 | 
			
		||||
                            self.add_event_title(e_url, title)
 | 
			
		||||
                            category = e.select("div#lieuevtcal span")
 | 
			
		||||
                            if len(category) > 0:
 | 
			
		||||
                                category = self.category_comedie2agenda(category[-1].contents[0])
 | 
			
		||||
                                if category is not None:
 | 
			
		||||
                                    self.add_event_category(e_url, category)
 | 
			
		||||
                            location = e.select("div#lieuevtcal")[0].contents[-1].split("•")[-1]
 | 
			
		||||
                            self.add_event_location(e_url, location)
 | 
			
		||||
 | 
			
		||||
            
 | 
			
		||||
        
 | 
			
		||||
    
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
        
 | 
			
		||||
        image = soup.select("#imgspec img")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image[0]["src"]
 | 
			
		||||
        else:
 | 
			
		||||
            image = None
 | 
			
		||||
 | 
			
		||||
        description = soup.select("#descspec")[0].get_text().replace("Lire plus...", "")
 | 
			
		||||
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, None, None, None, None, description, [], recurrences=None, uuid=event_url, url_human=url_human, published=published, image=image)
 | 
			
		||||
 | 
			
		||||
@@ -0,0 +1,64 @@
 | 
			
		||||
from ..generic_extractors import *
 | 
			
		||||
import re
 | 
			
		||||
import json5
 | 
			
		||||
from datetime import timedelta
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from La Coopérative de Mai:
 | 
			
		||||
# URL: https://www.lacoope.org/concerts-calendrier/
 | 
			
		||||
class CExtractor(TwoStepsExtractor):
 | 
			
		||||
 | 
			
		||||
    nom_lieu = "La Coopérative de Mai"
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
        script = soup.find('div', class_="js-filter__results").findChildren('script')
 | 
			
		||||
        if len(script) == 0:
 | 
			
		||||
            raise Exception("Cannot find events in the first page")
 | 
			
		||||
        script = script[0]
 | 
			
		||||
        search = re.search(r"window.fullCalendarContent = (.*)</script>", str(script), re.S)
 | 
			
		||||
        if search:
 | 
			
		||||
            data = json5.loads(search.group(1))
 | 
			
		||||
            for e in data['events']:
 | 
			
		||||
                self.add_event_url(e['url'])
 | 
			
		||||
                if e['tag'] == "Gratuit":
 | 
			
		||||
                    self.add_event_tag(e['url'], 'gratuit')
 | 
			
		||||
 | 
			
		||||
        else:
 | 
			
		||||
            raise Exception('Cannot extract events from javascript')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        title = soup.find("h1").contents[0]
 | 
			
		||||
        category = "Concert"
 | 
			
		||||
        image = soup.find("meta", property="og:image")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image["content"]
 | 
			
		||||
 | 
			
		||||
        description = soup.find("div", class_="grid-concert-content")
 | 
			
		||||
        if description:
 | 
			
		||||
            description = description.find('div', class_="content-striped")
 | 
			
		||||
            if description:
 | 
			
		||||
                description = description.find('div', class_='wysiwyg')
 | 
			
		||||
                if description:
 | 
			
		||||
                    description = description.get_text()
 | 
			
		||||
        if description is None:
 | 
			
		||||
            description = ""
 | 
			
		||||
 | 
			
		||||
        tags = []
 | 
			
		||||
 | 
			
		||||
        link_calendar = soup.select('a[href^="https://calendar.google.com/calendar/"]')
 | 
			
		||||
        if len(link_calendar) == 0:
 | 
			
		||||
            raise Exception('Cannot find the google calendar url')
 | 
			
		||||
            
 | 
			
		||||
        gg_cal = GGCalendar(link_calendar[0]["href"])
 | 
			
		||||
        start_day = gg_cal.start_day
 | 
			
		||||
        start_time = gg_cal.start_time
 | 
			
		||||
        end_day = gg_cal.end_day
 | 
			
		||||
        end_time = gg_cal.end_time
 | 
			
		||||
        location = CExtractor.nom_lieu
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, title, category, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 | 
			
		||||
 | 
			
		||||
@@ -0,0 +1,73 @@
 | 
			
		||||
from ..generic_extractors import *
 | 
			
		||||
import re
 | 
			
		||||
import json5
 | 
			
		||||
from datetime import timedelta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from La puce à l'oreille
 | 
			
		||||
# URL: https://www.lapucealoreille63.fr/
 | 
			
		||||
class CExtractor(TwoStepsExtractor):
 | 
			
		||||
    
 | 
			
		||||
    nom_lieu = "La Puce à l'Oreille"
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
 | 
			
		||||
        for e in events:
 | 
			
		||||
            e_url = e.find("a")
 | 
			
		||||
            if e_url:
 | 
			
		||||
                if self.add_event_url(e_url["href"]):
 | 
			
		||||
                    title = e.select("div[data-testid=richTextElement] h1.font_0 span")
 | 
			
		||||
                    if title:
 | 
			
		||||
                        title = title[0].contents[0].get_text().replace("\n", " ")
 | 
			
		||||
                        title = re.sub(" +", " ", title)
 | 
			
		||||
                        self.add_event_title(e_url["href"], title)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
 | 
			
		||||
 | 
			
		||||
        spans = soup.select("div[data-testid=richTextElement] span")
 | 
			
		||||
        start_time = None
 | 
			
		||||
        end_time = None
 | 
			
		||||
        location = None
 | 
			
		||||
 | 
			
		||||
        for span in spans:
 | 
			
		||||
            txt = span.get_text()
 | 
			
		||||
            if txt.lstrip().startswith("DÉBUT"):
 | 
			
		||||
                start_time = self.parse_french_time(txt.split(":")[-1])
 | 
			
		||||
                end_time = None
 | 
			
		||||
            elif txt.lstrip().startswith("HORAIRES :"):
 | 
			
		||||
                hs = txt.split(":")[-1].split("-")
 | 
			
		||||
                start_time = self.parse_french_time(hs[0])
 | 
			
		||||
                if len(hs) > 1:
 | 
			
		||||
                    end_time = self.parse_french_time(hs[1])
 | 
			
		||||
                else:
 | 
			
		||||
                    end_time = None
 | 
			
		||||
            elif txt.lstrip().startswith("LIEU :") and not location:
 | 
			
		||||
                location = txt.split(":")[-1].lstrip()
 | 
			
		||||
 | 
			
		||||
        if not location:
 | 
			
		||||
            location = self.nom_lieu
 | 
			
		||||
        end_day = self.guess_end_day(start_day, start_time, end_time)
 | 
			
		||||
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
        tags = []
 | 
			
		||||
 | 
			
		||||
        image = soup.select("wow-image img[fetchpriority=high]")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image[0]["src"]
 | 
			
		||||
        else:
 | 
			
		||||
            image = None
 | 
			
		||||
        
 | 
			
		||||
        descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
 | 
			
		||||
        if descriptions:
 | 
			
		||||
            descriptions = [d.get_text() for d in descriptions]
 | 
			
		||||
            description = max(descriptions, key=len)
 | 
			
		||||
        else:
 | 
			
		||||
            description = None
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 | 
			
		||||
@@ -0,0 +1,72 @@
 | 
			
		||||
from ..generic_extractors import *
 | 
			
		||||
import re
 | 
			
		||||
import json5
 | 
			
		||||
from datetime import timedelta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from Le Fotomat'
 | 
			
		||||
# URL: https://www.lefotomat.com/
 | 
			
		||||
class CExtractor(TwoStepsExtractor):
 | 
			
		||||
    
 | 
			
		||||
    nom_lieu = "Le Fotomat'"
 | 
			
		||||
 | 
			
		||||
    def category_fotomat2agenda(self, category):
 | 
			
		||||
        if not category:
 | 
			
		||||
            return None
 | 
			
		||||
        mapping = { "Concerts": "Concert"}
 | 
			
		||||
        if category in mapping:
 | 
			
		||||
            return mapping[category]
 | 
			
		||||
        else:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        soup = BeautifulSoup(content, "xml")
 | 
			
		||||
 | 
			
		||||
        events = soup.select("item")
 | 
			
		||||
        for e in events:
 | 
			
		||||
            e_url = e.find("link").contents[0]
 | 
			
		||||
            self.add_event_url(e_url)
 | 
			
		||||
 | 
			
		||||
            title = e.find("title").contents[0]
 | 
			
		||||
            self.add_event_title(e_url, title)
 | 
			
		||||
 | 
			
		||||
            category = self.category_fotomat2agenda(e.find("category").contents[0])
 | 
			
		||||
            if category:
 | 
			
		||||
                self.add_event_category(e_url, category)
 | 
			
		||||
            
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
        image = soup.select("div.post-content img.wp-post-image")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image[0]["src"]
 | 
			
		||||
        else:
 | 
			
		||||
            image = None
 | 
			
		||||
        desc = soup.select("head meta[name=description]")[0]["content"]
 | 
			
		||||
        start_day = self.parse_french_date(desc.split("-")[0])
 | 
			
		||||
        start_time = self.parse_french_time(desc.split("-")[1])
 | 
			
		||||
        end_time = self.parse_french_time(desc.split("-")[2])
 | 
			
		||||
        end_day = self.guess_end_day(start_day, start_time, end_time)
 | 
			
		||||
 | 
			
		||||
        location = self.nom_lieu
 | 
			
		||||
        descriptions = soup.select("div.vce-col-content")
 | 
			
		||||
        if descriptions:
 | 
			
		||||
            descriptions = [d.get_text() for d in descriptions]
 | 
			
		||||
            description = max(descriptions, key=len)
 | 
			
		||||
        else:
 | 
			
		||||
            description = None
 | 
			
		||||
 | 
			
		||||
        article = soup.select("article.post")
 | 
			
		||||
        tags = []
 | 
			
		||||
        for c in article[0]["class"]:
 | 
			
		||||
            if c.startswith("category-"):
 | 
			
		||||
                tag = '-'.join(c.split("-")[1:])
 | 
			
		||||
                if tag != "concerts":
 | 
			
		||||
                    tags.append(tag)
 | 
			
		||||
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 | 
			
		||||
 | 
			
		||||
@@ -3,6 +3,7 @@
 | 
			
		||||
            <tr>
 | 
			
		||||
                <th rowspan="2">Identifiant</th>
 | 
			
		||||
                <th rowspan="2">Date</th>
 | 
			
		||||
                <th rowspan="2">Source</th>
 | 
			
		||||
                <th rowspan="2">Status</th>
 | 
			
		||||
                <th rowspan="2">Action</th>
 | 
			
		||||
                <th colspan="4">événements</th>
 | 
			
		||||
@@ -19,6 +20,7 @@
 | 
			
		||||
        <tr>
 | 
			
		||||
            <td>{{ obj.id }}</a></td>
 | 
			
		||||
            <td>{{ obj.created_date }}</td>
 | 
			
		||||
            <td>{% if obj.recurrentImport %}<a href="{{ obj.recurrentImport.get_absolute_url }}">{{ obj.recurrentImport.name }}</a>{% else %}-{% endif %} </td>
 | 
			
		||||
            <td><span{% if obj.status == "failed" %} data-tooltip="{{ obj.error_message }}"{% endif %}>{{ obj.status }}</span></td>
 | 
			
		||||
            <td>{% if obj.status == "running" %}<a href="{% url 'cancel_import' obj.id %}">Annuler</a>{% endif %}</td>
 | 
			
		||||
            <td>{% if obj.status == "success" %}{{ obj.nb_initial }}{% endif %}</td>
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user