Ajout (pas finalisé) de l'import Cour des 3 Coquins
This commit is contained in:
		
							
								
								
									
										43
									
								
								experimentations/get_c3c_events.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										43
									
								
								experimentations/get_c3c_events.py
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,43 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
# coding: utf-8
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
# getting the name of the directory
 | 
			
		||||
# where the this file is present.
 | 
			
		||||
current = os.path.dirname(os.path.realpath(__file__))
 | 
			
		||||
 
 | 
			
		||||
# Getting the parent directory name
 | 
			
		||||
# where the current directory is present.
 | 
			
		||||
parent = os.path.dirname(current)
 | 
			
		||||
 
 | 
			
		||||
# adding the parent directory to 
 | 
			
		||||
# the sys.path.
 | 
			
		||||
sys.path.append(parent)
 | 
			
		||||
 | 
			
		||||
from src.agenda_culturel.import_tasks.downloader import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.extractor import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.importer import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.custom_extractors import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), c3c.CExtractor())
 | 
			
		||||
    url = "https://billetterie-c3c.clermont-ferrand.fr/"
 | 
			
		||||
    url_human = "https://billetterie-c3c.clermont-ferrand.fr/"
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        events = u2e.process(url, url_human, cache = "cache-c3c.html", default_values = {"location": "La Cour des 3 Coquins"}, published = True)
 | 
			
		||||
 | 
			
		||||
        exportfile = "events-c3c.json"
 | 
			
		||||
        print("Saving events to file {}".format(exportfile))
 | 
			
		||||
        with open(exportfile, "w") as f:
 | 
			
		||||
            json.dump(events, f, indent=4, default=str)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print("Exception: " + str(e))
 | 
			
		||||
							
								
								
									
										100
									
								
								src/agenda_culturel/import_tasks/custom_extractors/c3c.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										100
									
								
								src/agenda_culturel/import_tasks/custom_extractors/c3c.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,100 @@
 | 
			
		||||
from ..generic_extractors import *
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from La Cour des 3 Coquins
 | 
			
		||||
# URL: https://billetterie-c3c.clermont-ferrand.fr//
 | 
			
		||||
class CExtractor(TwoStepsExtractor):
 | 
			
		||||
    nom_lieu = "La Cour des 3 Coquins"
 | 
			
		||||
 | 
			
		||||
    def category_c3c2agenda(self, category):
 | 
			
		||||
        if not category:
 | 
			
		||||
            return None
 | 
			
		||||
        mapping = {"Théâtre": "Théâtre", "Concert": "Concert", "Projection": "Cinéma"}
 | 
			
		||||
        if category in mapping:
 | 
			
		||||
            return mapping[category]
 | 
			
		||||
        else:
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        events = soup.select("div.fiche-info")
 | 
			
		||||
 | 
			
		||||
        for e in events:
 | 
			
		||||
            e_url = e.select_one("a.btn.lien_savoir_plus")["href"]
 | 
			
		||||
            if e_url != "":
 | 
			
		||||
                e_url = self.url + "/" + e_url
 | 
			
		||||
                self.add_event_url(e_url)
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(
 | 
			
		||||
        self,
 | 
			
		||||
        event_content,
 | 
			
		||||
        event_url,
 | 
			
		||||
        url_human=None,
 | 
			
		||||
        default_values=None,
 | 
			
		||||
        published=False,
 | 
			
		||||
    ):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        title = soup.select_one("h1")
 | 
			
		||||
        if title:
 | 
			
		||||
            title = title.text
 | 
			
		||||
 | 
			
		||||
        image = soup.select_one("#media .swiper-slide img")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image["src"]
 | 
			
		||||
        else:
 | 
			
		||||
            image = None
 | 
			
		||||
            
 | 
			
		||||
        description = soup.select_one(".presentation")
 | 
			
		||||
        duree = soup.select_one("#criteres .DUREE-V .valeur-critere li")
 | 
			
		||||
        if duree is not None:
 | 
			
		||||
            duree = self.parse_french_time(duree.text)
 | 
			
		||||
 | 
			
		||||
        location = self.nom_lieu
 | 
			
		||||
        tags = []
 | 
			
		||||
        for t in soup.select(".sous-titre span"):
 | 
			
		||||
            classes = t.get("class")
 | 
			
		||||
            if classes and len(classes) > 0:
 | 
			
		||||
                if classes[0].startswith("LIEU-"):
 | 
			
		||||
                    location = t.text
 | 
			
		||||
                elif classes[0].startswith("THEMATIQUE-"):
 | 
			
		||||
                    tag = self.category_c3c2agenda(t.text)
 | 
			
		||||
                    if tag is not None:
 | 
			
		||||
                        tags.append(tag)
 | 
			
		||||
 | 
			
		||||
        # TODO: parser les dates, récupérer les heures ()
 | 
			
		||||
    
 | 
			
		||||
            
 | 
			
		||||
 | 
			
		||||
        print("EVENT ", event_url)
 | 
			
		||||
        print("- ", title)
 | 
			
		||||
        print("- ", image)
 | 
			
		||||
        print("- ", len(description))
 | 
			
		||||
        print("- ", duree)
 | 
			
		||||
        print("- ", location)
 | 
			
		||||
        print("- ", tags)
 | 
			
		||||
        print("- ", dates)
 | 
			
		||||
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(
 | 
			
		||||
            event_url,
 | 
			
		||||
            None,
 | 
			
		||||
            None,
 | 
			
		||||
            start_day,
 | 
			
		||||
            location,
 | 
			
		||||
            description,
 | 
			
		||||
            tags,
 | 
			
		||||
            recurrences=None,
 | 
			
		||||
            uuids=[event_url],
 | 
			
		||||
            url_human=url_human,
 | 
			
		||||
            start_time=start_time,
 | 
			
		||||
            end_day=end_day,
 | 
			
		||||
            end_time=end_time,
 | 
			
		||||
            published=published,
 | 
			
		||||
            image=image,
 | 
			
		||||
        )
 | 
			
		||||
@@ -97,13 +97,20 @@ class Extractor(ABC):
 | 
			
		||||
                s = "0"
 | 
			
		||||
            else:
 | 
			
		||||
                # format heures
 | 
			
		||||
                m = re.search("([0-9]+)[ Hh:.]", text)
 | 
			
		||||
                m = re.search("([0-9]+) [Hh:.]", text)
 | 
			
		||||
                if m:
 | 
			
		||||
                    h = m.group(1)
 | 
			
		||||
                    m = "0"
 | 
			
		||||
                    s = "0"
 | 
			
		||||
                else:
 | 
			
		||||
                    return None
 | 
			
		||||
                    # format minutes
 | 
			
		||||
                    m = re.search("([0-9]+)[ ]*(?:mn|min|Min|Mn)", text)
 | 
			
		||||
                    if m:
 | 
			
		||||
                        h = "0"
 | 
			
		||||
                        m = m.group(1)
 | 
			
		||||
                        s = "0"
 | 
			
		||||
                    else:
 | 
			
		||||
                        return None
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            h = int(h)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user