Importation des événements nature du puy de dôme
This commit is contained in:
		
							
								
								
									
										44
									
								
								experimentations/get_puydedome.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										44
									
								
								experimentations/get_puydedome.py
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,44 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
# coding: utf-8
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
# getting the name of the directory
 | 
			
		||||
# where the this file is present.
 | 
			
		||||
current = os.path.dirname(os.path.realpath(__file__))
 | 
			
		||||
 
 | 
			
		||||
# Getting the parent directory name
 | 
			
		||||
# where the current directory is present.
 | 
			
		||||
parent = os.path.dirname(current)
 | 
			
		||||
 
 | 
			
		||||
# adding the parent directory to 
 | 
			
		||||
# the sys.path.
 | 
			
		||||
sys.path.append(parent)
 | 
			
		||||
sys.path.append(parent + "/src")
 | 
			
		||||
 | 
			
		||||
from src.agenda_culturel.import_tasks.downloader import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.extractor import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.importer import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.custom_extractors import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), apidae_tourisme.CExtractor())
 | 
			
		||||
    url = "https://widgets.apidae-tourisme.com/filter.js?widget[id]=48"
 | 
			
		||||
    url_human = "https://ens.puy-de-dome.fr/agenda.html"
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        events = u2e.process(url, url_human, cache = "cache-puydedome.html", default_values = {}, published = True)
 | 
			
		||||
 | 
			
		||||
        exportfile = "events-puydedome.json"
 | 
			
		||||
        print("Saving events to file {}".format(exportfile))
 | 
			
		||||
        with open(exportfile, "w") as f:
 | 
			
		||||
            json.dump(events, f, indent=4, default=str)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print("Exception: " + str(e))
 | 
			
		||||
@@ -152,6 +152,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
 | 
			
		||||
        extractor = lerio.CExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.LARAYMONDE:
 | 
			
		||||
        extractor = laraymonde.CExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.APIDAE:
 | 
			
		||||
        extractor = apidae_tourisme.CExtractor()
 | 
			
		||||
    else:
 | 
			
		||||
        extractor = None
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,103 @@
 | 
			
		||||
from ..generic_extractors import *
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from apidae-tourisme widgets
 | 
			
		||||
class CExtractor(TwoStepsExtractorNoPause):
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content, infuture_days=180):
 | 
			
		||||
        
 | 
			
		||||
        # Get line starting with wrapper.querySelector(".results_agenda").innerHTML = "
 | 
			
		||||
        # split using "=" and keep the end
 | 
			
		||||
        # strip it, and remove the first character (") and the two last ones (";)
 | 
			
		||||
        # remove the escapes and parse the contained html
 | 
			
		||||
        for line in content.split("\n"):
 | 
			
		||||
            if line.startswith('wrapper.querySelector(".results_agenda").innerHTML = "'):
 | 
			
		||||
                html = ('"'.join(line.split('"')[3:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
 | 
			
		||||
                soup = BeautifulSoup(html, "html.parser")
 | 
			
		||||
                links = soup.select('a.widgit_result')
 | 
			
		||||
                for l in links:
 | 
			
		||||
                    self.add_event_url(l["data-w-href"])
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(
 | 
			
		||||
        self,
 | 
			
		||||
        event_content,
 | 
			
		||||
        event_url,
 | 
			
		||||
        url_human=None,
 | 
			
		||||
        default_values=None,
 | 
			
		||||
        published=False,
 | 
			
		||||
    ):
 | 
			
		||||
        # check for htag
 | 
			
		||||
        for line in event_content.split("\n"):
 | 
			
		||||
            if line.strip().startswith("window.location.hash"):
 | 
			
		||||
                ref = line.split('"')[1]
 | 
			
		||||
                break
 | 
			
		||||
 | 
			
		||||
        # check for content
 | 
			
		||||
        for line in event_content.split("\n"):
 | 
			
		||||
            if line.startswith('detailsWrapper.innerHTML ='):
 | 
			
		||||
                html = ('"'.join(line.split('"')[1:])).replace('\\"', '"').replace('\\n', "\n").replace('\\/', '/')
 | 
			
		||||
 | 
			
		||||
                soup = BeautifulSoup(html, "html.parser")
 | 
			
		||||
                title = soup.select_one('h2.widgit_title').text.strip()
 | 
			
		||||
                image = soup.select_one('img')
 | 
			
		||||
                image_alt = image["alt"]
 | 
			
		||||
                image = image["src"]
 | 
			
		||||
                description = soup.select('div.desc')
 | 
			
		||||
                description = '\n'.join([d.text for d in description])
 | 
			
		||||
                openings = soup.select_one('.openings .mts').text.strip().split("\n")[0]
 | 
			
		||||
                start_time = None
 | 
			
		||||
                end_time = None
 | 
			
		||||
                if "tous les" in openings:
 | 
			
		||||
                    start_day = None
 | 
			
		||||
                else:
 | 
			
		||||
                    start_day = Extractor.parse_french_date(openings)
 | 
			
		||||
                    details = openings.split("de")
 | 
			
		||||
                    if len(details) > 1:
 | 
			
		||||
                        hours = details[1].split("à")
 | 
			
		||||
                        start_time = Extractor.parse_french_time(hours[0])
 | 
			
		||||
                        if len(hours) > 1:
 | 
			
		||||
                            end_time = Extractor.parse_french_time(hours[1])
 | 
			
		||||
                
 | 
			
		||||
                contact = soup.select_one(".contact")
 | 
			
		||||
                sa = False
 | 
			
		||||
                location = []
 | 
			
		||||
                for c in contact.children:
 | 
			
		||||
                    if c.name == 'h2' and c.text.strip() == "Adresse":
 | 
			
		||||
                        sa = True
 | 
			
		||||
                    else:
 | 
			
		||||
                        if c.name == 'h2' and sa:
 | 
			
		||||
                            break
 | 
			
		||||
                    if c.name == 'p' and sa:
 | 
			
		||||
                        e = c.text.strip()
 | 
			
		||||
                        if e != "":
 | 
			
		||||
                            location.append(e)
 | 
			
		||||
 | 
			
		||||
                location = ', '.join(location)
 | 
			
		||||
 | 
			
		||||
                websites = soup.select("a.website")
 | 
			
		||||
                event_url = url_human + "#" + ref
 | 
			
		||||
 | 
			
		||||
                self.add_event_with_props(
 | 
			
		||||
                            default_values,
 | 
			
		||||
                            event_url,
 | 
			
		||||
                            title,
 | 
			
		||||
                            None,
 | 
			
		||||
                            start_day,
 | 
			
		||||
                            location,
 | 
			
		||||
                            description,
 | 
			
		||||
                            [],
 | 
			
		||||
                            recurrences=None,
 | 
			
		||||
                            uuids=[event_url],
 | 
			
		||||
                            url_human=event_url,
 | 
			
		||||
                            start_time=start_time,
 | 
			
		||||
                            end_day=start_day,
 | 
			
		||||
                            end_time=end_time,
 | 
			
		||||
                            published=published,
 | 
			
		||||
                            image=image,
 | 
			
		||||
                            image_alt=image_alt
 | 
			
		||||
                        )             
 | 
			
		||||
                return
 | 
			
		||||
@@ -2010,6 +2010,7 @@ class RecurrentImport(models.Model):
 | 
			
		||||
        ARACHNEE = "arachnee", _("Arachnée concert")
 | 
			
		||||
        LERIO = "rio", _('Le Rio')
 | 
			
		||||
        LARAYMONDE = "raymonde", _('La Raymonde')
 | 
			
		||||
        APIDAE = 'apidae', _('Agenda apidae tourisme')
 | 
			
		||||
 | 
			
		||||
    class DOWNLOADER(models.TextChoices):
 | 
			
		||||
        SIMPLE = "simple", _("simple")
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user