Ajout de la puce à l'oreille
This commit is contained in:
		
							
								
								
									
										43
									
								
								experimentations/get_lapucealoreille_events.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										43
									
								
								experimentations/get_lapucealoreille_events.py
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,43 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
# coding: utf-8
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
# getting the name of the directory
 | 
			
		||||
# where the this file is present.
 | 
			
		||||
current = os.path.dirname(os.path.realpath(__file__))
 | 
			
		||||
 
 | 
			
		||||
# Getting the parent directory name
 | 
			
		||||
# where the current directory is present.
 | 
			
		||||
parent = os.path.dirname(current)
 | 
			
		||||
 
 | 
			
		||||
# adding the parent directory to 
 | 
			
		||||
# the sys.path.
 | 
			
		||||
sys.path.append(parent)
 | 
			
		||||
 | 
			
		||||
from src.agenda_culturel.import_tasks.downloader import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.extractor import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.importer import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.custom_extractors import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    u2e = URL2Events(SimpleDownloader(), LaPuceALOreilleExtractor())
 | 
			
		||||
    url = "https://www.lapucealoreille63.fr/programmation/"
 | 
			
		||||
    url_human = "https://www.lapucealoreille63.fr/programmation/"
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        events = u2e.process(url, url_human, cache = "cache-lapucealoreille.xml", default_values = {}, published = True)
 | 
			
		||||
 | 
			
		||||
        exportfile = "events-lapucealoreille.json"
 | 
			
		||||
        print("Saving events to file {}".format(exportfile))
 | 
			
		||||
        with open(exportfile, "w") as f:
 | 
			
		||||
            json.dump(events, f, indent=4, default=str)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print("Exception: " + str(e))
 | 
			
		||||
@@ -107,6 +107,8 @@ def run_recurrent_import(self, pk):
 | 
			
		||||
        extractor = LaComedieExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.LEFOTOMAT:
 | 
			
		||||
        extractor = LeFotomatExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.LAPUCEALOREILLE:
 | 
			
		||||
        extractor = LaPuceALOreilleExtractor()
 | 
			
		||||
    else:
 | 
			
		||||
        extractor = None
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -195,3 +195,72 @@ class LeFotomatExtractor(TwoStepsExtractor):
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, None, None, start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from La puce à l'oreille
 | 
			
		||||
# URL: https://www.lapucealoreille63.fr/
 | 
			
		||||
class LaPuceALOreilleExtractor(TwoStepsExtractor):
 | 
			
		||||
    
 | 
			
		||||
    nom_lieu = "La Puce à l'Oreille"
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content):
 | 
			
		||||
        soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        events = soup.select("div.SPY_vo div[data-testid=mesh-container-content]")
 | 
			
		||||
        for e in events:
 | 
			
		||||
            e_url = e.find("a")
 | 
			
		||||
            if e_url:
 | 
			
		||||
                if self.add_event_url(e_url["href"]):
 | 
			
		||||
                    title = e.select("div[data-testid=richTextElement] h1.font_0 span")
 | 
			
		||||
                    if title:
 | 
			
		||||
                        title = title[0].contents[0].get_text().replace("\n", " ")
 | 
			
		||||
                        title = re.sub(" +", " ", title)
 | 
			
		||||
                        self.add_event_title(e_url["href"], title)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(self, event_content, event_url, url_human = None, default_values = None, published = False):
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        start_day = self.parse_french_date(soup.find("h2").get_text()) # pas parfait, mais bordel que ce site est mal construit
 | 
			
		||||
 | 
			
		||||
        spans = soup.select("div[data-testid=richTextElement] span")
 | 
			
		||||
        start_time = None
 | 
			
		||||
        end_time = None
 | 
			
		||||
        location = None
 | 
			
		||||
 | 
			
		||||
        for span in spans:
 | 
			
		||||
            txt = span.get_text()
 | 
			
		||||
            if txt.lstrip().startswith("DÉBUT"):
 | 
			
		||||
                start_time = self.parse_french_time(txt.split(":")[-1])
 | 
			
		||||
                end_time = None
 | 
			
		||||
            elif txt.lstrip().startswith("HORAIRES :"):
 | 
			
		||||
                hs = txt.split(":")[-1].split("-")
 | 
			
		||||
                start_time = self.parse_french_time(hs[0])
 | 
			
		||||
                if len(hs) > 1:
 | 
			
		||||
                    end_time = self.parse_french_time(hs[1])
 | 
			
		||||
                else:
 | 
			
		||||
                    end_time = None
 | 
			
		||||
            elif txt.lstrip().startswith("LIEU :") and not location:
 | 
			
		||||
                location = txt.split(":")[-1].lstrip()
 | 
			
		||||
 | 
			
		||||
        if not location:
 | 
			
		||||
            location = self.nom_lieu
 | 
			
		||||
        end_day = self.guess_end_day(start_day, start_time, end_time)
 | 
			
		||||
 | 
			
		||||
        url_human = event_url
 | 
			
		||||
        tags = []
 | 
			
		||||
 | 
			
		||||
        image = soup.select("wow-image img[fetchpriority=high]")
 | 
			
		||||
        if image:
 | 
			
		||||
            image = image[0]["src"]
 | 
			
		||||
        else:
 | 
			
		||||
            image = None
 | 
			
		||||
        
 | 
			
		||||
        descriptions = soup.select("div[data-testid=mesh-container-content] div[data-testid=inline-content] div[data-testid=mesh-container-content] div[data-testid=richTextElement]")
 | 
			
		||||
        if descriptions:
 | 
			
		||||
            descriptions = [d.get_text() for d in descriptions]
 | 
			
		||||
            description = max(descriptions, key=len)
 | 
			
		||||
        else:
 | 
			
		||||
            description = None
 | 
			
		||||
 | 
			
		||||
        self.add_event_with_props(event_url, None, "Concert", start_day, location, description, tags, recurrences=None, uuid=event_url, url_human=url_human, start_time=start_time, end_day=end_day, end_time=end_time, published=published, image=image)
 | 
			
		||||
 
 | 
			
		||||
@@ -22,6 +22,8 @@ class Extractor(ABC):
 | 
			
		||||
                return start_day
 | 
			
		||||
            else:
 | 
			
		||||
                return start_day + timedelta(days=1)
 | 
			
		||||
        else:
 | 
			
		||||
            return start_day
 | 
			
		||||
 | 
			
		||||
    def guess_month(self, text):
 | 
			
		||||
        mths = ["jan", "fe", "mar", "av", "mai", "juin", "juill", "ao", "sep", "oct", "nov", "dec"]
 | 
			
		||||
@@ -33,14 +35,14 @@ class Extractor(ABC):
 | 
			
		||||
 | 
			
		||||
    def parse_french_date(self, text):
 | 
			
		||||
        # format NomJour Numero Mois Année
 | 
			
		||||
        m = re.search('[a-zA-Z:.]+[ ]*([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text)
 | 
			
		||||
        m = re.search('[a-zA-ZéÉûÛ:.]+[ ]*([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text)
 | 
			
		||||
        if m:
 | 
			
		||||
            day = m.group(1)
 | 
			
		||||
            month = self.guess_month(m.group(2))
 | 
			
		||||
            year = m.group(3)
 | 
			
		||||
        else:
 | 
			
		||||
            # format Numero Mois Annee
 | 
			
		||||
            m = re.search('([0-9]+)[ ]*([a-zA-Z:.]+)[ ]*([0-9]+)', text)
 | 
			
		||||
            m = re.search('([0-9]+)[er]*[ ]*([a-zA-ZéÉûÛ:.]+)[ ]*([0-9]+)', text)
 | 
			
		||||
            if m:
 | 
			
		||||
                day = m.group(1)
 | 
			
		||||
                month = self.guess_month(m.group(2))
 | 
			
		||||
@@ -71,14 +73,14 @@ class Extractor(ABC):
 | 
			
		||||
            s = m.group(3)
 | 
			
		||||
        else:
 | 
			
		||||
            # format heures minutes
 | 
			
		||||
            m = re.search('([0-9]+)[ h:.]+([0-9]+)', text)
 | 
			
		||||
            m = re.search('([0-9]+)[ hH:.]+([0-9]+)', text)
 | 
			
		||||
            if m:
 | 
			
		||||
                h = m.group(1)
 | 
			
		||||
                m = m.group(2)
 | 
			
		||||
                s = "0"
 | 
			
		||||
            else:
 | 
			
		||||
                # format heures
 | 
			
		||||
                m = re.search('([0-9]+)[ h:.]', text)
 | 
			
		||||
                m = re.search('([0-9]+)[ Hh:.]', text)
 | 
			
		||||
                if m:
 | 
			
		||||
                    h = m.group(1)
 | 
			
		||||
                    m = "0"
 | 
			
		||||
 
 | 
			
		||||
@@ -58,7 +58,11 @@ class TwoStepsExtractor(Extractor):
 | 
			
		||||
        return url
 | 
			
		||||
 | 
			
		||||
    def add_event_url(self, url):
 | 
			
		||||
        self.event_urls.append(url)
 | 
			
		||||
        if url in self.event_urls:
 | 
			
		||||
            return False
 | 
			
		||||
        else:
 | 
			
		||||
            self.event_urls.append(url)
 | 
			
		||||
            return True
 | 
			
		||||
 | 
			
		||||
    def add_event_start_day(self, url, start_day):
 | 
			
		||||
        if not url in self.event_properties:
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,18 @@
 | 
			
		||||
# Generated by Django 4.2.7 on 2024-04-20 13:51
 | 
			
		||||
 | 
			
		||||
from django.db import migrations, models
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Migration(migrations.Migration):
 | 
			
		||||
 | 
			
		||||
    dependencies = [
 | 
			
		||||
        ('agenda_culturel', '0053_alter_recurrentimport_processor'),
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    operations = [
 | 
			
		||||
        migrations.AlterField(
 | 
			
		||||
            model_name='recurrentimport',
 | 
			
		||||
            name='processor',
 | 
			
		||||
            field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', 'la puce à loreille')], default='ical', max_length=20, verbose_name='Processor'),
 | 
			
		||||
        ),
 | 
			
		||||
    ]
 | 
			
		||||
@@ -760,6 +760,7 @@ class RecurrentImport(models.Model):
 | 
			
		||||
        LACOOPE = "lacoope", _('lacoope.org')
 | 
			
		||||
        LACOMEDIE = "lacomedie", _('la comédie')
 | 
			
		||||
        LEFOTOMAT = "lefotomat", _('le fotomat')
 | 
			
		||||
        LAPUCEALOREILLE = "lapucealoreille", _('la puce à l''oreille')
 | 
			
		||||
 | 
			
		||||
    class DOWNLOADER(models.TextChoices):
 | 
			
		||||
        SIMPLE = "simple", _("simple")
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user