Ajout d'un extracteur pour Arachnée Concerts
This commit is contained in:
		
							
								
								
									
										40
									
								
								experimentations/get_arachnee_events.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										40
									
								
								experimentations/get_arachnee_events.py
									
									
									
									
									
										Executable file
									
								
							@@ -0,0 +1,40 @@
 | 
			
		||||
#!/usr/bin/python3
 | 
			
		||||
# coding: utf-8
 | 
			
		||||
 | 
			
		||||
import os
 | 
			
		||||
import json
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
# getting the name of the directory
 | 
			
		||||
# where the this file is present.
 | 
			
		||||
current = os.path.dirname(os.path.realpath(__file__))
 | 
			
		||||
 
 | 
			
		||||
# Getting the parent directory name
 | 
			
		||||
# where the current directory is present.
 | 
			
		||||
parent = os.path.dirname(current)
 | 
			
		||||
 
 | 
			
		||||
# adding the parent directory to 
 | 
			
		||||
# the sys.path.
 | 
			
		||||
sys.path.append(parent)
 | 
			
		||||
 | 
			
		||||
from src.agenda_culturel.import_tasks.downloader import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.extractor import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.importer import *
 | 
			
		||||
from src.agenda_culturel.import_tasks.custom_extractors import *
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    u2e = URL2Events(ChromiumHeadlessDownloader(), arachnee.CExtractor())
 | 
			
		||||
    url = "https://www.arachnee-concerts.com/wp-admin/admin-ajax.php?action=movies-filter&per_page=9999&date=NaN.NaN.NaN&theatres=Clermont-Fd&cat=&sorting=&list_all_events=¤t_page="
 | 
			
		||||
    url_human = "https://www.arachnee-concerts.com/agenda-des-concerts/Clermont-Fd/"
 | 
			
		||||
 | 
			
		||||
    try:
 | 
			
		||||
        events = u2e.process(url, url_human, cache = "cache-arachnee.html", default_values = {}, published = True)
 | 
			
		||||
 | 
			
		||||
        exportfile = "events-arachnee.json"
 | 
			
		||||
        print("Saving events to file {}".format(exportfile))
 | 
			
		||||
        with open(exportfile, "w") as f:
 | 
			
		||||
            json.dump(events, f, indent=4, default=str)
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        print("Exception: " + str(e))
 | 
			
		||||
@@ -145,6 +145,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
 | 
			
		||||
        extractor = fbevents.CExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.C3C:
 | 
			
		||||
        extractor = c3c.CExtractor()
 | 
			
		||||
    elif rimport.processor == RecurrentImport.PROCESSOR.ARACHNEE:
 | 
			
		||||
        extractor = arachnee.CExtractor()
 | 
			
		||||
    else:
 | 
			
		||||
        extractor = None
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										109
									
								
								src/agenda_culturel/import_tasks/custom_extractors/arachnee.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								src/agenda_culturel/import_tasks/custom_extractors/arachnee.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,109 @@
 | 
			
		||||
from ..generic_extractors import *
 | 
			
		||||
from bs4 import BeautifulSoup
 | 
			
		||||
 | 
			
		||||
# A class dedicated to get events from Arachnée Concert
 | 
			
		||||
# URL: https://www.arachnee-concerts.com/agenda-des-concerts/
 | 
			
		||||
class CExtractor(TwoStepsExtractorNoPause):
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        super().__init__()
 | 
			
		||||
        self.possible_dates = {}
 | 
			
		||||
        self.theater = None
 | 
			
		||||
 | 
			
		||||
    def extract(
 | 
			
		||||
        self,
 | 
			
		||||
        content,
 | 
			
		||||
        url,
 | 
			
		||||
        url_human=None,
 | 
			
		||||
        default_values=None,
 | 
			
		||||
        published=False,
 | 
			
		||||
        only_future=True,
 | 
			
		||||
        ignore_404=True
 | 
			
		||||
    ):
 | 
			
		||||
        match = re.match(r".*\&theatres=([^&]*)&.*", url)
 | 
			
		||||
        if match:
 | 
			
		||||
            self.theater = match[1]
 | 
			
		||||
 | 
			
		||||
        return super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
 | 
			
		||||
 | 
			
		||||
    def build_event_url_list(self, content, infuture_days=180):
 | 
			
		||||
        
 | 
			
		||||
        soup = BeautifulSoup(content, "html.parser")
 | 
			
		||||
 | 
			
		||||
        containers = soup.select("ul.event_container>li")
 | 
			
		||||
        if containers:
 | 
			
		||||
            for c in containers:
 | 
			
		||||
                d = Extractor.parse_french_date(c.select_one(".date").text)
 | 
			
		||||
                l = c.select_one(".event_auditory").text
 | 
			
		||||
                if (self.theater is None or (l.startswith(self.theater))) and d < datetime.date.today() + timedelta(days=infuture_days):
 | 
			
		||||
                    t = Extractor.parse_french_time(c.select_one(".time").text)
 | 
			
		||||
                    e_url = c.select_one(".info a")["href"]
 | 
			
		||||
                    if not e_url in self.possible_dates:
 | 
			
		||||
                        self.possible_dates[e_url] = []
 | 
			
		||||
                    self.possible_dates[e_url].append((str(d) + " " + str(t)))
 | 
			
		||||
                    self.add_event_url(e_url)
 | 
			
		||||
 | 
			
		||||
    def add_event_from_content(
 | 
			
		||||
        self,
 | 
			
		||||
        event_content,
 | 
			
		||||
        event_url,
 | 
			
		||||
        url_human=None,
 | 
			
		||||
        default_values=None,
 | 
			
		||||
        published=False,
 | 
			
		||||
    ):
 | 
			
		||||
 | 
			
		||||
        soup = BeautifulSoup(event_content, "html.parser")
 | 
			
		||||
        title = ", ".join([x.text for x in [soup.select_one(y) for y in [".page_title", ".artiste-subtitle"]] if x])
 | 
			
		||||
        
 | 
			
		||||
        image = soup.select_one(".entry-image .image_wrapper img")
 | 
			
		||||
        if not image is None:
 | 
			
		||||
            image = image["src"]
 | 
			
		||||
 | 
			
		||||
        descs = soup.select(".entry-content p")
 | 
			
		||||
        if descs:
 | 
			
		||||
            description = "\n".join([d.text for d in descs])
 | 
			
		||||
        else:
 | 
			
		||||
            description = None
 | 
			
		||||
 | 
			
		||||
        category = soup.select_one(".event_category").text
 | 
			
		||||
        tags = []
 | 
			
		||||
        if category in ["Grand Spectacle"]:
 | 
			
		||||
            category = "Danse"
 | 
			
		||||
        elif category in ["Théâtre"]:
 | 
			
		||||
            category = "Théâtre"
 | 
			
		||||
        elif category in ["Chanson française", "Musique du monde", "Pop / Rock", "Rap, RnB", "Raggae", "Variété"]:
 | 
			
		||||
            category = "Concert"
 | 
			
		||||
        elif category in ["Comédie Musicale", "Humour / One Man Show", "Spectacle équestre"]:
 | 
			
		||||
            category = "Art du spectacle"
 | 
			
		||||
        elif category in ["Spectacle pour enfant"]:
 | 
			
		||||
            tags = ["jeune public"]
 | 
			
		||||
            category = None
 | 
			
		||||
        else:
 | 
			
		||||
            category = ""
 | 
			
		||||
 | 
			
		||||
        dates = soup.select("#event_ticket_content>ul>li")
 | 
			
		||||
        for d in dates:
 | 
			
		||||
            dt = datetime.datetime.fromisoformat(d.select_one(".date")["content"])
 | 
			
		||||
            date = dt.date()
 | 
			
		||||
            time = dt.time()
 | 
			
		||||
            if str(date) + " " + str(time) in self.possible_dates[event_url]:
 | 
			
		||||
                location = d.select_one(".event_auditory").text
 | 
			
		||||
                
 | 
			
		||||
                self.add_event_with_props(
 | 
			
		||||
                    default_values,
 | 
			
		||||
                    event_url,
 | 
			
		||||
                    title,
 | 
			
		||||
                    category,
 | 
			
		||||
                    date,
 | 
			
		||||
                    location,
 | 
			
		||||
                    description,
 | 
			
		||||
                    tags,
 | 
			
		||||
                    recurrences=None,
 | 
			
		||||
                    uuids=[event_url + "?d=" + str(date) + "&t=" + str(time)],
 | 
			
		||||
                    url_human=url_human,
 | 
			
		||||
                    start_time=time,
 | 
			
		||||
                    end_day=None,
 | 
			
		||||
                    end_time=None,
 | 
			
		||||
                    published=published,
 | 
			
		||||
                    image=image,
 | 
			
		||||
                )
 | 
			
		||||
@@ -250,3 +250,23 @@ class TwoStepsExtractor(Extractor):
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
        return self.get_structure()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class TwoStepsExtractorNoPause(TwoStepsExtractor):
 | 
			
		||||
 | 
			
		||||
    def extract(
 | 
			
		||||
        self,
 | 
			
		||||
        content,
 | 
			
		||||
        url,
 | 
			
		||||
        url_human=None,
 | 
			
		||||
        default_values=None,
 | 
			
		||||
        published=False,
 | 
			
		||||
        only_future=True,
 | 
			
		||||
        ignore_404=True
 | 
			
		||||
    ):
 | 
			
		||||
        pause = self.downloader.pause
 | 
			
		||||
        self.downloader.pause = False
 | 
			
		||||
        result = super().extract(content, url, url_human, default_values, published, only_future, ignore_404)
 | 
			
		||||
        self.downloader.pause = pause
 | 
			
		||||
 | 
			
		||||
        return result
 | 
			
		||||
@@ -0,0 +1,18 @@
 | 
			
		||||
# Generated by Django 4.2.9 on 2024-10-19 13:24
 | 
			
		||||
 | 
			
		||||
from django.db import migrations, models
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Migration(migrations.Migration):
 | 
			
		||||
 | 
			
		||||
    dependencies = [
 | 
			
		||||
        ('agenda_culturel', '0089_alter_recurrentimport_defaultcategory'),
 | 
			
		||||
    ]
 | 
			
		||||
 | 
			
		||||
    operations = [
 | 
			
		||||
        migrations.AlterField(
 | 
			
		||||
            model_name='recurrentimport',
 | 
			
		||||
            name='processor',
 | 
			
		||||
            field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('cour3coquins', 'la cour des 3 coquins'), ('arachnee', 'Arachnée concert')], default='ical', max_length=20, verbose_name='Processor'),
 | 
			
		||||
        ),
 | 
			
		||||
    ]
 | 
			
		||||
@@ -1307,6 +1307,7 @@ class RecurrentImport(models.Model):
 | 
			
		||||
        MECWORDPRESS = "Plugin wordpress MEC", _("Plugin wordpress MEC")
 | 
			
		||||
        FBEVENTS = "Facebook events", _("Événements d'une page FB")
 | 
			
		||||
        C3C = "cour3coquins", _("la cour des 3 coquins")
 | 
			
		||||
        ARACHNEE = "arachnee", _("Arachnée concert")
 | 
			
		||||
 | 
			
		||||
    class DOWNLOADER(models.TextChoices):
 | 
			
		||||
        SIMPLE = "simple", _("simple")
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user