parent
023a7fee8d
commit
2f146172da
44
experimentations/get_amisdutempsdescerises.py
Executable file
44
experimentations/get_amisdutempsdescerises.py
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# getting the name of the directory
|
||||||
|
# where the this file is present.
|
||||||
|
current = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
# Getting the parent directory name
|
||||||
|
# where the current directory is present.
|
||||||
|
parent = os.path.dirname(current)
|
||||||
|
|
||||||
|
# adding the parent directory to
|
||||||
|
# the sys.path.
|
||||||
|
sys.path.append(parent)
|
||||||
|
sys.path.append(parent + "/src")
|
||||||
|
|
||||||
|
from src.agenda_culturel.import_tasks.downloader import *
|
||||||
|
from src.agenda_culturel.import_tasks.extractor import *
|
||||||
|
from src.agenda_culturel.import_tasks.importer import *
|
||||||
|
from src.agenda_culturel.import_tasks.custom_extractors import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
u2e = URL2Events(SimpleDownloader(), amisdutempsdescerises.CExtractor())
|
||||||
|
url = "https://amisdutempsdescerises.org/page.php"
|
||||||
|
url_human = "https://amisdutempsdescerises.org/"
|
||||||
|
|
||||||
|
try:
|
||||||
|
events = u2e.process(url, url_human, cache = "cache-amiscerices.xml", default_values = {"category": "Rencontres & Débats"}, published = True)
|
||||||
|
|
||||||
|
exportfile = "events-amiscerices.json"
|
||||||
|
print("Saving events to file {}".format(exportfile))
|
||||||
|
with open(exportfile, "w") as f:
|
||||||
|
json.dump(events, f, indent=4, default=str)
|
||||||
|
except Exception as e:
|
||||||
|
print("Exception: " + str(e))
|
@ -158,6 +158,8 @@ def run_recurrent_import_internal(rimport, downloader, req_id):
|
|||||||
extractor = iguana_agenda.CExtractor()
|
extractor = iguana_agenda.CExtractor()
|
||||||
elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES:
|
elif rimport.processor == RecurrentImport.PROCESSOR.MILLEFORMES:
|
||||||
extractor = mille_formes.CExtractor()
|
extractor = mille_formes.CExtractor()
|
||||||
|
elif rimport.processor == RecurrentImport.PROCESSOR.AMISCERISES:
|
||||||
|
extractor = amisdutempsdescerises.CExtractor()
|
||||||
else:
|
else:
|
||||||
extractor = None
|
extractor = None
|
||||||
|
|
||||||
|
@ -0,0 +1,72 @@
|
|||||||
|
from ..extractor import *
|
||||||
|
import json
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urlparse, unquote
|
||||||
|
import pytz
|
||||||
|
import html
|
||||||
|
|
||||||
|
|
||||||
|
# A class dedicated to get events from les amis du temps des cerises
|
||||||
|
# Website https://amisdutempsdescerises.org/
|
||||||
|
class CExtractor(Extractor):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.data = b'------toto\r\nContent-Disposition: form-data; name="p"\r\n\r\nfutur\r\n------toto--\r\n'
|
||||||
|
self.content_type = 'multipart/form-data; boundary=----toto'
|
||||||
|
|
||||||
|
|
||||||
|
def extract(
|
||||||
|
self, content, url, url_human=None, default_values=None, published=False
|
||||||
|
):
|
||||||
|
self.set_header(url)
|
||||||
|
self.clear_events()
|
||||||
|
|
||||||
|
root_url = "https://" + urlparse(url).netloc + "/"
|
||||||
|
images_basename = root_url + "images/"
|
||||||
|
from_timezone = pytz.utc
|
||||||
|
to_timezone = pytz.timezone("Europe/Paris")
|
||||||
|
|
||||||
|
events = json.loads(content)
|
||||||
|
for e in events:
|
||||||
|
tags = []
|
||||||
|
start_day = e["ev_date"].split(' ')[0]
|
||||||
|
start_time = e["ev_time"]
|
||||||
|
title = html.unescape(e["ev_titre"]) # TODO: décoder
|
||||||
|
if "ev_sstitre" in e and e["ev_sstitre"] != '':
|
||||||
|
title = title + ' - ' + html.unescape(e["ev_sstitre"])
|
||||||
|
|
||||||
|
soup = BeautifulSoup(e["ev_info"], "html.parser")
|
||||||
|
description = soup.text
|
||||||
|
location = e["li_nom"] if "li_nom" in e else None
|
||||||
|
if "ev_canceled" in e and e["ev_canceled"] != '0':
|
||||||
|
tags += ["annulé"]
|
||||||
|
|
||||||
|
image = None
|
||||||
|
if "ev_img" in e and e["ev_img"] != '':
|
||||||
|
image = images_basename + e["ev_img"]
|
||||||
|
|
||||||
|
naive_dt = datetime.strptime(e["ev_date"], "%Y-%m-%d %H:%M:%S")
|
||||||
|
|
||||||
|
from_dt = from_timezone.localize(naive_dt)
|
||||||
|
dt = to_timezone.normalize(from_dt)
|
||||||
|
ts = int(datetime.timestamp(dt)) * 1000
|
||||||
|
|
||||||
|
event_url = root_url + "#" + str(ts)
|
||||||
|
|
||||||
|
self.add_event(
|
||||||
|
default_values,
|
||||||
|
title,
|
||||||
|
None,
|
||||||
|
start_day,
|
||||||
|
location,
|
||||||
|
description,
|
||||||
|
tags,
|
||||||
|
uuids=[event_url],
|
||||||
|
recurrences=None,
|
||||||
|
url_human=event_url,
|
||||||
|
start_time=start_time,
|
||||||
|
published=published,
|
||||||
|
image=image )
|
||||||
|
|
||||||
|
return self.get_structure()
|
@ -17,13 +17,13 @@ class Downloader(ABC):
|
|||||||
def download(self, url, post=None):
|
def download(self, url, post=None):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_content(self, url, cache=None, referer=None, post=None):
|
def get_content(self, url, cache=None, referer=None, post=None, content_type=None, data=None):
|
||||||
if cache and os.path.exists(cache):
|
if cache and os.path.exists(cache):
|
||||||
print("Loading cache ({})".format(cache))
|
print("Loading cache ({})".format(cache))
|
||||||
with open(cache) as f:
|
with open(cache) as f:
|
||||||
content = "\n".join(f.readlines())
|
content = "\n".join(f.readlines())
|
||||||
else:
|
else:
|
||||||
content = self.download(url, referer=referer, post=post)
|
content = self.download(url, referer=referer, post=post, content_type=content_type, data=data)
|
||||||
|
|
||||||
if cache:
|
if cache:
|
||||||
print("Saving cache ({})".format(cache))
|
print("Saving cache ({})".format(cache))
|
||||||
@ -39,7 +39,7 @@ class SimpleDownloader(Downloader):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
def download(self, url, referer=None, post=None):
|
def download(self, url, referer=None, post=None, content_type=None, data=None):
|
||||||
print("Downloading {} referer: {} post: {}".format(url, referer, post))
|
print("Downloading {} referer: {} post: {}".format(url, referer, post))
|
||||||
try:
|
try:
|
||||||
headers = {
|
headers = {
|
||||||
@ -47,7 +47,9 @@ class SimpleDownloader(Downloader):
|
|||||||
}
|
}
|
||||||
if referer is not None:
|
if referer is not None:
|
||||||
headers["Referer"] = referer
|
headers["Referer"] = referer
|
||||||
req = Request(url, headers=headers)
|
if content_type is not None:
|
||||||
|
headers["Content-Type"] = content_type
|
||||||
|
req = Request(url, headers=headers, data=data)
|
||||||
if post:
|
if post:
|
||||||
post_args = urlencode(post).encode("utf-8")
|
post_args = urlencode(post).encode("utf-8")
|
||||||
resource = urllib.request.urlopen(req, post_args)
|
resource = urllib.request.urlopen(req, post_args)
|
||||||
@ -109,9 +111,15 @@ class ChromiumHeadlessDownloader(Downloader):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def download(self, url, referer=None, post=None):
|
def download(self, url, referer=None, post=None, content_type=None, data=None):
|
||||||
if post:
|
if post:
|
||||||
raise Exception("POST method with Chromium headless not yet implemented")
|
raise Exception("POST method with Chromium headless not yet implemented")
|
||||||
|
if referer:
|
||||||
|
raise Exception("Referer parameter with Chromium headless not yet implemented")
|
||||||
|
if data:
|
||||||
|
raise Exception("Data content with Chromium headless not yet implemented")
|
||||||
|
if content_type:
|
||||||
|
raise Exception("Content-type parameter with Chromium headless not yet implemented")
|
||||||
print("Download {}".format(url))
|
print("Download {}".format(url))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -19,7 +19,11 @@ class Extractor(ABC):
|
|||||||
self.events = []
|
self.events = []
|
||||||
self.downloader = None
|
self.downloader = None
|
||||||
self.has_2nd_method = False
|
self.has_2nd_method = False
|
||||||
|
|
||||||
|
# parameters used by the downloader to get the content
|
||||||
self.referer = ""
|
self.referer = ""
|
||||||
|
self.data = None
|
||||||
|
self.content_type = None
|
||||||
|
|
||||||
def prepare_2nd_extract(self):
|
def prepare_2nd_extract(self):
|
||||||
pass
|
pass
|
||||||
@ -169,7 +173,7 @@ class Extractor(ABC):
|
|||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def clean_url(url):
|
def clean_url(url):
|
||||||
pass
|
return url
|
||||||
|
|
||||||
def is_known_url(url):
|
def is_known_url(url):
|
||||||
return False
|
return False
|
||||||
|
@ -20,9 +20,13 @@ class URL2Events:
|
|||||||
first=True
|
first=True
|
||||||
):
|
):
|
||||||
referer = ""
|
referer = ""
|
||||||
|
data = None
|
||||||
|
content_type = None
|
||||||
if self.extractor:
|
if self.extractor:
|
||||||
referer = self.extractor.url_referer
|
referer = self.extractor.url_referer
|
||||||
content = self.downloader.get_content(url, cache, referer=referer)
|
data = self.extractor.data
|
||||||
|
content_type = self.extractor.content_type
|
||||||
|
content = self.downloader.get_content(url, cache, referer=referer, content_type=content_type, data=data)
|
||||||
|
|
||||||
if content is None:
|
if content is None:
|
||||||
return None
|
return None
|
||||||
|
@ -0,0 +1,18 @@
|
|||||||
|
# Generated by Django 4.2.9 on 2025-02-08 13:33
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('agenda_culturel', '0145_revert_pause'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='recurrentimport',
|
||||||
|
name='processor',
|
||||||
|
field=models.CharField(choices=[('ical', 'ical'), ('icalnobusy', 'ical no busy'), ('icalnovc', 'ical no VC'), ('lacoope', 'lacoope.org'), ('lacomedie', 'la comédie'), ('lefotomat', 'le fotomat'), ('lapucealoreille', "la puce à l'oreille"), ('Plugin wordpress MEC', 'Plugin wordpress MEC'), ('Facebook events', "Événements d'une page FB"), ('Billetterie CF', 'Billetterie Clermont-Ferrand'), ('arachnee', 'Arachnée concert'), ('rio', 'Le Rio'), ('raymonde', 'La Raymonde'), ('apidae', 'Agenda apidae tourisme'), ('iguana', 'Agenda iguana (médiathèques)'), ('Mille formes', 'Mille formes'), ('Amis cerises', 'Les Amis du Temps des Cerises')], default='ical', max_length=20, verbose_name='Processor'),
|
||||||
|
),
|
||||||
|
]
|
@ -2106,6 +2106,7 @@ class RecurrentImport(models.Model):
|
|||||||
APIDAE = 'apidae', _('Agenda apidae tourisme')
|
APIDAE = 'apidae', _('Agenda apidae tourisme')
|
||||||
IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
|
IGUANA = 'iguana', _('Agenda iguana (médiathèques)')
|
||||||
MILLEFORMES = 'Mille formes', _('Mille formes')
|
MILLEFORMES = 'Mille formes', _('Mille formes')
|
||||||
|
AMISCERISES = 'Amis cerises', _('Les Amis du Temps des Cerises')
|
||||||
|
|
||||||
class DOWNLOADER(models.TextChoices):
|
class DOWNLOADER(models.TextChoices):
|
||||||
SIMPLE = "simple", _("simple")
|
SIMPLE = "simple", _("simple")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user