38
									
								
								experimentations/get_associations_cf.py
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										38
									
								
								experimentations/get_associations_cf.py
									
									
									
									
									
										Executable file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| #!/usr/bin/python3 | ||||
| # coding: utf-8 | ||||
|  | ||||
| import json | ||||
| import os | ||||
| import sys | ||||
|  | ||||
| # getting the name of the directory | ||||
| # where the this file is present. | ||||
| current = os.path.dirname(os.path.realpath(__file__)) | ||||
|  | ||||
| # Getting the parent directory name | ||||
| # where the current directory is present. | ||||
| parent = os.path.dirname(current) | ||||
|  | ||||
| # adding the parent directory to | ||||
| # the sys.path. | ||||
| sys.path.append(parent) | ||||
| sys.path.append(parent + "/src") | ||||
|  | ||||
| from src.agenda_culturel.import_tasks.downloader import ( | ||||
|     ChromiumHeadlessDownloader, | ||||
| ) | ||||
| from src.agenda_culturel.import_tasks.custom_extractors.associations_cf import ( | ||||
|     CExtractor, | ||||
| ) | ||||
| from src.agenda_culturel.import_tasks.importer import URL2Events | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     u2e = URL2Events(ChromiumHeadlessDownloader(), CExtractor()) | ||||
|     url = "https://associations.clermont-ferrand.fr/evenement/week-end-multi-culturel" | ||||
|  | ||||
|     events = u2e.process(url, cache="asso_cf.html", published=True) | ||||
|  | ||||
|     exportfile = "event-asso_cf.json" | ||||
|     print("Saving events to file {}".format(exportfile)) | ||||
|     with open(exportfile, "w") as f: | ||||
|         json.dump(events, f, indent=4, default=str) | ||||
| @@ -0,0 +1,72 @@ | ||||
| import logging | ||||
| import re | ||||
| from urllib.parse import urlparse | ||||
|  | ||||
| from bs4 import BeautifulSoup | ||||
|  | ||||
| from ..extractor import Extractor | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
|  | ||||
| class CExtractor(Extractor): | ||||
|     def __init__(self): | ||||
|         super().__init__() | ||||
|  | ||||
|     def extract( | ||||
|         self, | ||||
|         content, | ||||
|         url, | ||||
|         url_human=None, | ||||
|         default_values=None, | ||||
|         published=False, | ||||
|     ): | ||||
|         self.set_header(url) | ||||
|         u = urlparse(url) | ||||
|         if u.netloc == "associations.clermont-ferrand.fr": | ||||
|             soup = BeautifulSoup(content, "html.parser") | ||||
|  | ||||
|             title = soup.select_one("h1").text.strip() | ||||
|             img = soup.select_one("img.image-style-event") | ||||
|             image = None | ||||
|             image_alt = None | ||||
|             if img: | ||||
|                 image = img["src"] | ||||
|                 if not image.startswith("http"): | ||||
|                     image = "https://" + u.netloc + image | ||||
|                 image_alt = img["alt"] | ||||
|             description = soup.select_one("div.field--name-body").get_text( | ||||
|                 separator="\n" | ||||
|             ) | ||||
|             location = soup.select_one("div.c-location__holder .c-desc").text | ||||
|  | ||||
|             start_day = soup.select_one("div.o-date") | ||||
|             if start_day is not None: | ||||
|                 start_day = Extractor.parse_french_date( | ||||
|                     re.sub("[ ]*\n[ ]*", " ", start_day.get_text(separator=" ")) | ||||
|                 ) | ||||
|  | ||||
|             start_time = soup.select_one("div.c-hours__holder .c-desc") | ||||
|             if start_time is not None: | ||||
|                 start_time = Extractor.parse_french_time(start_time.text) | ||||
|  | ||||
|             category = None | ||||
|             tags = [] | ||||
|             uuids = [url] | ||||
|  | ||||
|             self.add_event( | ||||
|                 default_values, | ||||
|                 title, | ||||
|                 category, | ||||
|                 start_day, | ||||
|                 location, | ||||
|                 description, | ||||
|                 tags, | ||||
|                 uuids, | ||||
|                 url_human=url_human, | ||||
|                 start_time=start_time, | ||||
|                 image=image, | ||||
|                 image_alt=image_alt, | ||||
|             ) | ||||
|  | ||||
|         return self.get_structure() | ||||
| @@ -315,11 +315,13 @@ class Extractor(ABC): | ||||
|             CExtractor as GoogleCalendarLinkEventExtractor, | ||||
|         ) | ||||
|         from .generic_extractors.ical import ICALExtractor | ||||
|         from .custom_extractors.associations_cf import CExtractor as AssociationsCF | ||||
|  | ||||
|         if single_event: | ||||
|             return [ | ||||
|                 FacebookEventExtractor(), | ||||
|                 GoogleCalendarLinkEventExtractor(), | ||||
|                 AssociationsCF(), | ||||
|                 EventNotFoundExtractor(), | ||||
|             ] | ||||
|         else: | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jean-Marie Favreau
					Jean-Marie Favreau