Intégration de beautifulsoup et selenium pour récupérer le contenu d'un événement Facebook
This commit is contained in:
		
							
								
								
									
										3
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								Makefile
									
									
									
									
									
								
							@@ -53,6 +53,9 @@ migrate:
 | 
				
			|||||||
build-dev:
 | 
					build-dev:
 | 
				
			||||||
	DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build -d
 | 
						DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build -d
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					build-dev-log:
 | 
				
			||||||
 | 
						DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.yml up --build
 | 
				
			||||||
 | 
					
 | 
				
			||||||
build-prod:
 | 
					build-prod:
 | 
				
			||||||
	DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.prod.yml up --build -d
 | 
						DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker-compose -f docker-compose.prod.yml up --build -d
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -5,7 +5,7 @@ WORKDIR /usr/src/app
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
RUN --mount=type=cache,target=/var/cache/apt \
 | 
					RUN --mount=type=cache,target=/var/cache/apt \
 | 
				
			||||||
	apt-get update && \
 | 
						apt-get update && \
 | 
				
			||||||
    apt-get install --no-install-recommends -y build-essential libpq-dev gettext \
 | 
					    apt-get install --no-install-recommends -y build-essential libpq-dev gettext chromium-driver \
 | 
				
			||||||
    && rm -rf /var/lib/apt/lists/*
 | 
					    && rm -rf /var/lib/apt/lists/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -4,6 +4,7 @@ from celery import Celery
 | 
				
			|||||||
from celery.schedules import crontab
 | 
					from celery.schedules import crontab
 | 
				
			||||||
from celery.utils.log import get_task_logger
 | 
					from celery.utils.log import get_task_logger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from .extractors import ExtractorAllURLs 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Set the default Django settings module for the 'celery' program.
 | 
					# Set the default Django settings module for the 'celery' program.
 | 
				
			||||||
APP_ENV = os.getenv("APP_ENV", "dev")
 | 
					APP_ENV = os.getenv("APP_ENV", "dev")
 | 
				
			||||||
@@ -29,6 +30,7 @@ def create_event_from_submission(self, url):
 | 
				
			|||||||
    logger.info(f"{url=}")
 | 
					    logger.info(f"{url=}")
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        logger.info("About to create event from submission")
 | 
					        logger.info("About to create event from submission")
 | 
				
			||||||
 | 
					        events = ExtractorAllURLs.extract(url)
 | 
				
			||||||
        # TODO
 | 
					        # TODO
 | 
				
			||||||
    except BadHeaderError:
 | 
					    except BadHeaderError:
 | 
				
			||||||
        logger.info("BadHeaderError")
 | 
					        logger.info("BadHeaderError")
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										102
									
								
								src/agenda_culturel/extractors.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										102
									
								
								src/agenda_culturel/extractors.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,102 @@
 | 
				
			|||||||
 | 
					from abc import ABC, abstractmethod
 | 
				
			||||||
 | 
					#from .models import Event
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from selenium import webdriver
 | 
				
			||||||
 | 
					from selenium.webdriver.chrome.service import Service
 | 
				
			||||||
 | 
					from selenium.webdriver.chrome.options import Options
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from celery.utils.log import get_task_logger
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					logger = get_task_logger(__name__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class Extractor:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    @abstractmethod
 | 
				
			||||||
 | 
					    def extract(url):
 | 
				
			||||||
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def download(url):
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            options = Options()
 | 
				
			||||||
 | 
					            options.add_argument("--headless=new")
 | 
				
			||||||
 | 
					            options.add_argument("--disable-dev-shm-usage")
 | 
				
			||||||
 | 
					            options.add_argument("--no-sandbox")
 | 
				
			||||||
 | 
					            service = Service("/usr/bin/chromedriver")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            driver = webdriver.Chrome(service=service, options=options)
 | 
				
			||||||
 | 
					            driver.get(url)
 | 
				
			||||||
 | 
					            doc = driver.page_source
 | 
				
			||||||
 | 
					            driver.quit()
 | 
				
			||||||
 | 
					            return doc
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logger.error(e)
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ExtractorFacebook(Extractor):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    class FacebookEvent:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        name = "event"
 | 
				
			||||||
 | 
					        keys = ["start_time_formatted", 'start_timestamp', 'is_past', "name", "price_info", "cover_media_renderer", "event_creator", "id", "day_time_sentence", "event_place", "comet_neighboring_siblings"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __init__(self, event):
 | 
				
			||||||
 | 
					            self.data = event
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __str__(self):
 | 
				
			||||||
 | 
					            return self.data["name"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def find_event_in_array(array):
 | 
				
			||||||
 | 
					            if isinstance(array, dict):
 | 
				
			||||||
 | 
					                if len(ExtractorFacebook.FacebookEvent.keys) == len([k for k in ExtractorFacebook.FacebookEvent.keys if k in array]):
 | 
				
			||||||
 | 
					                    return ExtractorFacebook.FacebookEvent(array)
 | 
				
			||||||
 | 
					                else:
 | 
				
			||||||
 | 
					                    for k in array:
 | 
				
			||||||
 | 
					                        v = ExtractorFacebook.FacebookEvent.find_event_in_array(array[k])
 | 
				
			||||||
 | 
					                        if v != None:
 | 
				
			||||||
 | 
					                            return v
 | 
				
			||||||
 | 
					            elif isinstance(array, list):
 | 
				
			||||||
 | 
					                for e in array:
 | 
				
			||||||
 | 
					                        v = ExtractorFacebook.FacebookEvent.find_event_in_array(e)
 | 
				
			||||||
 | 
					                        if v != None:
 | 
				
			||||||
 | 
					                            return v
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def extract(url):
 | 
				
			||||||
 | 
					        txt = Extractor.download(url)
 | 
				
			||||||
 | 
					        if txt is None:
 | 
				
			||||||
 | 
					            logger.error("Cannot download " + url)
 | 
				
			||||||
 | 
					            return None
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            soup = BeautifulSoup(txt, "html.parser")
 | 
				
			||||||
 | 
					            for json_script in soup.find_all('script', type="application/json"):
 | 
				
			||||||
 | 
					                json_txt = json_script.get_text()
 | 
				
			||||||
 | 
					                json_struct = json.loads(json_txt)
 | 
				
			||||||
 | 
					                fevent = ExtractorFacebook.FacebookEvent.find_event_in_array(json_struct)
 | 
				
			||||||
 | 
					                if fevent != None:
 | 
				
			||||||
 | 
					                    logger.info(str(fevent))
 | 
				
			||||||
 | 
					                    result = "TODO"
 | 
				
			||||||
 | 
					                    return result
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					        return None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					class ExtractorAllURLs:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def extract(url):
 | 
				
			||||||
 | 
					        logger.info("Run extraction")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        result = ExtractorFacebook.extract(url)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if result is None:
 | 
				
			||||||
 | 
					            logger.info("Not a Facebook link")
 | 
				
			||||||
 | 
					            # add here other extrators
 | 
				
			||||||
 | 
					            pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return result
 | 
				
			||||||
@@ -20,3 +20,5 @@ vine==5.0.0
 | 
				
			|||||||
wcwidth==0.2.6
 | 
					wcwidth==0.2.6
 | 
				
			||||||
redis==4.5.5
 | 
					redis==4.5.5
 | 
				
			||||||
whitenoise==6.4.0
 | 
					whitenoise==6.4.0
 | 
				
			||||||
 | 
					selenium==4.14.0
 | 
				
			||||||
 | 
					BeautifulSoup4==4.12.2
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user