diff --git a/src/agenda_culturel/extractors.py b/src/agenda_culturel/extractors.py index 4a02380..b4438a5 100644 --- a/src/agenda_culturel/extractors.py +++ b/src/agenda_culturel/extractors.py @@ -26,10 +26,20 @@ logger = get_task_logger(__name__) class Extractor: + name = None + + @abstractmethod + def is_known_url(url): + pass + @abstractmethod def extract(url): pass + @abstractmethod + def clean_url(url): + pass + def download(url): try: options = Options() @@ -69,6 +79,8 @@ class Extractor: class ExtractorFacebook(Extractor): + name = "Facebook" + class SimpleFacebookEvent: def __init__(self, data): @@ -194,8 +206,7 @@ class ExtractorFacebook(Extractor): image = self.get_element("image") local_image = None if image is None else Extractor.download_media(image) - u = urlparse(url) - unique_url = u.scheme + "://" + u.netloc + u.path + return Event(title=self.get_element("name"), status=Event.STATUS.DRAFT, @@ -208,7 +219,20 @@ class ExtractorFacebook(Extractor): local_image=local_image, image=self.get_element("image"), image_alt=self.get_element("image_alt"), - reference_urls=[unique_url]) + reference_urls=[url]) + + + def clean_url(url): + + if ExtractorFacebook.is_known_url(url): + u = urlparse(url) + return u.scheme + "://" + u.netloc + u.path + else: + return url + + def is_known_url(url): + u = urlparse(url) + return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"] def process_page(txt, url): @@ -230,6 +254,13 @@ class ExtractorFacebook(Extractor): class ExtractorAllURLs: + extractors = [ExtractorFacebook] + + def clean_url(url): + result = url + for e in ExtractorAllURLs.extractors: + result = e.clean_url(result) + return result def extract(url): logger.info("Run extraction") @@ -239,13 +270,12 @@ class ExtractorAllURLs: logger.info("Cannot download url") return None - result = ExtractorFacebook.process_page(txt, url) + for e in ExtractorAllURLs.extractors: + result = e.process_page(txt, url) - if result is not None: - return result - else: - logger.info("Not a Facebook link") - - # TODO: add here other extrators + if result is not None: + return result + else: + logger.info("Not a " + e.name + " link") return None