un peu de refactoring pour les extracteurs
This commit is contained in:
parent
4999b47833
commit
794bed6b74
@ -26,10 +26,20 @@ logger = get_task_logger(__name__)
|
|||||||
|
|
||||||
class Extractor:
|
class Extractor:
|
||||||
|
|
||||||
|
name = None
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def is_known_url(url):
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def extract(url):
|
def extract(url):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def clean_url(url):
|
||||||
|
pass
|
||||||
|
|
||||||
def download(url):
|
def download(url):
|
||||||
try:
|
try:
|
||||||
options = Options()
|
options = Options()
|
||||||
@ -69,6 +79,8 @@ class Extractor:
|
|||||||
|
|
||||||
class ExtractorFacebook(Extractor):
|
class ExtractorFacebook(Extractor):
|
||||||
|
|
||||||
|
name = "Facebook"
|
||||||
|
|
||||||
class SimpleFacebookEvent:
|
class SimpleFacebookEvent:
|
||||||
|
|
||||||
def __init__(self, data):
|
def __init__(self, data):
|
||||||
@ -194,8 +206,7 @@ class ExtractorFacebook(Extractor):
|
|||||||
image = self.get_element("image")
|
image = self.get_element("image")
|
||||||
local_image = None if image is None else Extractor.download_media(image)
|
local_image = None if image is None else Extractor.download_media(image)
|
||||||
|
|
||||||
u = urlparse(url)
|
|
||||||
unique_url = u.scheme + "://" + u.netloc + u.path
|
|
||||||
|
|
||||||
return Event(title=self.get_element("name"),
|
return Event(title=self.get_element("name"),
|
||||||
status=Event.STATUS.DRAFT,
|
status=Event.STATUS.DRAFT,
|
||||||
@ -208,7 +219,20 @@ class ExtractorFacebook(Extractor):
|
|||||||
local_image=local_image,
|
local_image=local_image,
|
||||||
image=self.get_element("image"),
|
image=self.get_element("image"),
|
||||||
image_alt=self.get_element("image_alt"),
|
image_alt=self.get_element("image_alt"),
|
||||||
reference_urls=[unique_url])
|
reference_urls=[url])
|
||||||
|
|
||||||
|
|
||||||
|
def clean_url(url):
|
||||||
|
|
||||||
|
if ExtractorFacebook.is_known_url(url):
|
||||||
|
u = urlparse(url)
|
||||||
|
return u.scheme + "://" + u.netloc + u.path
|
||||||
|
else:
|
||||||
|
return url
|
||||||
|
|
||||||
|
def is_known_url(url):
|
||||||
|
u = urlparse(url)
|
||||||
|
return u.netloc in ["facebook.com", "www.facebook.com", "m.facebook.com"]
|
||||||
|
|
||||||
|
|
||||||
def process_page(txt, url):
|
def process_page(txt, url):
|
||||||
@ -230,6 +254,13 @@ class ExtractorFacebook(Extractor):
|
|||||||
|
|
||||||
class ExtractorAllURLs:
|
class ExtractorAllURLs:
|
||||||
|
|
||||||
|
extractors = [ExtractorFacebook]
|
||||||
|
|
||||||
|
def clean_url(url):
|
||||||
|
result = url
|
||||||
|
for e in ExtractorAllURLs.extractors:
|
||||||
|
result = e.clean_url(result)
|
||||||
|
return result
|
||||||
|
|
||||||
def extract(url):
|
def extract(url):
|
||||||
logger.info("Run extraction")
|
logger.info("Run extraction")
|
||||||
@ -239,13 +270,12 @@ class ExtractorAllURLs:
|
|||||||
logger.info("Cannot download url")
|
logger.info("Cannot download url")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
result = ExtractorFacebook.process_page(txt, url)
|
for e in ExtractorAllURLs.extractors:
|
||||||
|
result = e.process_page(txt, url)
|
||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
logger.info("Not a Facebook link")
|
logger.info("Not a " + e.name + " link")
|
||||||
|
|
||||||
# TODO: add here other extrators
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
Loading…
x
Reference in New Issue
Block a user