# -*- coding: utf-8 -*- import logging from urllib.parse import urlparse from scrapy.spiders import Spider from scrapy import Request logger = logging.getLogger(__name__) class Fedator(Spider): name = 'fedator' custom_settings = { "USER_AGENT" : 'Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0', "ROBOTSTXT_OBEY" : False, "REFERER_ENABLED" : False, "COOKIES_ENABLED" : False, "TELNETCONSOLE_ENABLED" : False, "HTTPCACHE_ENABLED" : True, "DEFAULT_REQUEST_HEADERS" : { 'Accept': 'application/activity+json', }, } MAX_DEPTH = 10 visited = set() def start_requests(self): for line in open("instances.txt").readlines(): host = line[:-1] self.visited.add(host) yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : 0, "page":1 }) def parse(self, response): followers = response.json()["orderedItems"] # "https://mobilizon.sans-nuage.fr/relay" if len(followers)>0: dst = response.request.meta["dst"] page = response.request.meta["page"] + 1 yield Request(f'https://{dst}/@relay/followers?page={page}', meta= {"dst":dst, "depth" : 0, "page": page}) depth = response.request.meta["depth"] for follower in followers: host = follower.split("/")[2] yield {"src": host, "dst" : dst } if host not in self.visited and depth < self.MAX_DEPTH: self.visited.add(host) yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : depth+1, "page":1 }) else: logger.debug(f"already visited of maxdepth ({depth+1})")