import logging from scrapy.spiders import Spider from scrapy.http import Request, Response logger = logging.getLogger(__name__) class Fedator(Spider): name = "fedator" custom_settings = { "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0", "ROBOTSTXT_OBEY": False, "REFERER_ENABLED": False, "COOKIES_ENABLED": False, "TELNETCONSOLE_ENABLED": False, "HTTPCACHE_ENABLED": True, "DEFAULT_REQUEST_HEADERS": { "Accept": "application/activity+json", }, } MAX_DEPTH = 10 visited = set() def start_requests(self): for line in open("instances.txt").readlines(): host = line[:-1] self.visited.add(host) yield Request( f"https://{host}/@relay/followers?page=1", meta={"dst": host, "depth": 0, "page": 1}, ) def parse(self, response:Response): followers = response.json()[ "orderedItems" ] # "https://mobilizon.sans-nuage.fr/relay" if len(followers) > 0: dst = response.request.meta["dst"] page = response.request.meta["page"] depth = response.request.meta["depth"] yield Request( f"https://{dst}/@relay/followers?page={page}", meta={"dst": dst, "depth": depth, "page": page + 1}, ) for follower in followers: host = follower.split("/")[2] yield {"src": host, "dst": dst} if host not in self.visited and depth < self.MAX_DEPTH: self.visited.add(host) yield Request( f"https://{host}/@relay/followers?page=1", meta={"dst": host, "depth": depth + 1, "page": 1}, ) else: logger.debug(f"already visited or maxdepth ({depth})")