import logging from scrapy.spiders import Spider from scrapy.http import Request, Response logger = logging.getLogger(__name__) class Fedator(Spider): name = "fedator" custom_settings = { "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0", "ROBOTSTXT_OBEY": False, "REFERER_ENABLED": False, "COOKIES_ENABLED": False, "TELNETCONSOLE_ENABLED": False, "HTTPCACHE_ENABLED": True, "DEFAULT_REQUEST_HEADERS": { "Accept": "application/activity+json", }, "DOWNLOAD_TIMEOUT": 10, } MAX_DEPTH = 10 visited: set[str] = set() def start_requests(self): for line in open("instances.txt").readlines(): host = line[:-1] self.visited.add(host) yield Request( f"https://{host}/@relay/followers?page=1", meta={"rel": "followers", "host": host, "depth": 0, "page": 1}, ) yield Request( f"https://{host}/@relay/following?page=1", meta={"rel": "following", "host": host, "depth": 0, "page": 1}, ) def parse(self, response: Response): items = response.json().get("orderedItems") if len(items) < 1: return rel = response.request.meta["rel"] host = response.request.meta["host"] page = response.request.meta["page"] depth = response.request.meta["depth"] # fetch next page yield Request( f"https://{host}/@relay/{rel}?page={page+1}", meta={"rel": rel, "host": host, "depth": depth, "page": page + 1}, ) # fetch children for item in items: child = item.split("/")[2] d = {"rel": rel} if rel == "followers": yield d | {"src": child, "dst": host} else: yield d | {"src": host, "dst": child} if child not in self.visited and depth < self.MAX_DEPTH: self.visited.add(child) yield Request( f"https://{child}/@relay/followers?page=1", meta={ "rel": "followers", "host": child, "depth": depth + 1, "page": 1, }, ) yield Request( f"https://{child}/@relay/following?page=1", meta={ "rel": "following", "host": child, "depth": depth + 1, "page": 1, }, ) else: logger.debug(f"already visited or maxdepth ({depth})")