fedigraph/fedcrawler_followers.py

58 lines
1.9 KiB
Python

import logging
from scrapy.spiders import Spider
from scrapy.http import Request, Response
logger = logging.getLogger(__name__)
class Fedator(Spider):
name = "fedator"
custom_settings = {
"USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
"ROBOTSTXT_OBEY": False,
"REFERER_ENABLED": False,
"COOKIES_ENABLED": False,
"TELNETCONSOLE_ENABLED": False,
"HTTPCACHE_ENABLED": True,
"DEFAULT_REQUEST_HEADERS": {
"Accept": "application/activity+json",
},
}
MAX_DEPTH = 10
visited = set()
def start_requests(self):
for line in open("instances.txt").readlines():
host = line[:-1]
self.visited.add(host)
yield Request(
f"https://{host}/@relay/followers?page=1",
meta={"dst": host, "depth": 0, "page": 1},
)
def parse(self, response:Response):
followers = response.json()[
"orderedItems"
] # "https://mobilizon.sans-nuage.fr/relay"
if len(followers) > 0:
dst = response.request.meta["dst"]
page = response.request.meta["page"]
depth = response.request.meta["depth"]
yield Request(
f"https://{dst}/@relay/followers?page={page}",
meta={"dst": dst, "depth": depth, "page": page + 1},
)
for follower in followers:
host = follower.split("/")[2]
yield {"src": host, "dst": dst}
if host not in self.visited and depth < self.MAX_DEPTH:
self.visited.add(host)
yield Request(
f"https://{host}/@relay/followers?page=1",
meta={"dst": host, "depth": depth + 1, "page": 1},
)
else:
logger.debug(f"already visited or maxdepth ({depth})")