54 lines
1.5 KiB
Python
54 lines
1.5 KiB
Python
# -*- coding: utf-8 -*-
|
|
import logging
|
|
|
|
from urllib.parse import urlparse
|
|
|
|
from scrapy.spiders import Spider
|
|
from scrapy import Request
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
class Fedator(Spider):
|
|
name = 'fedator'
|
|
custom_settings = {
|
|
"USER_AGENT" : 'Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0',
|
|
"ROBOTSTXT_OBEY" : False,
|
|
"REFERER_ENABLED" : False,
|
|
"COOKIES_ENABLED" : False,
|
|
"TELNETCONSOLE_ENABLED" : False,
|
|
"HTTPCACHE_ENABLED" : True,
|
|
"DEFAULT_REQUEST_HEADERS" : {
|
|
'Accept': 'application/activity+json',
|
|
},
|
|
}
|
|
|
|
MAX_DEPTH = 10
|
|
|
|
visited = set()
|
|
|
|
def start_requests(self):
|
|
for line in open("instances.txt").readlines():
|
|
host = line[:-1]
|
|
self.visited.add(host)
|
|
yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : 0, "page":1 })
|
|
|
|
|
|
def parse(self, response):
|
|
followers = response.json()["orderedItems"] # "https://mobilizon.sans-nuage.fr/relay"
|
|
if len(followers)>0:
|
|
dst = response.request.meta["dst"]
|
|
page = response.request.meta["page"] + 1
|
|
yield Request(f'https://{dst}/@relay/followers?page={page}', meta= {"dst":dst, "depth" : 0, "page": page})
|
|
depth = response.request.meta["depth"]
|
|
for follower in followers:
|
|
host = follower.split("/")[2]
|
|
yield {"src": host, "dst" : dst }
|
|
if host not in self.visited and depth < self.MAX_DEPTH:
|
|
self.visited.add(host)
|
|
yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : depth+1, "page":1 })
|
|
else:
|
|
logger.debug(f"already visited of maxdepth ({depth+1})")
|