83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
import logging
|
|
from scrapy.spiders import Spider
|
|
from scrapy.http import Request, Response
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Fedator(Spider):
|
|
name = "fedator"
|
|
custom_settings = {
|
|
"USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
|
|
"ROBOTSTXT_OBEY": False,
|
|
"REFERER_ENABLED": False,
|
|
"COOKIES_ENABLED": False,
|
|
"TELNETCONSOLE_ENABLED": False,
|
|
"HTTPCACHE_ENABLED": True,
|
|
"DEFAULT_REQUEST_HEADERS": {
|
|
"Accept": "application/activity+json",
|
|
},
|
|
"DOWNLOAD_TIMEOUT": 10,
|
|
}
|
|
|
|
MAX_DEPTH = 10
|
|
|
|
visited: set[str] = set()
|
|
|
|
def start_requests(self):
|
|
for line in open("instances.txt").readlines():
|
|
host = line[:-1]
|
|
self.visited.add(host)
|
|
yield Request(
|
|
f"https://{host}/@relay/followers?page=1",
|
|
meta={"rel": "followers", "host": host, "depth": 0, "page": 1},
|
|
)
|
|
yield Request(
|
|
f"https://{host}/@relay/following?page=1",
|
|
meta={"rel": "following", "host": host, "depth": 0, "page": 1},
|
|
)
|
|
|
|
def parse(self, response: Response):
|
|
items = response.json().get("orderedItems")
|
|
if len(items) < 1:
|
|
return
|
|
rel = response.request.meta["rel"]
|
|
host = response.request.meta["host"]
|
|
page = response.request.meta["page"]
|
|
depth = response.request.meta["depth"]
|
|
# fetch next page
|
|
yield Request(
|
|
f"https://{host}/@relay/{rel}?page={page+1}",
|
|
meta={"rel": rel, "host": host, "depth": depth, "page": page + 1},
|
|
)
|
|
# fetch children
|
|
for item in items:
|
|
child = item.split("/")[2]
|
|
d = {"rel": rel}
|
|
if rel == "followers":
|
|
yield d | {"src": child, "dst": host}
|
|
else:
|
|
yield d | {"src": host, "dst": child}
|
|
if child not in self.visited and depth < self.MAX_DEPTH:
|
|
self.visited.add(child)
|
|
yield Request(
|
|
f"https://{child}/@relay/followers?page=1",
|
|
meta={
|
|
"rel": "followers",
|
|
"host": child,
|
|
"depth": depth + 1,
|
|
"page": 1,
|
|
},
|
|
)
|
|
yield Request(
|
|
f"https://{child}/@relay/following?page=1",
|
|
meta={
|
|
"rel": "following",
|
|
"host": child,
|
|
"depth": depth + 1,
|
|
"page": 1,
|
|
},
|
|
)
|
|
else:
|
|
logger.debug(f"already visited or maxdepth ({depth})")
|