feat: add capture following

This commit is contained in:
setop 2024-12-20 22:06:17 +01:00
parent 0365ad42ef
commit d94f65092a
4 changed files with 68 additions and 27 deletions

View File

@ -5,4 +5,5 @@ scrapy runspider -o out.csv:csv fedcrawler_followers.py
dos2unix out.csv
uniq0 out.csv | sponge out.csv
bash <(awk -f pp.awk template.dot) < out.csv >| out.dot
awk -f to_dot.awk out.csv >| out.dot
neato -Tsvg -o out.svg out.dot

View File

@ -17,11 +17,12 @@ class Fedator(Spider):
"DEFAULT_REQUEST_HEADERS": {
"Accept": "application/activity+json",
},
"DOWNLOAD_TIMEOUT": 10,
}
MAX_DEPTH = 10
visited = set()
visited: set[str] = set()
def start_requests(self):
for line in open("instances.txt").readlines():
@ -29,29 +30,53 @@ class Fedator(Spider):
self.visited.add(host)
yield Request(
f"https://{host}/@relay/followers?page=1",
meta={"dst": host, "depth": 0, "page": 1},
meta={"rel": "followers", "host": host, "depth": 0, "page": 1},
)
yield Request(
f"https://{host}/@relay/following?page=1",
meta={"rel": "following", "host": host, "depth": 0, "page": 1},
)
def parse(self, response:Response):
followers = response.json()[
"orderedItems"
] # "https://mobilizon.sans-nuage.fr/relay"
if len(followers) > 0:
dst = response.request.meta["dst"]
page = response.request.meta["page"]
depth = response.request.meta["depth"]
yield Request(
f"https://{dst}/@relay/followers?page={page}",
meta={"dst": dst, "depth": depth, "page": page + 1},
)
for follower in followers:
host = follower.split("/")[2]
yield {"src": host, "dst": dst}
if host not in self.visited and depth < self.MAX_DEPTH:
self.visited.add(host)
yield Request(
f"https://{host}/@relay/followers?page=1",
meta={"dst": host, "depth": depth + 1, "page": 1},
)
else:
logger.debug(f"already visited or maxdepth ({depth})")
def parse(self, response: Response):
items = response.json().get("orderedItems")
if len(items) < 1:
return
rel = response.request.meta["rel"]
host = response.request.meta["host"]
page = response.request.meta["page"]
depth = response.request.meta["depth"]
# fetch next page
yield Request(
f"https://{host}/@relay/{rel}?page={page+1}",
meta={"rel": rel, "host": host, "depth": depth, "page": page + 1},
)
# fetch children
for item in items:
child = item.split("/")[2]
d = {"rel": rel}
if rel == "followers":
yield d | {"src": child, "dst": host}
else:
yield d | {"src": host, "dst": child}
if child not in self.visited and depth < self.MAX_DEPTH:
self.visited.add(child)
yield Request(
f"https://{child}/@relay/followers?page=1",
meta={
"rel": "followers",
"host": child,
"depth": depth + 1,
"page": 1,
},
)
yield Request(
f"https://{child}/@relay/following?page=1",
meta={
"rel": "following",
"host": child,
"depth": depth + 1,
"page": 1,
},
)
else:
logger.debug(f"already visited or maxdepth ({depth})")

View File

@ -1,5 +1,5 @@
digraph fediverse {
graph [overlap=false]
graph [overlap=false outputorder="edgesfirst"]
#!
IFS=","
while read src dst
@ -12,4 +12,4 @@ do
fi
done
#!
}
}

15
to_dot.awk Normal file
View File

@ -0,0 +1,15 @@
BEGIN {
print "digraph fediverse {"
print "graph [overlap=false outputorder=\"edgesfirst\"]"
print "node [style=\"filled\"]"
FS=","
}
# rel src dst
$1 == "followers" {
print "\"" $2 "\"", "->", "\""$3 "\";"
}
END {
print "}"
}