feat: add capture following

This commit is contained in:
setop 2024-12-20 22:06:17 +01:00
parent 0365ad42ef
commit d94f65092a
4 changed files with 68 additions and 27 deletions

View File

@ -5,4 +5,5 @@ scrapy runspider -o out.csv:csv fedcrawler_followers.py
dos2unix out.csv dos2unix out.csv
uniq0 out.csv | sponge out.csv uniq0 out.csv | sponge out.csv
bash <(awk -f pp.awk template.dot) < out.csv >| out.dot bash <(awk -f pp.awk template.dot) < out.csv >| out.dot
awk -f to_dot.awk out.csv >| out.dot
neato -Tsvg -o out.svg out.dot neato -Tsvg -o out.svg out.dot

View File

@ -17,11 +17,12 @@ class Fedator(Spider):
"DEFAULT_REQUEST_HEADERS": { "DEFAULT_REQUEST_HEADERS": {
"Accept": "application/activity+json", "Accept": "application/activity+json",
}, },
"DOWNLOAD_TIMEOUT": 10,
} }
MAX_DEPTH = 10 MAX_DEPTH = 10
visited = set() visited: set[str] = set()
def start_requests(self): def start_requests(self):
for line in open("instances.txt").readlines(): for line in open("instances.txt").readlines():
@ -29,29 +30,53 @@ class Fedator(Spider):
self.visited.add(host) self.visited.add(host)
yield Request( yield Request(
f"https://{host}/@relay/followers?page=1", f"https://{host}/@relay/followers?page=1",
meta={"dst": host, "depth": 0, "page": 1}, meta={"rel": "followers", "host": host, "depth": 0, "page": 1},
)
yield Request(
f"https://{host}/@relay/following?page=1",
meta={"rel": "following", "host": host, "depth": 0, "page": 1},
) )
def parse(self, response:Response): def parse(self, response: Response):
followers = response.json()[ items = response.json().get("orderedItems")
"orderedItems" if len(items) < 1:
] # "https://mobilizon.sans-nuage.fr/relay" return
if len(followers) > 0: rel = response.request.meta["rel"]
dst = response.request.meta["dst"] host = response.request.meta["host"]
page = response.request.meta["page"] page = response.request.meta["page"]
depth = response.request.meta["depth"] depth = response.request.meta["depth"]
yield Request( # fetch next page
f"https://{dst}/@relay/followers?page={page}", yield Request(
meta={"dst": dst, "depth": depth, "page": page + 1}, f"https://{host}/@relay/{rel}?page={page+1}",
) meta={"rel": rel, "host": host, "depth": depth, "page": page + 1},
for follower in followers: )
host = follower.split("/")[2] # fetch children
yield {"src": host, "dst": dst} for item in items:
if host not in self.visited and depth < self.MAX_DEPTH: child = item.split("/")[2]
self.visited.add(host) d = {"rel": rel}
yield Request( if rel == "followers":
f"https://{host}/@relay/followers?page=1", yield d | {"src": child, "dst": host}
meta={"dst": host, "depth": depth + 1, "page": 1}, else:
) yield d | {"src": host, "dst": child}
else: if child not in self.visited and depth < self.MAX_DEPTH:
logger.debug(f"already visited or maxdepth ({depth})") self.visited.add(child)
yield Request(
f"https://{child}/@relay/followers?page=1",
meta={
"rel": "followers",
"host": child,
"depth": depth + 1,
"page": 1,
},
)
yield Request(
f"https://{child}/@relay/following?page=1",
meta={
"rel": "following",
"host": child,
"depth": depth + 1,
"page": 1,
},
)
else:
logger.debug(f"already visited or maxdepth ({depth})")

View File

@ -1,5 +1,5 @@
digraph fediverse { digraph fediverse {
graph [overlap=false] graph [overlap=false outputorder="edgesfirst"]
#! #!
IFS="," IFS=","
while read src dst while read src dst
@ -12,4 +12,4 @@ do
fi fi
done done
#! #!
} }

15
to_dot.awk Normal file
View File

@ -0,0 +1,15 @@
BEGIN {
print "digraph fediverse {"
print "graph [overlap=false outputorder=\"edgesfirst\"]"
print "node [style=\"filled\"]"
FS=","
}
# rel src dst
$1 == "followers" {
print "\"" $2 "\"", "->", "\""$3 "\";"
}
END {
print "}"
}