From d94f65092a3bedc8c198dfbe68243ffe43d548bc Mon Sep 17 00:00:00 2001 From: setop Date: Fri, 20 Dec 2024 22:06:17 +0100 Subject: [PATCH] feat: add capture following --- cmd.txt | 1 + fedcrawler_followers.py | 75 +++++++++++++++++++++++++++-------------- template.dot | 4 +-- to_dot.awk | 15 +++++++++ 4 files changed, 68 insertions(+), 27 deletions(-) create mode 100644 to_dot.awk diff --git a/cmd.txt b/cmd.txt index b25bf3e..a1af341 100644 --- a/cmd.txt +++ b/cmd.txt @@ -5,4 +5,5 @@ scrapy runspider -o out.csv:csv fedcrawler_followers.py dos2unix out.csv uniq0 out.csv | sponge out.csv bash <(awk -f pp.awk template.dot) < out.csv >| out.dot +awk -f to_dot.awk out.csv >| out.dot neato -Tsvg -o out.svg out.dot diff --git a/fedcrawler_followers.py b/fedcrawler_followers.py index 16cf287..cb6004b 100644 --- a/fedcrawler_followers.py +++ b/fedcrawler_followers.py @@ -17,11 +17,12 @@ class Fedator(Spider): "DEFAULT_REQUEST_HEADERS": { "Accept": "application/activity+json", }, + "DOWNLOAD_TIMEOUT": 10, } MAX_DEPTH = 10 - visited = set() + visited: set[str] = set() def start_requests(self): for line in open("instances.txt").readlines(): @@ -29,29 +30,53 @@ class Fedator(Spider): self.visited.add(host) yield Request( f"https://{host}/@relay/followers?page=1", - meta={"dst": host, "depth": 0, "page": 1}, + meta={"rel": "followers", "host": host, "depth": 0, "page": 1}, + ) + yield Request( + f"https://{host}/@relay/following?page=1", + meta={"rel": "following", "host": host, "depth": 0, "page": 1}, ) - def parse(self, response:Response): - followers = response.json()[ - "orderedItems" - ] # "https://mobilizon.sans-nuage.fr/relay" - if len(followers) > 0: - dst = response.request.meta["dst"] - page = response.request.meta["page"] - depth = response.request.meta["depth"] - yield Request( - f"https://{dst}/@relay/followers?page={page}", - meta={"dst": dst, "depth": depth, "page": page + 1}, - ) - for follower in followers: - host = follower.split("/")[2] - yield {"src": host, "dst": dst} - if host not in self.visited and depth < self.MAX_DEPTH: - self.visited.add(host) - yield Request( - f"https://{host}/@relay/followers?page=1", - meta={"dst": host, "depth": depth + 1, "page": 1}, - ) - else: - logger.debug(f"already visited or maxdepth ({depth})") + def parse(self, response: Response): + items = response.json().get("orderedItems") + if len(items) < 1: + return + rel = response.request.meta["rel"] + host = response.request.meta["host"] + page = response.request.meta["page"] + depth = response.request.meta["depth"] + # fetch next page + yield Request( + f"https://{host}/@relay/{rel}?page={page+1}", + meta={"rel": rel, "host": host, "depth": depth, "page": page + 1}, + ) + # fetch children + for item in items: + child = item.split("/")[2] + d = {"rel": rel} + if rel == "followers": + yield d | {"src": child, "dst": host} + else: + yield d | {"src": host, "dst": child} + if child not in self.visited and depth < self.MAX_DEPTH: + self.visited.add(child) + yield Request( + f"https://{child}/@relay/followers?page=1", + meta={ + "rel": "followers", + "host": child, + "depth": depth + 1, + "page": 1, + }, + ) + yield Request( + f"https://{child}/@relay/following?page=1", + meta={ + "rel": "following", + "host": child, + "depth": depth + 1, + "page": 1, + }, + ) + else: + logger.debug(f"already visited or maxdepth ({depth})") diff --git a/template.dot b/template.dot index 8f1e514..ef8b5da 100644 --- a/template.dot +++ b/template.dot @@ -1,5 +1,5 @@ digraph fediverse { -graph [overlap=false] +graph [overlap=false outputorder="edgesfirst"] #! IFS="," while read src dst @@ -12,4 +12,4 @@ do fi done #! -} \ No newline at end of file +} diff --git a/to_dot.awk b/to_dot.awk new file mode 100644 index 0000000..6879044 --- /dev/null +++ b/to_dot.awk @@ -0,0 +1,15 @@ +BEGIN { + print "digraph fediverse {" + print "graph [overlap=false outputorder=\"edgesfirst\"]" + print "node [style=\"filled\"]" + FS="," +} + +# rel src dst +$1 == "followers" { + print "\"" $2 "\"", "->", "\""$3 "\";" +} + +END { + print "}" +}