From 234773b8f0e3a5f4eb005b7ff1c6479fd9dcb77d Mon Sep 17 00:00:00 2001 From: setop Date: Fri, 25 Nov 2022 18:10:45 +0100 Subject: [PATCH] build graph from mobilizon instances and followers --- .tool-versions | 1 + cmd.txt | 17 +++++++++++++ fedcrawler_followers.py | 53 +++++++++++++++++++++++++++++++++++++++++ template.dot | 13 ++++++++++ 4 files changed, 84 insertions(+) create mode 100644 .tool-versions create mode 100644 cmd.txt create mode 100644 fedcrawler_followers.py create mode 100644 template.dot diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000..c6e5191 --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +python 3.9.9 diff --git a/cmd.txt b/cmd.txt new file mode 100644 index 0000000..8a78972 --- /dev/null +++ b/cmd.txt @@ -0,0 +1,17 @@ +. .venv/bin/activate + +pip install scrapy + +wget https://framagit.org/-/snippets/6539/raw/main/pp.awk + +curl -fsS 'https://instances.joinmobilizon.org/api/v1/instances?start=0&count=1000' | jq -r '.data[].host' instances.json > instances.txt + +scrapy runspider -o out.csv:csv fedcrawler_followers.py + +dos2unix out.csv + +uniq0 out.csv >| out.u.csv + +bash <(awk -f pp.awk template.dot) < out.u.csv >| out.dot + +neato -Tsvg -o out.svg out.dot diff --git a/fedcrawler_followers.py b/fedcrawler_followers.py new file mode 100644 index 0000000..c14a4d6 --- /dev/null +++ b/fedcrawler_followers.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +import logging + +from urllib.parse import urlparse + +from scrapy.spiders import Spider +from scrapy import Request + + +logger = logging.getLogger(__name__) + + + +class Fedator(Spider): + name = 'fedator' + custom_settings = { + "USER_AGENT" : 'Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0', + "ROBOTSTXT_OBEY" : False, + "REFERER_ENABLED" : False, + "COOKIES_ENABLED" : False, + "TELNETCONSOLE_ENABLED" : False, + "HTTPCACHE_ENABLED" : True, + "DEFAULT_REQUEST_HEADERS" : { + 'Accept': 'application/activity+json', + }, + } + + MAX_DEPTH = 10 + + visited = set() + + def start_requests(self): + for line in open("instances.txt").readlines(): + host = line[:-1] + self.visited.add(host) + yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : 0, "page":1 }) + + + def parse(self, response): + followers = response.json()["orderedItems"] # "https://mobilizon.sans-nuage.fr/relay" + if len(followers)>0: + dst = response.request.meta["dst"] + page = response.request.meta["page"] + 1 + yield Request(f'https://{dst}/@relay/followers?page={page}', meta= {"dst":dst, "depth" : 0, "page": page}) + depth = response.request.meta["depth"] + for follower in followers: + host = follower.split("/")[2] + yield {"src": host, "dst" : dst } + if host not in self.visited and depth < self.MAX_DEPTH: + self.visited.add(host) + yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : depth+1, "page":1 }) + else: + logger.debug(f"already visited of maxdepth ({depth+1})") diff --git a/template.dot b/template.dot new file mode 100644 index 0000000..8c406c2 --- /dev/null +++ b/template.dot @@ -0,0 +1,13 @@ +digraph fediverse { +graph [overlap=false] +#! +IFS="," +while read src dst +do + if [ ! $src == "src" ]; + then + echo "\"${src}\" -> \"${dst}\"" + fi +done +#! +} \ No newline at end of file