From 0365ad42efab3df35be5c3d3d89a8362fcbe57f3 Mon Sep 17 00:00:00 2001 From: setop Date: Fri, 20 Dec 2024 18:54:43 +0100 Subject: [PATCH] chore: format and minor fix --- cmd.txt | 15 ++----- fedcrawler_followers.py | 92 +++++++++++++++++++++-------------------- 2 files changed, 51 insertions(+), 56 deletions(-) diff --git a/cmd.txt b/cmd.txt index 8a78972..b25bf3e 100644 --- a/cmd.txt +++ b/cmd.txt @@ -1,17 +1,8 @@ . .venv/bin/activate - -pip install scrapy - wget https://framagit.org/-/snippets/6539/raw/main/pp.awk - -curl -fsS 'https://instances.joinmobilizon.org/api/v1/instances?start=0&count=1000' | jq -r '.data[].host' instances.json > instances.txt - +curl -fsS 'https://instances.joinmobilizon.org/api/v1/instances?start=0&count=1000' | tee instances.json | jq -r '.data[].host' > instances.txt scrapy runspider -o out.csv:csv fedcrawler_followers.py - dos2unix out.csv - -uniq0 out.csv >| out.u.csv - -bash <(awk -f pp.awk template.dot) < out.u.csv >| out.dot - +uniq0 out.csv | sponge out.csv +bash <(awk -f pp.awk template.dot) < out.csv >| out.dot neato -Tsvg -o out.svg out.dot diff --git a/fedcrawler_followers.py b/fedcrawler_followers.py index c14a4d6..16cf287 100644 --- a/fedcrawler_followers.py +++ b/fedcrawler_followers.py @@ -1,53 +1,57 @@ -# -*- coding: utf-8 -*- import logging - -from urllib.parse import urlparse - from scrapy.spiders import Spider -from scrapy import Request - +from scrapy.http import Request, Response logger = logging.getLogger(__name__) - class Fedator(Spider): - name = 'fedator' - custom_settings = { - "USER_AGENT" : 'Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0', - "ROBOTSTXT_OBEY" : False, - "REFERER_ENABLED" : False, - "COOKIES_ENABLED" : False, - "TELNETCONSOLE_ENABLED" : False, - "HTTPCACHE_ENABLED" : True, - "DEFAULT_REQUEST_HEADERS" : { - 'Accept': 'application/activity+json', - }, - } - - MAX_DEPTH = 10 + name = "fedator" + custom_settings = { + "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0", + "ROBOTSTXT_OBEY": False, + "REFERER_ENABLED": False, + "COOKIES_ENABLED": False, + "TELNETCONSOLE_ENABLED": False, + "HTTPCACHE_ENABLED": True, + "DEFAULT_REQUEST_HEADERS": { + "Accept": "application/activity+json", + }, + } - visited = set() - - def start_requests(self): - for line in open("instances.txt").readlines(): - host = line[:-1] - self.visited.add(host) - yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : 0, "page":1 }) - + MAX_DEPTH = 10 - def parse(self, response): - followers = response.json()["orderedItems"] # "https://mobilizon.sans-nuage.fr/relay" - if len(followers)>0: - dst = response.request.meta["dst"] - page = response.request.meta["page"] + 1 - yield Request(f'https://{dst}/@relay/followers?page={page}', meta= {"dst":dst, "depth" : 0, "page": page}) - depth = response.request.meta["depth"] - for follower in followers: - host = follower.split("/")[2] - yield {"src": host, "dst" : dst } - if host not in self.visited and depth < self.MAX_DEPTH: - self.visited.add(host) - yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : depth+1, "page":1 }) - else: - logger.debug(f"already visited of maxdepth ({depth+1})") + visited = set() + + def start_requests(self): + for line in open("instances.txt").readlines(): + host = line[:-1] + self.visited.add(host) + yield Request( + f"https://{host}/@relay/followers?page=1", + meta={"dst": host, "depth": 0, "page": 1}, + ) + + def parse(self, response:Response): + followers = response.json()[ + "orderedItems" + ] # "https://mobilizon.sans-nuage.fr/relay" + if len(followers) > 0: + dst = response.request.meta["dst"] + page = response.request.meta["page"] + depth = response.request.meta["depth"] + yield Request( + f"https://{dst}/@relay/followers?page={page}", + meta={"dst": dst, "depth": depth, "page": page + 1}, + ) + for follower in followers: + host = follower.split("/")[2] + yield {"src": host, "dst": dst} + if host not in self.visited and depth < self.MAX_DEPTH: + self.visited.add(host) + yield Request( + f"https://{host}/@relay/followers?page=1", + meta={"dst": host, "depth": depth + 1, "page": 1}, + ) + else: + logger.debug(f"already visited or maxdepth ({depth})")