chore: format and minor fix

This commit is contained in:
setop 2024-12-20 18:54:43 +01:00
parent bb6dc019ef
commit 0365ad42ef
2 changed files with 51 additions and 56 deletions

15
cmd.txt
View File

@ -1,17 +1,8 @@
. .venv/bin/activate . .venv/bin/activate
pip install scrapy
wget https://framagit.org/-/snippets/6539/raw/main/pp.awk wget https://framagit.org/-/snippets/6539/raw/main/pp.awk
curl -fsS 'https://instances.joinmobilizon.org/api/v1/instances?start=0&count=1000' | tee instances.json | jq -r '.data[].host' > instances.txt
curl -fsS 'https://instances.joinmobilizon.org/api/v1/instances?start=0&count=1000' | jq -r '.data[].host' instances.json > instances.txt
scrapy runspider -o out.csv:csv fedcrawler_followers.py scrapy runspider -o out.csv:csv fedcrawler_followers.py
dos2unix out.csv dos2unix out.csv
uniq0 out.csv | sponge out.csv
uniq0 out.csv >| out.u.csv bash <(awk -f pp.awk template.dot) < out.csv >| out.dot
bash <(awk -f pp.awk template.dot) < out.u.csv >| out.dot
neato -Tsvg -o out.svg out.dot neato -Tsvg -o out.svg out.dot

View File

@ -1,53 +1,57 @@
# -*- coding: utf-8 -*-
import logging import logging
from urllib.parse import urlparse
from scrapy.spiders import Spider from scrapy.spiders import Spider
from scrapy import Request from scrapy.http import Request, Response
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class Fedator(Spider): class Fedator(Spider):
name = 'fedator' name = "fedator"
custom_settings = { custom_settings = {
"USER_AGENT" : 'Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0', "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0",
"ROBOTSTXT_OBEY" : False, "ROBOTSTXT_OBEY": False,
"REFERER_ENABLED" : False, "REFERER_ENABLED": False,
"COOKIES_ENABLED" : False, "COOKIES_ENABLED": False,
"TELNETCONSOLE_ENABLED" : False, "TELNETCONSOLE_ENABLED": False,
"HTTPCACHE_ENABLED" : True, "HTTPCACHE_ENABLED": True,
"DEFAULT_REQUEST_HEADERS" : { "DEFAULT_REQUEST_HEADERS": {
'Accept': 'application/activity+json', "Accept": "application/activity+json",
}, },
} }
MAX_DEPTH = 10 MAX_DEPTH = 10
visited = set() visited = set()
def start_requests(self): def start_requests(self):
for line in open("instances.txt").readlines(): for line in open("instances.txt").readlines():
host = line[:-1] host = line[:-1]
self.visited.add(host) self.visited.add(host)
yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : 0, "page":1 }) yield Request(
f"https://{host}/@relay/followers?page=1",
meta={"dst": host, "depth": 0, "page": 1},
)
def parse(self, response:Response):
def parse(self, response): followers = response.json()[
followers = response.json()["orderedItems"] # "https://mobilizon.sans-nuage.fr/relay" "orderedItems"
if len(followers)>0: ] # "https://mobilizon.sans-nuage.fr/relay"
dst = response.request.meta["dst"] if len(followers) > 0:
page = response.request.meta["page"] + 1 dst = response.request.meta["dst"]
yield Request(f'https://{dst}/@relay/followers?page={page}', meta= {"dst":dst, "depth" : 0, "page": page}) page = response.request.meta["page"]
depth = response.request.meta["depth"] depth = response.request.meta["depth"]
for follower in followers: yield Request(
host = follower.split("/")[2] f"https://{dst}/@relay/followers?page={page}",
yield {"src": host, "dst" : dst } meta={"dst": dst, "depth": depth, "page": page + 1},
if host not in self.visited and depth < self.MAX_DEPTH: )
self.visited.add(host) for follower in followers:
yield Request(f'https://{host}/@relay/followers?page=1', meta= {"dst":host, "depth" : depth+1, "page":1 }) host = follower.split("/")[2]
else: yield {"src": host, "dst": dst}
logger.debug(f"already visited of maxdepth ({depth+1})") if host not in self.visited and depth < self.MAX_DEPTH:
self.visited.add(host)
yield Request(
f"https://{host}/@relay/followers?page=1",
meta={"dst": host, "depth": depth + 1, "page": 1},
)
else:
logger.debug(f"already visited or maxdepth ({depth})")