feat: add capture following
This commit is contained in:
parent
0365ad42ef
commit
d94f65092a
1
cmd.txt
1
cmd.txt
@ -5,4 +5,5 @@ scrapy runspider -o out.csv:csv fedcrawler_followers.py
|
||||
dos2unix out.csv
|
||||
uniq0 out.csv | sponge out.csv
|
||||
bash <(awk -f pp.awk template.dot) < out.csv >| out.dot
|
||||
awk -f to_dot.awk out.csv >| out.dot
|
||||
neato -Tsvg -o out.svg out.dot
|
||||
|
@ -17,11 +17,12 @@ class Fedator(Spider):
|
||||
"DEFAULT_REQUEST_HEADERS": {
|
||||
"Accept": "application/activity+json",
|
||||
},
|
||||
"DOWNLOAD_TIMEOUT": 10,
|
||||
}
|
||||
|
||||
MAX_DEPTH = 10
|
||||
|
||||
visited = set()
|
||||
visited: set[str] = set()
|
||||
|
||||
def start_requests(self):
|
||||
for line in open("instances.txt").readlines():
|
||||
@ -29,29 +30,53 @@ class Fedator(Spider):
|
||||
self.visited.add(host)
|
||||
yield Request(
|
||||
f"https://{host}/@relay/followers?page=1",
|
||||
meta={"dst": host, "depth": 0, "page": 1},
|
||||
meta={"rel": "followers", "host": host, "depth": 0, "page": 1},
|
||||
)
|
||||
yield Request(
|
||||
f"https://{host}/@relay/following?page=1",
|
||||
meta={"rel": "following", "host": host, "depth": 0, "page": 1},
|
||||
)
|
||||
|
||||
def parse(self, response:Response):
|
||||
followers = response.json()[
|
||||
"orderedItems"
|
||||
] # "https://mobilizon.sans-nuage.fr/relay"
|
||||
if len(followers) > 0:
|
||||
dst = response.request.meta["dst"]
|
||||
page = response.request.meta["page"]
|
||||
depth = response.request.meta["depth"]
|
||||
yield Request(
|
||||
f"https://{dst}/@relay/followers?page={page}",
|
||||
meta={"dst": dst, "depth": depth, "page": page + 1},
|
||||
)
|
||||
for follower in followers:
|
||||
host = follower.split("/")[2]
|
||||
yield {"src": host, "dst": dst}
|
||||
if host not in self.visited and depth < self.MAX_DEPTH:
|
||||
self.visited.add(host)
|
||||
yield Request(
|
||||
f"https://{host}/@relay/followers?page=1",
|
||||
meta={"dst": host, "depth": depth + 1, "page": 1},
|
||||
)
|
||||
else:
|
||||
logger.debug(f"already visited or maxdepth ({depth})")
|
||||
def parse(self, response: Response):
|
||||
items = response.json().get("orderedItems")
|
||||
if len(items) < 1:
|
||||
return
|
||||
rel = response.request.meta["rel"]
|
||||
host = response.request.meta["host"]
|
||||
page = response.request.meta["page"]
|
||||
depth = response.request.meta["depth"]
|
||||
# fetch next page
|
||||
yield Request(
|
||||
f"https://{host}/@relay/{rel}?page={page+1}",
|
||||
meta={"rel": rel, "host": host, "depth": depth, "page": page + 1},
|
||||
)
|
||||
# fetch children
|
||||
for item in items:
|
||||
child = item.split("/")[2]
|
||||
d = {"rel": rel}
|
||||
if rel == "followers":
|
||||
yield d | {"src": child, "dst": host}
|
||||
else:
|
||||
yield d | {"src": host, "dst": child}
|
||||
if child not in self.visited and depth < self.MAX_DEPTH:
|
||||
self.visited.add(child)
|
||||
yield Request(
|
||||
f"https://{child}/@relay/followers?page=1",
|
||||
meta={
|
||||
"rel": "followers",
|
||||
"host": child,
|
||||
"depth": depth + 1,
|
||||
"page": 1,
|
||||
},
|
||||
)
|
||||
yield Request(
|
||||
f"https://{child}/@relay/following?page=1",
|
||||
meta={
|
||||
"rel": "following",
|
||||
"host": child,
|
||||
"depth": depth + 1,
|
||||
"page": 1,
|
||||
},
|
||||
)
|
||||
else:
|
||||
logger.debug(f"already visited or maxdepth ({depth})")
|
||||
|
@ -1,5 +1,5 @@
|
||||
digraph fediverse {
|
||||
graph [overlap=false]
|
||||
graph [overlap=false outputorder="edgesfirst"]
|
||||
#!
|
||||
IFS=","
|
||||
while read src dst
|
||||
@ -12,4 +12,4 @@ do
|
||||
fi
|
||||
done
|
||||
#!
|
||||
}
|
||||
}
|
||||
|
15
to_dot.awk
Normal file
15
to_dot.awk
Normal file
@ -0,0 +1,15 @@
|
||||
BEGIN {
|
||||
print "digraph fediverse {"
|
||||
print "graph [overlap=false outputorder=\"edgesfirst\"]"
|
||||
print "node [style=\"filled\"]"
|
||||
FS=","
|
||||
}
|
||||
|
||||
# rel src dst
|
||||
$1 == "followers" {
|
||||
print "\"" $2 "\"", "->", "\""$3 "\";"
|
||||
}
|
||||
|
||||
END {
|
||||
print "}"
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user