feat: add capture following
This commit is contained in:
parent
0365ad42ef
commit
d94f65092a
1
cmd.txt
1
cmd.txt
@ -5,4 +5,5 @@ scrapy runspider -o out.csv:csv fedcrawler_followers.py
|
|||||||
dos2unix out.csv
|
dos2unix out.csv
|
||||||
uniq0 out.csv | sponge out.csv
|
uniq0 out.csv | sponge out.csv
|
||||||
bash <(awk -f pp.awk template.dot) < out.csv >| out.dot
|
bash <(awk -f pp.awk template.dot) < out.csv >| out.dot
|
||||||
|
awk -f to_dot.awk out.csv >| out.dot
|
||||||
neato -Tsvg -o out.svg out.dot
|
neato -Tsvg -o out.svg out.dot
|
||||||
|
@ -17,11 +17,12 @@ class Fedator(Spider):
|
|||||||
"DEFAULT_REQUEST_HEADERS": {
|
"DEFAULT_REQUEST_HEADERS": {
|
||||||
"Accept": "application/activity+json",
|
"Accept": "application/activity+json",
|
||||||
},
|
},
|
||||||
|
"DOWNLOAD_TIMEOUT": 10,
|
||||||
}
|
}
|
||||||
|
|
||||||
MAX_DEPTH = 10
|
MAX_DEPTH = 10
|
||||||
|
|
||||||
visited = set()
|
visited: set[str] = set()
|
||||||
|
|
||||||
def start_requests(self):
|
def start_requests(self):
|
||||||
for line in open("instances.txt").readlines():
|
for line in open("instances.txt").readlines():
|
||||||
@ -29,29 +30,53 @@ class Fedator(Spider):
|
|||||||
self.visited.add(host)
|
self.visited.add(host)
|
||||||
yield Request(
|
yield Request(
|
||||||
f"https://{host}/@relay/followers?page=1",
|
f"https://{host}/@relay/followers?page=1",
|
||||||
meta={"dst": host, "depth": 0, "page": 1},
|
meta={"rel": "followers", "host": host, "depth": 0, "page": 1},
|
||||||
|
)
|
||||||
|
yield Request(
|
||||||
|
f"https://{host}/@relay/following?page=1",
|
||||||
|
meta={"rel": "following", "host": host, "depth": 0, "page": 1},
|
||||||
)
|
)
|
||||||
|
|
||||||
def parse(self, response:Response):
|
def parse(self, response: Response):
|
||||||
followers = response.json()[
|
items = response.json().get("orderedItems")
|
||||||
"orderedItems"
|
if len(items) < 1:
|
||||||
] # "https://mobilizon.sans-nuage.fr/relay"
|
return
|
||||||
if len(followers) > 0:
|
rel = response.request.meta["rel"]
|
||||||
dst = response.request.meta["dst"]
|
host = response.request.meta["host"]
|
||||||
page = response.request.meta["page"]
|
page = response.request.meta["page"]
|
||||||
depth = response.request.meta["depth"]
|
depth = response.request.meta["depth"]
|
||||||
yield Request(
|
# fetch next page
|
||||||
f"https://{dst}/@relay/followers?page={page}",
|
yield Request(
|
||||||
meta={"dst": dst, "depth": depth, "page": page + 1},
|
f"https://{host}/@relay/{rel}?page={page+1}",
|
||||||
)
|
meta={"rel": rel, "host": host, "depth": depth, "page": page + 1},
|
||||||
for follower in followers:
|
)
|
||||||
host = follower.split("/")[2]
|
# fetch children
|
||||||
yield {"src": host, "dst": dst}
|
for item in items:
|
||||||
if host not in self.visited and depth < self.MAX_DEPTH:
|
child = item.split("/")[2]
|
||||||
self.visited.add(host)
|
d = {"rel": rel}
|
||||||
yield Request(
|
if rel == "followers":
|
||||||
f"https://{host}/@relay/followers?page=1",
|
yield d | {"src": child, "dst": host}
|
||||||
meta={"dst": host, "depth": depth + 1, "page": 1},
|
else:
|
||||||
)
|
yield d | {"src": host, "dst": child}
|
||||||
else:
|
if child not in self.visited and depth < self.MAX_DEPTH:
|
||||||
logger.debug(f"already visited or maxdepth ({depth})")
|
self.visited.add(child)
|
||||||
|
yield Request(
|
||||||
|
f"https://{child}/@relay/followers?page=1",
|
||||||
|
meta={
|
||||||
|
"rel": "followers",
|
||||||
|
"host": child,
|
||||||
|
"depth": depth + 1,
|
||||||
|
"page": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
yield Request(
|
||||||
|
f"https://{child}/@relay/following?page=1",
|
||||||
|
meta={
|
||||||
|
"rel": "following",
|
||||||
|
"host": child,
|
||||||
|
"depth": depth + 1,
|
||||||
|
"page": 1,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(f"already visited or maxdepth ({depth})")
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
digraph fediverse {
|
digraph fediverse {
|
||||||
graph [overlap=false]
|
graph [overlap=false outputorder="edgesfirst"]
|
||||||
#!
|
#!
|
||||||
IFS=","
|
IFS=","
|
||||||
while read src dst
|
while read src dst
|
||||||
@ -12,4 +12,4 @@ do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
#!
|
#!
|
||||||
}
|
}
|
||||||
|
15
to_dot.awk
Normal file
15
to_dot.awk
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
BEGIN {
|
||||||
|
print "digraph fediverse {"
|
||||||
|
print "graph [overlap=false outputorder=\"edgesfirst\"]"
|
||||||
|
print "node [style=\"filled\"]"
|
||||||
|
FS=","
|
||||||
|
}
|
||||||
|
|
||||||
|
# rel src dst
|
||||||
|
$1 == "followers" {
|
||||||
|
print "\"" $2 "\"", "->", "\""$3 "\";"
|
||||||
|
}
|
||||||
|
|
||||||
|
END {
|
||||||
|
print "}"
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user