From 9e411cee546f25127ba907229831f7636635fc9e Mon Sep 17 00:00:00 2001 From: setop Date: Sat, 5 Jul 2025 01:34:59 +0200 Subject: [PATCH] chore: package --- .dockerignore | 2 + .gitignore | 176 +++++++++++++++++++ DESIGN.md | 27 ++- Dockerfile | 24 +++ cmd.txt | 7 + requirements.txt | 2 + s6/crond/run | 9 + s6/sqlpage/run | 4 + schema.sql | 6 + scripts/runjob.sh | 5 + src/instances.py | 93 ++++++++++ src/model.py | 38 ++++ src/query.gql | 23 +++ add_instance.sql => webroot/add_instance.sql | 0 index.sql => webroot/index.sql | 3 + stats.sql => webroot/stats.sql | 9 + 16 files changed, 424 insertions(+), 4 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 requirements.txt create mode 100755 s6/crond/run create mode 100755 s6/sqlpage/run create mode 100755 scripts/runjob.sh create mode 100644 src/instances.py create mode 100644 src/model.py create mode 100644 src/query.gql rename add_instance.sql => webroot/add_instance.sql (100%) rename index.sql => webroot/index.sql (75%) rename stats.sql => webroot/stats.sql (86%) diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0c2697a --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.env +**/__pycache__ diff --git a/.gitignore b/.gitignore index 03bd412..08d8305 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,177 @@ *.env +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +# End of https://www.toptal.com/developers/gitignore/api/python diff --git a/DESIGN.md b/DESIGN.md index 3f42784..85efadb 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -12,20 +12,26 @@ # MVP - [x] chore: data model + - [ ] model : add "s" to failure + - [ ] model : add "s" to folloing + - [ ] add "blacklisted" - [x] data migration scripts - [ ] homepage - [x] form to register an instance - [ ] check for duplicates - [ ] grab first stats : version, users, groups, events - [ ] if fail, set failure to 1 - - [ ] confirmation page + - [x] confirmation page + - [ ] find a way to populate location - [x] instances list, no pagination - [ ] abuse link -- [ ] cron - - [ ] select instances where failure < max_failure +- [ ] scaper + - [x] select instances where failure < max_failure - [ ] for each entry, fetch stats - - [ ] if fail, set failure to failure+1 + - [ ] if fail, set failure to min(1, failure+1) - [ ] if success, set failure = 0 + - [x] insert new stats + - [x] update instances info - [ ] stats page - [x] big numbers - [x] total Instances @@ -41,6 +47,19 @@ - [ ] versions pie chart - [ ] languages pie chart (user weighted ?) - [ ] location pie chart +- [ ] admin panel + - [ ] authentication (shared secret, oauth ?) + - [ ] list failed domain + - [ ] rescan an instance + - [ ] allow to blacklist a domain +- [ ] package + - [ ] docker image with : s6 or tini + crond + python(pyinstaller) + sqlpage +- [ ] deploy on `beta-instances.mobilizon.org` + - [ ] DNS + - [ ] HC ping + + + # impl diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..05dbee5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.13-alpine + +RUN apk add --no-cache s6 bash wget unzip + +COPY requirements.txt . + +RUN python -m pip install --no-cache-dir -r requirements.txt + +COPY ./s6 /etc/s6 + +WORKDIR /app + +COPY scripts scripts +COPY src src +COPY sqlpage sqlpage +COPY webroot webroot + +RUN wget -q https://github.com/sqlpage/SQLPage/releases/download/v0.35.2/sqlpage-aws-lambda.zip \ + && unzip sqlpage-aws-lambda.zip bootstrap \ + && rm sqlpage-aws-lambda.zip + +ENTRYPOINT [ "/usr/bin/s6-svscan", "/etc/s6" ] + +EXPOSE 8080 diff --git a/cmd.txt b/cmd.txt index 77f7c0b..61aceea 100644 --- a/cmd.txt +++ b/cmd.txt @@ -1 +1,8 @@ . .venv/bin/activate + +export VERSION=0.1.0 + +docker build -t mobilizon-instances:${VERSION} . + +docker run -d --name mbzinstances --env-file .envfile -v $(realpath sqlpage):/data mobilizon-instances:${VERSION} + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0ad6b15 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +Scrapy +pydantic diff --git a/s6/crond/run b/s6/crond/run new file mode 100755 index 0000000..1996025 --- /dev/null +++ b/s6/crond/run @@ -0,0 +1,9 @@ +#!/bin/bash -eu + +# specific to alpine, won't work with debian + +# place crontab /!\ UTC time +printf "ping=\"curl -fsS -m 10 --retry 5 -o /dev/null ${HC_PING}\"\n${SCHEDULE} /app/scripts/runjob.sh && \${ping} || \${ping}/fail \n" | crontab - + +# start cron +crond -f diff --git a/s6/sqlpage/run b/s6/sqlpage/run new file mode 100755 index 0000000..d608e63 --- /dev/null +++ b/s6/sqlpage/run @@ -0,0 +1,4 @@ +#!/bin/sh -eu + +/app/bootstrap -c /app/sqlpage/sqlpage.json + diff --git a/schema.sql b/schema.sql index a6cee17..5a7d187 100644 --- a/schema.sql +++ b/schema.sql @@ -1,4 +1,10 @@ +-- set some pragma +PRAGMA journal_mode = WAL; +PRAGMA busy_timeout = 15000; -- for interactive, 15s for background tasks +PRAGMA synchronous = NORMAL; +PRAGMA cache_size = 1000000000; -- means infinite PRAGMA foreign_keys = true; +PRAGMA temp_store = memory; CREATE TABLE instances ( -- PK will be rowid diff --git a/scripts/runjob.sh b/scripts/runjob.sh new file mode 100755 index 0000000..95ebddc --- /dev/null +++ b/scripts/runjob.sh @@ -0,0 +1,5 @@ +#!/bin/sh -eu + +CMDD=$(dirname $(realpath $0)) +cd $(dirname "${CMDD}") +python -m scrapy runspider src/instances.py diff --git a/src/instances.py b/src/instances.py new file mode 100644 index 0000000..21e8e58 --- /dev/null +++ b/src/instances.py @@ -0,0 +1,93 @@ +import sys +import os +import logging +import time +import sqlite3 +from scrapy import Spider, Request +from scrapy.http import Response +from model import Model + +logger = logging.getLogger(__name__) + + +def yield_from_db(): + with sqlite3.connect(f'file:{os.environ.get("DATADIR")}/sqlpage.db?mode=ro', uri=True) as db: + for row in db.execute('select rowid, domain from instances where failure < 5'): + yield row + +def yield_tests(): + #yield (105, 'keskonfai.fr',) + yield (7, 'mobilizon.fr',) + +class Fedator(Spider): + name = "fedator" + custom_settings = { + "USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64; rv:100.0) Gecko/20100101 Firefox/100.0", + "ROBOTSTXT_OBEY": False, + "REFERER_ENABLED": False, + "COOKIES_ENABLED": False, + "TELNETCONSOLE_ENABLED": False, + "HTTPCACHE_ENABLED": False, + "DEFAULT_REQUEST_HEADERS": { + "Accept": "application/json", + "Content-Type": "application/json", + }, + "DOWNLOAD_TIMEOUT": 10, + } + + + async def start(self): + gql_query = open('src/query.gql', 'r').read() + from json import dumps + body = dumps({"query": gql_query}) + # bbody = bytes(body, encoding='utf-8') + # sys.stdout.buffer.write(bbody) + for row in yield_tests(): + domain = row[1] + yield Request( + url = f"https://{domain}/api", + method = 'POST', + body = body, + meta = { "record": row, }, + ) + + def parse(self, response: Response): + res:Model = Model.model_validate_json(response.body) + meta = response.request.meta + instance_id = meta.get('record')[0] + with sqlite3.connect(f'file:{os.environ.get("DATADIR")}/sqlpage.db', timeout=15.0, isolation_level='IMMEDIATE') as db: + # insert new stats + stats = res.data.statistics + db.execute('''insert into stats(insertedAt, instance_id, users, local_groups, total_groups, local_events, total_events, local_comments, total_comments, following, followers) values (?,?,?,?,?,?,?,?,?,?,?) + ''', ( + int(time.time()) # insertedAt + , instance_id + , stats.number_of_users # users + , stats.number_of_local_groups + , stats.number_of_groups + , stats.number_of_local_events + , stats.number_of_events + , stats.number_of_local_comments + , stats.number_of_comments + , stats.number_of_instance_followings + , stats.number_of_instance_followers + + )) + # update info + config = res.data.config + db.execute('''update instances set name=?,slogan=?,description=?,languages=?,open=?,version=? + -- ,location=? + ,failure=?,updatedAt=? where rowid=? + ''', ( + config.name + , config.slogan + , config.description + , ",".join(config.languages) + , config.registrations_open + , config.version + #, config.location + , 0 # failure + , int(time.time()) # updatedAt + , instance_id + )) + diff --git a/src/model.py b/src/model.py new file mode 100644 index 0000000..44e1745 --- /dev/null +++ b/src/model.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +from typing import Any, List, Optional + +from pydantic import BaseModel, Field + + +class Config(BaseModel): + country_code: Any = Field(..., alias='countryCode') + description: str + languages: List[str] + long_description: str = Field(..., alias='longDescription') + name: str + registrations_open: bool = Field(..., alias='registrationsOpen') + slogan: str + version: str + + +class Statistics(BaseModel): + number_of_comments: int = Field(..., alias='numberOfComments') + number_of_events: int = Field(..., alias='numberOfEvents') + number_of_groups: int = Field(..., alias='numberOfGroups') + number_of_instance_followers: int = Field(..., alias='numberOfInstanceFollowers') + number_of_instance_followings: int = Field(..., alias='numberOfInstanceFollowings') + number_of_local_comments: int = Field(..., alias='numberOfLocalComments') + number_of_local_events: int = Field(..., alias='numberOfLocalEvents') + number_of_local_groups: int = Field(..., alias='numberOfLocalGroups') + number_of_users: int = Field(..., alias='numberOfUsers') + + +class Data(BaseModel): + config: Config + statistics: Statistics + + +class Model(BaseModel): + data: Data + diff --git a/src/query.gql b/src/query.gql new file mode 100644 index 0000000..12859d3 --- /dev/null +++ b/src/query.gql @@ -0,0 +1,23 @@ +query About { + statistics { + numberOfUsers + numberOfLocalGroups + numberOfGroups + numberOfLocalEvents + numberOfEvents + numberOfLocalComments + numberOfComments + numberOfInstanceFollowings + numberOfInstanceFollowers + } + config { + name + version + registrationsOpen + slogan + description + longDescription + countryCode + languages + } +} diff --git a/add_instance.sql b/webroot/add_instance.sql similarity index 100% rename from add_instance.sql rename to webroot/add_instance.sql diff --git a/index.sql b/webroot/index.sql similarity index 75% rename from index.sql rename to webroot/index.sql index 598665a..559e6e7 100644 --- a/index.sql +++ b/webroot/index.sql @@ -2,6 +2,8 @@ select 'shell' as component , 'Mobilizon Instances' as title , 'social' as icon + , JSON('{"title":"Statistics","link":"/stats", "icon":"chart-dots"}') as menu_item + , JSON('{"link":"mailto:contact@kaihuri.org?subject=report%20an%20instance","title":"Report an instance","icon":"forms"}') as menu_item , '' as footer ; @@ -34,6 +36,7 @@ select select domain as Url , name + , version , slogan , description from instances diff --git a/stats.sql b/webroot/stats.sql similarity index 86% rename from stats.sql rename to webroot/stats.sql index 6b2da92..764eab9 100644 --- a/stats.sql +++ b/webroot/stats.sql @@ -1,3 +1,12 @@ +select + 'shell' as component + , 'Mobilizon statistics' as title + , 'chart-dots' as icon + , JSON('{"title":"Instances list","link":"/", "icon":"social"}') as menu_item + , JSON('{"link":"mailto:contact@kaihuri.org?subject=report%20an%20instance","title":"Report an instance","icon":"forms"}') as menu_item + , '' as footer +; + select 'big_number' as component , 4 as columns