Add InfluxDB/Grafana monitoring

This commit is contained in:
Ske 2018-07-16 20:53:41 +02:00
parent 9c0811afe8
commit 0b8488606b
11 changed files with 668 additions and 23 deletions

View File

@ -1 +1 @@
from . import commands, db, proxy from . import commands, db, proxy, stats

View File

@ -1,6 +1,8 @@
from datetime import datetime
import logging import logging
import json import json
import os import os
import time
import discord import discord
@ -38,7 +40,7 @@ async def on_message(message):
# Split into args. shlex sucks so we don't bother with quotes # Split into args. shlex sucks so we don't bother with quotes
args = message.content.split(" ") args = message.content.split(" ")
from pluralkit import proxy, utils from pluralkit import proxy, utils, stats
command_items = utils.command_map.items() command_items = utils.command_map.items()
command_items = sorted(command_items, key=lambda x: len(x[0]), reverse=True) command_items = sorted(command_items, key=lambda x: len(x[0]), reverse=True)
@ -54,7 +56,14 @@ async def on_message(message):
args = [] args = []
async with client.pool.acquire() as conn: async with client.pool.acquire() as conn:
time_before = time.perf_counter()
await func(conn, message, args) await func(conn, message, args)
time_after = time.perf_counter()
# Report command time stats
execution_time = time_after - time_before
response_time = (datetime.now() - message.timestamp).total_seconds()
await stats.report_command(command, execution_time, response_time)
return return
# Try doing proxy parsing # Try doing proxy parsing
@ -82,7 +91,7 @@ async def on_socket_raw_receive(msg):
pass pass
async def run(): async def run():
from pluralkit import db from pluralkit import db, stats
try: try:
logger.info("Connecting to database...") logger.info("Connecting to database...")
pool = await db.connect() pool = await db.connect()
@ -92,6 +101,7 @@ async def run():
await db.create_tables(conn) await db.create_tables(conn)
logger.info("Connecting to InfluxDB...") logger.info("Connecting to InfluxDB...")
await stats.connect()
client.pool = pool client.pool = pool
logger.info("Connecting to Discord...") logger.info("Connecting to Discord...")

View File

@ -3,9 +3,9 @@ import time
import asyncpg import asyncpg
import asyncpg.exceptions import asyncpg.exceptions
from pluralkit import stats
from pluralkit.bot import logger from pluralkit.bot import logger
async def connect(): async def connect():
while True: while True:
try: try:
@ -17,11 +17,17 @@ async def connect():
def db_wrap(func): def db_wrap(func):
async def inner(*args, **kwargs): async def inner(*args, **kwargs):
before = time.perf_counter() before = time.perf_counter()
try:
res = await func(*args, **kwargs) res = await func(*args, **kwargs)
after = time.perf_counter() after = time.perf_counter()
logger.debug(" - DB call {} took {:.2f} ms".format(func.__name__, (after - before) * 1000)) logger.debug(" - DB call {} took {:.2f} ms".format(func.__name__, (after - before) * 1000))
await stats.report_db_query(func.__name__, after - before, True)
return res return res
except asyncpg.exceptions.PostgresError:
await stats.report_db_query(func.__name__, time.perf_counter() - before, False)
logger.exception("Error from database query {}".format(func.__name__))
return inner return inner
@db_wrap @db_wrap
@ -223,6 +229,14 @@ async def update_server(conn, server_id: str, logging_channel_id: str):
logger.debug("Updating server settings (id={}, log_channel={})".format(server_id, logging_channel_id)) logger.debug("Updating server settings (id={}, log_channel={})".format(server_id, logging_channel_id))
await conn.execute("insert into servers (id, log_channel) values ($1, $2) on conflict (id) do update set log_channel = $2", int(server_id), logging_channel_id) await conn.execute("insert into servers (id, log_channel) values ($1, $2) on conflict (id) do update set log_channel = $2", int(server_id), logging_channel_id)
@db_wrap
async def member_count(conn):
return await conn.fetchval("select count(*) from members")
@db_wrap
async def system_count(conn):
return await conn.fetchval("select count(*) from systems")
async def create_tables(conn): async def create_tables(conn):
await conn.execute("""create table if not exists systems ( await conn.execute("""create table if not exists systems (
id serial primary key, id serial primary key,

View File

@ -6,7 +6,7 @@ import time
import aiohttp import aiohttp
import discord import discord
from pluralkit import db from pluralkit import db, stats
from pluralkit.bot import client, logger from pluralkit.bot import client, logger
def make_log_embed(hook_message, member, channel_name): def make_log_embed(hook_message, member, channel_name):
@ -98,20 +98,28 @@ async def send_hook_message(member, hook_id, hook_token, text=None, image_url=No
fd.add_field("file", image_resp.content, content_type=image_resp.content_type, filename=image_resp.url.name) fd.add_field("file", image_resp.content, content_type=image_resp.content_type, filename=image_resp.url.name)
# Send the actual webhook request, and wait for a response # Send the actual webhook request, and wait for a response
time_before = time.perf_counter()
try:
async with session.post("https://discordapp.com/api/v6/webhooks/{}/{}?wait=true".format(hook_id, hook_token), async with session.post("https://discordapp.com/api/v6/webhooks/{}/{}?wait=true".format(hook_id, hook_token),
data=fd, data=fd,
headers=req_headers) as resp: headers=req_headers) as resp:
if resp.status == 200: if resp.status == 200:
resp_data = await resp.json() resp_data = await resp.json()
# Make a fake message object for passing on - this is slightly broken but works for most things # Make a fake message object for passing on - this is slightly broken but works for most things
msg = discord.Message(reactions=[], **resp_data) msg = discord.Message(reactions=[], **resp_data)
# Make sure it's added to the client's message cache - otherwise events r # Report to stats
#client.messages.append(msg) await stats.report_webhook(time.perf_counter() - time_before, True)
return msg return msg
else: else:
await stats.report_webhook(time.perf_counter() - time_before, False)
# Fake a Discord exception, also because #yolo # Fake a Discord exception, also because #yolo
raise discord.HTTPException(resp, await resp.text()) raise discord.HTTPException(resp, await resp.text())
except aiohttp.ClientResponseError:
await stats.report_webhook(time.perf_counter() - time_before, False)
logger.exception("Error while sending webhook message")
async def proxy_message(conn, member, trigger_message, inner): async def proxy_message(conn, member, trigger_message, inner):

29
bot/pluralkit/stats.py Normal file
View File

@ -0,0 +1,29 @@
from aioinflux import InfluxDBClient
from pluralkit.bot import logger
client = None
async def connect():
global client
client = InfluxDBClient(host="influx", db="pluralkit")
await client.create_database(db="pluralkit")
async def report_db_query(query_name, time, success):
await client.write({
"measurement": "database_query",
"tags": {"query": query_name},
"fields": {"response_time": time, "success": int(success)}
})
async def report_command(command_name, execution_time, response_time):
await client.write({
"measurement": "command",
"tags": {"command": command_name},
"fields": {"execution_time": execution_time, "response_time": response_time}
})
async def report_webhook(time, success):
await client.write({
"measurement": "webhook",
"fields": {"response_time": time, "success": int(success)}
})

View File

@ -4,6 +4,7 @@ services:
build: bot build: bot
depends_on: depends_on:
- db - db
- influx
environment: environment:
- CLIENT_ID - CLIENT_ID
- TOKEN - TOKEN
@ -12,5 +13,19 @@ services:
volumes: volumes:
- "db_data:/var/lib/postgresql/data" - "db_data:/var/lib/postgresql/data"
restart: always restart: always
influx:
image: influxdb:alpine
volumes:
- "influx_data:/var/lib/influxdb:Z"
restart: always
grafana:
build: grafana
depends_on:
- influx
ports:
- "3000:3000"
restart: always
volumes: volumes:
db_data: db_data:
influx_data:

3
grafana/Dockerfile Normal file
View File

@ -0,0 +1,3 @@
FROM grafana/grafana
COPY . /etc/grafana

View File

@ -0,0 +1,537 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"limit": 100,
"name": "Annotations & Alerts",
"showIn": 0,
"type": "dashboard"
}
]
},
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": 1,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "InfluxDB",
"fill": 1,
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 0
},
"id": 6,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "Webhook success rate",
"yaxis": 2
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"alias": "Webhook response time",
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "webhook",
"orderByTime": "ASC",
"policy": "default",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"response_time"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
},
{
"alias": "Webhook success rate",
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "webhook",
"orderByTime": "ASC",
"policy": "default",
"refId": "B",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"success"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Webhook executions",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "percentunit",
"label": null,
"logBase": 1,
"max": "1",
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "InfluxDB",
"fill": 1,
"gridPos": {
"h": 9,
"w": 12,
"x": 12,
"y": 0
},
"id": 2,
"legend": {
"alignAsTable": false,
"avg": false,
"current": false,
"hideEmpty": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [
{
"alias": "Database Success Rate (%)",
"yaxis": 2
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"alias": "Database response time (ms)",
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "database_query",
"orderByTime": "ASC",
"policy": "default",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"response_time"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
},
{
"alias": "Database success rate (%)",
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "database_query",
"orderByTime": "ASC",
"policy": "default",
"query": "SELECT mean(\"success\") FROM \"database_query\" WHERE $timeFilter GROUP BY time($__interval) fill(linear)",
"rawQuery": false,
"refId": "B",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"success"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Database Queries",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": false,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"decimals": null,
"format": "percentunit",
"label": null,
"logBase": 1,
"max": "1",
"min": "0",
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "InfluxDB",
"fill": 1,
"gridPos": {
"h": 9,
"w": 12,
"x": 0,
"y": 9
},
"id": 4,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"alias": "Command execution time",
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "command",
"orderByTime": "ASC",
"policy": "default",
"refId": "A",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"execution_time"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
},
{
"alias": "Command response time",
"groupBy": [
{
"params": [
"$__interval"
],
"type": "time"
},
{
"params": [
"linear"
],
"type": "fill"
}
],
"measurement": "command",
"orderByTime": "ASC",
"policy": "default",
"refId": "B",
"resultFormat": "time_series",
"select": [
[
{
"params": [
"response_time"
],
"type": "field"
},
{
"params": [],
"type": "mean"
}
]
],
"tags": []
}
],
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Commands",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": false,
"schemaVersion": 16,
"style": "dark",
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "",
"title": "PluralKit Stats",
"uid": "pk",
"version": 1
}

12
grafana/grafana.ini Normal file
View File

@ -0,0 +1,12 @@
instance_name = pluralkit
[security]
allow_sign_up = false
allow_org_create = false
[auth]
disable_login_form = true
[auth.anonymous]
enabled = true
org_role = Viewer

View File

@ -0,0 +1,8 @@
apiVersion: 1
providers:
- name: "pluralkit"
orgId: 1
folder: ''
type: file
options:
path: /etc/grafana/dashboards

View File

@ -0,0 +1,9 @@
apiVersion: 1
datasources:
- name: InfluxDB
type: influxdb
database: pluralkit
access: proxy
url: http://influx:8086
editable: false