前言 Link to heading

探针采用的是 cppla 的 ServerStatus 项目,配置起来还是挺简单的。

安装 ServerStatus 服务端 Link to heading

首先,执行以下命令以下载原始的配置文件:

wget --no-check-certificate -qO ~/serverstatus-config.json https://raw.githubusercontent.com/cppla/ServerStatus/master/server/config.json && mkdir ~/serverstatus-monthtraffic
mkdir serv_status && mkdir client_status && cd server_status && touch docker-compose.yml

docker-compose.yml 的内容修改为如下:

version: '3'

services:
  serverstatus:
    image: cppla/serverstatus:latest
    container_name: serverstatus
    restart: always
    volumes:
      - ~/serverstatus-config.json:/ServerStatus/server/config.json
      - ~/serverstatus-monthtraffic:/usr/share/nginx/html/json
    ports:
      - "10081:80"
      - "35601:35601"

并修改 ~/serverstatus-config.json 内容:

{
    "servers": [
        {
            "username": "s01",
            "name": "绿云 日本",
            "type": "GreenCloud",
            "host": "host1",
            "location": "🇯🇵",
            "password": "passwordlin",
            "monthstart": 1
        },
        {
            "username": "s02",
            "name": "BV 新加坡",
            "type": "ByteVirt",
            "host": "host1",
            "location": "🇯🇵",
            "password": "passwordlin",
            "monthstart": 1
        },
        {
            "username": "s03",
            "name": "甲骨文零号 AMD 圣保罗",
            "type": "Oracle",
            "host": "host1",
            "location": "🇧🇷",
            "password": "passwordlin",
            "monthstart": 1
        },
        {
            "username": "s04",
            "name": "甲骨文一号 AMD 圣保罗",
            "type": "Oracle",
            "host": "host1",
            "location": "🇧🇷",
            "password": "passwordlin",
            "monthstart": 1
        },
        {
            "username": "s05",
            "name": "甲骨文 ARM 圣保罗",
            "type": "Oracle",
            "host": "host1",
            "location": "🇧🇷",
            "password": "passwordlin",
            "monthstart": 1
        },
        {
            "username": "s06",
            "name": "阿里轻量云 香港",
            "type": "Alibaba",
            "host": "host1",
            "location": "🇭🇰",
            "password": "passwordlin",
            "monthstart": 1
        },
        {
            "username": "s07",
            "name": "Netcup 德国",
            "type": "Netcup",
            "host": "host1",
            "location": "🇩🇪",
            "password": "passwordlin",
            "monthstart": 1
        },
        {
            "username": "s08",
            "name": "CCS 洛杉矶",
            "type": "ColorCrossing",
            "host": "host1",
            "location": "🇺🇸",
            "password": "passwordlin",
            "monthstart": 1
        }
    ],
    "watchdog": [
        {
            "name": "cpu high warning,exclude username s01",
            "rule": "cpu>90&load_1>5&username!='s01'",
            "interval": 600,
            "callback": "https://yourSMSurl"
        },
        {
            "name": "memory high warning, exclude less than 1GB vps",
            "rule": "(memory_used/memory_total)*100>90&memory_total>1048576",
            "interval": 300,
            "callback": "https://yourSMSurl"
        },
        {
            "name": "offline warning,exclude name node1",
            "rule": "online4=0&online6=0&name!='node1'",
            "interval": 600,
            "callback": "https://yourSMSurl"
        },
        {
            "name": "ddcc attack,limit type Oracle",
            "rule": "tcp_count>600&type='Oracle'",
            "interval": 300,
            "callback": "https://yourSMSurl"
        },
        {
            "name": "month traffic warning",
            "rule": "(network_out-last_network_out)/1024/1024/1024>999",
            "interval": 3600,
            "callback": "https://yourSMSurl"
        },
        {
            "name": "you can parse an expression combining any known field",
            "rule": "load_5>3",
            "interval": 900,
            "callback": "https://yourSMSurl"
        }
    ]
}

然后执行 docker-compose up -d

安装 ServerStatus 客户端 Link to heading

执行如下命令

mkdir ~/client_status && cd client_status && touch client-linux.py

修改 client-linux.py 的内容为如下:

#!/usr/bin/env python3
# coding: utf-8
# Update by : https://github.com/cppla/ServerStatus, Update date: 20220530
# 版本:1.0.3, 支持Python版本:2.7 to 3.10
# 支持操作系统: Linux, OSX, FreeBSD, OpenBSD and NetBSD, both 32-bit and 64-bit architectures
# 说明: 默认情况下修改server和user就可以了。丢包率监测方向可以自定义,例如:CU = "www.facebook.com"。

SERVER = "193.26.156.130"
USER = "s13"


PASSWORD = "server_status_1136lin"
PORT = 35601
CU = "cu.tz.cloudcpp.com"
CT = "ct.tz.cloudcpp.com"
CM = "cm.tz.cloudcpp.com"
PROBEPORT = 80
PROBE_PROTOCOL_PREFER = "ipv4"  # ipv4, ipv6
PING_PACKET_HISTORY_LEN = 100
INTERVAL = 1

import socket
import time
import timeit
import re
import os
import sys
import json
import errno
import subprocess
import threading

try:
    from queue import Queue  # python3
except ImportError:
    from Queue import Queue  # python2


def get_uptime():
    with open("/proc/uptime", "r") as f:
        uptime = f.readline().split(".", 2)
        return int(uptime[0])


def get_memory():
    re_parser = re.compile(r"^(?P<key>\S*):\s*(?P<value>\d*)\s*kB")
    result = dict()
    for line in open("/proc/meminfo"):
        match = re_parser.match(line)
        if not match:
            continue
        key, value = match.groups(["key", "value"])
        result[key] = int(value)
    MemTotal = float(result["MemTotal"])
    MemUsed = (
        MemTotal
        - float(result["MemFree"])
        - float(result["Buffers"])
        - float(result["Cached"])
        - float(result["SReclaimable"])
    )
    SwapTotal = float(result["SwapTotal"])
    SwapFree = float(result["SwapFree"])
    return int(MemTotal), int(MemUsed), int(SwapTotal), int(SwapFree)


def get_hdd():
    p = subprocess.check_output(
        [
            "df",
            "-Tlm",
            "--total",
            "-t",
            "ext4",
            "-t",
            "ext3",
            "-t",
            "ext2",
            "-t",
            "reiserfs",
            "-t",
            "jfs",
            "-t",
            "ntfs",
            "-t",
            "fat32",
            "-t",
            "btrfs",
            "-t",
            "fuseblk",
            "-t",
            "zfs",
            "-t",
            "simfs",
            "-t",
            "xfs",
        ]
    ).decode("Utf-8")
    total = p.splitlines()[-1]
    used = total.split()[3]
    size = total.split()[2]
    return int(size), int(used)


def get_time():
    with open("/proc/stat", "r") as f:
        time_list = f.readline().split(" ")[2:6]
        for i in range(len(time_list)):
            time_list[i] = int(time_list[i])
        return time_list


def delta_time():
    x = get_time()
    time.sleep(INTERVAL)
    y = get_time()
    for i in range(len(x)):
        y[i] -= x[i]
    return y


def get_cpu():
    t = delta_time()
    st = sum(t)
    if st == 0:
        st = 1
    result = 100 - (t[len(t) - 1] * 100.00 / st)
    return round(result, 1)


def liuliang():
    NET_IN = 0
    NET_OUT = 0
    with open("/proc/net/dev") as f:
        for line in f.readlines():
            netinfo = re.findall(
                "([^\s]+):[\s]{0,}(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)",
                line,
            )
            if netinfo:
                if (
                    netinfo[0][0] == "lo"
                    or "tun" in netinfo[0][0]
                    or "docker" in netinfo[0][0]
                    or "veth" in netinfo[0][0]
                    or "br-" in netinfo[0][0]
                    or "vmbr" in netinfo[0][0]
                    or "vnet" in netinfo[0][0]
                    or "kube" in netinfo[0][0]
                    or netinfo[0][1] == "0"
                    or netinfo[0][9] == "0"
                ):
                    continue
                else:
                    NET_IN += int(netinfo[0][1])
                    NET_OUT += int(netinfo[0][9])
    return NET_IN, NET_OUT


def tupd():
    """
    tcp, udp, process, thread count: for view ddcc attack , then send warning
    :return:
    """
    s = subprocess.check_output("ss -t|wc -l", shell=True)
    t = int(s[:-1]) - 1
    s = subprocess.check_output("ss -u|wc -l", shell=True)
    u = int(s[:-1]) - 1
    s = subprocess.check_output("ps -ef|wc -l", shell=True)
    p = int(s[:-1]) - 2
    s = subprocess.check_output("ps -eLf|wc -l", shell=True)
    d = int(s[:-1]) - 2
    return t, u, p, d


def get_network(ip_version):
    if ip_version == 4:
        HOST = "ipv4.google.com"
    elif ip_version == 6:
        HOST = "ipv6.google.com"
    try:
        socket.create_connection((HOST, 80), 2).close()
        return True
    except:
        return False


lostRate = {"10010": 0.0, "189": 0.0, "10086": 0.0}
pingTime = {"10010": 0, "189": 0, "10086": 0}
netSpeed = {
    "netrx": 0.0,
    "nettx": 0.0,
    "clock": 0.0,
    "diff": 0.0,
    "avgrx": 0,
    "avgtx": 0,
}
diskIO = {"read": 0, "write": 0}


def _ping_thread(host, mark, port):
    lostPacket = 0
    packet_queue = Queue(maxsize=PING_PACKET_HISTORY_LEN)

    while True:
        # flush dns , every time.
        IP = host
        if (
            host.count(":") < 1
        ):  # if not plain ipv6 address, means ipv4 address or hostname
            try:
                if PROBE_PROTOCOL_PREFER == "ipv4":
                    IP = socket.getaddrinfo(host, None, socket.AF_INET)[0][4][0]
                else:
                    IP = socket.getaddrinfo(host, None, socket.AF_INET6)[0][4][0]
            except Exception:
                pass

        if packet_queue.full():
            if packet_queue.get() == 0:
                lostPacket -= 1
        try:
            b = timeit.default_timer()
            socket.create_connection((IP, port), timeout=1).close()
            pingTime[mark] = int((timeit.default_timer() - b) * 1000)
            packet_queue.put(1)
        except socket.error as error:
            if error.errno == errno.ECONNREFUSED:
                pingTime[mark] = int((timeit.default_timer() - b) * 1000)
                packet_queue.put(1)
            # elif error.errno == errno.ETIMEDOUT:
            else:
                lostPacket += 1
                packet_queue.put(0)

        if packet_queue.qsize() > 30:
            lostRate[mark] = float(lostPacket) / packet_queue.qsize()

        time.sleep(INTERVAL)


def _net_speed():
    while True:
        with open("/proc/net/dev", "r") as f:
            net_dev = f.readlines()
            avgrx = 0
            avgtx = 0
            for dev in net_dev[2:]:
                dev = dev.split(":")
                if (
                    "lo" in dev[0]
                    or "tun" in dev[0]
                    or "docker" in dev[0]
                    or "veth" in dev[0]
                    or "br-" in dev[0]
                    or "vmbr" in dev[0]
                    or "vnet" in dev[0]
                    or "kube" in dev[0]
                ):
                    continue
                dev = dev[1].split()
                avgrx += int(dev[0])
                avgtx += int(dev[8])
            now_clock = time.time()
            netSpeed["diff"] = now_clock - netSpeed["clock"]
            netSpeed["clock"] = now_clock
            netSpeed["netrx"] = int((avgrx - netSpeed["avgrx"]) / netSpeed["diff"])
            netSpeed["nettx"] = int((avgtx - netSpeed["avgtx"]) / netSpeed["diff"])
            netSpeed["avgrx"] = avgrx
            netSpeed["avgtx"] = avgtx
        time.sleep(INTERVAL)


def _disk_io():
    """
    good luck for opensource! by: cpp.la
    磁盘IO:因为IOPS原因,SSD和HDD、包括RAID卡,ZFS等阵列技术。IO对性能的影响还需要结合自身服务器情况来判断。
    比如我这里是机械硬盘,大量做随机小文件读写,那么很低的读写也就能造成硬盘长时间的等待。
    如果这里做连续性IO,那么普通机械硬盘写入到100Mb/s,那么也能造成硬盘长时间的等待。
    磁盘读写有误差:4k,8k ,https://stackoverflow.com/questions/34413926/psutil-vs-dd-monitoring-disk-i-o
    :return:
    """
    while True:
        # pre pid snapshot
        snapshot_first = {}
        # next pid snapshot
        snapshot_second = {}
        # read count snapshot
        snapshot_read = 0
        # write count snapshot
        snapshot_write = 0
        # process snapshot
        pid_snapshot = [str(i) for i in os.listdir("/proc") if i.isdigit() is True]
        for pid in pid_snapshot:
            try:
                with open("/proc/{}/io".format(pid)) as f:
                    pid_io = {}
                    for line in f.readlines():
                        if "read_bytes" in line:
                            pid_io["read"] = int(line.split("read_bytes:")[-1].strip())
                        elif (
                            "write_bytes" in line
                            and "cancelled_write_bytes" not in line
                        ):
                            pid_io["write"] = int(
                                line.split("write_bytes:")[-1].strip()
                            )
                    pid_io["name"] = (
                        open("/proc/{}/comm".format(pid), "r").read().strip()
                    )
                    snapshot_first[pid] = pid_io
            except:
                if pid in snapshot_first:
                    snapshot_first.pop(pid)

        time.sleep(INTERVAL)

        for pid in pid_snapshot:
            try:
                with open("/proc/{}/io".format(pid)) as f:
                    pid_io = {}
                    for line in f.readlines():
                        if "read_bytes" in line:
                            pid_io["read"] = int(line.split("read_bytes:")[-1].strip())
                        elif (
                            "write_bytes" in line
                            and "cancelled_write_bytes" not in line
                        ):
                            pid_io["write"] = int(
                                line.split("write_bytes:")[-1].strip()
                            )
                    pid_io["name"] = (
                        open("/proc/{}/comm".format(pid), "r").read().strip()
                    )
                    snapshot_second[pid] = pid_io
            except:
                if pid in snapshot_first:
                    snapshot_first.pop(pid)
                if pid in snapshot_second:
                    snapshot_second.pop(pid)

        for k, v in snapshot_first.items():
            if (
                snapshot_first[k]["name"] == snapshot_second[k]["name"]
                and snapshot_first[k]["name"] != "bash"
            ):
                snapshot_read += snapshot_second[k]["read"] - snapshot_first[k]["read"]
                snapshot_write += (
                    snapshot_second[k]["write"] - snapshot_first[k]["write"]
                )
        diskIO["read"] = snapshot_read
        diskIO["write"] = snapshot_write


def get_realtime_data():
    """
    real time get system data
    :return:
    """
    t1 = threading.Thread(
        target=_ping_thread, kwargs={"host": CU, "mark": "10010", "port": PROBEPORT}
    )
    t2 = threading.Thread(
        target=_ping_thread, kwargs={"host": CT, "mark": "189", "port": PROBEPORT}
    )
    t3 = threading.Thread(
        target=_ping_thread, kwargs={"host": CM, "mark": "10086", "port": PROBEPORT}
    )
    t4 = threading.Thread(
        target=_net_speed,
    )
    t5 = threading.Thread(
        target=_disk_io,
    )
    for ti in [t1, t2, t3, t4, t5]:
        ti.daemon = True
        ti.start()


def byte_str(object):
    """
    bytes to str, str to bytes
    :param object:
    :return:
    """
    if isinstance(object, str):
        return object.encode(encoding="utf-8")
    elif isinstance(object, bytes):
        return bytes.decode(object)
    else:
        print(type(object))


if __name__ == "__main__":
    for argc in sys.argv:
        if "SERVER" in argc:
            SERVER = argc.split("SERVER=")[-1]
        elif "PORT" in argc:
            PORT = int(argc.split("PORT=")[-1])
        elif "USER" in argc:
            USER = argc.split("USER=")[-1]
        elif "PASSWORD" in argc:
            PASSWORD = argc.split("PASSWORD=")[-1]
        elif "INTERVAL" in argc:
            INTERVAL = int(argc.split("INTERVAL=")[-1])
    socket.setdefaulttimeout(30)
    get_realtime_data()
    while True:
        try:
            print("Connecting...")
            s = socket.create_connection((SERVER, PORT))
            data = byte_str(s.recv(1024))
            if data.find("Authentication required") > -1:
                s.send(byte_str(USER + ":" + PASSWORD + "\n"))
                data = byte_str(s.recv(1024))
                if data.find("Authentication successful") < 0:
                    print(data)
                    raise socket.error
            else:
                print(data)
                raise socket.error

            print(data)
            if data.find("You are connecting via") < 0:
                data = byte_str(s.recv(1024))
                print(data)

            timer = 0
            check_ip = 0
            if data.find("IPv4") > -1:
                check_ip = 6
            elif data.find("IPv6") > -1:
                check_ip = 4
            else:
                print(data)
                raise socket.error

            while True:
                CPU = get_cpu()
                NET_IN, NET_OUT = liuliang()
                Uptime = get_uptime()
                Load_1, Load_5, Load_15 = os.getloadavg()
                MemoryTotal, MemoryUsed, SwapTotal, SwapFree = get_memory()
                HDDTotal, HDDUsed = get_hdd()

                array = {}
                if not timer:
                    array["online" + str(check_ip)] = get_network(check_ip)
                    timer = 10
                else:
                    timer -= 1 * INTERVAL

                array["uptime"] = Uptime
                array["load_1"] = Load_1
                array["load_5"] = Load_5
                array["load_15"] = Load_15
                array["memory_total"] = MemoryTotal
                array["memory_used"] = MemoryUsed
                array["swap_total"] = SwapTotal
                array["swap_used"] = SwapTotal - SwapFree
                array["hdd_total"] = HDDTotal
                array["hdd_used"] = HDDUsed
                array["cpu"] = CPU
                array["network_rx"] = netSpeed.get("netrx")
                array["network_tx"] = netSpeed.get("nettx")
                array["network_in"] = NET_IN
                array["network_out"] = NET_OUT
                # todo:兼容旧版本,下个版本删除ip_status
                array["ip_status"] = True
                array["ping_10010"] = lostRate.get("10010") * 100
                array["ping_189"] = lostRate.get("189") * 100
                array["ping_10086"] = lostRate.get("10086") * 100
                array["time_10010"] = pingTime.get("10010")
                array["time_189"] = pingTime.get("189")
                array["time_10086"] = pingTime.get("10086")
                array["tcp"], array["udp"], array["process"], array["thread"] = tupd()
                array["io_read"] = diskIO.get("read")
                array["io_write"] = diskIO.get("write")

                s.send(byte_str("update " + json.dumps(array) + "\n"))
        except KeyboardInterrupt:
            raise
        except socket.error:
            print("Disconnected...")
            if "s" in locals().keys():
                del s
            time.sleep(3)
        except Exception as e:
            print("Caught Exception:", e)
            if "s" in locals().keys():
                del s
            time.sleep(3)

其中,对于在不同的服务器上安装 client,我们只需要修改开头的 SERVERUSERPASSWORD 即可,将 SERVER 修改为安装了服务端的服务器的 IP,将 USER 修改为 serverstatus-config.json 中对应服务器的 usernamePASSWORD 修改为 serverstatus-config.json 对应服务器的 password

然后执行 sudo nvim /etc/systemd/system/client_status.service,将内容修改为以下:

[Unit]
Description=Client Status

[Service]
ExecStart=/usr/bin/python3 /home/myuser/client_status/client-linux.py >/dev/null 2>&1 &
Restart=always

[Install]
WantedBy=multi-user.target

注意 ExecStart 字段中的路径要与 client-linux.py 的实际路径相匹配,必须使用绝对路径,而不能使用 ~

修改 Caddyfile Link to heading

添加以下内容:

status.example.eu.org {
    reverse_proxy example.eu.org:10081
}

这样就能通过 status.example.eu.org 来直接访问网页而不需要通过网址 + 端口来访问了。