
Je suppose que jabber n'était déjà plus utilisé sur ovh depuis un moment,
car il avait planté début mars suite au commit
8b08a06192
sans que personne ne s'en rende compte.
On rajoute ejabberd dans les services à monitorer.
358 lines
10 KiB
Python
358 lines
10 KiB
Python
# -*- coding: utf-8; mode: python -*-
|
|
|
|
import re
|
|
|
|
include("ip")
|
|
|
|
info["owner"] = "root"
|
|
info["group"] = "root"
|
|
info["perms"] = 0644
|
|
|
|
header()
|
|
import sys
|
|
def service(group=None, **kw):
|
|
"""Permet de définir un service monit.
|
|
|
|
Les arguments supplémentaires peuvent être:
|
|
name (par défault le nom du groupe)
|
|
init (le nom du script de démarrage dans /etc/init.d, (par défaut name))
|
|
pidp (le nom du fichier de pid (par défaut /var/run/<pidf>.pid))
|
|
extra (des lignes supplémentaires)"""
|
|
|
|
if group == None or has(group):
|
|
name = kw.get('name', group)
|
|
init = kw.get('init', name)
|
|
pidp = kw.get('pidp', "/var/run/%s.pid" % kw.get('pidf', name))
|
|
start_cmd = kw.get('start_cmd', '/etc/init.d/%s start' % init)
|
|
stop_cmd = kw.get('stop_cmd', '/etc/init.d/%s stop' % init)
|
|
print ('''# %(name)s
|
|
check process %(name)s with pidfile %(pidp)s
|
|
start program = "%(start_cmd)s"
|
|
stop program = "%(stop_cmd)s"''' % locals())
|
|
|
|
max_restart = kw.get('max_restart', 5)
|
|
for line in kw.get('extra', []):
|
|
if line:
|
|
print " ", line
|
|
print " if %d restarts within 5 cycles then timeout" % max_restart
|
|
print
|
|
|
|
service("apache",
|
|
name="apache2",
|
|
extra=[ has("http-server") and "if failed host localhost port 80 protocol http timeout 30 seconds then restart",
|
|
has("https-server") and "if failed host localhost port 443 type tcpssl protocol http timeout 30 seconds then restart",
|
|
has("intranet-server") and "if failed host intranet.crans.org port 443 type tcpssl protocol http timeout 30 seconds then restart",
|
|
"if cpu is greater than 60% for 2 cycles then alert",
|
|
"if cpu > 80% for 5 cycles then restart",
|
|
"if totalmem > 500.0 MB for 5 cycles then restart",
|
|
"if children > 250 then restart",
|
|
"if loadavg(5min) greater than 10 for 8 cycles then restart",
|
|
],
|
|
max_restart=3)
|
|
|
|
service('nginx')
|
|
|
|
service('ejabberd',
|
|
pidf='ejabberd/ejabberd')
|
|
|
|
if has('nginx'):
|
|
if has('php'):
|
|
service(name='php5-fpm')
|
|
if has('cgi'):
|
|
service(name='fcgiwrap', pidp='/var/run/fcgiwrap.pids')
|
|
|
|
if has('gunicorn'):
|
|
# on utilise le prob gunicorn-debian, qui liste les sites actifs sur
|
|
# /etc/gunicorn.d/
|
|
re_ignore = re.compile(r'(^_|\.(dpkg-(old|dist|new|tmp)|example)$|\.pyc|\.comc$)')
|
|
for site in metadata.Probes["gunicorn-debian"].splitlines():
|
|
site = site.strip()
|
|
if re_ignore.search(site):
|
|
continue
|
|
service(None,
|
|
name="gunicorn_%s" % site,
|
|
start_cmd = "/etc/init.d/gunicorn start %s" % site,
|
|
stop_cmd = "/etc/init.d/gunicorn stop %s" % site,
|
|
pidf = "gunicorn/%s" % site,
|
|
)
|
|
|
|
if has('asterisk-server'):
|
|
service(name="sms_queuing")
|
|
service('asterisk-server',
|
|
pidp="/var/run/asterisk/asterisk.pid",
|
|
start_cmd = "/etc/init.d/asterisk start",
|
|
stop_cmd = "/etc/init.d/asterisk stop",
|
|
)
|
|
|
|
service("at",
|
|
init="atd",
|
|
pidf="atd")
|
|
|
|
service("cherrypy",
|
|
name="intranet")
|
|
|
|
service("backuppc-server",
|
|
name="backuppc",
|
|
pidf="backuppc/BackupPC")
|
|
|
|
service("isc-dhcp-server",
|
|
pidf="dhcpd")
|
|
|
|
if has('arpwatch'):
|
|
include("arpwatch")
|
|
# TODO modeliser ceci par un seul service pour éviter des races débiles
|
|
# au moment des restart ...
|
|
for vlan in watched_vlans:
|
|
iface = 'eth0'
|
|
if vlan <> 1:
|
|
iface += '.%d' % vlan
|
|
service(None, name='arpwatch-%s' % iface,
|
|
init='arpwatch',
|
|
)
|
|
|
|
service("bind",
|
|
init="bind9",
|
|
pidf="named/named")
|
|
|
|
service("cups",
|
|
pidf="cups/cupsd")
|
|
|
|
service("openntpd",
|
|
name="openntpd",
|
|
pidf="openntpd/ntpd")
|
|
|
|
service("cron",
|
|
pidf="crond")
|
|
|
|
service("dhcp-detect")
|
|
|
|
if has("komaz"):
|
|
service(None,
|
|
name="filtrage_firewall")
|
|
if has("comptage-upload"):
|
|
service(None,
|
|
name="netacct-crans-ens",
|
|
pidf="netacct-crans-ens",
|
|
init="netacct-crans-ens")
|
|
service(None,
|
|
name="netacct-crans-sixxs2",
|
|
pidf="netacct-crans-sixxs2",
|
|
init="netacct-crans-sixxs2")
|
|
service(None,
|
|
name="mac_ip",
|
|
pidf="mac_ip",
|
|
init="mac_ip")
|
|
|
|
service("aiccu")
|
|
|
|
service("freeradius",
|
|
pidf="freeradius/freeradius")
|
|
|
|
service("digicode",
|
|
name="digicode_server",
|
|
pidf="digicode")
|
|
|
|
service("inn",
|
|
pidf="news/innd",
|
|
init="inn2")
|
|
|
|
service("mailman",
|
|
pidf="mailman/mailman")
|
|
|
|
service("monit-ovh")
|
|
|
|
service("mysql",
|
|
pidf="mysqld/mysqld")
|
|
|
|
service("munin-node",
|
|
pidf="munin/munin-node")
|
|
|
|
service("nslcd",
|
|
pidf="nslcd/nslcd",
|
|
extra=["if failed unixsocket /var/run/nslcd/socket then restart"])
|
|
|
|
service("nscd",
|
|
pidf="nscd/nscd",
|
|
extra=["if failed unixsocket /var/run/nscd/socket then restart"])
|
|
|
|
service("ntp", pidf="ntpd")
|
|
|
|
service("openvpn-ovh",
|
|
pidf="openvpn.ovh",
|
|
init="openvpn")
|
|
|
|
service("openvpn-komaz",
|
|
pidf="openvpn.komaz",
|
|
init="openvpn")
|
|
|
|
service("openvpn-freebox",
|
|
pidf="openvpn.freebox",
|
|
init="openvpn",
|
|
extra=["depends on openvpn-komaz"])
|
|
|
|
pg_version = '8.4' if has('squeeze') else '9.1'
|
|
service("pgsql-server",
|
|
name="postgresql",
|
|
init="postgresql",
|
|
pidp="/var/lib/postgresql/%s/main/postmaster.pid" % pg_version,
|
|
extra=["if failed port 5432 timeout 30 seconds then restart"])
|
|
|
|
service("postfix",
|
|
pidp="/var/spool/postfix/pid/master.pid",
|
|
extra=["if failed port 25 protocol smtp timeout 30 seconds then restart"])
|
|
|
|
service("privoxy",
|
|
extra=["if failed host localhost port 8118 timeout 30 seconds then restart"])
|
|
|
|
service("proftpd",
|
|
extra=["if failed port 21 protocol ftp timeout 30 seconds then restart"])
|
|
|
|
service("rsync")
|
|
|
|
service("slapd",
|
|
pidp="/var/run/slapd/slapd.pid",
|
|
extra=["if failed host localhost port 389 protocol ldap3 timeout 30 seconds then restart"])
|
|
|
|
service("spamassassin",
|
|
name="spamd",
|
|
init="spamassassin")
|
|
|
|
service("sqlgrey")
|
|
|
|
service("ssh",
|
|
pidf="sshd",
|
|
extra=["if failed port 22 protocol ssh timeout 30 seconds then restart",
|
|
"if children > 200 then restart"])
|
|
|
|
if has('rsyslog-client') or has('rsyslog-server'):
|
|
service(None, name="rsyslog",
|
|
pidf='rsyslogd',
|
|
extra=["depend on file/var/log/syslog"])
|
|
|
|
service("ups-monitor",
|
|
name="upsmon",
|
|
pidf="nut/upsmon")
|
|
|
|
service("ups-server",
|
|
name="upsd",
|
|
pidf="nut/upsd")
|
|
|
|
if has('non-vlan-adherent'):
|
|
vsftpd_ip = admip()
|
|
else:
|
|
vsftpd_ip = pubip()
|
|
service("vsftpd",
|
|
pidf="vsftpd/vsftpd",
|
|
extra=["if failed host %s port 21 protocol ftp timeout 30 seconds then restart" % vsftpd_ip])
|
|
|
|
service("vsftpd-federez",
|
|
extra=["if failed host 138.231.136.129 port 21 protocol ftp timeout 30 seconds then restart"])
|
|
|
|
dernierecarte=int(metadata.Probes["cartesdvb"])
|
|
try:
|
|
cartesdesactivees=map(lambda x : int(x),metadata.Probes["cartesdvbdesactivees_local"].split(' '))
|
|
except:
|
|
cartesdesactivees=[]
|
|
|
|
if dernierecarte:
|
|
print "# Il y a %d carte(s) DVB sur ce serveur dont %d cartes desactivee(s)\n" % (dernierecarte,len(cartesdesactivees))
|
|
for i in range(0,dernierecarte):
|
|
if not i in cartesdesactivees:
|
|
service("mumudvb",
|
|
name = "mumudvb%d" % i,
|
|
init = "mumudvb",
|
|
pidf = "mumudvb/mumudvb_adapter%d_tuner0" % i,
|
|
start_cmd = """/sbin/start-stop-daemon --start --oknodo --pidfile /var/run/mumudvb/mumudvb_adapter%d_tuner0.pid --chuid _mumudvb --exec /usr/bin/mumudvb -- -c /etc/sat/carte%d.conf""" % (i,i),
|
|
stop_cmd = """/sbin/start-stop-daemon --stop --pidfile /var/run/mumudvb/mumudvb_adapter%d_tuner0.pid""" % i
|
|
)
|
|
|
|
@check file file/var/log/syslog with path /var/log/syslog
|
|
@ if timestamp > 15 minutes then alert
|
|
@
|
|
|
|
service('igmpproxy')
|
|
|
|
if hostname == 'zamok':
|
|
@# print_status
|
|
@check file file/usr/scripts/var/print_status/error.txt with path /usr/scripts/var/print_status/error.txt
|
|
@ if size > 0 for 3 cycles then alert
|
|
@
|
|
|
|
# on ne monitore pas les disques de canard
|
|
if hostname in ['canard'] :
|
|
done()
|
|
|
|
disques = {}
|
|
if has('blkid'):
|
|
for line in metadata.Probes["blkid"].splitlines():
|
|
label, disque = line.strip().split()
|
|
disques[label] = disque
|
|
|
|
for line in metadata.Probes["fstab_local"].splitlines():
|
|
# on supprime les espaces
|
|
line = line.strip()
|
|
|
|
# on saute les lignes inintérassantes
|
|
if not line :
|
|
continue
|
|
if line[0] == "#" :
|
|
continue
|
|
|
|
# on découpe la ligne
|
|
[fs, mntpoint, type, options, dump, pass_] = re.split('[ \t]*',line)
|
|
fs = disques.get(fs, fs)
|
|
options = options.split(",")
|
|
|
|
# on saute si c'est une partition non montée au démarrage
|
|
if "noauto" in options:
|
|
continue
|
|
|
|
# on saute si c'est une partition bind
|
|
if "bind" in options:
|
|
continue
|
|
|
|
# on saute les système pas intéressants
|
|
if type in ['swap', 'sw', 'proc', 'tmpfs', 'sysfs', 'nfs', 'devpts']:
|
|
continue
|
|
|
|
# on ajoute les lignes de configuration générale
|
|
comment("partition %s" % mntpoint)
|
|
|
|
# Permission attendue
|
|
perm = '660'
|
|
if not has('squeeze'):
|
|
perm = '1' + perm
|
|
|
|
# on vérifie le filesystem directement pour les volumes lvm
|
|
if fs.startswith('/dev/mapper'):
|
|
print 'check filesystem fs%s with path %s' % (mntpoint, mntpoint)
|
|
else:
|
|
print 'check device fs%s with path %s' % (mntpoint, fs)
|
|
print ' if failed permission %s then alert' % perm
|
|
print ' if failed uid root then alert'
|
|
|
|
# place sur les disques
|
|
alert_level = {
|
|
('babar','/backup') : 90,
|
|
('sable','/var/spool/squid1') : None,
|
|
('sable','/var/spool/squid2') : None,
|
|
('sable','/var/log/squid') : 92,
|
|
('charybde','/pubftp') : 80,
|
|
('news','/var') : 90
|
|
}.get((hostname, mntpoint), -1)
|
|
if alert_level == -1:
|
|
if mntpoint in ('/usr', '/var/lib/mailman', '/localhome' , '/home') or has('domu'):
|
|
alert_level = 90
|
|
else:
|
|
alert_level = 80
|
|
|
|
if alert_level:
|
|
print ' if space usage > %d%% for 3 cycles then alert' % alert_level
|
|
|
|
# inodes pour les disques
|
|
if type != 'reiserfs' :
|
|
print ' if inode usage > 80% then alert'
|
|
|
|
print ' mode passive'
|
|
print
|
|
|