crans_bcfg2/Python/etc/monit/services
Daniel STAN 172c13c13c [ejabberd] retrait ovh et monitoring
Je suppose que jabber n'était déjà plus utilisé sur ovh depuis un moment,
car il avait planté début mars suite au commit
8b08a06192
sans que personne ne s'en rende compte.
On rajoute ejabberd dans les services à monitorer.
2013-06-26 14:33:05 +02:00

358 lines
10 KiB
Python

# -*- coding: utf-8; mode: python -*-
import re
include("ip")
info["owner"] = "root"
info["group"] = "root"
info["perms"] = 0644
header()
import sys
def service(group=None, **kw):
"""Permet de définir un service monit.
Les arguments supplémentaires peuvent être:
name (par défault le nom du groupe)
init (le nom du script de démarrage dans /etc/init.d, (par défaut name))
pidp (le nom du fichier de pid (par défaut /var/run/<pidf>.pid))
extra (des lignes supplémentaires)"""
if group == None or has(group):
name = kw.get('name', group)
init = kw.get('init', name)
pidp = kw.get('pidp', "/var/run/%s.pid" % kw.get('pidf', name))
start_cmd = kw.get('start_cmd', '/etc/init.d/%s start' % init)
stop_cmd = kw.get('stop_cmd', '/etc/init.d/%s stop' % init)
print ('''# %(name)s
check process %(name)s with pidfile %(pidp)s
start program = "%(start_cmd)s"
stop program = "%(stop_cmd)s"''' % locals())
max_restart = kw.get('max_restart', 5)
for line in kw.get('extra', []):
if line:
print " ", line
print " if %d restarts within 5 cycles then timeout" % max_restart
print
service("apache",
name="apache2",
extra=[ has("http-server") and "if failed host localhost port 80 protocol http timeout 30 seconds then restart",
has("https-server") and "if failed host localhost port 443 type tcpssl protocol http timeout 30 seconds then restart",
has("intranet-server") and "if failed host intranet.crans.org port 443 type tcpssl protocol http timeout 30 seconds then restart",
"if cpu is greater than 60% for 2 cycles then alert",
"if cpu > 80% for 5 cycles then restart",
"if totalmem > 500.0 MB for 5 cycles then restart",
"if children > 250 then restart",
"if loadavg(5min) greater than 10 for 8 cycles then restart",
],
max_restart=3)
service('nginx')
service('ejabberd',
pidf='ejabberd/ejabberd')
if has('nginx'):
if has('php'):
service(name='php5-fpm')
if has('cgi'):
service(name='fcgiwrap', pidp='/var/run/fcgiwrap.pids')
if has('gunicorn'):
# on utilise le prob gunicorn-debian, qui liste les sites actifs sur
# /etc/gunicorn.d/
re_ignore = re.compile(r'(^_|\.(dpkg-(old|dist|new|tmp)|example)$|\.pyc|\.comc$)')
for site in metadata.Probes["gunicorn-debian"].splitlines():
site = site.strip()
if re_ignore.search(site):
continue
service(None,
name="gunicorn_%s" % site,
start_cmd = "/etc/init.d/gunicorn start %s" % site,
stop_cmd = "/etc/init.d/gunicorn stop %s" % site,
pidf = "gunicorn/%s" % site,
)
if has('asterisk-server'):
service(name="sms_queuing")
service('asterisk-server',
pidp="/var/run/asterisk/asterisk.pid",
start_cmd = "/etc/init.d/asterisk start",
stop_cmd = "/etc/init.d/asterisk stop",
)
service("at",
init="atd",
pidf="atd")
service("cherrypy",
name="intranet")
service("backuppc-server",
name="backuppc",
pidf="backuppc/BackupPC")
service("isc-dhcp-server",
pidf="dhcpd")
if has('arpwatch'):
include("arpwatch")
# TODO modeliser ceci par un seul service pour éviter des races débiles
# au moment des restart ...
for vlan in watched_vlans:
iface = 'eth0'
if vlan <> 1:
iface += '.%d' % vlan
service(None, name='arpwatch-%s' % iface,
init='arpwatch',
)
service("bind",
init="bind9",
pidf="named/named")
service("cups",
pidf="cups/cupsd")
service("openntpd",
name="openntpd",
pidf="openntpd/ntpd")
service("cron",
pidf="crond")
service("dhcp-detect")
if has("komaz"):
service(None,
name="filtrage_firewall")
if has("comptage-upload"):
service(None,
name="netacct-crans-ens",
pidf="netacct-crans-ens",
init="netacct-crans-ens")
service(None,
name="netacct-crans-sixxs2",
pidf="netacct-crans-sixxs2",
init="netacct-crans-sixxs2")
service(None,
name="mac_ip",
pidf="mac_ip",
init="mac_ip")
service("aiccu")
service("freeradius",
pidf="freeradius/freeradius")
service("digicode",
name="digicode_server",
pidf="digicode")
service("inn",
pidf="news/innd",
init="inn2")
service("mailman",
pidf="mailman/mailman")
service("monit-ovh")
service("mysql",
pidf="mysqld/mysqld")
service("munin-node",
pidf="munin/munin-node")
service("nslcd",
pidf="nslcd/nslcd",
extra=["if failed unixsocket /var/run/nslcd/socket then restart"])
service("nscd",
pidf="nscd/nscd",
extra=["if failed unixsocket /var/run/nscd/socket then restart"])
service("ntp", pidf="ntpd")
service("openvpn-ovh",
pidf="openvpn.ovh",
init="openvpn")
service("openvpn-komaz",
pidf="openvpn.komaz",
init="openvpn")
service("openvpn-freebox",
pidf="openvpn.freebox",
init="openvpn",
extra=["depends on openvpn-komaz"])
pg_version = '8.4' if has('squeeze') else '9.1'
service("pgsql-server",
name="postgresql",
init="postgresql",
pidp="/var/lib/postgresql/%s/main/postmaster.pid" % pg_version,
extra=["if failed port 5432 timeout 30 seconds then restart"])
service("postfix",
pidp="/var/spool/postfix/pid/master.pid",
extra=["if failed port 25 protocol smtp timeout 30 seconds then restart"])
service("privoxy",
extra=["if failed host localhost port 8118 timeout 30 seconds then restart"])
service("proftpd",
extra=["if failed port 21 protocol ftp timeout 30 seconds then restart"])
service("rsync")
service("slapd",
pidp="/var/run/slapd/slapd.pid",
extra=["if failed host localhost port 389 protocol ldap3 timeout 30 seconds then restart"])
service("spamassassin",
name="spamd",
init="spamassassin")
service("sqlgrey")
service("ssh",
pidf="sshd",
extra=["if failed port 22 protocol ssh timeout 30 seconds then restart",
"if children > 200 then restart"])
if has('rsyslog-client') or has('rsyslog-server'):
service(None, name="rsyslog",
pidf='rsyslogd',
extra=["depend on file/var/log/syslog"])
service("ups-monitor",
name="upsmon",
pidf="nut/upsmon")
service("ups-server",
name="upsd",
pidf="nut/upsd")
if has('non-vlan-adherent'):
vsftpd_ip = admip()
else:
vsftpd_ip = pubip()
service("vsftpd",
pidf="vsftpd/vsftpd",
extra=["if failed host %s port 21 protocol ftp timeout 30 seconds then restart" % vsftpd_ip])
service("vsftpd-federez",
extra=["if failed host 138.231.136.129 port 21 protocol ftp timeout 30 seconds then restart"])
dernierecarte=int(metadata.Probes["cartesdvb"])
try:
cartesdesactivees=map(lambda x : int(x),metadata.Probes["cartesdvbdesactivees_local"].split(' '))
except:
cartesdesactivees=[]
if dernierecarte:
print "# Il y a %d carte(s) DVB sur ce serveur dont %d cartes desactivee(s)\n" % (dernierecarte,len(cartesdesactivees))
for i in range(0,dernierecarte):
if not i in cartesdesactivees:
service("mumudvb",
name = "mumudvb%d" % i,
init = "mumudvb",
pidf = "mumudvb/mumudvb_adapter%d_tuner0" % i,
start_cmd = """/sbin/start-stop-daemon --start --oknodo --pidfile /var/run/mumudvb/mumudvb_adapter%d_tuner0.pid --chuid _mumudvb --exec /usr/bin/mumudvb -- -c /etc/sat/carte%d.conf""" % (i,i),
stop_cmd = """/sbin/start-stop-daemon --stop --pidfile /var/run/mumudvb/mumudvb_adapter%d_tuner0.pid""" % i
)
@check file file/var/log/syslog with path /var/log/syslog
@ if timestamp > 15 minutes then alert
@
service('igmpproxy')
if hostname == 'zamok':
@# print_status
@check file file/usr/scripts/var/print_status/error.txt with path /usr/scripts/var/print_status/error.txt
@ if size > 0 for 3 cycles then alert
@
# on ne monitore pas les disques de canard
if hostname in ['canard'] :
done()
disques = {}
if has('blkid'):
for line in metadata.Probes["blkid"].splitlines():
label, disque = line.strip().split()
disques[label] = disque
for line in metadata.Probes["fstab_local"].splitlines():
# on supprime les espaces
line = line.strip()
# on saute les lignes inintérassantes
if not line :
continue
if line[0] == "#" :
continue
# on découpe la ligne
[fs, mntpoint, type, options, dump, pass_] = re.split('[ \t]*',line)
fs = disques.get(fs, fs)
options = options.split(",")
# on saute si c'est une partition non montée au démarrage
if "noauto" in options:
continue
# on saute si c'est une partition bind
if "bind" in options:
continue
# on saute les système pas intéressants
if type in ['swap', 'sw', 'proc', 'tmpfs', 'sysfs', 'nfs', 'devpts']:
continue
# on ajoute les lignes de configuration générale
comment("partition %s" % mntpoint)
# Permission attendue
perm = '660'
if not has('squeeze'):
perm = '1' + perm
# on vérifie le filesystem directement pour les volumes lvm
if fs.startswith('/dev/mapper'):
print 'check filesystem fs%s with path %s' % (mntpoint, mntpoint)
else:
print 'check device fs%s with path %s' % (mntpoint, fs)
print ' if failed permission %s then alert' % perm
print ' if failed uid root then alert'
# place sur les disques
alert_level = {
('babar','/backup') : 90,
('sable','/var/spool/squid1') : None,
('sable','/var/spool/squid2') : None,
('sable','/var/log/squid') : 92,
('charybde','/pubftp') : 80,
('news','/var') : 90
}.get((hostname, mntpoint), -1)
if alert_level == -1:
if mntpoint in ('/usr', '/var/lib/mailman', '/localhome' , '/home') or has('domu'):
alert_level = 90
else:
alert_level = 80
if alert_level:
print ' if space usage > %d%% for 3 cycles then alert' % alert_level
# inodes pour les disques
if type != 'reiserfs' :
print ' if inode usage > 80% then alert'
print ' mode passive'
print