# -*- coding: utf-8; mode: python -*- import re include("ip") info["owner"] = "root" info["group"] = "root" info["perms"] = 0644 header() import sys def service(group=None, **kw): """Permet de définir un service monit. Les arguments supplémentaires peuvent être: name (par défault le nom du groupe) init (le nom du script de démarrage dans /etc/init.d, (par défaut name)) pidp (le nom du fichier de pid (par défaut /var/run/.pid)) extra (des lignes supplémentaires)""" if group == None or has(group): name = kw.get('name', group) init = kw.get('init', name) pidp = kw.get('pidp', "/var/run/%s.pid" % kw.get('pidf', name)) start_cmd = kw.get('start_cmd', '/etc/init.d/%s start' % init) stop_cmd = kw.get('stop_cmd', '/etc/init.d/%s stop' % init) print ('''# %(name)s check process %(name)s with pidfile %(pidp)s start program = "%(start_cmd)s" stop program = "%(stop_cmd)s"''' % locals()) max_restart = kw.get('max_restart', 5) for line in kw.get('extra', []): if line: print " ", line print " if %d restarts within 5 cycles then timeout" % max_restart print service("apache", name="apache2", extra=[ has("http-server") and "if failed host localhost port 80 protocol http timeout 30 seconds then restart", has("https-server") and "if failed host localhost port 443 type tcpssl protocol http timeout 30 seconds then restart", has("intranet-server") and "if failed host intranet.crans.org port 443 type tcpssl protocol http timeout 30 seconds then restart", "if cpu is greater than 60% for 2 cycles then alert", "if cpu > 80% for 5 cycles then restart", "if totalmem > 500.0 MB for 5 cycles then restart", "if children > 250 then restart", "if loadavg(5min) greater than 10 for 8 cycles then restart", ], max_restart=3) service('nginx') service('sogo', pidf="sogo/sogo", extra=['if failed host localhost port 20000 protocol http timeout 20 seconds for 5 cycles then restart']) service('ejabberd', pidf='ejabberd/ejabberd') if has('nginx'): if has('php'): service(name='php5-fpm', extra=["if failed host localhost port 80 protocol http and request '/php_ping' timeout 20 seconds for 5 cycles then restart"]) if has('cgi'): service(name='fcgiwrap', pidp='/var/run/fcgiwrap.pids') if has('gunicorn'): # on utilise le prob gunicorn-debian, qui liste les sites actifs sur # /etc/gunicorn.d/ re_ignore = re.compile(r'(^_|\.(dpkg-(old|dist|new|tmp)|example)$|\.pyc|\.comc$)') for site in metadata.Probes["gunicorn-debian"].splitlines(): site = site.strip() if re_ignore.search(site): continue service(None, name="gunicorn_%s" % site, start_cmd = "/etc/init.d/gunicorn start %s" % site, stop_cmd = "/etc/init.d/gunicorn stop %s" % site, pidf = "gunicorn/%s" % site, ) if has('asterisk-server'): service(name="sms_queuing") service('asterisk-server', pidp="/var/run/asterisk/asterisk.pid", start_cmd = "/etc/init.d/asterisk start", stop_cmd = "/etc/init.d/asterisk stop", ) service("at", init="atd", pidf="atd") service("cherrypy", name="intranet") service("backuppc-server", name="backuppc", pidf="backuppc/BackupPC") service("isc-dhcp-server", pidf="dhcpd") if has('arpwatch'): include("arpwatch") # TODO modeliser ceci par un seul service pour éviter des races débiles # au moment des restart ... for vlan in watched_vlans: iface = 'eth0' if vlan <> 1: iface += '.%d' % vlan service(None, name='arpwatch-%s' % iface, init='arpwatch', ) service(name="arpwatch_sendmail", group=None) service("bind", init="bind9", pidf="named/named") service("cups", pidf="cups/cupsd") service("openntpd", name="openntpd", pidf="openntpd/ntpd") service("cron", pidf="crond") service("dhcp-detect") if has("komaz"): service(None, name="filtrage_firewall") if has("comptage-upload"): service(None, name="netacct-crans-ens", pidf="netacct-crans-ens", init="netacct-crans-ens") service(None, name="netacct-crans-sixxs2", pidf="netacct-crans-sixxs2", init="netacct-crans-sixxs2") service(None, name="mac_ip", pidf="mac_ip", init="mac_ip") service("aiccu") service("freeradius", pidf="freeradius/freeradius") service("digicode", name="digicode_server", pidf="digicode") service("inn", pidf="news/innd", init="inn2") service("mailman", pidf="mailman/mailman") service("monit-ovh") service("mysql", pidf="mysqld/mysqld") service("munin-node", pidf="munin/munin-node") service("nslcd", pidf="nslcd/nslcd", extra=["if failed unixsocket /var/run/nslcd/socket then restart"]) service("nscd", pidf="nscd/nscd", extra=["if failed unixsocket /var/run/nscd/socket then restart"]) service("ntp", pidf="ntpd") service("openvpn-ovh", pidf="openvpn.ovh", init="openvpn") service("openvpn-komaz", pidf="openvpn.komaz", init="openvpn") service("openvpn-freebox", pidf="openvpn.freebox", init="openvpn", extra=["depends on openvpn-komaz"]) pg_version = '8.4' if has('squeeze') else '9.1' service("pgsql-server", name="postgresql", init="postgresql", pidp="/var/lib/postgresql/%s/main/postmaster.pid" % pg_version, extra=["if failed port 5432 timeout 30 seconds then restart"]) service("ident-daemon", name="ident2") service("postfix", pidp="/var/spool/postfix/pid/master.pid", extra=["if failed port 25 protocol smtp timeout 30 seconds then restart"]) service("privoxy", extra=["if failed host localhost port 8118 timeout 30 seconds then restart"]) service("proftpd", extra=["if failed port 21 protocol ftp timeout 30 seconds then restart"]) service("rsync") service("slapd", pidp="/var/run/slapd/slapd.pid", extra=["if failed host localhost port 389 protocol ldap3 timeout 30 seconds then restart"]) service("spamassassin", name="spamd", init="spamassassin") service("sqlgrey") service("ssh", pidf="sshd", extra=["if failed port 22 protocol ssh timeout 30 seconds then restart", "if children > 200 then restart"]) if has('rsyslog-client') or has('rsyslog-server'): service(None, name="rsyslog", pidf='rsyslogd', extra=["depend on file/var/log/syslog"]) service("ups-monitor", name="upsmon", pidf="nut/upsmon") service("ups-server", name="upsd", pidf="nut/upsd") if has('non-vlan-adherent'): vsftpd_ip = admip() else: vsftpd_ip = pubip() service("vsftpd", pidf="vsftpd/vsftpd", extra=["if failed host %s port 21 protocol ftp timeout 30 seconds then restart" % vsftpd_ip]) service("vsftpd-federez", extra=["if failed host 138.231.136.129 port 21 protocol ftp timeout 30 seconds then restart"]) dernierecarte=int(metadata.Probes["cartesdvb"]) try: cartesdesactivees=map(lambda x : int(x),metadata.Probes["cartesdvbdesactivees_local"].split(' ')) except: cartesdesactivees=[] if dernierecarte: print "# Il y a %d carte(s) DVB sur ce serveur dont %d cartes desactivee(s)\n" % (dernierecarte,len(cartesdesactivees)) for i in range(0,dernierecarte): if not i in cartesdesactivees: service("mumudvb", name = "mumudvb%d" % i, init = "mumudvb", pidf = "mumudvb/mumudvb_adapter%d_tuner0" % i, start_cmd = """/sbin/start-stop-daemon --start --oknodo --pidfile /var/run/mumudvb/mumudvb_adapter%d_tuner0.pid --chuid _mumudvb --exec /usr/bin/mumudvb -- -c /etc/sat/carte%d.conf""" % (i,i), stop_cmd = """/sbin/start-stop-daemon --stop --pidfile /var/run/mumudvb/mumudvb_adapter%d_tuner0.pid""" % i ) @check file file/var/log/syslog with path /var/log/syslog @ if timestamp > 15 minutes then alert @ service('igmpproxy') if hostname == 'zamok': @# print_status @check file file/usr/scripts/var/print_status/error.txt with path /usr/scripts/var/print_status/error.txt @ if size > 0 for 3 cycles then alert @ # on ne monitore pas les disques de canard if hostname in ['canard'] : done() disques = {} if has('blkid'): for line in metadata.Probes["blkid"].splitlines(): label, disque = line.strip().split() disques[label] = disque for line in metadata.Probes["fstab_local"].splitlines(): # on supprime les espaces line = line.strip() # on saute les lignes inintérassantes if not line : continue if line[0] == "#" : continue # on découpe la ligne [fs, mntpoint, type, options, dump, pass_] = re.split('[ \t]*',line) fs = disques.get(fs, fs) options = options.split(",") # on saute si c'est une partition non montée au démarrage if "noauto" in options: continue # on saute si c'est une partition bind if "bind" in options: continue # on saute les système pas intéressants if type in ['swap', 'sw', 'proc', 'tmpfs', 'sysfs', 'nfs', 'devpts']: continue # on ajoute les lignes de configuration générale comment("partition %s" % mntpoint) # Permission attendue perm = '660' if not has('squeeze'): perm = '1' + perm # on vérifie le filesystem directement pour les volumes lvm if fs.startswith('/dev/mapper'): print 'check filesystem fs%s with path %s' % (mntpoint, mntpoint) else: print 'check device fs%s with path %s' % (mntpoint, fs) print ' if failed permission %s then alert' % perm print ' if failed uid root then alert' # place sur les disques alert_level = { ('babar','/backup') : 90, ('sable','/var/spool/squid1') : None, ('sable','/var/spool/squid2') : None, ('sable','/var/log/squid') : 92, ('charybde','/pubftp') : 80, ('news','/var') : 90 }.get((hostname, mntpoint), -1) if alert_level == -1: if mntpoint in ('/usr', '/var/lib/mailman', '/localhome' , '/home') or has('domu'): alert_level = 90 else: alert_level = 80 if alert_level: print ' if space usage > %d%% for 3 cycles then alert' % alert_level # inodes pour les disques if type != 'reiserfs' : print ' if inode usage > 80% then alert' print ' mode passive' print