#!/usr/local/bin/tclsh8.6

#  
# Network graph building daemon
#  
# When a modification is done on an equipment (either by a human or by
# a program when a user is using web interface to change characteristics
# of an interface), this modification is detected by:
# - detectconfmod script which reads syslog output
# - radius daemon
# They both write an entry in topo.modeq table of the database.
#
# The topographd daemon exploit this topo.modeq table. Its algorithm
# is as follows.
#
#	Infinite loop
#	    once by night, fetches all equipment configuration files
#		(full rancid), analyzes every configuration file and
#		rebuild the graph.
#
#	    when an equipment specification is modified in topo.eq
#		table, generates the router.db rancid file), fetches
#		all equipments (full rancid), analyzes all configuration
#		files and rebuild the graph.
#
#	    when a vlan specification is modified in topo.vlan table,
#		generates the vlan.eq file, and rebuild the graph.
#
#	    when an equipment is modified (topo.modeq table), fetches
#		its configuration, analyzes it, and rebuild the graph.
#	    	When graph is rebuilt, mark the entry in topo.modeq as
#		"processed".
#
# History
#  2010/02/16 : pda/jean : design
#  2010/12/18 : pda      : rework installation
#  2012/01/18 : pda      : ranciddb -> ranciddir
#  2012/03/27 : pda/jean : daemonization
#

set conf(extractcoll)	{extractcoll -a -s -w -i -p}
set conf(start-rancid)	{start-rancid %1$s}
set conf(anaconf)	{anaconf}

source /usr/local/lib/netmagis/libnetmagis.tcl

##############################################################################
# Graph updating
##############################################################################

#
# Detect separator in file using the following heuristic:
# the expected format of the file is:
#
# <alpha-numeric characters or dot>+<Separator>
# Separator is the first non-alphanumeric and non-dot character
# (ie. a hostname or fully qualified domain name)
# found after the first field in the file.
#
# Input:
#   - filename : file name to open
# Output: 
#   - return value : character used as separator (":", ";" etc.),
#   - msg: error message if 
#
# History 
#   2015/09/08 : jean/boggia : design
#
#

proc detect-separator {filename _msg} {
    upvar $_msg msg

    set separator ""
    set msg ""
    if { [catch {open $filename "r"} fd] } then {
        set msg "detect-separator: cannot open $filename: $fd"
    } else {
        if {[gets $fd line] >= 0} then {
            if {![regexp {^[.a-zA-Z0-9-]+([^.a-zA-Z0-9])} $line dummy separator]} then {
                set msg "detect-separator: separator not found in $filename"
            }
        } else {
            set msg "detect-separator: could not read first line of $filename"
        }
    }
    return $separator
}

# Generate a new router.db rancid file
#
# Input:
#   - none
# Output: 
#   - return value : empty string or error message
#
# History 
#   2010/12/13 : pda/jean : design
#   2012/04/25 : pda      : create file if it doesn't exist (even if no modif)
#

proc update-routerdb {} {
    global ctxt

    set sql "SELECT * FROM topo.modeq
		    WHERE eq = '_routerdb' AND processed = 0"
    set found 0
    if {! [toposqlselect $sql tab { set found 1 }]} then {
	return "Cannot read equipment modification from database"
    }

    if {$found || ! [file exists $ctxt(routerdb)]} then {
	set sql "SELECT e.eq, t.type, e.up
			FROM topo.eq e, topo.eqtype t
			WHERE e.idtype = t.idtype"
	set leq {}
	if {! [toposqlselect $sql t { lappend leq [list $t(eq) $t(type) $t(up)] }]} then {
	    return "Cannot read equipment list from database"
	}

	set msg ""
	set sep [detect-separator $ctxt(routerdb) msg]
        if {$msg ne ""} then {
	    return $msg
        }
	set new "$ctxt(routerdb).new"
	if {[catch {set fd [open $new "w"]} msg]} then {
	    return "Cannot create $new ($msg)"
	}
	foreach e $leq {
	    lassign $e eq type up
	    if {$up} then { set up "up" } else { set up "down" }
	    puts $fd "$eq$sep$type$sep$up"
	}
	if {[catch {close $fd} msg]} then {
	    return "Cannot close $new ($msg)"
	}
	if {[catch {file rename -force $new $ctxt(routerdb)} msg]} then {
	    return "Cannot move $new to $ctxt(routerdb) ($msg)"
	}
	set sql "UPDATE topo.modeq SET processed = 1 WHERE eq = '_routerdb'"
	if {! [toposqlexec $sql]} then {
	    return "Cannot update equipment modification for _routerdb"
	}
    }
    return ""
}

#
# Guess if a full rancid run is needed
#
# Input:
#   - routerdbmod : in return, result from detect-filemod
# Output: 
#   - return value :
#	-1 : error
#	0 : no full rancid needed
#	1 : full rancid needed
#   - parameter routerdbmod : detect-filemod result
#
# History 
#   2010/10/15 : pda/jean : design
#

proc full-rancid-needed {_routerdbmod} {
    global ctxt
    upvar $_routerdbmod routerdbmod

    set msg [update-routerdb]
    if {$msg ne ""} then {
	keep-state-mail "router.db" $msg
	return -1
    }

    set sql "SELECT topo.lastrun.date IS NULL
		    OR (
			(date_trunc('day',topo.lastrun.date)
			    <> date_trunc('day',now())
			AND extract(hour from now())>=$ctxt(fullrancidmin)
			AND extract(hour from now())<=$ctxt(fullrancidmax))
		    )
	       AS result
	       FROM topo.lastrun"

    # if selects succeeds, returns the result of SQL query,
    # while translating it to 1 (true) or 0 (false)
    set r2 1
    set r [toposqlselect $sql tab { set r2 [expr $tab(result) ? 1 : 0]}]
    if {$r} then {
	set r $r2

	# detect if router.db has been modified
	set routerdbmod {}
	set fmod [detect-filemod $ctxt(routerdb)]
	if {[llength $fmod] > 0} then {
	    lassign $fmod code path date
	    switch $code {
		err {
		    set msg $date
		    set r -1
		}
		add {
		    set msg "File router.db added"
		    set r 1
		}
		mod {
		    set msg "Resuming normal operation"
		    set r 1
		}
		del {
		    set msg "File router.db deleted"
		    set r -1
		}
	    }
	    keep-state-mail "router.db" $msg
	    if {$r == 1} then {
		set routerdbmod $fmod
	    }
	} else {
	    keep-state-mail "router.db" "Resuming normal operation"
	}
    }

    return $r
}

#
# Update topo graph from equipment configuration files
#
# Input:
#   - full : 1 if a full rancid is needed
#   - routerdbmod : result from detect-filemod, or empty
#   - leq : list of modified equipments (may be with a fake "_vlan" equipement)
#   - leqvirt : modified virtual equipments, whose date must be reset in
#	database. See detect-dirmod for the format of this list.
# Output:
#   - return value : 1 if ok, 0 if error
#
# History 
#   2010/10/15 : pda/jean : design
#   2010/10/20 : pda/jean : coding
#   2010/11/12 : pda/jean : add leqvirt
#

proc update-graph {full routerdbmod leq leqvirt} {
    global conf

    #
    # Reset equipments marked as modified in the topo.modeq table
    #

    if {! [toposqllock]} then {
	return 0
    }

    if {[llength $leq] == 0} then {
	set sql "UPDATE topo.modeq SET processed = 1"
    } else {
	set inlist [join $leq "', '"]
	set sql "UPDATE topo.modeq SET processed = 1 WHERE eq IN ('$inlist')"
    }
    if {! [toposqlexec $sql]} then {
	return 0
    }

    #
    # Run rancid and send a mail if needed
    #

    if {$full} then {
	set callrancid 1
	set leqrancid {}
    } else {
	# It is not a full rancid run.
	# Remove dummy equipment "_vlan". If, after this removal, there
	# is no equipment, distinguish from the "full-rancid" case.
        set pos [lsearch -exact $leq "_vlan"]
        if {$pos != -1} then {
            set leqrancid [lreplace $leq $pos $pos]
        } else {
            set leqrancid $leq
        }
	if {[llength $leqrancid] == 0} then {
	    set callrancid 0
	} else {
	    set callrancid 1
	}
    }

    if {$callrancid} then {
	if {! [rancid $leqrancid]} then {
	    toposqlunlock "abort"
	    return 0
	}
    }

    #
    # Update modification time of router.db if needed
    #

    if {[llength $routerdbmod] > 0} then {
	if {! [sync-filemonitor [list $routerdbmod]]} then {
	    toposqlunlock "abort"
	    return 0
	}
    }

    # If it is not a "full anaconf" run, add virtual equipments

    if {$full} then {
	set leqanaconf {}
    } else {
	set leqanaconf $leq
	foreach meq $leqvirt {
	    topo-verbositer "processing $meq" 9
	    lassign $meq code path date
	    if {$code eq "add" || $code eq "mod"} then {
		if {[regexp {([^/]+)\.eq$} $path bidon eq]} then {
		    topo-verbositer "adding virtual $eq to leqanaconf" 9
		    lappend leqanaconf $eq
		}
	    }
	}
    }

    #
    # Update graph and send a mail if needed
    #

    if {! [anaconf $leqanaconf]} then {
	toposqlunlock "abort"
	return 0
    }

    #
    # Update modification time of virtual equipments
    #

    if {! [sync-filemonitor $leqvirt]} then {
	toposqlunlock "abort"
	return 0
    }

    #
    # Update sensor list
    #

    if {! [sensors]} then {
	toposqlunlock "abort"
	return 0
    }

    #
    # Update date of last full rancid/anaconf
    #

    if {[llength $leq] == 0} then {
	set sql "DELETE FROM topo.lastrun ;
		    INSERT INTO topo.lastrun (date) VALUES (NOW ())"
	if {! [toposqlexec $sql]} then {
	    return 0
	}
    }
    toposqlunlock "commit"

    return 1
}

#
# Call rancid
#
# Input:
#   - leq (optional) : modified equipment list
# Output: 
#   - return value : 1 if ok, 0 if error
#
# History 
#   2010/10/20 : pda/jean : design
#

proc rancid {{leq {}}} {
    global conf
    global ctxt

    if {[llength $leq] == 0} then {
	set-status "Ranciding all equipements"
    } else {
	set-status "Ranciding $leq"
    }

    #
    # Call rancid
    #

    set cmd [format "$ctxt(topobindir)/$conf(start-rancid)" $leq]
    topo-verbositer "rancid : cmd=<$cmd>" 2

    if {[catch {exec sh -c $cmd} msg]} then {
	# erreur
	set msg "Error while running '$cmd'\n$msg"
	set r 0
    } else {
	# No error: msg contains rancid output
	if {$msg eq ""} then {
	    set msg "Resuming normal operation"
	}
	set r 1
    }

    #
    # Send a mail if needed
    #

    if {[llength $leq] == 0} then {
	set ev "fullrancid"
    } else {
	set ev "rancid"
    }

    keep-state-mail $ev $msg

    return $r
}

#
# Call anaconf to build the graph
#
# Input:
#   - leq (optional) : modified equipment list
# Output: 
#   - return value : 1 if ok, 0 if error
#
# History 
#   2010/10/20 : pda/jean : design
#

proc anaconf {{leq {}}} {
    global conf
    global ctxt

    if {[llength $leq] == 0} then {
	set-status "Building graph for all equipements"
    } else {
	set-status "Building graph for $leq"
    }

    set text ""

    set cmd "$ctxt(topobindir)/$conf(anaconf)"

    set r 1
    foreach eq $leq {
	append cmd " $eq"
    }

    topo-verbositer "anaconf : cmd=<$cmd>" 2
    if {[catch {exec sh -c $cmd} msg]} then {
	set msg "Error in $cmd\n$msg"
	set r 0
    } else {
	# no error
    }
    set text $msg

    #
    # Send a mail if needed
    #

    keep-state-mail "anaconf" $msg

    return $r
}

#
# Read sensor list and update it in database
#
# Input: none
# Output: 
#   - return value : 1 if ok, 0 if error
#
# History 
#   2010/11/09 : pda/jean : design
#

proc sensors {} {
    global conf
    global ctxt

    set-status "Updating sensor list"

    #
    # Read existing sensors in database
    #

    set sql "SELECT * FROM topo.sensor"
    set r [toposqlselect $sql tab {
				set id $tab(id)
				set told($id) [list $tab(type) $tab(eq) \
					$tab(comm) $tab(iface) $tab(param)]
			    } ]
    if {! $r} then {
	keep-state-mail "sensors" "Cannot read sensor list from database"
	return 0
    }

    #
    # Red new sensor list from the graph
    #

    if {! [read-coll tnew msg]} then {
	keep-state-mail "sensors" "Cannot read sensor list from graph\n$msg"
	return 0
    }
    if {$msg ne ""} then {
	keep-state-mail "sensors" "Inconsistent sensors:\n$msg\nSensor list not updated."
	return 1
    }

    #
    # Difference analysis
    #

    set lunmod {}
    set sql {}

    foreach id [array names tnew] {
	lassign $tnew($id) type eq comm iface param
	set qtype [::pgsql::quote $type]
	set qid [::pgsql::quote $id]
	set qeq [::pgsql::quote $eq]
	set qcomm [::pgsql::quote $comm]
	set qiface [::pgsql::quote $iface]
	set qparam [::pgsql::quote $param]

	if {[info exists told($id)]} then {
	    #
	    # Update common sensors
	    #

	    if {$tnew($id) eq $told($id)} then {
		#
		# Same : just update lastseen date
		#
		lappend lunmod "'$qid'"
	    } else {
		#
		# Not the same: update all fields
		#
		lappend sql "UPDATE topo.sensor
				    SET type = '$qtype',
					eq = '$qeq',
					comm = '$qcomm',
					iface = '$qiface',
					param = '$qparam',
					lastmod = DEFAULT,
					lastseen = DEFAULT
				    WHERE id = '$qid'"
	    }

	    unset told($id)
	} else {
	    #
	    # New sensor
	    #
	    lappend sql \
		"INSERT INTO topo.sensor (id, type, eq, comm, iface, param)
		    VALUES ('$qid','$qtype','$qeq','$qcomm','$qiface','$qparam')"
	}
    }

    #
    # Update date of sensors seen, but not modified
    #

    if {[llength $lunmod] > 0} then {
	set l [join $lunmod ","]
	lappend sql "UPDATE topo.sensor SET lastseen = DEFAULT WHERE id IN ($l)"
    }

    #
    # Remove old sensors, after some delay
    #

    lappend sql "DELETE FROM topo.sensor
		    WHERE lastseen + interval '$ctxt(sensorexpire) days' < now()"

    #
    # Send the huuuuuge SQL command
    # 

    if {[llength $sql] > 0} then {
	set sql [join $sql ";"]
	if {! [toposqlexec $sql]} then {
	    keep-state-mail "sensors" "Cannot write sensors in database"
	    return 0
	}
    }

    #
    # Send a mail if needed
    #

    keep-state-mail "sensors" "Resuming normal operation"

    return 1
}

#
# Read lines from "extractcoll" and get sensor list
#
# Input:
#   - _tab : in return, information extracted
#   - _msg : in return, empty string or error/warning message
# Output:
#   - return value : 1 if ok, 0 if error
#   - parameter _tab: array, indexed by sensor names, containing a list
#	{<type> <eq> <community> [<iface> [<param>]]}
#
# Note :
#   Format of input file is:
#	trafic      <id coll> <eq> <community> <phys iface> <vlan|->
#	nbassocwifi <id coll> <eq> <community> <phys iface> <ssid>
#	nbauthwifi  <id coll> <eq> <community> <phys iface> <ssid>
#	port        <id coll> <eq> <community> <eqtype> <vlan id> {<iflist...>}
#	ipmac       <id coll> <eq> <community> <eqtype>
#
# History
#   2008/07/28 : pda/boggia : design
#   2008/07/30 : pda        : adapt to new input format
#   2010/11/09 : pda/jean   : topographd integration
#   2010/12/19 : pda        : use call-topo
#   2010/12/21 : pda        : any message is not automatically an error
#   2011/12/08 : jean       : portmac and ipmac collector integration
#

proc read-coll {_tab _msg} {
    global conf
    upvar $_tab tab
    upvar $_msg msg

    set cmd $conf(extractcoll)
    if {! [call-topo $cmd msg]} then {
	return 0
    }

    set lwarn {}
    foreach line [split $msg "\n"] {
	set l [split $line]
	switch [lindex $l 0] {
	    trafic {
		lassign $l kw id eq comm iface vlan
		if {$vlan ne "-"} then {
		    set iface "$iface.$vlan"
		}
		set sensor [list $kw $eq $comm $iface {}]
	    }
	    nbassocwifi -
	    nbauthwifi {
		lassign $l kw id eq comm iface ssid
		set sensor [list $kw $eq $comm $iface $ssid]
	    }
	    portmac {
		lassign $l kw id eq comm eqtype ifacelist vlanid
		set sensor [list "portmac.$eqtype" $eq $comm $ifacelist $vlanid]
	    }
	    ipmac {
		lassign $l kw id eq comm eqtype 
		set sensor [list "ipmac" $eq $comm {} {}]
	    }
	    default {
		lappend lwarn "Unknown sensor type ($l)"
		set sensor ""
	    }
	}

	if {$sensor ne ""} then {
	    if {[info exists tab($id)]} then {
		# same format for all sensor types, until now
		lassign $tab($id) okw oeq ocomm oiface
		lappend lwarn "Sensor '$id' seen more than once ($eq/$iface and $oeq/$oiface)"
	    }
	    set tab($id) $sensor
	}
    }

    set msg [join $lwarn "\n"]
    return 1
}

##############################################################################
# Detection of equipment modified
##############################################################################

#
# Detect modifications on equipments
#
# Output:
#   - return value : list of modified equipments, or empty list
# 
# History 
#   2010/10/21 : pda/jean : design
#   2011/01/05 : jean : known equipements are pulled from rancid
#

proc detect-mod {} {

    set l {}
    set sql "SELECT DISTINCT(eq) AS eq FROM topo.modeq WHERE processed = 0"
    if {! [toposqlselect $sql tab { lappend l $tab(eq) }]} then {
	return {}
    }
    
    set sql "SELECT eq FROM topo.eq WHERE up=1"
    if {! [toposqlselect $sql tab {set rancideq($tab(eq)) 1}]} then {
	return {}
    }   

    #
    # Check that equipment is managed by rancid
    # Note: according to equipment types, syslogd versions, and equipment
    # local configuration, names may be short names (not fqdn). In those
    # cases, we suppose that equipment is not managed (it is surely
    # an error in the detection script)
    #

    set leq {}
    set lunk {}
    foreach eq $l {
	if {[info exists rancideq($eq)]} then {
	    lappend leq $eq
	} elseif {$eq eq "_vlan"} then {
	    lappend leq $eq
	} elseif {$eq ne "_routerdb"} then {
	    lappend lunk $eq
	}
    }

    if {[llength $lunk] == 0} then {
	keep-state-mail "detectunknw" "Resuming normal operation"
    } else {
	keep-state-mail "detectunknw" \
			"Change detected on unknown equipments ($lunk)"
    }

    return $leq
}

##############################################################################
# Main program
##############################################################################

# The -z option is reserved for internal use
set usage {usage: %1$s [-h][-f][-v <n>]
    -h         : display this text
    -f         : run in foreground
    -v <n>     : verbose level (0 = none, 1 = minimum, 99 = max)
}

proc usage {argv0} {
    global usage

    puts stderr [format $usage $argv0]
}

#
# Main program
#

proc main {argv0 argv} {
    global conf
    global ctxt

    set ctxt(dbfd1) ""
    set ctxt(dbfd2) ""
    set verbose 0
    set foreground 0
    set daemonized 0

    #
    # Get configuration values from local file
    #

    set-log [get-local-conf "logger"]

    set ctxt(routerdb) [get-local-conf "ranciddir"]
    append ctxt(routerdb) "/router.db"

    set eqvirtdir [get-local-conf "eqvirtdir"]
    set ctxt(topobindir) [get-local-conf "topobindir"]

    #
    # Get configuration values from database
    #

    config ::dnsconfig
    lazy-connect

    set delay [dnsconfig get "topographddelay"]
    set delay [expr $delay*1000]

    set ctxt(maxstatus) [dnsconfig get "topomaxstatus"]

    set ctxt(sensorexpire) [dnsconfig get "sensorexpire"]
    set ctxt(modeqexpire) [dnsconfig get "modeqexpire"]

    set ctxt(fullrancidmin) [dnsconfig get "fullrancidmin"]
    set ctxt(fullrancidmax) [dnsconfig get "fullrancidmax"]

    #
    # Argument analysis
    #

    while {[llength $argv] > 0} {
	switch -glob -- [lindex $argv 0] {
	    -h {
		usage $argv0
		return 0
	    }
	    -f {
		set foreground 1
		set argv [lreplace $argv 0 0]
	    }
	    -z {
	    	# This option is not meant to be used by a human
		# It implies that the program is being rerun in order to be
		# daemonized
		set daemonized 1
		set argv [lreplace $argv 0 0]
	    }
	    -v {
		set verbose [lindex $argv 1]
		set argv [lreplace $argv 0 1]

	    }
	    -* {
		usage $argv0
		return 1
	    }
	    default {
		break
	    }
	}
    }

    if {[llength $argv] != 0} then {
	usage $argv0
	return 1
    }

    if {! $foreground && ! $daemonized} then {
    	set argstr {}
	if {$verbose > 0} then {
	    lappend argstr -v $verbose
	}
	lappend argstr "-z"
	run-as-daemon $argv0 [join $argstr " "]
    }

    reset-status
    set-status "Starting topographd"

    #
    # Default values
    #

    topo-set-verbose $verbose

    if {$verbose > 0} then {
	set-trace {toposqlselect toposqlexec toposqllock toposqlunlock
		    keep-state-mail
		    update-routerdb full-rancid-needed update-graph
		    rancid anaconf sensors read-coll read-eq-type
		    detect-mod
		    detect-filemod detect-dirmod sync-filemonitor}

    }

    #
    # Daemon main loop
    #

    set first 1

    while {true} {
	#
	# Except first time, wait for the delay
	#

	topo-verbositer "delay : first=$first delay=$delay" 10
	if {! $first} then {
	    after $delay
	}
	set first 0

	#
	# Detect if a full rancid is needed (i.e. if no full rancid
	# has been done since 2 o'clock this morning, for example).
	#
    
	switch [full-rancid-needed routerdbmod] {
	    -1 {
		# error
		continue
	    }
	    0 {
		# not needed
	    }
	    1 {
		# graph must be updated
		# check modification times of virtual equipments
		# in order to update them after configuration read.
		set leqvirt [detect-dirmod $eqvirtdir err]
		if {$err ne ""} then {
		    keep-state-mail "eqvirt" $err
		    continue
		}

		if {! [update-graph 1 $routerdbmod {} $leqvirt]} then {
		    continue
		}
	    }
	}

	#
	# Search for modified equipments and rebuild the graph
	#

	# virtual equipments
	set leqvirt [detect-dirmod $eqvirtdir err]
	if {$err ne ""} then {
	    keep-state-mail "eqvirt" $err
	    continue
	}

	set leq [detect-mod]
	if {[llength $leq] > 0 || [llength $leqvirt] > 0} then {
	    update-graph 0 {} $leq $leqvirt
	}
    }
}

exit [main $argv0 $argv]
