#!/usr/local/bin/php
<?php
/**
 * ThunderSnarf
 *
 * @author Ricardo H.F. Sette P. <ricardohfsp@gmail.com>
 * @link http://www.freebsdbrasil.com.br/
 * @copyright Copyright &copy; 2010 - FreeBSD Brasil LTDA
 * @license licença GNU/GPL v2.0 http://www.gnu.org/licenses/gpl-2.0.html
 */
?>
<?php

/*
*	Modo DEBUG
*/

define("DEBUG", false);

date_default_timezone_set('America/Sao_Paulo');

//variavel para controlar a quantidade de chamadas de funcao por transacao
//configure este valor conforme sua configuracao de hardware + postgre
$limit_transactin = 1000;

function get_base_domain($url)
{
        $debug = 0;
        $base_domain = '';

        $G_TLD = array(
        'biz','com','edu','gov','info','int','mil','name','net','org',
        'aero','asia','cat','coop','jobs','mobi','museum','pro','tel','travel',
        'arpa','root',
        'berlin','bzh','cym','gal','geo','kid','kids','lat','mail','nyc','post','sco','web','xxx',
        'nato',
        'example','invalid','localhost','test',
        'bitnet','csnet','ip','local','onion','uucp',
        'co' // note: not technically, but used in things like co.uk
        );

        // country tlds (source: http://en.wikipedia.org/wiki/Country_code_top-level_domain)
        $C_TLD = array(
        // active
        'ac','ad','ae','af','ag','ai','al','am','an','ao','aq','ar','as','at','au','aw','ax','az',
        'ba','bb','bd','be','bf','bg','bh','bi','bj','bm','bn','bo','br','bs','bt','bw','by','bz',
        'ca','cc','cd','cf','cg','ch','ci','ck','cl','cm','cn','co','cr','cu','cv','cx','cy','cz',
        'de','dj','dk','dm','do','dz','ec','ee','eg','er','es','et','eu','fi','fj','fk','fm','fo',
        'fr','ga','gd','ge','gf','gg','gh','gi','gl','gm','gn','gp','gq','gr','gs','gt','gu','gw',
        'gy','hk','hm','hn','hr','ht','hu','id','ie','il','im','in','io','iq','ir','is','it','je',
        'jm','jo','jp','ke','kg','kh','ki','km','kn','kr','kw','ky','kz','la','lb','lc','li','lk',
        'lr','ls','lt','lu','lv','ly','ma','mc','md','mg','mh','mk','ml','mm','mn','mo','mp','mq',
        'mr','ms','mt','mu','mv','mw','mx','my','mz','na','nc','ne','nf','ng','ni','nl','no','np',
        'nr','nu','nz','om','pa','pe','pf','pg','ph','pk','pl','pn','pr','ps','pt','pw','py','qa',
        're','ro','ru','rw','sa','sb','sc','sd','se','sg','sh','si','sk','sl','sm','sn','sr','st',
        'sv','sy','sz','tc','td','tf','tg','th','tj','tk','tl','tm','tn','to','tr','tt','tv','tw',
        'tz','ua','ug','uk','us','uy','uz','va','vc','ve','vg','vi','vn','vu','wf','ws','ye','yu',
        'za','zm','zw',
        // inactive
        'eh','kp','me','rs','um','bv','gb','pm','sj','so','yt','su','tp','bu','cs','dd','zr'
        );


        // get domain
        if ( !$full_domain = get_url_domain($url) )
        {
                return $base_domain;
        }

        // now the fun

        // break up domain, reverse
        $DOMAIN = explode('.', $full_domain);
        if ( $debug ) print_r($DOMAIN);
        $DOMAIN = array_reverse($DOMAIN);
        if ( $debug ) print_r($DOMAIN);

        // first check for ip address
        if ( count($DOMAIN) == 4 && is_numeric($DOMAIN[0]) && is_numeric($DOMAIN[3]) )
        {
                return $full_domain;
        }

        // if only 2 domain parts, that must be our domain
        if ( count($DOMAIN) <= 2 ) return $full_domain;

        /*
        finally, with 3+ domain parts: obviously D0 is tld
        now, if D0 = ctld and D1 = gtld, we might have something like com.uk
        so, if D0 = ctld && D1 = gtld && D2 != 'www', domain = D2.D1.D0
        else if D0 = ctld && D1 = gtld && D2 == 'www', domain = D1.D0
        else domain = D1.D0
        these rules are simplified below
        */
        if ( in_array($DOMAIN[0], $C_TLD) && in_array($DOMAIN[1], $G_TLD) && $DOMAIN[2] != 'www' )
        {
                $full_domain = $DOMAIN[2] . '.' . $DOMAIN[1] . '.' . $DOMAIN[0];
        }
        else
        {
                $full_domain = $DOMAIN[1] . '.' . $DOMAIN[0];;
        }

        // did we succeed?
        return $full_domain;
}


// get domain from url
function get_url_domain($url)
{
        $domain = '';

        $_URL = parse_url($url);

        // sanity check
        if ( empty($_URL) || empty($_URL['host']) )
        {
                $domain = '';
        }
        else
        {
                $domain = $_URL['host'];
        }

        return $domain;
}

/*
 * Funcao para limpar as bases de dados
 * */
function clean_db(&$db, $dias){
	if((int)$dias==0){
		return 0;
	}
	$time_start = $time_parcial = microtime(1);
	//limpar tabela thundersnarf_domain
	$sql_domain = "DELETE FROM thundersnarf_domain WHERE last_requested < extract(epoch FROM current_date - integer '".(int)$dias."')";
	//limpar tabela thundersnarf_ip
	$sql_ip = "DELETE FROM thundersnarf_ip WHERE last_requested < extract(epoch FROM current_date - integer '".(int)$dias."')";
	//limpar tabela thundersnarf_domainip
	$sql_domainip = "DELETE FROM thundersnarf_domainip WHERE last_requested < extract(epoch FROM current_date - integer '".(int)$dias."')";
	//limpar tabela thundersnarf_extension
	$sql_extension = "DELETE FROM thundersnarf_extension WHERE last_requested < extract(epoch FROM current_date - integer '".(int)$dias."')";
	//limpar tabela thundersnarf_file
	$sql_file = "DELETE FROM thundersnarf_file WHERE last_requested < extract(epoch FROM current_date - integer '".(int)$dias."') OR (size=0 AND file='')";
	$result_domain = pg_exec($db, $sql_domain) or print('Erro: ' . pg_last_error()."\n".$sql_domain);
	$result_ip = pg_exec($db, $sql_ip) or print('Erro: ' . pg_last_error()."\n".$sql_ip);
	$result_domainip = pg_exec($db, $sql_domainip) or print('Erro: ' . pg_last_error()."\n".$sql_domainip);
	$result_extension = pg_exec($db, $sql_extension) or print('Erro: ' . pg_last_error()."\n".$sql_extension);
	$result_file = pg_exec($db, $sql_file) or print('Erro: ' . pg_last_error()."\n".$sql_file);
	$time_end = microtime(1);
	$time = str_replace(".", ",", round(($time_end - $time_start),4));
	return array(
		$time,
		pg_affected_rows($result_domain),
		pg_affected_rows($result_ip),
		pg_affected_rows($result_domainip),
		pg_affected_rows($result_extension),
		pg_affected_rows($result_file)
	);
}

$time_start = $time_parcial = microtime(1);
if (!isset($_SERVER['argv'])) {
		echo "ERRO --> Este script deve ser executado na linha de comando.";
		exit ();
}

include_once("/usr/local/www/thundersnarf/protected/config/thundersnarf.conf.php");

$dbhost = $thundersnarf_conf["DBHOST"];
$dbdb = $thundersnarf_conf["DBNAME"];
$dbuser = $thundersnarf_conf["DBUSERNAME"];
$dbpass = $thundersnarf_conf["DBPASSWORD"];

$db = pg_connect("host=$dbhost dbname=$dbdb user=$dbuser password=$dbpass")
    or die('Could not connect: ' . pg_last_error());

//verificar se foi passado o parametro -d para fazer SOMENTE a limpeza da base de dados
if (isset($_SERVER['argv'][1]) AND isset($_SERVER['argv'][2]) AND $_SERVER['argv'][1]=="-d" AND (int)$_SERVER['argv'][2]>0){
	$resposta = clean_db($db, (int)$_SERVER['argv'][2]);	
	if($resposta==0){
		echo "Os dias deve ser maior que zero.\n";
		exit();
	}
	if(DEBUG){
		echo $resposta[1]." registro excluidos na tabela thundersnarf_domain\n";
		echo $resposta[2]." registro excluidos na tabela thundersnarf_ip\n";
		echo $resposta[3]." registro excluidos na tabela thundersnarf_domainip\n";
		echo $resposta[4]." registro excluidos na tabela thundersnarf_extension\n";
		echo $resposta[5]." registro excluidos na tabela thundersnarf_file\n";
	}
	echo "Total registro excluidos: ".($resposta[1]+$resposta[2]+$resposta[3]+$resposta[4]+$resposta[5])." registro(s) - ".$resposta[0]." segundos para limpar a base de dados #thundersnarf ".date('r')." \n";
	exit();
}

if (!isset($_SERVER['argv'][1]) OR ($_SERVER['argv'][1]=="-d")) {
		echo "ERRO --> Voce deve informar o parametro path_access_log_valido ou -d xdias\n\n\t\t" . $_SERVER['argv'][0] . " <path_access_log_valido>\n\t\t" . $_SERVER['argv'][0] . " -d <dias>\n\n";
		exit ();
}

// caminho do local aonde estara o arquivo com os logs
$path_access_log_valido = $_SERVER['argv'][1];
//manter na base os ultimos x dias indicados,
//caso nao seja passado nenhum parametro o padrao sera dos ultimos 15 dias
//vai limpar as tabelas thunder_ip//thunder_file//thunder_domain//thunder_extension// 
//a tabela thunder_file nesta limpesa levara em conta os registros com:
//		o nome(file) do arquivo em branco e/ou 
//		com data(last_requested) inferior a x dias e/ou
//		com tamanho(size) igual a zero

if(DEBUG)
	echo "Conexao no host $dbhost com a database $dbdb e usuario $dbuser Ok!\n";

if (!file_exists($path_access_log_valido)) {
	echo "O arquivo $path_access_log_valido nao existe.";
	exit();
}

$f = fopen($path_access_log_valido, 'r');//$f = fopen("access.log.valido", 'r');

$count = $count_anteriror = $count_linha = 0;
$linha_array = array("incluida"=>0, "descartada"=>array("count"=>0,"type"=>0,"3"=>0));

$sql = "START TRANSACTION; ";
while (!feof($f)) {
        $linha = fgets($f);
        $linha = explode(" ", chop($linha));
        if(count($linha) == 12){
                if (is_numeric($linha[0]) && //timestamp
                is_numeric($linha[1]) && //pid
                !empty ($linha[2]) && //ip
                !empty ($linha[3]) && //ip2
                !empty ($linha[4]) && //response
                is_numeric($linha[5]) && //filesize
                is_numeric($linha[6]) && //transfered
                !empty ($linha[7]) && //type of request
                !empty ($linha[8]) && //host request (url)
                !empty ($linha[9]) && //domain
                !empty ($linha[10]) && //file
                is_numeric($linha[11]) //novo padrao6 && //static
                //novo padrao6 is_numeric($linha[12]) //match
                ) {
                        unset($linha[2]);
                        $novo_array = array();
                        foreach($linha as $key=>$value){
                                $novo_array[] = $value;
                        }
                        $linha = $novo_array;
                }
        }
        if (count($linha) == 11) {
                if (is_numeric($linha[0]) && //timestamp
                is_numeric($linha[1]) && //pid
                !empty ($linha[2]) && //ip
                !empty ($linha[3]) && //response
                is_numeric($linha[4]) && //filesize
                is_numeric($linha[5]) && //transfered
                !empty ($linha[6]) && //type of request
                !empty ($linha[7]) && //host request (url)
                !empty ($linha[8]) && //domain
                !empty ($linha[9]) && //file
                is_numeric($linha[10]) //novo padrao6 && //static
                //novo padrao6 is_numeric($linha[11]) //match
                ) {
                        $dominio = "";
                        if (substr($linha[7],0,4) != "http")
                                $dominio = get_base_domain("http://".$linha[7]);
                        else
                                $dominio = get_base_domain($linha[7]);
                        //novo padrao6//if (($linha[10] == "0" || $linha[10] == "1") && ($linha[8] != "()" || $dominio != "")) {
                        if (($linha[10] == "0" || $linha[10] == "1" || $linha[10] == "2") && ($linha[8] != "()" || $dominio != "")) {
                                if(($linha[10]=="1" || $linha[10]=="2") && (strpos($linha[3], "HIT_ETAG") === false)){//foi feito cache do arquivo mas por que cargas daquas nao mostra na linha de log o tamanho do arquivo
                                    $linha[4] = $linha[5];//forcar o tamanho do arquivo ser do tamanho do que foi transferido
                                }
                                $linha_array["incluida"]++;
                                if (strpos($linha[3], "HIT") !== false)
                                        $msghit = "true";
                                else
                                        $msghit = "false";
                                if ($linha[10] == "1"){//novo padrao6 agora se for 1 é dinamico, o restante é static
                                        //novo padrao6//$linha[10] = "true";
                                        $linha[10] = "false";
                                }else{
                                        //novo padrao6//$linha[10] = "false";
                                        $linha[10] = "true";
                                }
                                if (strpos($linha[3], "HIT_ETAG") !== false)
                                        $linha[5] = $linha[4];
                                $linha[8] = $dominio;
                                $linha[9] = str_replace("(", "", $linha[9]);
                                $linha[9] = str_replace(")", "", $linha[9]);
                                if(strrpos($linha[9],".")!==false){
                                        $extensao = strtoupper(substr(trim($linha[9]), strrpos($linha[9],".")+1, 3));
                                }else{
                                        $extensao = "";
                                }
                                $linha[7] = htmlentities($linha[7], ENT_QUOTES);
                                if(strlen($linha[7])>3999) $linha[7] = substr($linha[7],0,3995)."...";
                                if(strlen($linha[9])>299) $linha[9] = substr($linha[9],0,295)."...";
                                $sql .= "SELECT insertfile(" .
                                                "".$linha[0].", " .// timestamp_1, integer
                                                "'".utf8_encode(addslashes($linha[2]))."', " .// ip_2, "unknown"
                                                "'".utf8_encode(addslashes($linha[3]))."', " .// response_3, "unknown"
                                                "".$linha[4].", " .// filesize_4, integer
                                                "".$linha[5].", " .// transfered_5, integer
                                                "'".utf8_encode(addslashes($linha[6]))."', " .// type_of_request_6, "unknown"
                                                "'".utf8_encode(addslashes($linha[7]))."', " .// url_7, "unknown"
                                                "'".utf8_encode(addslashes($linha[8]))."', " .// domain_8, "unknown"
                                                "'".utf8_encode(addslashes($linha[9]))."', " .// file_9, "unknown"
                                                "".$linha[10].", " .// static_10, integer
                                                "".$msghit.", " .// hit_11, boolean
                                                "'".utf8_encode(addslashes($extensao))."'" .// extension_12, "unknown"
                                                "); ";//
                                $count++;
                        }else{
                                $linha_array["descartada"]["3"]++;
                        }
                }else{
                        $linha_array["descartada"]["type"]++;
                }
        }else{
                $linha_array["descartada"]["count"]++;
        }
        $count_linha++;
        if($count_anteriror!=$count AND !($count % $limit_transaction)){
                pg_exec($db, $sql." COMMIT;") or print('Erro: ' . pg_last_error()."\n".$sql);
                $sql = "START TRANSACTION; ";
                $count_anteriror=$count;
                $time_end = microtime(1);
                $time = str_replace(".", ",", $time_end - $time_parcial);
                $time_parcial = microtime(1);
                if(DEBUG)
                	echo "linha $count_linha | total de funcao insertfile chamada $count ($time segundos para inserir $limit_transaction)...\n";
        }
}
pg_exec($db, $sql." COMMIT;") or print('Erro: ' . pg_last_error()."\n".$sql);
fclose($f);
$time_end = microtime(1);
$time = str_replace(".", ",", $time_end - $time_start);
echo $count." chamdas SQL(sendo ".$limit_transactin." por transacao) - ".(int)$time." segundos executando script #thundersnarf ".date('r')."\n";
if(DEBUG)
	print_r($linha_array);
?>