#! /bin/sh  
#
# license: Standard BSD2CLAUSE (BSD 2-clause Simplified License),
# Please read from the web.
#
# I would like to acknowledge the guidance and technical insight given
# by Polytropon from Magdenburg, Germany in writing this awk script.
#
           
# This script sanitizes the raw host data fetched from public providers
# of such information.
          
  host_in="$1"
  host_out="$2"
  truncate -s 0 $host_out
                
  # tr -d '\r'      means delete carriage return where you find it.
  #                 mdl.hosts, mvps.hosts, hpp.hosts, 
  #                 Hosts.mis, Hosts.pub, Hosts.rsk, Hosts.sex, Hosts.trc 

  # tr '[:upper:]' '[:lower:]'  means convert all upper case to lower case
  #                 sm.hosts, sv.hosts

  # sed 's/^#/# /g' means replace all "#" with "# "
  #                 fdm.hosts, hp.hosts, hpp.hosts, mvps.hosts, sb.hosts, 
  #                 swc.hosts, wsp1.hosts, wsp2.hosts, wsp3.hosts,

  # tr ':' ' '      means replace every : with a space
  #                 swc.hosts, wsp1.hosts  removes this fqdn:433

  # sed 's/\^M//g'  means delete carriage return litteral that was
  #                 added by editing file. Example if needed.

  # grep -v "@"     means delete all records with "@" IE; email addresses
  #                 Hosts.rsk has 9 email addresses 

  # What files are dropping what items
  # aw.hosts  - localhost
  # ch.hosts  - localhost
  # hp.hosts  - localhost 66.7.213.144  95.140.125.124
  # mdl.hosts - localhost
  # mvps.host - localhost #[server
  # sb.hosts  - local localhost localhost.localdomain broadcasthost
  #             0.0.0.0 sbc
  # swc.hosts - local localhost localhost.localdomain broadcasthost
  #             130.211.230.53
  # sm.hosts  - malware
  # sv.hosts  - malvertising
  # Hosts.mis - localhost yahoo.com
  # Hosts.pub - localhost
  # Hosts.rsk - localhost microsoft.com
  # Hosts.sex - localhost markmail.org  chiark.greenend.org.uk 9.
  # Hosts.trc - localhost
     
  cat $host_in | tr -d '\r' | tr '[:upper:]' '[:lower:]' | \
    tr ':' ' ' | grep -v "@" | sed 's/^#/# /g' | awk '

   (length == 0) { next }   # drop blank lines
   (length > 90) { next }   # drop lines with too long host names

   { if ($1 == "#" || $1 == "malvertising" || $1 == "malware" ||
         $1 == "microsoft.com" || $1 == "yahoo.com" || $1 == "9." ||
         $1 == "markmail.org" ) next; 
   }

   { if ($2 == "localhost" || $2 == "localhost.localdomain" || 
         $2 == "local" || $2 == "broadcasthost" || $2 == "#[server" ||
         $2 == "130.211.230.53" || $2 == "sbc" || $2 == "0.0.0.0" ||
         $2 == "66.7.213.144" || $2 == "95.140.125.124" ||
         $2 == "yahoo.com" || $2 == "microsoft.com" ) next;
   }

   { ip = $1; host = $2;
     if (ip == "127.0.0.1" || ip == "0.0.0.0")
       print host;
     else
       print ip;
   }' > $host_out


