#!/usr/local/bin/perl
# Read WWW log files and get the InterNIC information for the
# IP addresses in them, using the DB file produced by dns-terror
# to to map the IP addresses into domains first.  Saves the results
# to a second DB file.
#
# Options:
# -a		Query for all entries in the input DB file.
#		If not given, reads logs from stdin or files on command line.
# -c conn	Make conn simultaneous connections to the InterNIC, default 5.
# -d		Print debugging info.
# -e exclude	Regex to match IP addresses on the local machine to not use,
#		e.g., unrouted backnets or down interfaces.
# -i dbfile	Name the input DB file, default ip2host.db.
# -o dbfile	Name the output DB file, default dominfo.db.
# -m marksize	Print a notice every marksize entries.
# -v		Verbose: print progress messages.
#
# If you make more than some small number of whois queries in
# a given period of time (perhaps a minute) from a given IP address, 
# the InterNIC congestion control algorithm refuses connections
# from that address for a minute or two.
# Since they have a tight-fisted monopoly on the database, we
# work around their uncooperativeness by configuring a bunch of
# virtual IP addresses on the local machine, and cycling through
# them as the source address for the queries.  It's easier than writing
# a distributed program to make the queries.
#
# Now that there are multiple registrars to query, the process has become
# even more complicated, and this program has not been updated to handle it.
#
# Written by Josh Osborne <stripes@pix.net>
# and David MacKenzie <djm@djmnet.org>
# Please send comments and bug reports to <fastresolve-bugs@djmnet.org>

##############################################################################
#   Copyright 1999 UUNET, an MCI WorldCom company.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
# 02111-1307, USA.
##############################################################################

BEGIN { $prefix="/usr/local"; }
use lib "${prefix}/share/fastresolve";
use Getopt::Std;
use BerkeleyDB;
use Fcntl;
use IO::Socket;
use IO::Select;
use AddrCycle;

use strict 'vars';
use vars qw($opt_a $opt_c $opt_d $opt_e $opt_i $opt_o $opt_m $opt_v
	    $WhoisHost $WhoisPort $ReadTimeout $ErrorTimeout
	    @QueryDomain %QueryDomain %IP2Domain %Domain2Info
	    $NicBusy %Answers %Sock2Domain);

# Constants.
$WhoisHost = "whois.networksolutions.com";
$WhoisPort = "whois";
$ReadTimeout = 15;
$ErrorTimeout = 0.01;

main();
exit(0);

sub main
{
    my($domfile, $infofile, $maxconn);

    getopts('ac:de:i:o:m:v')
	or die "Usage: $0 [-adv] [-c conn] [-e exclude] [-i dbfile] [-o dbfile] [-m mark-size] [logfile...]\n";

    $SIG{PIPE} = 'IGNORE';
    select STDOUT;
    $| = 1;			# Instant gratification if output is logged.

    $maxconn = $opt_c || 5;

    $domfile = $opt_i || "ip2host.db";
    $infofile = $opt_o || "dominfo.db";
    tie_hashes($domfile, $infofile);

    if ($opt_a) {
	read_ipdb($domfile);
    } else {
	gzip_kludge();
	read_logs();
    }
    untie %IP2Domain;
    @QueryDomain = keys(%QueryDomain);
    untie %QueryDomain;

    query_nic($maxconn);
}

# $domfile is the input DB file.
# $infofile is the output DB file.
sub tie_hashes
{
    my($domfile, $infofile) = @_;

    tie(%IP2Domain, "BerkeleyDB::Btree",
	-Filename => $domfile, -Flags => DB_RDONLY)
	|| die "Can't tie to $domfile: $!\n";

    tie(%Domain2Info, "BerkeleyDB::Btree", -Filename => $infofile,
	-Flags => DB_CREATE|DB_TRUNCATE, -Mode => 0640)
	|| die "Can't tie to $infofile: $!\n";

    tie(%QueryDomain, "BerkeleyDB::Btree", -Flags => DB_CREATE|DB_TRUNCATE)
	|| die "Can't tie to memory!\n";
}

# Add $domain to %QueryDomain
# if it is suitable and we don't already have info about it.
sub add_domain
{
    my($domain) = @_;

    return if $domain =~ m/^[\d.]+$/;

    return unless queryable_subdomains($domain);

    # Remove the hostname, leaving the domain.
    $domain = remove_element($domain);

    return if defined $Domain2Info{$domain}; # Already know it.

    $QueryDomain{$domain} = 1;
}

# Add to %QueryDomain all entries in the input DB file.
sub read_ipdb
{
    my($domfile) = @_;
    my($ipaddr, $domain, $entry);

    warn "Reading $domfile\n";
    $entry = 1;
    while (($ipaddr, $domain) = each %IP2Domain) {
	if ($opt_v) {
	    warn "On entry $entry of $domfile\n"
		unless ($entry & 0xfff);
	    $entry++;
	}

	$domain = (unpack("IA*", $domain))[1];
	add_domain($domain) if $domain;
    }
    warn "Done looking at the DB file.\n";
}

# Magic for open(), via <>.
sub gzip_kludge
{
    foreach (@ARGV) {
	s/(.*\.(Z|z|gz))$/gzip -cd $1|/;
    }
}

# Add to %QueryDomain the entries in the input DB file
# that are found in the log files on stdin.
sub read_logs
{
    my($ipaddr, $domain);
    my($oldARGV) = '';

    while (<>) {
	if ($opt_v) {
	    warn "On line $. of $ARGV\n"
		unless ($. & 0xfff);
	}
	if ($ARGV ne $oldARGV) {
	    warn "Reading $ARGV\n";
	    $oldARGV = $ARGV;
	}
	
	m/^(\S+)/;
	$ipaddr = $1;
	# or:
	# $ipaddr = (split(/\s+/, $_, 2))[0];

	# Skip dotted quads that we can't resolve,
	# but pass through non-dotted-quads
	# (assume they're already resolved host names).
	if (exists($IP2Domain{$ipaddr})) {
	    $domain = (unpack("IA*", $IP2Domain{$ipaddr}))[1];
	    add_domain($domain) if $domain;
	} elsif ($ipaddr !~ m/^[\d.]+$/) {
	    add_domain($ipaddr);
	}
    }
    warn "Done looking at the log files.\n";
}

# False if not a domain that's handled by the InterNIC.
# True if $domain has at least two elements of domain below the host name.
# E.g., true for foo.bar.edu and foo.bar.baz.edu, but false for foo.edu.
sub queryable_subdomains
{
    my($domain) = @_;

    # Not done for int|mil|gov, because the InterNIC doesn't handle those.
    return $domain =~ m/\.[^.]+\.(edu|com|org|net)$/i;
}

# Strip off the first element of $domain.
# E.g., for foo.bar.baz.edu, return bar.baz.edu.
sub remove_element
{
    my($domain) = @_;

    $domain =~ s/^[^.]+\.//;
    return $domain;
}

# Return the number of elements in the domain.
# E.g., for foo.bar.edu return 3.
sub elements
{
    my($domain) = @_;

    return ($domain =~ tr/././) + 1;
}

# Make whois queries for the domains in @QueryDomain,
# up to $maxconn simultaneously.
# Put the results in %Domain2Info.
sub query_nic
{
    my($maxconn) = @_;
    my($ac) = new AddrCycle($opt_e);
    my($sel) = new IO::Select();
    # If we have to, we can resort to an exponential or linear backoff.
    my($wait) = 45;
    my($domain, $avail);

    warn "Will ask about: @QueryDomain\n" if $opt_d;

    while (@QueryDomain > 0 or $sel->count() > 0) {
	# min($maxconn, @QueryDomain)
	$avail = $maxconn < @QueryDomain ? $maxconn : @QueryDomain;
	# Keep the query pipeline full.
	# read_answers() didn't necessarily drain all of the pending queries.
	$avail -= $sel->count();
	warn "Making $avail more queries\n" if $opt_d;
	while ($avail-- > 0 and @QueryDomain > 0) {
	    $domain = shift @QueryDomain;
	    if (!exists($Answers{$domain}) and !exists($Domain2Info{$domain})) {
		$sel->add(request($ac->next_addr(), $domain));
	    } else {
		warn "$domain was already seen\n" if $opt_v;
	    }
	}

	$NicBusy = 0;
	if ($sel->count() > 0) {
	    read_answers($sel);
	}
	
	if ($NicBusy) {
	    warn "The InterNIC appears to be blowing us off, waiting $wait seconds\n";
	    warn "before trying again (" , scalar(@QueryDomain), " left)...\n";

	    sleep $wait;
	}
    }
}

{
    my($lastmark) = -1;

    # Submit a whois request for $domain, using $laddr as the source address
    # (in unpacked, ASCII form).
    # Return the socket handle for the connection.
    sub request
    {
	my($laddr, $domain) = @_;
	my($sock);
	# If we have to, we can resort to an exponential or linear backoff.
	my($wait) = 45;

	while (!($sock = IO::Socket::INET->new(PeerAddr => $WhoisHost,
					       PeerPort => $WhoisPort,
					       LocalAddr => $laddr,
					       Proto    => 'tcp'))) {
	    warn "Can't make socket: $!\n";
	    sleep $wait;
	}
	$sock->autoflush(1);
	$sock->sockopt(SO_KEEPALIVE, 1);

	$sock->syswrite("$domain\r\n", length($domain) + 2);

	$Sock2Domain{$sock} = $domain;
	$Answers{$domain} = '';

	if ($opt_v) {
	    warn "asking for $domain, ", scalar(@QueryDomain), " remaining\n";
	} elsif ($opt_m
		 and scalar(@QueryDomain) % $opt_m == 0
		 and scalar(@QueryDomain) != $lastmark) {
	    $lastmark = scalar(@QueryDomain);
	    warn "$lastmark remaining\n";
	}

	return $sock;
    }
}

# Process any pending input from the whois server connections.
sub read_answers
{
    my($sel) = @_;
    my($sock, $domain, $answer, $nread);

    foreach $sock ($sel->has_error($ErrorTimeout)) {
	$domain = $Sock2Domain{$sock};
	if ($sock->error()) {
	    warn "socket error\n";
	    read_error($sel, $sock, $domain);
	} else {
	    # Just EOF.
	    process_answer($sel, $sock, $domain);
	}
    }

    return if $sel->count() < 1;

    foreach $sock ($sel->can_read($ReadTimeout)) {
	$domain = $Sock2Domain{$sock};

	# Read "as much as possible" (for a whois response).
	eval {
	    local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
	    alarm $ReadTimeout;
	    $nread = $sock->sysread($Answers{$domain}, 1024 * 50,
				    length($Answers{$domain}));
	    alarm 0;
	};

	if ($@ or $nread < 0) {
	    if ($@ eq "alarm\n") {
		warn "read timed out: $!\n";
	    } elsif ($@) {
		warn "read error: $@\n";
	    } else {
		warn "stream error: $!\n";
	    }
	    read_error($sel, $sock, $domain);
	}

	# Try to detect the end of the answer in this iteration,
	# instead of holding onto the socket for another round.
	elsif ($nread == 0
	    or $Answers{$domain} =~ m/Domain servers in listed order/
	    or $Answers{$domain} =~ m/To single out one record/
	    or $Answers{$domain} =~ m/No match/i) {
	    process_answer($sel, $sock, $domain);
	}
    }
}

# There was an error during the query for
# $domain from socket handle $sock.
# Remove temporary hash entries and remove the query from select list $sel,
# and queue up $domain for another try.
sub read_error
{
    my($sel, $sock, $domain) = @_;

    push(@QueryDomain, $domain); # Don't forget this domain.
    $sel->remove($sock);
    $sock->close();
    delete $Sock2Domain{$sock};
    delete $Answers{$domain};
}

# Try to find an answer in %Answer for
# $domain from socket handle $sock.
# Remove temporary hash entries and remove the query from select list $sel.
sub process_answer
{
    my($sel, $sock, $domain) = @_;
    my($answer, $owner);

    $sel->remove($sock);
    $sock->close();
    delete $Sock2Domain{$sock};

    $answer = $Answers{$domain};
    delete $Answers{$domain};
    warn $domain, " answer is:\n", $answer, "\n" if $opt_d;

    if (!$answer) {
	warn "no answer for $domain, retrying\n";
	push(@QueryDomain, $domain); # Don't forget this domain.
    } elsif ($answer =~ m/^No match/mi) {
	$Domain2Info{$domain} = ""; # Negative caching.

	if (elements($domain) > 2) {
	    $domain = remove_element($domain);
	    if (!exists($Answers{$domain}) and !exists($Domain2Info{$domain})) {
		warn "asking for parent domain $domain\n" if $opt_v;
		# Do the parent domain next, to make a visual connection.
		unshift(@QueryDomain, $domain);
	    }
	} elsif ($opt_v) {
	    warn "no info for $domain\n";
	}
    } elsif ($answer =~ m/Your query limit has been exceeded/) {
	warn "InterNIC refused query\n";
	push(@QueryDomain, $domain); # Don't forget this domain.
	$NicBusy = 1;
    } else {
	$owner = undef;
	# Here we strip out everything but the owner, which is all we are
	# interested in.  It makes the DB file smaller, and keeps
	# Network Solutions from worrying that we're copying their database.
	$answer =~ s/^.*?this policy\.\n//s; # Skip the legal notice.
	if (($answer =~ s/^(.*?)\s*\(.*\)/$owner ||= &findowner($1)/meg) && $owner) {
	    $Domain2Info{$domain} = $owner;
	}
    }
}

# Do some fixing up on a domain owner from whois.
sub findowner
{
    my($data) = @_;

    return undef if $data eq "[No name]";
    $data =~ s/\"/\'/g;
    return $data;
}
