#!/usr/local/bin/perl
# ----------------------------------------------------------------------
#  wfb2sql - Converts the CIA World Factbook into SQL statements
#
#  $Id: wfb2sql,v 1.4 2004/02/06 19:42:08 ruut_ Exp $
#
#  Run 'wfb2sql --man' to view the man page.
#
#  You can run this file through either pod2man or pod2html to produce pretty
#  documentation in manual or html file format (these utilities are part of 
#  the Perl 5 distribution).
#
#  Written by Jan Schreiber <mail@jschreiber.com>
#  COPYRIGHT (C) 2003 Arbeitsbereich Datenbanken und Informationssysteme,
#	University of Tuebingen, Germany
#
# ----------------------------------------------------------------------
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#   the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
# ----------------------------------------------------------------------

require 5.004;
use strict 'vars';
use Getopt::Long;
use Config::General;
use Pod::Usage;

# Some globally available constants
my $VERSION_HIGH = 0;
my $VERSION_LOW = 6;

my $INDEX_FILE = 'index.html';
my $PROGRAM_NAME = "wfb2sql version $VERSION_HIGH.$VERSION_LOW\n".
	"COPYRIGHT (C) 2003 by Arbeitsbereich Datenbanken und Informationssysteme,\n".
	"University of Tuebingen, Germany\n\n".
	"This program comes with ABSOLUTELY NO WARRANTY.\n".
	"This is free software, and you are welcome to redistribute it\n".
	"under certain conditions. See the file COPYING for details.\n\n";

my (%code2name, %name2code, %country_data, @DEPENDENT_COUNTRIES, $TOTAL_BORDER, 
 %country_border, %ORGANIZATION_LIST);
my $tmp;
my $SCHEMA_PREFIX;
my $CURRENT_COUNTRY;

my %FIELDS = (
	'age_structure' => 'Age structure',
	'area' => 'Area',
	'background' => 'Background',
	'birth_rate' => 'Birth rate',
	'capital' => 'Capital',
	'coastline' => 'Coastline',
	'death_rate' => 'Death rate',
	'dependent_countries' => 'Dependent areas',
	'internet_users' => 'Internet users',
	'isps' => 'Internet Service Providers (ISPs)',
	'labor_force' => 'Labor force',
	'land_boundaries' => 'Land boundaries',
	'organizations' => 'International organization participation',
	'phone_mainlines' => 'Telephones - main lines in use',
	'phone_mobiles' => 'Telephones - mobile cellular',
	'population' => 'Population',
	'population_growth' => 'Population growth rate',
	'religions' => 'Religions'
);

# Options and their default values
my $BOUNDARIES = 1;
my $COMMUNICATIONS = 1;
my $DATABASE = '';
my $DATABASE_NAME = '';
my $DATA_ONLY = 0;
my $DEPENDENT_COUNTRIES = 1;
my $DIRECTORY = '.';
my $FLAGS = 1;
my $HELP = 0;
my $MAN = 0;
my $MAPS = 1;
my $ORGANIZATIONS = 1;
my $SCHEMA = 1;
my $SCHEMA_NAME = 'wfb';
my $SCHEMA_ONLY = 0;
my $SMALL_FLAGS = 0;
my $TEXT_FIELDS = 1;
my $TMP_DIR = '.';
my $VERBOSE = 0;


# Try to find the configuration from a config file
my $config_file = '';

# Find the home directory in a reasonably portable way
my $home = $ENV{'HOME'} || $ENV{'LOGDIR'} || (getpwuid($<))[7];

if (-e "wfb2sqlrc") {
	$config_file = "wfb2sqlrc";
} elsif (-e "$home/.wfb2sqlrc") {
	$config_file = "$home/.wfb2sqlrc";
}

if ($config_file ne '') {
	my %config = ParseConfig(
		-ConfigFile => $config_file,
		-AutoTrue => true
	);
	$BOUNDARIES = $config{'boundaries'}
		if exists$config{'boundaries'};
	$COMMUNICATIONS = $config{'communications'}
		if exists $config{'communications'};
	$DATABASE = $config{'database'} 
		if exists $config{'database'};
	$DATABASE_NAME = $config{'database-name'}
		if exists $config{'database-name'};
	$DATA_ONLY = $config{'data-only'}
		if exists $config{'data-only'};
	$DEPENDENT_COUNTRIES = $config{'dependent-countries'}
		if exists $config{'dependent-countries'};
	$DIRECTORY = $config{'directory'} 
		if exists $config{'directory'};
	$FLAGS = $config{'flags'}
		if exists $config{'flags'};
	$MAPS = $config{'maps'}
		if exists $config{'maps'};
	$ORGANIZATIONS = $config{'organizations'}
		if exists $config{'organizations'};
	$SCHEMA = $config{'schema'}
		if exists $config{'schema'};
	$SCHEMA_NAME = $config{'schema-name'}
		if exists $config{'schema-name'};
	$SCHEMA_ONLY = $config{'schema-only'}
		if exists $config{'schema-only'};
	$SMALL_FLAGS = $config{'small_flags'}
		if exists $config{'small_flags'};
	$TEXT_FIELDS = $config{'text-fields'}
		if exists $config{'text-fields'};
	$TMP_DIR = $config{'tmp-dir'}
		if exists $config{'tmp-dir'};
	$VERBOSE = $config{'verbose'}
		if exists $config{'verbose'};
}

# Process the options
GetOptions (
	'boundaries!' => \$BOUNDARIES,
	'communications!' => \$COMMUNICATIONS,
	'database=s' => \$DATABASE,
	'database-name=s' => \$DATABASE_NAME,
	'data-only' => \$DATA_ONLY,
	'dependent-countries!' => \$DEPENDENT_COUNTRIES,
	'directory=s' => \$DIRECTORY,
	'flags!' => \$FLAGS,
	'help|?' => \$HELP,
	'man' => \$MAN,
	'maps!' => \$MAPS,
	'organizations!' => \$ORGANIZATIONS,
	'tmp-dir=s' => \$TMP_DIR,
	'schema!' => \$SCHEMA,
	'schema-name=s' => \$SCHEMA_NAME,
	'schema-only' => \$SCHEMA_ONLY,
	'small-flags' => \$SMALL_FLAGS,
	'text-fields' => \$TEXT_FIELDS,
	'verbose!' => \$VERBOSE
) or pod2usage(
	-msg => $PROGRAM_NAME,
	-exitval => 1,
	-verbose => 1,
	-output => \*STDERR); 

pod2usage( 
	-msg => $PROGRAM_NAME,
	-exitval => 1,
	-verbose => 1,
	-output => \*STDERR) if $HELP;
pod2usage( 
	-msg => $PROGRAM_NAME,
	-exitval => 1,
	-verbose => 2 ) if $MAN;
pod2usage( 
	-msg => "You have to specify a database type.\n".$PROGRAM_NAME,
	-exitval => 1,
	-verbose => 1,
	-output => \*STDERR) if ($DATABASE ne 'db2' and $DATABASE ne 'postgresql' and $DATABASE ne 'mysql');

my $SEPARATOR = '/';

# check if directory really is a world factbook directory
if (!-e "$DIRECTORY$SEPARATOR$INDEX_FILE" and
    !-e "$DIRECTORY${SEPARATOR}print/af.html") {
	print<<ERROR;
Fatal error: '$DIRECTORY$SEPARATOR$INDEX_FILE' does not seem to be a valid factbook root directory. 

See 'wfb2sql --man' for details.
ERROR
exit;
}


if ($SCHEMA && $DATABASE ne 'mysql') {
	$SCHEMA_PREFIX = "$SCHEMA_NAME.";
} else {
	$SCHEMA_PREFIX = '';
}


if (!$DATA_ONLY) {
	&print_schema();
}

if (!$SCHEMA_ONLY) {
	&process_codes();

	&open_spool_files() if $DATABASE eq 'db2';

	print(STDERR "Pass 1: Country data...\n") if $VERBOSE;
	while (<$DIRECTORY${SEPARATOR}print/??.html>) {

		# Skip the 'world' page
		next if ($_ eq "$DIRECTORY${SEPARATOR}print/xx.html");
		%country_data = ();
		&process_country($_);
		&process_age_structure($country_data{$FIELDS{'age_structure'}});
		&process_country_area($country_data{$FIELDS{'area'}});
		if ($BOUNDARIES) {
			&process_borders($country_data{$FIELDS{'land_boundaries'}});
		}
		print_country();
	}
	
	print(STDERR "Pass 2: Dependent data...\n") if $VERBOSE;
	while (<$DIRECTORY${SEPARATOR}print/??.html>) {
		$TOTAL_BORDER = '';
		%country_border = ();
		@DEPENDENT_COUNTRIES = ();
		%ORGANIZATION_LIST = ();

		# Skip the 'world' page
		next if ($_ eq "$DIRECTORY${SEPARATOR}print/xx.html");
		%country_data = ();
		&process_country($_);
		if ($ORGANIZATIONS) {
			&process_organizations($country_data{$FIELDS{'organizations'}});
			&print_organizations();
		}
		if ($DEPENDENT_COUNTRIES) {
			&process_dependent_countries($country_data{$FIELDS{'dependent_countries'}});
			&print_dependent_countries();
		}
		if ($BOUNDARIES) {
			&process_borders($country_data{$FIELDS{'land_boundaries'}});
			&print_borders();
		}
		&print_flag() if ($FLAGS);
		&print_map() if ($MAPS);
	}
	&close_spool_files() if $DATABASE eq 'db2';
}


#
# Builds two hashes to convert names to codes and backwards.
#
sub process_codes {
	my ($name, $code);
	print STDERR "Processing country codes...\n" if $VERBOSE;

	open(INDEX_FILE, "$DIRECTORY$SEPARATOR$INDEX_FILE") 
		|| die("Can't open file $DIRECTORY$SEPARATOR$INDEX_FILE");
	while (<INDEX_FILE>) {
		if (/geos\/(..)\.html"\s+>(.*)</) {
			$name = &normalize($2);
			$code = uc($1);
			$code2name{$code} = $name;
			$name2code{$name} = $code;
		}
	}
	close(INDEX_FILE);
}


#
# Opens all required spool files
#
sub open_spool_files {
	open(COUNTRIES,
		">$TMP_DIR${SEPARATOR}countries.del") ||
	die("Can't open file '$TMP_DIR${SEPARATOR}countries.del'");

	if ($BOUNDARIES) {
		open(BOUNDARIES,
			">$TMP_DIR${SEPARATOR}boundaries.del") ||
		die("Can't open file '$TMP_DIR${SEPARATOR}boundaries.del'");
	}
	if ($DEPENDENT_COUNTRIES) {
		open(BELONGS_TO, 
			">$TMP_DIR${SEPARATOR}belongs_to.del") ||
		die("Can't open file '$TMP_DIR${SEPARATOR}belongs_to.del'");
	}
	if ($MAPS) {
		open(MAPS, 
			">$TMP_DIR${SEPARATOR}maps.del") ||
		die("Can't open file '$TMP_DIR${SEPARATOR}maps.del'");
	}
	if ($FLAGS) {
		open(FLAGS, 
			">$TMP_DIR${SEPARATOR}flags.del") ||
		die("Can't open file '$TMP_DIR${SEPARATOR}flags.del'");
	}
	if ($ORGANIZATIONS) {
		open(ORGANIZATIONS, 
			">$TMP_DIR${SEPARATOR}organizations.del") ||
		die("Can't open file '$TMP_DIR${SEPARATOR}organizations.del'");
	}
}


#
# Closes all created spool files
#
sub close_spool_files {
	close(BOUNDARIES) if ($BOUNDARIES);
	close(BELONGS_TO) if ($DEPENDENT_COUNTRIES);
	close(MAPS) if ($MAPS);
	close(FLAGS) if ($FLAGS);
	close(COUNTRIES);
	close(ORGANIZATIONS) if ($ORGANIZATIONS);
}
	

#
# Extracts information about a country
#
sub process_country {
	local ($_) = shift;
	my ($tmp, $td, $data, $rec);
	open(COUNTRY_FILE, $_) || die("Can't open file '$_'");

	$rec = 0;
	while(<COUNTRY_FILE>) {
		if ($rec == 1) {
			if (/^(.*)(<\/td>|<\/table>)/ and $td == 1) {
				$data .= &trim($1);
				$country_data{$tmp} = $data;
				$rec = 0;
			} elsif (/<td[^>]*>(.*)$/) {
				$data = &trim($1);
				$td = 1;
			} else {
				if (&trim($_) ne '') {
					$data .= &trim($_);
				}
			}	
			next;
		} 
		if (/<title>.*--(.*)<\/title>/) {
			my $name = &trim($1);
			print STDERR "Processing ".&normalize($name)."...\n" if $VERBOSE;
			$CURRENT_COUNTRY = $name2code{&normalize($name)};
		} elsif (/<div align="right">(.*):<\/div>/) {
			$tmp = $1;
			$td = 0;
			$rec = 1;
		}
	}
}


#
# creates a hash of all dependent countries
#
sub process_dependent_countries {
	my (@countries, $country);
	local($_) = shift;
#	s/^([^<]*)<.*/$1/;
	s/<.*//;
	@countries = split /,/;

	@DEPENDENT_COUNTRIES = ();
	
	foreach $country (@countries) {
		push @DEPENDENT_COUNTRIES, $name2code{&normalize($country)}; 
	}
}


#
# prints SQL statements for a country
#
sub print_country {
	my ($fields, $values, $count);
	if ($TOTAL_BORDER eq '') {
		$TOTAL_BORDER = 0;
	}
	if ($country_data{'total_area'} ne '') {
		$fields .= ", total_area";
		$values .= ", ".&quote($country_data{'total_area'});
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}
	if ($country_data{'land_area'} ne '') {
		$fields .= ", land_area";
		$values .= ", ".&quote($country_data{'land_area'});
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}
	if ($country_data{'water_area'} ne '') {
		$fields .= ", water_area";
		$values .= ", ".&quote($country_data{'water_area'});
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}
	$tmp = $country_data{$FIELDS{'coastline'}};
	if ($tmp =~ /([0-9,\.]+) km/) {
		$fields .= ", coastline";
		$values .= ", ".&str2num($1);
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}
	if ($BOUNDARIES) {
		$fields .= ", total_border";
		$values .= ", $TOTAL_BORDER";
	}

	$tmp = $country_data{$FIELDS{'population'}};
	if ($tmp =~ /^([0-9,\.]+)\s*(million)?/) {
		$count = &str2num($1);
		$count = $count*1000000 if ($2 eq 'million');
		$fields .= ", population";
		$values .= ", ".&str2num($count);
	} elsif ($tmp =~ /uninhabited/) {
		$fields .= ", population";
		$values .= ", 0";
	} elsif ($tmp =~ /no indigenous inhabitants/) {
		$fields .= ", population";
		$values .= ", 0";
	} elsif ($DATABASE eq 'db2') {
		$values .=", ";
	}
	
	if ($country_data{'age_young'}) {
		$fields .= ", p_young";
		$values .= ", ".$country_data{'age_young'};
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}

	if ($country_data{'age_adult'}) {
		$fields .= ", p_adult";
		$values .= ", ".$country_data{'age_adult'};
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}

	if ($country_data{'age_old'}) {
		$fields .= ", p_old";
		$values .= ", ".$country_data{'age_old'};
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}

	$tmp = $country_data{$FIELDS{'population_growth'}};
	if ($tmp =~ /(\-?[0-9,\.]+)\s?%/) {
		$fields .= ", p_growth";
		$values .= ", ".&str2num($1);
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}


	$tmp = $country_data{$FIELDS{'labor_force'}};
	if ($tmp =~ /^([0-9,\.]+)\s*(million)?/) {
		$count = &str2num($1);
		$count = $count*1000000 if ($2 eq 'million');
		$fields .= ", labor_force";
		$values .= ", ".&str2num($count);
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}


	if ($COMMUNICATIONS) {
		$tmp = $country_data{$FIELDS{'phone_mainlines'}};
		if ($tmp =~ /^([0-9,\.]+)\s*(million)?/) {
			$count = &str2num($1);
			$count = $count*1000000 if ($2 eq 'million');
			$fields .= ", phone_mainlines";
			$values .= ", ".&str2num($count);
		} elsif ($DATABASE eq 'db2') {
			$values .= ", ";
		}

		$tmp = $country_data{$FIELDS{'phone_mobiles'}};
		if ($tmp =~ /^([0-9,\.]+)\s*(million)?/) {
			$count = &str2num($1);
			$count = $count*1000000 if ($2 eq 'million');
			$fields .= ", phone_mobiles";
			$values .= ", ".&str2num($count);
		} elsif ($DATABASE eq 'db2') {
			$values .= ", ";
		}

		$tmp = $country_data{$FIELDS{'internet_users'}};
		if ($tmp =~ /^([0-9,\.]+)\s*(million)?/) {
			$count = &str2num($1);
			$count = $count*1000000 if ($2 eq 'million');
			$fields .= ", internet_users";
			$values .= ", ".&str2num($count);
		}  elsif ($DATABASE eq 'db2') {
			$values .= ", ";
		}

		$tmp = $country_data{$FIELDS{'isps'}};
		if ($tmp =~ /^([0-9,\.]+)\s*(million)?/) {
			$count = &str2num($1);
			$count = $count*1000000 if ($2 eq 'million');
			$fields .= ", isps";
			$values .= ", ".&str2num($count);
		}  elsif ($DATABASE eq 'db2') {
			$values .= ", ";
		}
	}
	$tmp = $country_data{$FIELDS{'birth_rate'}};
	if ($tmp =~ /([0-9\.]+) births/) {
		$fields .= ", birth_rate";
		$values .= ", ".&str2num($1);
	}  elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}

	$tmp = $country_data{$FIELDS{'death_rate'}};
	if ($tmp =~ /([0-9\.]+) deaths/) {
		$fields .= ", death_rate";
		$values .= ", ".&str2num($1);
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}


	my $capital = &quote(&trim($country_data{$FIELDS{'capital'}}));
	# cut everything behind a ';' or '('
	$capital =~ s/\(.*//g;
	$capital =~ s/;.*//g;

	# treat some special cases
	if ($CURRENT_COUNTRY eq 'bn') {
		$capital = 'Porto-novo';
	}

	if ($CURRENT_COUNTRY ne 'nr' and $capital ne '' and $capital ne 'none') {
		$fields .= ", capital";
		if ($DATABASE eq 'db2') {
			$values .= ", \"$capital\"";
		} else {	
			$values .= ", '$capital'";
		}
	} elsif ($DATABASE eq 'db2') {
		$values .= ", ";
	}

	if ($TEXT_FIELDS) {
		if ($country_data{$FIELDS{'religions'}}) {
			my $religions;
			$religions = $country_data{$FIELDS{'religions'}};
			# comment out the following three lines,
			# if you want to store the notes as well
			# WARNING! This will store some html tags as well
			if ($religions =~ /(.*)<br>/) {
				$religions = $1;
			}
			$fields .= ", religions";
			$values .= ", '".&quote($religions)."'";
		} elsif ($DATABASE eq 'db2') {
			$values .= ", ";
		}
	}

	if ($DATABASE eq 'db2') {
		print(COUNTRIES "\"$CURRENT_COUNTRY\",\"".&quote($code2name{$CURRENT_COUNTRY})."\"$values\n");
	} else {
		print("INSERT INTO ${SCHEMA_PREFIX}countries (id, name$fields) VALUES ('$CURRENT_COUNTRY', '".&quote($code2name{$CURRENT_COUNTRY})."' $values);\n");
	}
}

#
# create an array with all dependent arrays
#
sub print_dependent_countries {
	my $country;

	foreach $country (@DEPENDENT_COUNTRIES) {
		if ($DATABASE eq 'db2') {
			print(BELONGS_TO "\"$CURRENT_COUNTRY\",\"$country\"\n");
		} else {
			print("INSERT INTO ${SCHEMA_PREFIX}belongs_to (id1, id2) VALUES ('$CURRENT_COUNTRY', '$country');\n");
		}
	}	
}


#
# print the flag data
#
sub print_flag {
	my $FLAG = 'lg';
	$FLAG = '' if $SMALL_FLAGS;
	
	# check if the flag exists. some "countries" like Antarctica
	# don't have a flag
	if (!-e "$DIRECTORY${SEPARATOR}flags/".lc($CURRENT_COUNTRY)."-${FLAG}flag.gif") {
		return;
	}
		
	if ($DATABASE eq 'db2') {
		print(FLAGS "\"$CURRENT_COUNTRY\",\"$DIRECTORY${SEPARATOR}flags/".lc($CURRENT_COUNTRY)."-${FLAG}flag.gif\"\n");
	} elsif ($DATABASE eq 'postgresql') {
		print("INSERT INTO ${SCHEMA_PREFIX}flags (id, flag) VALUES ('$CURRENT_COUNTRY', lo_import('$DIRECTORY${SEPARATOR}flags/".lc($CURRENT_COUNTRY)."-${FLAG}flag.gif'));\n");
	} elsif ($DATABASE eq 'mysql') {
		print("INSERT INTO ${SCHEMA_PREFIX}flags (id, flag) VALUES ('$CURRENT_COUNTRY', LOAD_FILE('$DIRECTORY${SEPARATOR}flags/".lc($CURRENT_COUNTRY)."-${FLAG}flag.gif'));\n");
	}
}


#
# print the map data
#
sub print_map {
	if ($DATABASE eq 'db2') {
		print(MAPS "\"$CURRENT_COUNTRY\",\"$DIRECTORY${SEPARATOR}maps/".lc($CURRENT_COUNTRY)."-map.gif\"\n");
	} elsif ($DATABASE eq 'postgresql') {
		print("INSERT INTO ${SCHEMA_PREFIX}maps (id, map) VALUES ('$CURRENT_COUNTRY', lo_import('$DIRECTORY${SEPARATOR}maps/".lc($CURRENT_COUNTRY)."-map.gif'));\n");
	} elsif ($DATABASE eq 'mysql') {
		print("INSERT INTO ${SCHEMA_PREFIX}maps (id, map) VALUES ('$CURRENT_COUNTRY', LOAD_FILE('$DIRECTORY${SEPARATOR}maps/".lc($CURRENT_COUNTRY)."-map.gif'));\n");
	}
}



#
# create a list with all border countries 
#
sub process_borders {
	my (@countries, $country, $length);

	local($_) = shift;
	if (/<i>total:<\/i> ([0-9,\.]+) km/) {
		$TOTAL_BORDER = &str2num($1);
	} elsif (/^\s*0 km\s*$\s*/) {
		$TOTAL_BORDER = 0;
	}
	if (/<i>border countries:<\/i>([^<]*)/) {
		@countries = split /, /, $1;
		foreach (@countries) {
			if (/\s*(\D*)\s*([0-9,\.]+) km.*/) {
				$tmp = $1;
				$length = $2;
				$country_border{$name2code{&normalize($tmp)}} += 
					&str2num(&trim($length));
			}
		}
	}
}


#
# print INSERT statements for the boundaries relation
#
sub print_borders() {
	my $country;

	foreach $country (keys %country_border) {
		if ($DATABASE eq 'db2') {
			print(BOUNDARIES "\"$CURRENT_COUNTRY\",\"$country\",$country_border{$country}\n");
		} else {
			print("INSERT INTO ${SCHEMA_PREFIX}boundaries (id1, id2, b_length) VALUES ('$CURRENT_COUNTRY', '$country', $country_border{$country});\n");
		}
	}	
}



#
# Processes the age structure and creates some entries in %coutry_data
#
sub process_age_structure {
	local ($_) = @_;
	foreach (split /<br>/) {
		if (/<i>0-14 years:<\/i> ([0-9\.]*)%/) {
			$country_data{'age_young'} = $1;
		} 
		elsif (/<i>15-64 years:<\/i> ([0-9\.]*)%/) {
			$country_data{'age_adult'} = $1;
		} 
		elsif (/<i>65 years and over:<\/i> ([0-9\.]*)%/) {
			$country_data{'age_old'} = $1;
		} 
	}
}


#
# Processes the area field and creates some entries in %country_data
#
sub process_country_area {
	local ($_) = @_;
	my ($tmp, $count);
	foreach (split /<br>/) {
		if (/<i>total:<\/i> ([0-9\.,]+)\s*(million)? sq km/) {
			$count = &str2num($1);
			$count = $count*1000000 if ($2 eq 'million');
			$country_data{'total_area'} = &str2num($count);
		}
		elsif (/<i>water:<\/i> ([0-9\.,]+)\s*(million)? sq km/) {
			$count = &str2num($1);
			$count = $count*1000000 if ($2 eq 'million');
			$country_data{'water_area'} = &str2num($count);
		}
		elsif (/<i>land:<\/i> ([0-9\.,]+)\s*(million)? sq km/) {
			$count = &str2num($1);
			$count = $count*1000000 if ($2 eq 'million');
			$country_data{'land_area'} = &str2num($count);
		}
	}
}


#
# Extracts organization participation of a country
#
sub process_organizations {
	local ($_) = @_;
	my ($org);
	foreach (split /,/) {
		if (/none/) {
		} elsif (/\s*(.*)\s+\((.*)\)/) {
			$ORGANIZATION_LIST{&trim($1)} = &trim($2);
		} elsif (/\s*(.*)/) {
			$org = $1;
			if ($org =~ /G- /) {
				$org =~ s/\s//g;
			}
			if ($org =~ /(.*)<br>/) {
				$org = $1;
			}
			$ORGANIZATION_LIST{&trim($org)} = '';
		}
	}
}


sub print_organizations {
	foreach (keys %ORGANIZATION_LIST) {
		if ($ORGANIZATION_LIST{$_}) {
			if ($DATABASE eq 'db2') {
				print( ORGANIZATIONS "\"$CURRENT_COUNTRY\", \"$_\", \"$ORGANIZATION_LIST{$_}\"\n" );
			} else {
				print("INSERT INTO ${SCHEMA_PREFIX}organizations (id, name, role) VALUES ('$CURRENT_COUNTRY', '$_', '$ORGANIZATION_LIST{$_}');\n");
			}
		} else {
			if ($DATABASE eq 'db2') {
				print( ORGANIZATIONS "\"$CURRENT_COUNTRY\", \"$_\", NULL\n" );
			} else {
				print("INSERT INTO ${SCHEMA_PREFIX}organizations (id, name) VALUES ('$CURRENT_COUNTRY', '$_');\n");
			}
		}
	}
}


#
# removes all whitespaces from the end and beginning of a string
#
sub trim {
	my ($tmp) = @_;
	$tmp =~ s/^\s*(.*?)\s*$/$1/;
	return $tmp;
}


#
# converts a string to a number (removes the ,)
#
sub str2num {
	my ($tmp) = @_;
	$tmp =~ s/,//g;
	return $tmp;
}


sub quote {
	my ($tmp) = @_;
	if ($DATABASE ne 'mysql') {
		$tmp =~ s/'/''/g;
		$tmp =~ s/"/""/g;
	} else {
		$tmp =~ s/"/\\"/g;
		$tmp =~ s/'/\\'/g;
	}
	return $tmp;
}
#
# 'normalizes' a name, that is (xxx) is removed and everything behind a ,
# will be moved to the beginning of the string. 
# treats some special cases.
#
sub normalize {
	my ($tmp) = @_;
	$tmp =~ s/\(.*\)//;
	$tmp =~ s/(.*),(.*)/$2 $1/;
	$tmp =~ s/\s+/ /g;
	$tmp = &trim($tmp);
	# for some names, synonyms are used
	if ($tmp eq 'UAE') {
		$tmp = 'United Arab Emirates';
	}
	if ($tmp eq 'US') {
		$tmp = 'United States';
	}
	if ($tmp eq 'UK') {
		$tmp = 'United Kingdom';
	}
	if ($tmp =~ 'Azerbaijan') {
		$tmp = 'Azerbaijan';
	}
	if ($tmp =~ 'US Naval Base at Guantanamo Bay') {
		$tmp = 'United States';
	}
	return $tmp;
}


#
# Prints the schema data.
#
sub print_schema {
	print("-- CIA World Factbook\n");
	print("-- SQL Schema automatically created by wfb2sql version $VERSION_HIGH.$VERSION_LOW\n");

	if ($DATABASE eq 'db2' and $DATABASE_NAME ne '') {
		print("CONNECT TO $DATABASE_NAME;\n");
	}

	if ($DATABASE ne 'mysql' and $SCHEMA) {
		print("CREATE SCHEMA $SCHEMA_NAME;\n");
	}

	if ($DATABASE eq 'db2' and $SCHEMA) {
		print("COMMIT;\n\n");
	}

	print("CREATE TABLE ${SCHEMA_PREFIX}countries\n(\n");
  	print("  id CHAR(2) NOT NULL,\n");
	print("  name VARCHAR(100) NOT NULL,\n");
	print("  total_area DECIMAL(12,2)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (total_area >= 0)");
	}
	print(",\n");
	print("  land_area  DECIMAL(10,2)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (land_area >= 0)");
	}
	print(",\n");
	print("  water_area  DECIMAL(10,2)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (water_area >= 0)");
	}
	print(",\n");
	print("  coastline DECIMAL(12,0)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (coastline >= 0)");
	}
	print(",\n");
	if ($BOUNDARIES) {
		print("  total_border DECIMAL(10,2)");
		if ($DATABASE ne 'mysql') {
			print("\n  CHECK (total_border >= 0)");
		}
		print(",\n");
	}
	print("  population DECIMAL(13,2)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (population >= 0)");
	}
	print(",\n");
	print("  p_young DECIMAL(5,2)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (p_young BETWEEN 0 AND 100)");
	}
	print(",\n");
	print("  p_adult DECIMAL(5,2)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (p_adult BETWEEN 0 AND 100)");
	}
	print(",\n");
	print("  p_old DECIMAL(5,2)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (p_old BETWEEN 0 AND 100)");
	}
	print(",\n");
	print("  p_growth DECIMAL(5,2),\n");
	print("  labor_force DECIMAL(12,2)");
	if ($DATABASE ne 'mysql') {
		print("\n  CHECK (labor_force > 0)");
	}
	print(",\n");

	if ($COMMUNICATIONS) {
		print("  phone_mobiles INTEGER,\n");
		print("  phone_mainlines INTEGER,\n");
		print("  internet_users INTEGER,\n");
		print("  isps INTEGER,\n");
	}
	print("  birth_rate DECIMAL(7,2),\n");
	print("  death_rate DECIMAL(6,2),\n");
	print("  capital  VARCHAR(100),\n");

	# right now, there is only a text field describing the
	# religions in the country
	if ($TEXT_FIELDS) {
		print("  religions TEXT,\n");
	}
	
	if ($DATABASE ne 'mysql') {
		print("  CONSTRAINT check_area\n");
		print("  CHECK (land_area <= total_area),\n");
	}
	print("  PRIMARY KEY (id)\n");
	print(");\n\n");

	if ($ORGANIZATIONS) {
		print("CREATE TABLE ${SCHEMA_PREFIX}organizations\n(\n");
  		print("  id CHAR(2) NOT NULL,\n");
		print("  name VARCHAR(100) NOT NULL,\n");
		print("  role VARCHAR(100),\n");
		if ($DATABASE ne 'mysql') {
			print("  FOREIGN KEY\n");
			print("  (id) REFERENCES ${SCHEMA_PREFIX}countries,\n");
		}
		print("  PRIMARY KEY (id, name)\n);\n\n");
	}
	

	if ($BOUNDARIES) {
		print("CREATE TABLE ${SCHEMA_PREFIX}boundaries\n(\n");
  		print("  id1 CHAR(2) NOT NULL,\n");
		print("  id2 CHAR(2) NOT NULL,\n");
		print("  b_length DECIMAL(12,2)");
		if ($DATABASE ne 'mysql') {
			print("\n  CHECK (b_length >= 0)");
		}
		print(",\n");
		if ($DATABASE ne 'mysql') {
			print("  FOREIGN KEY\n");
			print("  (id1) REFERENCES ${SCHEMA_PREFIX}countries,\n");
			print("  FOREIGN KEY\n");
			print("  (id2) REFERENCES ${SCHEMA_PREFIX}countries,\n");
		}
		print("  PRIMARY KEY (id1, id2)\n);\n\n");
	}
	
	
	if ($DEPENDENT_COUNTRIES) {
		print("CREATE TABLE ${SCHEMA_PREFIX}belongs_to\n(\n");
	  	print("  id1 CHAR(2) NOT NULL,\n");
		print("  id2 CHAR(2) NOT NULL,\n");
		if ($DATABASE ne 'mysql') {
			print("  FOREIGN KEY\n");
			print("    (id1) REFERENCES ${SCHEMA_PREFIX}countries,\n");
			print("  FOREIGN KEY\n");
			print("    (id2) REFERENCES ${SCHEMA_PREFIX}countries,\n");
		}
		print("  PRIMARY KEY (id1, id2)\n);\n\n");
	}
	
	if ($MAPS) {
		print("CREATE TABLE ${SCHEMA_PREFIX}maps\n(\n");
		print("  id CHAR(2) NOT NULL,\n");
	  	print("  map ");
		if ($DATABASE eq 'postgresql') {
			print('oid');
		} else {
			print('BLOB');
		}
		print("(300 K)") if $DATABASE eq 'db2';
		print(",\n");
		if ($DATABASE ne 'mysql') {
			print("  FOREIGN KEY\n");
			print("  (id) REFERENCES ${SCHEMA_PREFIX}countries,\n");
		}
		print("  PRIMARY KEY (id)\n);\n\n");
	} # if ($maps)
	
	if ($FLAGS) {
		print("CREATE TABLE ${SCHEMA_PREFIX}flags\n(\n");
		print("  id CHAR(2) NOT NULL,\n");
		print("  flag ");
		if ($DATABASE eq 'postgresql') {
			print('oid');
		} else {
			print('BLOB');
		}
		print("(150 K)") if $DATABASE eq 'db2';
		print(",\n");
		if ($DATABASE ne 'mysql') {
			print("  FOREIGN KEY\n");
			print("    (id) REFERENCES ${SCHEMA_PREFIX}countries,\n");
		}
		print("  PRIMARY KEY (id)\n);\n\n");
	}
	print("COMMIT;\n\n") if $DATABASE eq 'db2';
	
	if ($DATABASE eq 'db2') {
		print("IMPORT FROM ${TMP_DIR}${SEPARATOR}countries.del OF DEL INSERT INTO ${SCHEMA_PREFIX}countries;\n");
		print("IMPORT FROM ${TMP_DIR}${SEPARATOR}belongs_to.del OF DEL INSERT INTO ${SCHEMA_PREFIX}belongs_to;\n") if $DEPENDENT_COUNTRIES;
		print("IMPORT FROM ${TMP_DIR}${SEPARATOR}boundaries.del OF DEL INSERT INTO ${SCHEMA_PREFIX}boundaries;\n") if $BOUNDARIES;
		print("IMPORT FROM ${TMP_DIR}${SEPARATOR}flags.del OF DEL MODIFIED BY lobsinfile INSERT INTO ${SCHEMA_PREFIX}flags;\n") if $FLAGS;
		print("IMPORT FROM ${TMP_DIR}${SEPARATOR}maps.del OF DEL MODIFIED BY lobsinfile INSERT INTO ${SCHEMA_PREFIX}maps;\n") if $MAPS;
		print("IMPORT FROM ${TMP_DIR}${SEPARATOR}organizations.del OF DEL INSERT INTO ${SCHEMA_PREFIX}organizations;\n") if $ORGANIZATIONS;
		print("COMMIT;\n\n");
	}
}
__END__

=head1 NAME

wfb2sql - Converts the CIA World Factbook into SQL statements

=head1 SYNOPSIS

B<wfb2sql> [options] --database=[db2|postgresql|mysql]

=head1 DESCRIPTION

B<wfb2sql> is a I<Perl> script that converts the CIA World Factbook HTML pages into SQL statements.

It was tested with the HTML version of the World Factbook 2003 and the World Factbook 2002 and probably has to be modified to extract data from further versions.

By default, all data that can be interpreted, is included in the SQL statements. If you want to restrict the data to only some information, use the B<--no>... command line options as described below. The data is written to STDOUT, so you can directly pass it to a database client.

In the current version, three database management systems are supported: I<IBM DB/2>, I<PostgreSQL> and I<mySQL>. The I<mysql> schema data produced by B<wfb2sql> uses a very basic SQL language, which, for example, doesn't include foreign keys. At least this schema should work with other DBMSs as well. If you plan to adapt B<wfb2sql> to other SQL dialects, focus on BLOB handling first. Not all databases support import of files in INSERT statements.

=head2 Notes for IBM DB/2

The I<db2> data had to be split into several files, since DB/2 doesn't support file inclusion in INSERT statements without a multimedia extender. Because of that, B<wfb2sql> outputs the schema data to STDOUT and creates extra files with spool data. By default, it tries to create the extra files in the current directory, but you may specify an alternative directory using the B<--tmp-dir> option. Please note, that you have to delete the extra files manually after script execution. To insert the world factbook data into a database, create a new database and run

wfb2sql [your options] | db2 -t

Use the I<--database-name> parameter to tell the script the name of your database. It will automatically add a CONNECT TO <database> statement to the produced sql code.

=head2 Notes for MySQL

The flag and map images are loaded into the database via the LOAD_FILE() function. The use of this function requires the FILE privilege. So, please make sure, that the user has sufficient rights to execute this function. Otherwise, MySQL will B<not> report an error and all the blob fields will just contain NULL. To insert the world factbook data into a database, create a database (e.g. factbook) and run

wfb2sql [your options] | mysql <databasename>

The script needs some time, so you might want to turn on the B<--verbose> option.

=head2 Notes for PostgreSQL

Please note that you must have Postgres superuser privilege to use server-side lo_import(), which is needed for maps and flags. To insert the world factbook data into a database, create a new database (with createdb) and run

wfb2sql [your options] | psql <databasename>

=head2 Configuration file

B<wfb2sql> supports the use of a configuration file called I<wfb2sqlrc>. The script first tries to find the file I<wfb2sqlrc> in the current directory. If it is not found, it tries to read the file I<.wfb2sqlrc> in the users home directory. If both files exist, the latter will be ignored. The file format is similar to standard UN*X configuration files. Comments begin with a B<#> and all behind the B<#> will be ignored. Options are specified by

B<optionname> = I<value>

and whitespaces are silently ignored. If a configuration file is load, these settings will override the defaults. Additional command line arguments will override the configuration file settings. Note that the options in the configuration file don't have leading --. A B<--no...> option might look like this in the configuration file (using the option B<--noboundaries> in this example):

B<boundaries = no>

You can as an alternative use B<boundaries = yes> to explicitly turn on the option B<boundaries>.

=head1 OPTIONS

=over 4

=item B<--database=dialect>

Selects an SQL dialect for the SQL output. Currently supported values are I<db2>, I<mysql> and I<postgresql>.

=item B<--database-name=name>

Adds a 'CONNECT TO name' to the schema definition. This is needed if you want to pipe the output directly into db2. For databases other than db2 this parameter is ignored.

=item B<--data-only>

Output only SQL data and emit schema information. Use this function if you already have created the tables.

=item B<--directory=dir>

Specifies the root directory of the factbook installation. The root directory containts the file I<index.html> and the I<print/>-directory.

=item B<--noboundaries>

Include no boundary information. Otherwise, the borders to surrounding countries are stored in an extra table called I<boundaries>, the total boundaries are stored in the I<country> entity.

=item B<--nocommunications>

Emit data on communication. Otherwise the following fields will be added:

=over 4

=item I<phone_mainlines>

Number of main lines in use.

=item I<phone_mobile>

Number of cellphones.

=item I<internet_users>

Number of internet users.

=item I<isps>

Number of Internet service providers.

=back

=item B<--nodependent-countries>

Emit dependent country information.

=item B<--noflags>

Emit flag data and schema information.

=item B<--nomaps>

Emit map data and schema information.

=item B<--noorganizations>

Emit data about organizations memberships.

=item B<--noschema>

Do not add a schema name (default: I<wfb>) when printing the SQL CREATE TABLE commands.

=item B<--notext-fields>

Do not include textual fields like background information.

=item B<--schema-name=name>

Specifies an alternate schema name. This applies only to databases which support schemata (DB/2 and PostgreSQL Version 7.3 or higher). If you want to disable the schema names, use the B<--noschema> option. The default schema name is I<wfb>.


=item B<--schema-only>

Prints only SQL schema data. Use this function if you just want to create the SQL tables first.

=item B<--small-flags>

Include only small flags. By default, the large flags will be included in the database. Use the B<--noflags> option to emit all kind of flags.

=item B<--temp-dir=dir>

The I<db2> creation method requires the use of temporary files. By default they will be placed in the current directory, but you may specify a different directory using this option.

=item B<-v>, B<--verbose>

Prints some more or less useful information to STDERR.

=back

=head1 REQUIRES

Perl 5.004, Getopt::Long, Pod::Usage, Options::General

Getopt::Long is available at http://search.cpan.org/author/JV/Getopt-Long-2.33/

Pod::Usage is available at http://search.cpan.org/author/MAREKR/PodParser-1.23/

Options::General is available at http://search.cpan.org/author/TLINDEN/Config-General-2.21/

=head1 AUTHOR

Jan Schreiber E<lt>I<mail@jschreiber.com>E<gt>

=head1 COPYRIGHT

COPYRIGHT (C) 2003 Arbeitsbereich Datenbanken und Informationssysteme,
Eberhard-Karls-Universitaet Tuebingen, Germany.

=head1 BUGS

"Cyprus" does not contain correct values for some entries since all
values consist of two values: on for the "Greek Cypriot area" and one
for the "Turkish Cypriot area". Maybe this data should be handled manually.

Please send bug reports to the author.

=head1 SEE ALSO

CIA World Factbook	http://www.cia.gov/cia/publications/factbook/

CIA World Factbook 2002	http://www.cia.gov/cia/download2002.htm

=cut
