#!/usr/local/bin/perl -w
######################################################################
#
# $Id: ftimes-dbm-make,v 1.7 2014/07/18 06:40:45 mavrik Exp $
#
######################################################################
#
# Copyright 2008-2014 The FTimes Project, All Rights Reserved.
#
######################################################################
#
# Purpose: Create or update a Berkeley database.
#
######################################################################

use strict;
use DB_File;
use File::Basename;
use Getopt::Std;

BEGIN
{
  ####################################################################
  #
  # The Properties hash is essentially private. Those parts of the
  # program that wish to access or modify the data in this hash need
  # to call GetProperties() to obtain a reference.
  #
  ####################################################################

  my (%hProperties);

  sub GetProperties
  {
    return \%hProperties;
  }
}

######################################################################
#
# Main Routine
#
######################################################################

  ####################################################################
  #
  # Punch in and go to work.
  #
  ####################################################################

  my ($phProperties);

  $phProperties = GetProperties();

  $$phProperties{'Program'} = basename(__FILE__);

  ####################################################################
  #
  # Get Options.
  #
  ####################################################################

  my (%hOptions);

  if (!getopts('d:F:o:', \%hOptions))
  {
    Usage($$phProperties{'Program'});
  }

  ####################################################################
  #
  # A database, '-d', is required.
  #
  ####################################################################

  if (!exists($hOptions{'d'}) || !defined($hOptions{'d'}) || length($hOptions{'d'}) < 1)
  {
    Usage($$phProperties{'Program'});
  }
  $$phProperties{'DbFile'} = $hOptions{'d'};

  ####################################################################
  #
  # The input field separator, '-F', is optional.
  #
  ####################################################################

  $$phProperties{'FieldSeparator'} = (exists($hOptions{'F'})) ? $hOptions{'F'} : "|";

  if ($$phProperties{'FieldSeparator'} !~ /^(\\t|[ ,:;=|])$/)
  {
    print STDERR "$$phProperties{'Program'}: Error='Invalid input field separator ($$phProperties{'FieldSeparator'}).'\n";
    exit(2);
  }

  $$phProperties{'FieldSeparator'} =~ s/^\|$/\\|/g; # Escape the pipe separator.

  ####################################################################
  #
  # The option list, '-o', is optional.
  #
  ####################################################################

  $$phProperties{'AlwaysInsert'} = 0;
  $$phProperties{'BeQuiet'} = 0;
  $$phProperties{'ForceNew'} = 0;

  $$phProperties{'Options'} = (exists($hOptions{'o'})) ? $hOptions{'o'} : undef;

  if (exists($hOptions{'o'}) && defined($hOptions{'o'}))
  {
    foreach my $sActualOption (split(/,/, $$phProperties{'Options'}))
    {
      foreach my $sTargetOption ('AlwaysInsert', 'BeQuiet', 'ForceNew')
      {
        if ($sActualOption =~ /^$sTargetOption$/i)
        {
          $$phProperties{$sTargetOption} = 1;
        }
      }
    }
  }

  ####################################################################
  #
  # If there isn't at least one argument left, it's an error.
  #
  ####################################################################

  if (scalar(@ARGV) < 1)
  {
    Usage($$phProperties{'Program'});
  }

  ####################################################################
  #
  # Tie OnDiskList to the db.
  #
  ####################################################################

  my (%hOnDiskList, $sTieFlags);

  $sTieFlags = ($$phProperties{'ForceNew'}) ? O_RDWR|O_CREAT|O_TRUNC : O_RDWR|O_CREAT;

  if (!tie(%hOnDiskList, "DB_File", $$phProperties{'DbFile'}, $sTieFlags, 0644, $DB_BTREE))
  {
    print STDERR "$$phProperties{'Program'}: File='$$phProperties{'DbFile'}' Error='$!'\n";
    exit(2);
  }

  ####################################################################
  #
  # Finalize variables.
  #
  ####################################################################

  my ($sKIndex, $sVIndex, $sRecordRegex);

  $sRecordRegex = qq(([^$$phProperties{'FieldSeparator'}]+)[$$phProperties{'FieldSeparator'}](.+));
  $sKIndex = 0;
  $sVIndex = 1;

  ####################################################################
  #
  # Iterate over input files.
  #
  ####################################################################

  my ($sAccepted, $sInserted, $sRejected, $sUpdated) = (0, 0, 0, 0);

  foreach my $sHdFile (@ARGV)
  {
    if (!open(FH, "< $sHdFile"))
    {
      if (!$$phProperties{'BeQuiet'})
      {
        print STDERR "$$phProperties{'Program'}: File='$sHdFile' Error='$!'\n";
      }
      next;
    }
    if ($$phProperties{'AlwaysInsert'})
    {
      while (my $sRecord = <FH>)
      {
        if (my @aFields = $sRecord =~ /^$sRecordRegex$/o)
        {
          $hOnDiskList{$aFields[$sKIndex]} = $aFields[$sVIndex];
          $sAccepted++;
          $sInserted++;
        }
        else
        {
          if (!$$phProperties{'BeQuiet'})
          {
            $sRecord =~ s/[\r\n]+$//;
            print STDERR "$$phProperties{'Program'}: File='$sHdFile' Record='$sRecord' Error='Record did not parse properly.'\n";
          }
          $sRejected++;
        }
      }
    }
    else
    {
      while (my $sRecord = <FH>)
      {
        if (my @aFields = $sRecord =~ /^$sRecordRegex$/o)
        {
          if (exists($hOnDiskList{$aFields[$sKIndex]}))
          {
            if ($hOnDiskList{$aFields[$sKIndex]} ne $aFields[$sVIndex])
            {
              $hOnDiskList{$aFields[$sKIndex]} = $aFields[$sVIndex];
              $sUpdated++;
            }
          }
          else
          {
            $sInserted++;
            $hOnDiskList{$aFields[$sKIndex]} = $aFields[$sVIndex];
          }
          $sAccepted++;
        }
        else
        {
          if (!$$phProperties{'BeQuiet'})
          {
            $sRecord =~ s/[\r\n]+$//;
            print STDERR "$$phProperties{'Program'}: File='$sHdFile' Record='$sRecord' Error='Record did not parse properly.'\n";
          }
          $sRejected++;
        }
      }
    }
    close(FH);
  }

  ####################################################################
  #
  # Print activity report.
  #
  ####################################################################

  my (@aCounts);

  push(@aCounts, "Accepted='$sAccepted'");
  push(@aCounts, "Rejected='$sRejected'");
  push(@aCounts, "Inserted='$sInserted'");
  push(@aCounts, ($$phProperties{'AlwaysInsert'}) ? "Updated='NA'" : "Updated='$sUpdated'");
  print join(' ', @aCounts), "\n";

  ####################################################################
  #
  # Clean up and go home.
  #
  ####################################################################

  untie(%hOnDiskList);

  1;


######################################################################
#
# Usage
#
######################################################################

sub Usage
{
  my ($sProgram) = @_;
  print STDERR "\n";
  print STDERR "Usage: $sProgram [-F separator] [-o option[,option[,...]]] -d db file [file ...]\n";
  print STDERR "\n";
  exit(1);
}


=pod

=head1 NAME

ftimes-dbm-make - Create or update a Berkeley database

=head1 SYNOPSIS

B<ftimes-dbm-make> B<[-F separator]> B<[-o option[,option[,...]]]> B<-d db> B<file [file ...]>

=head1 DESCRIPTION

This utility reads one or more input files (see ftimes-dbm-reap(1))
and creates or updates the specified database.  Databases are
implemented as BTrees and constructed using Perl's DB_File module.
Enumerating these databases yields the following format:

    key|value

The primary rule of engagement is that imported key/value pairs trump
existing pairs in the database.  This is true unless the pairs are
identical.  In that case the existing pairs are not modified.  If the
B<AlwaysInsert> option is set, imported pairs always trump existing
pairs.  Input files are processed in command-line order.  Typically,
files that are sorted yield much faster load times.

=head1 OPTIONS

=over 4

=item B<-d db>

Specifies the name of the database to create or update.

=item B<-F separator>

Specifies the input field separator.  Valid separators include the
following characters: tab '\t', space ' ', comma ',', colon ':',
semi-colon ';', equal sign '=', and pipe '|'.  The default separator
is a pipe.  Note that parse errors are likely to occur if the
specified separator appears in any of the field values.

=item B<-o option,[option[,...]]>

Specifies the list of options to apply.  Currently the following
options are supported:

=over 4

=item AlwaysInsert

If a key already exists, its value is overwritten.  This option
improves performance when the database is new or the B<ForceNew>
option has been set.  Enabling this option, however, disables update
tracking.

=item BeQuiet

Don't report errors (i.e., be quiet) while processing files.

=item ForceNew

Force the specified database to be truncated on open.

=back

=back

=head1 CAVEATS

Databases created from the same input may yield different, but
equivalent, DBM files.  Generally, DBM files are not portable across
different platforms and operating systems.  Therefore, the recommended
method for exchanging or verifying a database is to dump it to a
delimited file (see ftimes-dbm-dump(1)) and operate on that instead.

Take care to avoid mixing databaes that are based on different field
types (i.e., keys).  This can be easy to do if you're not careful.

=head1 AUTHOR

Klayton Monroe

=head1 SEE ALSO

ftimes-dbm-bash(1), ftimes-dbm-dump(1), ftimes-dbm-reap(1), ftimes-dbm-weed(1)

=head1 LICENSE

All documentation and code are distributed under same terms and
conditions as FTimes.

=cut
