#!/usr/bin/perl
#****************************************************************************
#****************************************************************************
# This program is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by the Free
# Software Foundation; either version 2 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc., 59
# Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
#****************************************************************************
#****************************************************************************
#
#                     awffull_history_regen.pl
#
#
# Copyright (C) 2005 by Stephen McInerney spm@stedee.id.au
#
# $Id: awffull_history_regen.pl 400 2007-05-19 04:39:16Z steve $
#
# DESCRIPTION
# --------------
# Given a directory, this script will parse all old weblizer html (per month)
# files and spit out a complete history file (via STDOUT).
# This new history file will contain all years/months from all the
# webalizer html files.
#
# Designed for pre-processing an old webalizer install prior to an
# upgrade to AWFFull.
#
#****************************************************************************
#****************************************************************************
#  Modification History
# 11-Sep-2005 steve     Initial Creation
# 17-Sep-2005 steve     major tidy and functionalise
#****************************************************************************
#****************************************************************************
#
###  *** Sample text to parse for
#
# <TR><TH COLSPAN=3 ALIGN=center BGCOLOR="#C0C0C0">Monthly Statistics for July 2005</TH></TR>
# <TR><TH HEIGHT=4></TH></TR>
# <TR><TD WIDTH=380><FONT SIZE="-1">Total Hits</FONT></TD>
# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>12217843</B></FONT></TD></TR>
# <TR><TD WIDTH=380><FONT SIZE="-1">Total Files</FONT></TD>
# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>5384438</B></FONT></TD></TR>
# <TR><TD WIDTH=380><FONT SIZE="-1">Total Pages</FONT></TD>
# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>1031846</B></FONT></TD></TR>
# <TR><TD WIDTH=380><FONT SIZE="-1">Total Visits</FONT></TD>
# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>226836</B></FONT></TD></TR>
# <TR><TD WIDTH=380><FONT SIZE="-1">Total KBytes</FONT></TD>
# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>39965939</B></FONT></TD></TR>
# <TR><TH HEIGHT=4></TH></TR>
# <TR><TD WIDTH=380><FONT SIZE="-1">Total Unique Sites</FONT></TD>
# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>120135</B></FONT></TD></TR>
# <TR><TD WIDTH=380><FONT SIZE="-1">Total Unique URLs</FONT></TD>
#****************************************************************************

use strict;               # die on all bad programming
use Getopt::Long 2.33;    # Command Line Option Processing
use Pod::Usage;           # For inline documentation

###########################
## Global Variables
###########################
my $DATE        = '/bin/date';   # Location of the GNU Date Command - default
my $exit_status = 0;             # Script Return. 0 = success!

## Options
my $opt_UsageDir    = ".";       # Directory to look for webalizer usage files
my $opt_DateCommand = $DATE;     # Location of the GNU Date Command

###########################
###########################
##         MAIN
###########################
###########################

ProcessCommandLine();
$exit_status = RegenerateHistory();

if ($exit_status == 2) {
    printf(STDERR "Failed to find any Webalizer usage_YYYYMM.html files.\n");
}

exit($exit_status);

##########################################################################
##########################################################################
####                          END OF MAIN
##########################################################################
##########################################################################


####             SUBROUTINES

##########################################################################
##########################################################################
## ProcessCommandLine
##       Parse the Commandline Arguments
##########################################################################
sub ProcessCommandLine {
    my $result;    # result from Calling GetOptions

    my $opt_Help;  # Local options
    my $opt_Man;   #  use for man page, or help screen

    Getopt::Long::Configure("gnu_getopt");    # Configure to use GNU style Options

    $result =
        GetOptions("dir|d:s" => \$opt_UsageDir,
                   "help|\?" => \$opt_Help,
                   "man"     => \$opt_Man,
                   "date:s"  => \$opt_DateCommand,
                  )
        || pod2usage(-verbose => 0);
    if ($opt_Help) { pod2usage(-verbose => 1); }
    if ($opt_Man)  { pod2usage(-verbose => 2); }
    if (!-x $opt_DateCommand) {
        printf("Invalid Date command: %s\n", $opt_DateCommand);
        exit(1);
    }
} ## end sub ProcessCommandLine


##########################################################################
##########################################################################
## RegenerateHistory
##      Do the hard work - process the data, generate the output
##########################################################################
sub RegenerateHistory {
    my $usagefile;    # The current file we're processing

    # Up to Flags
    my $in_MonthlyStats = 0;    # We are currently in the right place for monthly stats in the page
    my $in_HitsStats    = 0;    # Now in Hits Stats
    my $in_FilesStats   = 0;    # Now in File Stats
    my $in_PageStats    = 0;    # Now in Page Stats
    my $in_VisitStats   = 0;    # Now in Visit Stats
    my $in_KByteStats   = 0;    # Now in KByte Stats
    my $in_SiteStats    = 0;    # Now in Site Stats

    my @line = ();              # The current input line
    my %History;                # The hash holding all the ripped data

    my $cur_month = "";         # The current month
    my $cur_year  = 0;          # The current year
    my $nofiles   = 2;          # Return the value 2 if no files are found

    ### Month stuff
    my @MoY = qw(January February March April May June July August September October November December);
    my %MoY;
    @MoY{@MoY} = (0 .. 11);
    my @DinM = (31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31);

    opendir(DIR, $opt_UsageDir) or die "Cannot open directory $opt_UsageDir";
    while ($usagefile = readdir DIR) {
        if ($usagefile =~ /^usage_[0-9]{6}\.html$/) {
            open(FILE, "<$opt_UsageDir/$usagefile") or die "Cannot open file $usagefile";

        FILELINE:
            while (<FILE>) {
                if (/>Monthly Statistics for /) {
                    $in_MonthlyStats = 1;
                    @line            = split /( |<)/;
                    $cur_month       = $line[16];
                    $cur_year        = $line[18];
                    $nofiles         = 0;
                } ## end if (/>Monthly Statistics for /)
                next FILELINE if (!$in_MonthlyStats);

                if ($in_MonthlyStats) {

                    # Exit this file, end of useful info
                    last FILELINE if (/>Total Unique URLs</);

                    # HITS - set value
                    if ($in_HitsStats) {
                        @line                                           = split /(<|>)/;
                        $History{$cur_year}{@MoY{$cur_month} + 1}{HITS} = $line[12];
                        $in_HitsStats                                   = 0;
                    }

                    # FILES - set value
                    elsif ($in_FilesStats) {
                        @line                                            = split /(<|>)/;
                        $History{$cur_year}{@MoY{$cur_month} + 1}{FILES} = $line[12];
                        $in_FilesStats                                   = 0;
                    }

                    # PAGES - set value
                    elsif ($in_PageStats) {
                        @line                                            = split /(<|>)/;
                        $History{$cur_year}{@MoY{$cur_month} + 1}{PAGES} = $line[12];
                        $in_PageStats                                    = 0;
                    }

                    # VISITS - set value
                    elsif ($in_VisitStats) {
                        @line                                             = split /(<|>)/;
                        $History{$cur_year}{@MoY{$cur_month} + 1}{VISITS} = $line[12];
                        $in_VisitStats                                    = 0;
                    }

                    # KBYTES - set value
                    elsif ($in_KByteStats) {
                        @line                                             = split /(<|>)/;
                        $History{$cur_year}{@MoY{$cur_month} + 1}{KBYTES} = $line[12];
                        $in_KByteStats                                    = 0;
                    }

                    # SITES - set value
                    elsif ($in_SiteStats) {
                        @line                                            = split /(<|>)/;
                        $History{$cur_year}{@MoY{$cur_month} + 1}{SITES} = $line[12];
                        $in_SiteStats                                    = 0;
                    }

                    # Else, all the checks for a next section
                    elsif (/>Total Hits</) {
                        $in_HitsStats = 1;
                    } elsif (/>Total Files</) {
                        $in_FilesStats = 1;
                    } elsif (/>Total Pages</) {
                        $in_PageStats = 1;
                    } elsif (/>Total Visits</) {
                        $in_VisitStats = 1;
                    } elsif (/>Total KBytes</) {
                        $in_KByteStats = 1;
                    } elsif (/>Total Unique Sites</) {
                        $in_SiteStats = 1;
                    }
                }    ## if ($in_MonthlyStats) {
            }    ## while (<FILE>) {
            close(FILE);

        }    ## if ($usagefile =~ /^usage_[0-9]{6}\.html$/) {
    }    ## while (DIR) {
    closedir(DIR);

    my $key_year;
    my $key_month;
    foreach $key_year (sort (keys %History)) {
        foreach $key_month (sort numerically (keys %{$History{$key_year}})) {
            my $DaysInMonth = $DinM[$key_month - 1];
            if ($key_month == 2) {
                my $testmonth = `$opt_DateCommand "+%m" --date="29 feb $key_year"`;
                if ($testmonth == 2) {
                    $DaysInMonth = 29;
                }
            } ## end if ($key_month == 2)
            printf("%d %d %d %d %d %d 1 %d %d %d\n",
                   $key_month,                             $key_year,
                   $History{$key_year}{$key_month}{HITS},  $History{$key_year}{$key_month}{FILES},
                   $History{$key_year}{$key_month}{SITES}, $History{$key_year}{$key_month}{KBYTES},
                   $DaysInMonth,                           $History{$key_year}{$key_month}{PAGES},
                   $History{$key_year}{$key_month}{VISITS}
                  );
        }    ## foreach $key_month
    }    ## foreach $key_year

    return ($nofiles);
} ## end sub RegenerateHistory


##########################################################################
##########################################################################
## numerically
##      Do a numerical sort
##########################################################################
sub numerically { $a <=> $b }


##########################################################################
##########################################################################
##########################################################################

__END__

=pod

=head1 NAME

awffull_history_regen.pl - Generate a history file from old Webalizer usage files

=head1 SYNOPSIS

awffull_history_regen.pl [options]

NB! Must have the GNU Date command!

=head1 OPTIONS

=over 8

=item B<--help>

Print a brief help message and exit.

=item B<--man>

Print the manual page and exit.

=item B<--dir directory>

The directory to use, looking for old webalizer usage_YYYYMM.html files. If
not present will use the current directory.

=item B<--date gnu-date-location>

This program requires the GNU date command, use this option, if it's in a non-standard place.

=head1 DESCRIPTION

Generate a history file from old Webalizer usage files.

The resulting history file is sent only to STDOUT. 

=cut

