#!/usr/bin/perl -w
#############################################################################
# GroupSTAT: Generates statistics from a local news spool                   #
# Copyright (C) H. Alex LaHurreau <alexdw@locl.net>, 1999-2000.             #
#############################################################################
# This program is based on StatNews, and so parts of this program are       #
# Copyright (C) Davide G. M. Salvetti <salve@debian.org>, 1998.             #
#############################################################################
# This program is free software; you can redistribute it and/or modify      #
# it under the terms of the GNU General Public License as published by      #
# the Free Software Foundation; either version 2 of the License, or         #
# (at your option) any later version.                                       #
#                                                                           #
# This program is distributed in the hope that it will be useful,           #
# but WITHOUT ANY WARRANTY; without even the implied warranty of            #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
# GNU General Public License for more details.                              #
#                                                                           #
# You should have received a copy of the GNU General Public License         # 
# along with this program; if not, write to the Free Software               #
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA #
#                                                                           #
# Alternatively, if you are using Debian GNU/Linux, you should be able to   #
# find a copy of the GNU General Public License in:                         #
#     /usr/share/common-licenses/GPL                                        #
# OR: /usr/doc/copyright/GPL                                                #
#############################################################################


########################
### `use' DIRECTIVES ###
########################
use strict;
use locale;		# Use some locale-specific customizations
use POSIX qw(strftime);	# Hey Perl hackers!  Use strftime()
use News::Scan;		# Our wonderful OO backend!
use Mail::Address;	# Quick and easy mail address parsing
use Getopt::Long;
use File::Path;		# Yay!  I can create arbitrary paths!	
use File::Spec;		# Sometimes you just need cross platform compatability
use File::Basename;	# Ditto above
# Just in case someone decides to use MIME headers :-)
#use MIME::Words qw(:all);


###################################################
### READ COMMAND-LINE AND ENVIRONMENTAL OPTIONS ###
###################################################
my %opt;
if ($_ = $ENV{GROUPSTAT}) {@_ = split; unshift(@ARGV, @_);}
GetOptions(\%opt, 
		# Options which effect the stats themselves
		'days|d=i', 'num-major|N=i', 'num-minor|n=i',
		# Options which effect the formatting of the stats
		'width|w=i', 'headstrip|H!',
		# Options which effect input
		'spool|s=s', 'dotted|D!', 'input-dir|i=s',
		# Options which effect output
		'output-base|o=s', 'output-dir|O=s', 'output-file|f=s',
		# Debugging
		'verbose|v!', 'help|usage|h|?',
	  ) or die "Error parsing options, try --help";
if($opt{'help'}) { help(); exit 0; }


##############################
### INITIALIZING VARIABLES ###
##############################
use vars qw(	$VERSION $BACKEND
		$DAYS $RANKLARGE $RANKSMALL
		$WIDTH $HEADSTRIP
		$HOME $SPOOL_BASE $DOTTED $SPOOL
		$OUTPUT_BASE $OUTPUT_DIR $OUTPUT_FILE
		$VERBOSE
           );
$VERSION	= '0.12.1';
$BACKEND	= 'News::Scan';
$DAYS		= defined($opt{'days'})        ? $opt{'days'}        : 7;
$RANKLARGE	= defined($opt{'num-major'})   ? $opt{'num-major'}   : 25;
$RANKSMALL	= defined($opt{'num-minor'})   ? $opt{'num-minor'}   : 10;
$WIDTH		= defined($opt{'width'})       ? $opt{'width'}       : 72;
$HEADSTRIP	= defined($opt{'headstrip'})   ? $opt{'headstrip'}   : 1;
$SPOOL_BASE	= defined($opt{'spool'})       ? $opt{'spool'}
			: '/var/spool/news';	# FHS-compliant location
$DOTTED		= defined($opt{'dotted'})      ? $opt{'dotted'}      : 1;
$SPOOL		= $opt{'input-dir'};	# No default
$HOME		= defined($ENV{'HOME'})        ? $ENV{'HOME'}
			: File::Spec->rootdir;
$OUTPUT_BASE	= defined($opt{'output-base'}) ? $opt{'output-base'}
			: File::Spec->catdir($HOME, 'News', 'STATS');
$OUTPUT_DIR	= $opt{'output-dir'};	# No default
$OUTPUT_FILE	= $opt{'output-file'};	# No default
$VERBOSE	= $opt{'verbose'};	# Default == off
my @groups	= @ARGV	? @ARGV : 'rec.arts.drwho';
$| = 1;		# Use line-buffered output


#################################
### WARNING ABOUT --input-dir ###
#################################
if (($#groups > 0) && $SPOOL) {
	die "Can only scan one group when --input-dir is in effect";
}


###################################
### PREPARE TO USE $OUTPUT_FILE ###
###################################
if ($OUTPUT_FILE) { open_outfile($OUTPUT_FILE) }


##############################
### PRINT ON-SCREEN BANNER ###
##############################
print STDERR <<"END BANNER";
///GroupSTAT: Advanced Newsgroup Statistics version $VERSION
///Generates statistics from a local news spool

Copyleft (C) 2000 H. Alex LaHurreau, but is based on StatNews, which is
Copyleft (C) 1998 Davide G. M. Salvetti
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
END BANNER


###########################################################
###                  LOOP OVER @groups                  ###
### NOTE: MOST OF THE REST OF THIS CODE IS IN THIS LOOP ###
###########################################################
foreach my $group (@groups) {

##############################################
### DETERMINE INPUT AND OUTPUT DIRECTORIES ###
###   (the $output_dir will go to waste    ###
###     if we are using $OUTPUT_FILE)      ###
##############################################
my $group_dir	= $group;
if ($DOTTED)	{ $group_dir =~ s#\.#/#g }
my $output_dir	= $OUTPUT_DIR || File::Spec->catdir($OUTPUT_BASE, $group_dir);
my $spool_dir	= $SPOOL      || File::Spec->catdir($SPOOL_BASE,  $group_dir);

unless($OUTPUT_FILE) {
	my $short_date	= strftime "%Y-%m-%d", gmtime($^T);
	my $output_name	= $OUTPUT_DIR ? "$group-$short_date" : "$short_date-$VERSION";
	my $output_file	= File::Spec->catfile($output_dir, $output_name);
	open_outfile($output_file);
}


###################
### SET UP SCAN ###
###################
print STDERR "\nScanning $DAYS days of ${group}...";
my $scan = new News::Scan     ( Group => $group,
				From  => 'spool',
				Spool => $spool_dir,
				Period => $DAYS,
			      );
$scan->scan or die "\nNews::Scan error: " . $scan->error;
print STDERR "done.\n";


######################
### GRAB SCAN INFO ###
######################
my($current, $from, $to, $articles, $chars, $body_chars, $orig_chars);
$current    = strftime "%a, %d %b %Y %H:%M:%S GMT", gmtime($^T);
$from       = strftime "%a, %d %b %Y %H:%M:%S GMT", gmtime($scan->earliest);
$to         = strftime "%a, %d %b %Y %H:%M:%S GMT", gmtime($scan->latest);
$articles   = $scan->articles;
$chars      = $scan->volume;
$body_chars = $scan->body_volume;
$orig_chars = $scan->orig_volume;


#########################
### PROCESS SCAN INFO ###
#########################
my($group_ocr, $sig_percent, $msg_per_day, $char_per_day, $kb_total, $kb_per_day);
$group_ocr    = 100*($orig_chars/$body_chars);
$sig_percent  = 100*($scan->signatures)/$articles;
$msg_per_day  = $articles/$DAYS;
$char_per_day = $chars/$DAYS;
$kb_total     = $chars/1024;
$kb_per_day   = $char_per_day/1024;


##########################
### MAJOR DATA OBJECTS ###
##########################
my($people, $subjects, $cross_groups, $task);
$people		= $scan->posters;
$subjects	= $scan->threads;
$cross_groups	= $scan->crossposts;


##########################
### PRINT STATS HEADER ###
##########################
my $old_fh = select OUTFILE;	# $old_fh should be STDOUT, but you never know
print "\n" if $OUTPUT_FILE;
print center(<<"END HEADER"), "\n\n";
GroupSTAT: Advanced Newsgroup Statistics Program
Version $VERSION (using $BACKEND)

Copyleft (C) 2000 H. Alex LaHurreau
Based on StatNews, Copyleft (C) 1998 Davide G. M. Salvetti
END HEADER

print <<"END EASY";
Newsgroup.................: $group
Stats Were Taken..........: $current
Stats Begin...............: $from
Stats End.................: $to
Days......................: $DAYS
Total No. of Articles.....: $articles
Total No. of Characters...: $chars
END EASY
printf("Total Volume..............: %d\n", $kb_total);
printf("Messages Per Day..........: %.1f\n", $msg_per_day);
printf("Characters Per Day........: %.1f\n", $char_per_day);
printf("Average Daily Volume......: %d kB\n", $kb_per_day);
printf("Total Posters This Week...: %d\n", (scalar keys %{$people}));
printf("Messages with Sigs........: %.2f%%\n", $sig_percent);
printf("Original Content Rating...: %.2f%%\n", $group_ocr);


######################
### RETRIEVE STATS ###
######################
my $i;	# counter variable used in many places

$task = "Top $RANKLARGE Prolific Posters";
print STDERR "Getting $task...";
$i = 0; line();
print underline("$task: Posts / Posts per Day / Percent Share");
foreach my $addr (sort { ($people->{$b}->articles) <=> ($people->{$a}->articles)
                          || $people->{$a}->attrib cmp $people->{$b}->attrib }
		  keys %{$people}) {
	my $name  = headstrip($people->{$addr}->attrib);
	my $posts = $people->{$addr}->articles;
	printf "%3d. %s: %3d %4.1f %4.1f%%\n", ++$i, dotline($WIDTH - 22, $name),
		$posts, $posts/$DAYS, 100*$posts/$articles;
	last if ($i >= $RANKLARGE);
}
print STDERR "done.\n";

$task = "Top $RANKLARGE Bandwidth-Slurping Posters";
print STDERR "Getting $task...";
$i = 0; line();
print underline("$task: kBytes / kBytes per Day / Percent Share");
foreach my $addr (sort { ($people->{$b}->volume) <=> ($people->{$a}->volume)
                          || $people->{$a}->attrib cmp $people->{$b}->attrib }
		  keys %{$people}) {
	my $name   = headstrip($people->{$addr}->attrib);
	my $bytes  = $people->{$addr}->volume;
	my $kbytes = $bytes/1024;
	printf "%3d. %s: %3d %4.1f %4.1f%%\n", ++$i, dotline($WIDTH - 22, $name),
		$kbytes, $kbytes/$DAYS, 100*$bytes/$chars;
	last if ($i >= $RANKLARGE);
}
print STDERR "done.\n";

$task = "Top $RANKLARGE Popular Threads";
print STDERR "Getting $task...";
$i = 0; line();
print underline("$task: Posts / Posts per Day / Percent Share");
foreach my $subj 
  (sort { ($subjects->{$b}->articles) <=> ($subjects->{$a}->articles)
	  || $a cmp $b }
   keys %{$subjects}) {
	my $posts = $subjects->{$subj}->articles;
	printf "%3d. %s: %3d %4.1f %4.1f%%\n", ++$i, dotline($WIDTH - 22, $subj),
		$posts, $posts/$DAYS, 100*$posts/$articles;
	last if ($i >= $RANKLARGE);
}
print STDERR "done.\n";

$task = "Top $RANKLARGE Bandwidth-Slurping Threads";
print STDERR "Getting $task...";
$i = 0; line();
print underline("$task: kBytes / kBytes per Day / Percent Share");
foreach my $subj 
  (sort { ($subjects->{$b}->volume) <=> ($subjects->{$a}->volume)
	  || $a cmp $b }
   keys %{$subjects}) {
	my $bytes  = $subjects->{$subj}->volume;
	my $kbytes = $bytes/1024;
	printf "%3d. %s: %3d %4.1f %4.1f%%\n", ++$i, dotline($WIDTH - 22, $subj),
		$kbytes, $kbytes/$DAYS, 100*$bytes/$chars;
	last if ($i >= $RANKLARGE);
}
print STDERR "done.\n";

# Fill up %me_too for the OCR stats
my %me_too;
foreach my $addr (keys %{$people}) {
	next unless ($people->{$addr}->articles >= 5);
	$me_too{$addr} = 
		$people->{$addr}->orig_volume / $people->{$addr}->body_volume;
}

$task = "Top $RANKSMALL Original Content Ratings";
print STDERR "Getting $task...";
$i = 0; line();
print underline("$task: (Original Bytes) / (All Bytes)");
foreach my $addr (sort { ($me_too{$b}) <=> ($me_too{$a})
                          || $people->{$a}->attrib cmp $people->{$b}->attrib }
		  keys %me_too) {
	my $name = headstrip($people->{$addr}->attrib);
	printf "%3d. %s: %5.1f%%\n", ++$i, dotline($WIDTH - 14, $name), 100*$me_too{$addr};
	last if ($i >= $RANKSMALL);
}
print STDERR "done.\n";

$task = "Bottom $RANKSMALL Original Content Ratings";
print STDERR "Getting $task...";
$i = 0; line();
print underline("$task: (Original Bytes) / (All Bytes)");
foreach my $addr (sort { ($me_too{$a}) <=> ($me_too{$b})
                          || $people->{$a}->attrib cmp $people->{$b}->attrib }
		  keys %me_too) {
	my $name = headstrip($people->{$addr}->attrib);
	printf "%3d. %s: %5.1f%%\n", ++$i, dotline($WIDTH - 14, $name), 100*$me_too{$addr};
	last if ($i >= $RANKSMALL);
}
print STDERR "done.\n";

$task = "Top $RANKSMALL Crossposting Groups";
print STDERR "Getting $task...";
$i = 0; line();
print underline("$task: Posts in Group");
foreach my $group (sort { ($cross_groups->{$b}) <=> ($cross_groups->{$a})
                           || $a cmp $b }
		   keys %{$cross_groups}) {
	my $posts = $cross_groups->{$group};
	printf "%3d. %s: %3d\n", ++$i, dotline($WIDTH - 11, $group), $posts;
	last if ($i >= $RANKSMALL);
}
print STDERR "done.\n";

line(); print overline("End of stats for $group");
print "\n" if $OUTPUT_FILE;
close OUTFILE unless $OUTPUT_FILE;
print STDERR "Statistics complete for ${group}!\n";
select $old_fh;	# in case I want to do something with it

###########################################################
###                END LOOP OVER @groups                ###
###########################################################
}
print STDERR "\nAll statistics complete!\n";
close OUTFILE if $OUTPUT_FILE;
exit 0;

###################
### SUBROUTINES ###
###################
# center: Center the argument string and return it.
sub center {
	my @lines = split("\n", join('', @_));
	my @formatted; my $i=0;
	foreach my $line (@lines) {
		$formatted[$i++] = ' ' x (($WIDTH - length($line))/2) . $line;
	}
	return join("\n", @formatted);
}

# dotline: Take the string and pad it right with dots.
sub dotline {
    my ($len, $line) = @_;
    my $fmt = sprintf("%%.%ds%%s", $len);
    return sprintf($fmt, $line, '.' x ($len - length($line)));
}

# underline: Underline the argument string and return it.
sub underline {
    my $line = shift;
    return sprintf("%s\n%s\n", $line, '=' x length($line));
}

# overline: Overline the argument string and return it. (based on underline :-)
sub overline {
    my $line = shift;
    return sprintf("%s\n%s\n", '=' x length($line), $line);
}

# line: print a newline to currently selected filehandle
sub line { print "\n" }

# headstrip: returns just the name part of the From: field
sub headstrip {
	if($HEADSTRIP) {
		my $from = shift;
		my @addrs = Mail::Address->parse($from);
		if (($#addrs > 0) && $VERBOSE) {
			# There should only be one address
			# listed in the From: header for news
			warn "Too many addresses";
		}
		my $addr = $addrs[0];
		unless(ref $addr) {
			warn "Didn't get Mail::Address object" if $VERBOSE;
			return $from;
		}
		return ($addr->name) || ($addr->user);
	} else { return shift }
}

# open_outfile: opens the specified file as OUTFILE (w/ paranoia)
sub open_outfile {
	my $output_file	= shift;
	my $output_dir	= dirname($output_file);
	stat $output_dir;
	if ((-e _) && (-d _) &&!(-w _)) { die "Can't write to $output_dir" }
	if ((-e _) &&!(-d _))           { die "$output_dir already exists, but isn't a directory" }
	if (!(-e _))                    { mkpath($output_dir, $VERBOSE, 0777) }
	open OUTFILE, ">$output_file"
		or die "Can't open ${output_file}\n\tfor output: $!";
}

# help: prints out usage information (search for GetOptions)
sub help {
	print <<"END USAGE INFO";
Usage: $0 [options] <newsgroup1> [<newsgroup2> ...]
  --usage, --help, -?	Print this message
  --verbose (*)		Turn on a few helpful debugging messages (d: off)
  --days		Number of days in the past to scan (d: 7)
  --num-major		Number of items shown in major categories (d: 25)
  --num-minor		Number of items shown in minor categories (d: 10)
  --width		Width of output lines in columns (d: 72)
  --headstrip (*)	Turn on `From:' header processing (d: on)
  --spool		Directory where news server spool is located
			(d: /var/spool/news)
  --dotted (*)		Replace `.' with `/' in newsgroup names (d: on)
  --input-dir		Absolute directory where news articles are located
  --output-base		Top-level directory to store STATS files in
			(d: \$HOME/News/STATS)
  --output-dir		Absolute directory to store STATS files in
  --output-file		Specific single file to store STATS in

Defaults are shown in parentheses like this: (d: default)
Options which are shown with a (*) may be disabled like this:
  --noheadstrip
END USAGE INFO
}

__END__

=pod

=head1 NAME

groupstat - Generates statistics from a local news spool

=head1 SYNOPSIS

B<groupstat> [I<options>] I<newsgroup1> [I<newsgroup2> I<...>]

=head1 DESCRIPTION

B<GroupSTAT> is an advanced newsgroup statistics program.  With
the help of the I<News::Scan> module, B<GroupSTAT> scans the
articles in the newsgroup(s) of your choice and prints various
statistics from them.  You could look at these statistics for
your own amusement, or you post them to a newsgroup.

=head1 OPTIONS

=head2 Debugging/Help Options

=over 8

=item B<--usage, --help, -?>

Prints a short usage message, then exits.

=item B<-v, --verbose>

Enables a few helpful debugging messages.  Might also be useful
for submitting bug reports.  This facility is turned off by
default.

=item B<--noverbose>

Turns off debugging messages.  (The opposite of B<--verbose>.)

=back

=head2 Scan Options

=over 8

=item B<-d, --days>

Sets the number of days in the past B<GroupSTAT> should scan.
The default is one week.  See L<"NOTES">.

=item B<-N, --num-major>

Sets the number of items printed in the "major" categories.
Both of the 'Posters' and 'Threads' categories are considered
major.  The default is twenty-five.

=item B<-n, --num-minor>

Sets the number of items printed in the "minor" categories.
Both of the 'Original Content Ratings' categories and the
crossposting category are considered minor.  The default is
ten.

=back

=head2 Formatting Options

=over 8

=item B<-w, --width>

Sets the width of the printed lines in columns.  The default
of 72 should be acceptable for most purposes.

=item B<-H, --headstrip>

This option enables the use of the C<headstrip> routine.  This
routine attempts to remove the email address from poster names
before printing out results.  If you do not want this, or if
your I<Mail::Address> module is buggy, see the next option.
The default is to turn on C<headstrip>.

=item B<--noheadstrip>

Disables the C<headstrip> routine.  See the previous option
for details.

=back

=head2 Input Options

=over 8

=item B<-s, --spool>

Sets the base directory of your local news spool.  This
specifies the root of your spool, B<not> the exact directory
where articles are stored.  If you wish to specify I<exactly>
where the articles were stored, use the B<--input-dir>
option.  The default spool directory is F</var/spool/news>.

=item B<-i, --input-dir>

Sets the exact directory where articles are stored.  These
articles must be text files each containing exactly one
article, and they must have name consisting only of numbers.
F<getnews> does this automatically for you.  If you are
running a local news server with a spool, use the
B<--spool> option instead.  There is no default, setting
this option overrides the B<--spool> default.

=item B<-d, --dotted>

Replace '.' with '/' in newsgroup names.  This only applies
when using the --spool or --output-base options.  The default
is to do the replacement.

=item B<--nodotted>

Turn off the replacement done by the B<--dotted> option.

=back

=head2 Output Options

=over 8

=item B<-o, --output-base>

Sets the base directory to store stats files in.  These
files are automatically stored by group, and then by
date.  If any directories are missing, they will be
created (if possible).  This does B<not> specify the
exact directory to store stats files in, use the
B<--output-dir> option for that.  If you want to store
stats in a specific file, use the B<--output-file>
option.  The default is to store stats under 
F<~/News/STATS>.

=item B<-O, --output-dir>

Sets the exact directory to store stats files in.
The stats for each group are stored in files with
names containing the newsgroup name and the date.
If any directories are missing, they will be created
(if possible).  There is no default, setting the
B<--output-dir> option overrides the B<--output-base>
default.

=item B<-f, --output-file>

Specifies a single file to store stats in.  If more
than one newsgroup was scanned, this file will
contain multiple scan results.  If any directories
are missing in the path to B<--output-file>, they
will be created (if possible).  Setting
B<--output-file=-> will send results to standard
output (which can be piped to a pager like C<more>
or C<less>).  There is no default, setting the
B<--output-file> option overrides the B<--output-base>
default.

=back

=head1 ENVIRONMENT

B<GroupSTAT> reads the contents of the environmental
variable C<GROUPSTAT> on startup.  If you want to set
default options for this program, put them here.  They
will be read as if they were normal command-line options.

B<GroupSTAT> also makes use of the C<HOME> variable to
determine where to place output files in the event
that none of the B<--output> options were specified.

=head1 EXAMPLES

Scan 'rec.arts.drwho' from seven days ago until now,
getting articles from C</var/spool/news>, putting the
output files into C<~/News/STATS>:

	groupstat rec.arts.drwho

Scan three days of 'alt.sysadmin.recovery', getting articles
from C</tmp/asr> and putting output into C</tmp/asr-stats.txt>:
	groupstat -i=/tmp/asr -f=/tmp/asr-stats.txt alt.sysadmin.recovery

Scan the top 50's for 'news.admin.net-abuse.usenet' with
default options:

	groupstat -N50 -n50 news.admin.net-abuse.usenet

=head1 NOTES

In order to use the B<--days> option, you must have that
many days of articles in your "spool".  If you don't,
B<GroupSTAT> will not correct your error and the stats
will be skewed.  This means that if you want to use the
default, you must have I<at least> a week's worth of
articles on spool.

If you are constantly resetting the defaults, you may
want to define the C<GROUPSTAT> environmental variable.

In F<~/.bashrc> (Unix platforms):

	export GROUPSTAT=options

In F<AUTOEXEC.BAT> (M$-DoS type systems):

	SET GROUPSTAT=options

=head1 WARNINGS

If you post statistics about a newsgroup to that newsgroup
on a regular basis, you may end up starting a series of
everlasting flamewars.  Be warned!

=head1 BUGS

Lots and lots.  :-)  See the F<BUGS> file that came with
B<GroupSTAT>.

=head1 AUTHOR

B<GroupSTAT> was written by H. Alex LaHurreau
E<lt>alexdw@locl.netE<gt>.  

=head1 HISTORY

B<GroupSTAT> is partially based on B<StatNews>, which was
written by Davide G. M. Salvetti E<lt>salve@debian.orgE<gt>.

=head1 COPYRIGHT

Copyright  1999-2000 H. Alex LaHurreau.
This is free software; see the source for copying  
conditions.  There is NO warranty; not even for 
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

=head1 SEE ALSO

You may find L<News::Scan> of interest.  See the
author's homepage (http://www.locl.net/homes/alexdw/)
for more wonderful things.  :-)

=cut

