#!/usr/bin/perl

eval 'exec /usr/bin/perl  -S $0 ${1+"$@"}'
    if 0; # not running under some shell

use strict;
use warnings FATAL => 'all';

# This is mysql-duplicate-key-checker, a program to analyze MySQL tables for
# duplicated or redundant indexes and foreign key constraints.  It is part of
# MySQL Toolkit (http://mysqltoolkit.sourceforge.net).
# 
# This program is copyright (c) 2007 Baron Schwartz.
# Feedback and improvements are welcome.
#
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
# licenses.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA  02111-1307  USA.

# ###########################################################################
# This is a combination of modules and programs in one -- a runnable module.
# http://www.perl.com/pub/a/2006/07/13/lightning-articles.html?page=last
# Or, look it up in the Camel book on pages 642 and 643 in the 3rd edition.
# ###########################################################################
package IndexChecker;

use List::Util qw(min);

sub new {
   bless {}, shift;
}

sub find_engine {
   my ( $self, $ddl, $opts ) = @_;
   my ( $engine ) = $ddl =~ m/\) (?:ENGINE|TYPE)=(\w+)/;
   return $engine || undef;
}

# The general format of a key is
# [FOREIGN|UNIQUE|PRIMARY|FULLTEXT|SPATIAL] KEY `name` [USING BTREE|HASH] (`cols`).
sub find_keys {
   my ( $self, $ddl, $opts ) = @_;

   # Find and filter the indexes.
   my @indexes = 
      grep { $_ !~ m/FOREIGN/ }
      $ddl =~ m/((?:\w+ )?KEY .+\))/mg;

   # Make allowances for HASH bugs in SHOW CREATE TABLE.  A non-MEMORY table
   # will report its index as USING HASH even when this is not supported.  The
   # true type should be BTREE.  See http://bugs.mysql.com/bug.php?id=22632
   my $engine = $self->find_engine($ddl);
   if ( $engine !~ m/MEMORY|HEAP/ ) {
      @indexes = map { $_ =~ s/USING HASH/USING BTREE/; $_; } @indexes;
   }

   my @keys = map {
      my ( $struct, $cols ) = $_ =~ m/(?:USING (\w+))? \((.+)\)/;
      my ( $special ) = $_ =~ m/(FULLTEXT|SPATIAL)/;
      $struct = $struct || $special || 'BTREE';
      my ( $name ) = $_ =~ m/KEY `(.*?)` \(/;

      # MySQL pre-4.1 supports only HASH indexes.
      if ( $opts->{version} lt '004001000' && $engine =~ m/HEAP|MEMORY/i ) {
         $struct = 'HASH';
      }

      {
         struct   => $struct,
         cols     => $cols,
         name     => $name || 'PRIMARY',
      }
   } @indexes;
   return \@keys;
}

sub find_fks {
   my ( $self, $ddl, $opts ) = @_;

   my @fks = $ddl =~ m/CONSTRAINT .* FOREIGN KEY .* REFERENCES [^\)]*\)/mg;

   my @result = map {
      my ( $name ) = $_ =~ m/CONSTRAINT `(.*?)`/;
      my ( $fkcols ) = $_ =~ m/\(([^\)]+)\)/;
      my ( $cols )   = $_ =~ m/REFERENCES.*?\(([^\)]+)\)/;
      my ( $parent ) = $_ =~ m/REFERENCES (\S+) /;
      if ( $parent !~ m/\./ ) {
         $parent = "`$opts->{database}`.$parent";
      }
      {  name   => $name,
         parent => $parent,
         cols   => $cols,
         fkcols => $fkcols,
      };
   } @fks;
   return \@result;
}

sub find_duplicate_keys {
   my ( $self, $keys, $opts ) = @_;
   my @keys = @$keys;
   my %seen; # Avoid outputting a key more than once.
   my @result;

   foreach my $i ( 0..$#keys - 1 ) {
      foreach my $j ( $i+1..$#keys ) {
         my $i_cols = $keys[$i]->{cols};
         my $j_cols = $keys[$j]->{cols};
         my $len    = min(length($i_cols), length($j_cols));
         if ( $opts->{ignore_order} ) {
            $i_cols = join(',', sort(split(/`/, $i_cols)));
            $j_cols = join(',', sort(split(/`/, $j_cols)));
         }
         if ( (($keys[$i]->{struct} eq $keys[$j]->{struct}) || $opts->{ignore_type})
            && substr($i_cols, 0, $len) eq substr($j_cols, 0, $len))
         {
            push @result, $keys[$i] unless $seen{$i}++;
            push @result, $keys[$j] unless $seen{$j}++;
         }
      }
   }

   # If the key ends with a prefix of the primary key, it's a duplicate.
   if ( $opts->{clustered} && $opts->{engine} =~ m/^(?:InnoDB|solidDB)$/ ) {
      my $i = 0;
      my $found = 0;
      while ( $i < @keys ) {
         if ( $keys[$i]->{name} eq 'PRIMARY' ) {
            $found = 1;
            last;
         }
         $i++;
      }
      if ( $found ) {
         my $pkcols = $keys[$i]->{cols};
         KEY:
         foreach my $j ( 0..$#keys ) {
            next KEY if $i == $j;
            my $suffix = $keys[$j]->{cols};
            SUFFIX:
            while ( $suffix =~ s/`[^`]+`,// ) {
               my $len = min(length($pkcols), length($suffix));
               if ( (($keys[$i]->{struct} eq $keys[$j]->{struct}) || $opts->{ignore_type})
                  && substr($suffix, 0, $len) eq substr($pkcols, 0, $len))
               {
                  push @result, $keys[$i] unless $seen{$i}++;
                  push @result, $keys[$j] unless $seen{$j}++;
                  last SUFFIX;
               }
            }
         }
      }
   }

   return \@result;
}

sub find_duplicate_fks {
   my ( $self, $fks, $opts ) = @_;
   my @fks = @$fks;
   my %seen; # Avoid outputting a fk more than once.
   my @result;
   foreach my $i ( 0..$#fks - 1 ) {
      foreach my $j ( $i+1..$#fks ) {
         # A foreign key is a duplicate no matter what order the columns are in, so
         # re-order them alphabetically so they can be compared.
         my $i_cols = join(', ', map { "`$_`" } sort($fks[$i]->{cols} =~ m/`([^`]+)`/g));
         my $j_cols = join(', ', map { "`$_`" } sort($fks[$j]->{cols} =~ m/`([^`]+)`/g));
         my $i_fkcols = join(', ', map { "`$_`" } sort($fks[$i]->{fkcols} =~ m/`([^`]+)`/g));
         my $j_fkcols = join(', ', map { "`$_`" } sort($fks[$j]->{fkcols} =~ m/`([^`]+)`/g));
         if ( $fks[$i]->{parent} eq $fks[$j]->{parent}
               && $i_cols eq $j_cols
               && $i_fkcols eq $j_fkcols
         ) {
            push @result, $fks[$i] unless $seen{$i}++;
            push @result, $fks[$j] unless $seen{$j}++;
         }
      }
   }
   return \@result;
}

# ###########################################################################
# And now for the "program".
# ###########################################################################
package main;

use DBI;
use English qw(-no_match_vars);
use Getopt::Long;
use List::Util qw(max);
use Term::ReadKey;

our $VERSION = '1.1.0';
our $DISTRIB = '848';
our $SVN_REV = sprintf("%d", q$Revision: 831 $ =~ m/(\d+)/g);

$OUTPUT_AUTOFLUSH = 1;

if ( !caller ) {

   # ############################################################################
   # Get configuration information.
   # ############################################################################

   my @opt_spec = (
      { s => 'allatonce|s',    d => 'Print only once, instead of one DB at a time' },
      { s => 'allstruct|a',    d => 'Compare indexes with different structs (BTREE, HASH, etc)' },
      { s => 'askpass',        d => 'Prompt for password for connections' },
      { s => 'clustered|c',    d => 'PK columns appended to secondary key is duplicate' },
      { s => 'databases|d=s',  d => 'Only do this comma-separated list of databases' },
      { s => 'defaults-file|F=s', d => 'Only read default options from the given file' },
      { s => 'function|f=s',   d => 'Do f=foreign keys, k=keys.  Default is do both.' },
      { s => 'help',           d => 'Show this help message' },
      { s => 'host|h=s',       d => 'Connect to host' },
      { s => 'ignoredb|g=s',   d => 'Ignore this comma-separated list of databases' },
      { s => 'ignoreorder',    d => 'Ignore index order so KEY(a,b) duplicates KEY(b,a)' },
      { s => 'ignoretbl|n=s',  d => 'Ignore this comma-separated list of tables' },
      { s => 'password|p=s',   d => 'Password to use when connecting' },
      { s => 'port|P=i',       d => 'Port number to use for connection' },
      { s => 'socket|S=s',     d => 'Socket file to use for connection' },
      { s => 'tab|b',          d => 'Output separated with tabs' },
      { s => 'tables|t=s',     d => 'Only do this comma-separated list of tables' },
      { s => 'user|u=s',       d => 'User for login if not current user' },
      { s => 'verbose|v',      d => 'Output everything, not just dupes' },
      { s => 'version',        d => 'Output version information and exit' },
   );

   # This is the container for the command-line options' values to be stored in
   # after processing.  Initial values are defaults.
   my %opts = (
      d => '',
      g => '',
      t => '',
      n => '',
      f => 'fk',
      v => 0,
      b => 0,
      a => 0,
      s => 0,
   );
   # Post-process...
   my %opt_seen;
   foreach my $spec ( @opt_spec ) {
      my ( $long, $short ) = $spec->{s} =~ m/^([\w-]+)(?:\|([^!+=]*))?/;
      $spec->{k} = $short || $long;
      $spec->{l} = $long;
      $spec->{t} = $short;
      $spec->{n} = $spec->{s} =~ m/!/;
      $opts{$spec->{k}} = undef unless defined $opts{$spec->{k}};
      die "Duplicate option $spec->{k}" if $opt_seen{$spec->{k}}++;
   }

   Getopt::Long::Configure('no_ignore_case', 'bundling');
   GetOptions( map { $_->{s} => \$opts{$_->{k}} } @opt_spec) or $opts{help} = 1;

   if ( $opts{version} ) {
      print "mysql-duplicate-key-checker  Ver $VERSION Distrib $DISTRIB Changeset $SVN_REV\n";
      exit(0);
   }

   # Make comma-separated lists into hashes.
   if ( $opts{d} ) {
      $opts{d} = { map { $_ => 1 } split(/,\s*/, $opts{d}) };
   }
   $opts{g} = { map { $_ => 1 } split(/,\s*/, $opts{g}) };
   if ( $opts{t} ) {
      $opts{t} = { map { $_ => 1 } split(/,\s*/, $opts{t}) };
   }
   $opts{n} = { map { $_ => 1 } split(/,\s*/, $opts{n}) };
   if ( $opts{e} ) {
      $opts{e} = { map { lc($_) => 1 } split(/,\s*/, $opts{e}) };
   }

   if ( $opts{help} ) {
      print "Usage: mysql-duplicate-key-checker <options>\n\n";
      my $maxw = max(map { length($_->{l}) + ($_->{n} ? 4 : 0)} @opt_spec);
      foreach my $spec ( sort { $a->{l} cmp $b->{l} } @opt_spec ) {
         my $long  = $spec->{n} ? "[no]$spec->{l}" : $spec->{l};
         my $short = $spec->{t} ? "-$spec->{t}" : '';
         printf("  --%-${maxw}s %-4s %s\n", $long, $short, $spec->{d});
      }
      ( my $usage = <<"      USAGE" ) =~ s/^         //gm;

         mysql-duplicate-key-checker examines MySQL tables for duplicate or redundant
         indexes and foreign keys.  Connection options are read from MySQL option files.
         For more details, please read the documentation:

            perldoc mysql-duplicate-key-checker

      USAGE
      exit(0);
   }

   # ############################################################################
   # Get ready to do the main work.
   # ############################################################################
   my %conn = (
      F => 'mysql_read_default_file',
      h => 'host',
      P => 'port',
      S => 'mysql_socket'
   );

   # Connect to the database
   if ( !defined $opts{p} && $opts{askpass} ) {
      print "Enter password: ";
      ReadMode('noecho');
      chomp($opts{p} = <STDIN>);
      ReadMode('normal');
      print "\n";
   }

   my $dsn = 'DBI:mysql:' . ( $opts{D} || '' ) . ';'
      . join(';', map  { "$conn{$_}=$opts{$_}" } grep { defined $opts{$_} } qw(F h P S))
      . ';mysql_read_default_group=mysql';
   my $dbh = DBI->connect($dsn, @opts{qw(u p)}, { AutoCommit => 1, RaiseError => 1, PrintError => 0 } );

   my @databases = @{$dbh->selectcol_arrayref('SHOW DATABASES')};
   my @whole_batch;
   my $exit_code;
   my $version = sprintf('%03d%03d%03d', $dbh->{mysql_serverinfo} =~ m/(\d+)/g);

   my $ic = new IndexChecker;
   my $ic_opts = {
      ignore_type  => $opts{a},
      ignore_order => $opts{ignoreorder},
      clustered    => $opts{c},
   };

   DATABASE:
   foreach my $database ( @databases ) {

      # Ignore databases as instructed.  Also ignore INFORMATION_SCHEMA and skip
      # databases caused by lost+found directories created in the root of ext3
      # filesystems; they are not really databases.
      next DATABASE if
         ( $opts{d} && !exists($opts{d}->{$database}) )
         || $database =~ m/^(information_schema|lost\+found)$/mi
         || exists $opts{g}->{$database};

      my @tables = @{$dbh->selectcol_arrayref('SHOW TABLES FROM `' . $database .  '`')};
      next DATABASE unless @tables;

      my %info_for;

      TABLE:
      foreach my $table ( @tables ) {
         next TABLE if
            ( $opts{t} && !exists($opts{t}->{$table}) )
            || exists $opts{n}->{$table};

         $exit_code = 1;
         my $ddl;
         eval {
            $ddl = ($dbh->selectrow_array("SHOW CREATE TABLE `$database`.`$table`"))[1];
            $exit_code = 0;
         };
         next TABLE if !$ddl;
         my $engine = $ic->find_engine($ddl) || next TABLE;

         my $keys = $opts{f} =~ m/k/ ? $ic->find_keys($ddl, {version => $version }) : [];
         my $fks  = $opts{f} =~ m/f/ ? $ic->find_fks($ddl, {database => $database}) : [];

         if ( @$keys || @$fks ) {
            $info_for{$table} = {
               database => $database,
               table    => $table,
               engine   => $engine,
               keys     => $keys,
               fks      => $fks,
            };
         }
      }

      my @to_print;
      foreach my $table ( sort keys %info_for ) {
         my $hash   = $info_for{$table};
         my $engine = $hash->{engine};

         # Prepare indexes
         if ( $opts{f} =~ m/k/ ) {
            if ( $opts{v} ) { # Print all
               push @to_print, map { make_hash($hash, 'KEY', $_) }
                  @{$hash->{keys}};
            }
            else { # Find duplicate/redundant by prefix matching.
               push @to_print, map { make_hash($hash, 'KEY', $_) }
                  @{$ic->find_duplicate_keys($hash->{keys}, { engine => $engine, %$ic_opts})};
            }
         }

         # Prepare foreign keys
         if ( $opts{f} =~ m/f/ ) {
            if ( $opts{v} ) { # Print all
               push @to_print, map { make_hash($hash, 'FK', $_) }
                  @{$hash->{fks}};
            }
            else { # Otherwise output duplicates.
               push @to_print, map { make_hash($hash, 'FK', $_) }
                  @{$ic->find_duplicate_fks($hash->{fks})};
            }
         }
      }

      next DATABASE unless @to_print;

      if ( $opts{s} ) {
         push @whole_batch, @to_print;
      }
      else {
         print_batch(@to_print);
      }
   }

   if ( @whole_batch && $opts{s} ) {
      print_batch(@whole_batch);
   }

   exit($exit_code);

   # ############################################################################
   # Subroutines
   # ############################################################################

   sub make_hash {
      my ( $info, $type, $hash ) = @_;
      return {
         database => $info->{database},
         table    => $info->{table},
         engine   => $info->{engine},
         name     => $hash->{name},
         type     => $type,
         struct   => $hash->{struct} || 'NULL',
         parent   => $hash->{parent} || 'NULL',
         cols     => $hash->{cols},
      };
   }

   sub print_batch {
      my ( @batch ) = @_;

      my $hdr;
      if ( $opts{b} ) {
         $hdr = ( "%s\t" x 8 ) . "\n";
      }
      else {
         my $max_idx  = max(6, map { length($_->{name}) } @batch);
         my $max_tbl  = max(5, map { length($_->{table}) } @batch);
         my $max_db   = max(8, map { length($_->{database}) } @batch);
         my $max_par  = max(6, map { length($_->{parent} || '') } @batch);
         $hdr         = "%-${max_db}s %-${max_tbl}s %-6s %-${max_idx}s %-4s %-8s %-${max_par}s %s\n";
      }

      printf($hdr, qw(DATABASE TABLE ENGINE OBJECT TYPE STRUCT PARENT COLUMNS));
      foreach my $thing ( @batch ) {
         printf($hdr, @{$thing}{qw(database table engine name type struct parent cols)});
      }

   }

}

1; # Because this is a module as well as a script.

# ############################################################################
# Documentation
# ############################################################################

=pod

=head1 NAME

mysql-duplicate-key-checker - Find possible duplicate indexes and foreign keys on
MySQL tables.

=head1 DESCRIPTION

This program examines the output of SHOW CREATE TABLE on MySQL tables, and if
it finds indexes that cover the same columns as another index in the same
order, or cover an exact leftmost prefix of another index, it prints out
the suspicious indexes.  By default, indexes must be of the same type, so a
BTREE index is not a duplicate of a FULLTEXT index, even if they have the same
colums.  You can override this.

It also looks for duplicate foreign keys.  A duplicate foreign key covers the
same columns as another in the same table, and references the same parent
table.

This tool is part of MySQL Toolkit L<http://mysqltoolkit.sourceforge.net>.

=head1 OPTIONS

=over

=item --allatonce

Prints everything it finds in one chunk.  The default is to print a database at
a time.

=item --allstruct

Compare indexes with different structures.  By default this is disabled, because
a BTREE index that covers the same columns as a FULLTEXT index is not really a
duplicate, for example.

=item --askpass

Prompt for password for connections.

=item --clustered

Detects when a suffix of a secondary key is a leftmost prefix of the primary
key, and treats it as a duplicate key.  Only detects this condition on storage
engines whose primary keys are clustered (currently InnoDB and solidDB).

Clustered storage engines append the primary key columns to the leaf nodes of
all secondary keys anyway, so you might consider it redundant to have them
appear in the internal nodes as well.  Of course, you may also want them in the
internal nodes, because just having them at the leaf nodes won't help for some
queries.  It does help for covering index queries, however.

Here's an example of a key that is considered redundant with this option:

  PRIMARY KEY  (`a`)
  KEY `b` (`b`,`a`)

=item --databases

A comma-separated list of databases to examine.

=item --defaults-file

Only read default options from the given file.

=item --function

What to check: 'f' is foreign keys, 'k' is indexes.  The default is to check
both.

=item --help

Displays a help message.

=item --host

Connect to host.

=item --ignoredb

A comma-separated list of databases to ignore.

=item --ignoretype

Ignore column ordering, so an index on columns (a,b) is considered a duplicate
of an index on columns (b,a).

=item --ignoretbl

A comma-separated list of tables to ignore.

=item --password

Password to use when connecting.

=item --port

Port number to use for connection.

=item --socket

Socket file to use for connection.

=item --tab

Print output separated with tabs, instead of whitespace-aligned.  See
L<"OUTPUT"> for details.

=item --tables

A comma-separated list of tables to check.

=item --user

User for login if not current user.

=item --verbose

Output all keys and/or foreign keys found, not just redundant ones.

=item --version

Output version information and exit.

=back

=head1 OUTPUT

Output is to STDOUT, one line per server and table, with header lines for each
database.  I tried to make the output easy to process with awk.  For this reason
columns are always present.  If there's no value, the script prints 'NULL'.
Output is sorted by database and table.

The columns in the output are as follows.

=over

=item DATABASE

The database the table is in.

=item TABLE

The table name.

=item ENGINE

The table's storage engine.

=item OBJECT

The index or constraint's name, e.g. `tbl_ibfk_3` (the default InnoDB name for
the third foreign key on a table named tbl).

=item TYPE

'KEY' for indexes, 'FK' for foreign keys.

=item STRUCT

The type of index: BTREE, FULLTEXT, HASH etc.  By default MySQL's indexes are
BTREE in most cases.  This does not apply to foreign keys.

=item PARENT

The parent table to which the foreign key constraint refers.  This does not
apply to indexes.

=item COLUMNS

The columns included in the index or foreign key constraint.  For indexes,
this column list is output verbatim, as shown in SHOW CREATE TABLE.  For
foreign keys, the columns are ordered so string comparison can find
duplicates, since column order in a foreign key is immaterial.

=back

=head1 SYSTEM REQUIREMENTS

You need the following Perl modules: DBI and DBD::mysql.

=head1 BUGS

Please use the Sourceforge bug tracker, forums, and mailing lists to request
support or report bugs: L<http://sourceforge.net/projects/mysqltoolkit/>.

=head1 LICENSE

This program is copyright (c) 2007 Baron Schwartz.
Feedback and improvements are welcome.

THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
systems, you can issue `man perlgpl' or `man perlartistic' to read these
licenses.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA  02111-1307  USA.

=head1 AUTHOR

Baron Schwartz.

=head1 VERSION

This manual page documents Ver 1.1.0 Distrib 848 $Revision: 831 $.

=cut
