#!/usr/bin/perl

eval 'exec /usr/bin/perl  -S $0 ${1+"$@"}'
    if 0; # not running under some shell

# This is mysql-slave-restart, a program to watch replication and try to
# restart the slave on errors.
#
# This program is copyright (c) 2007 Baron Schwartz.  Feedback and
# improvements are welcome.
#
# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# This program is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
# systems, you can issue `man perlgpl' or `man perlartistic' to read these
# licenses.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
# Place, Suite 330, Boston, MA  02111-1307  USA.

use strict;
use warnings FATAL => 'all';

use DBI;
use English qw(-no_match_vars);
use Getopt::Long;
use List::Util qw(min max);
use Time::HiRes qw(sleep);
use Term::ReadKey qw(ReadMode);

our $VERSION = '1.0.0';
our $DISTRIB = '848';
our $SVN_REV = sprintf("%d", q$Revision: 785 $ =~ m/(\d+)/g);

# ############################################################################
# Get configuration information.
# ############################################################################

# Define cmdline args.
my @opt_spec = (
   { s => 'askpass',           d => 'Prompt for a password for the connection' },
   { s => 'database|D=s',      d => 'Database to use' },
   { s => 'defaults-file|F=s', d => 'Only read default options from the given file' },
   { s => 'error-numbers|e=s', d => 'Only restart this comma-separated list of errors' },
   { s => 'error-text|E=s',    d => 'Only restart errors that match this pattern' },
   { s => 'error-length|L=i',  d => 'Max length of error message to print' },
   { s => 'help',              d => 'Show this help message' },
   { s => 'host|h=s',          d => 'Connect to host' },
   { s => 'maxsleep|M=f',      d => 'Maximum sleep time (default 64 sec)'},
   { s => 'minsleep|m=f',      d => 'Minimum sleep time (default 1/64 sec)'},
   { s => 'password|p=s',      d => 'Password to use when connecting' },
   { s => 'port|P=i',          d => 'Port number to use for connection' },
   { s => 'skipcount|k=i',     d => 'Number of statements to skip (default 1)' },
   { s => 'sleep|s=f',         d => 'Initial sleep time (default 1 sec)' },
   { s => 'socket|S=s',        d => 'Socket file to use for connection' },
   { s => 'time|t=s',          d => 'Time to run before exiting (suffix: s/m/h/d)' },
   { s => 'untilmaster=s',     d => 'Run until this master log file and position' },
   { s => 'untilrelay=s',      d => 'Run until this relay log file and position' },
   { s => 'user|u=s',          d => 'User for login if not current user' },
   { s => 'verbose|v+',        d => 'Verbosity (specify multiple times for more detail)' },
   { s => 'version',           d => 'Output version information and exit' },
);

# This is the container for the command-line options' values to be stored in
# after processing.  Initial values are defaults.
my %opts = (
   k => 1,
   s => 1,
   m => 1/64,
   M => 64,
   v => 0,
);

# Post-process...
my %opt_seen;
foreach my $spec ( @opt_spec ) {
   my ( $long, $short ) = $spec->{s} =~ m/^([\w-]+)(?:\|([^!+=]*))?/;
   $spec->{k} = $short || $long;
   $spec->{l} = $long;
   $spec->{t} = $short;
   $spec->{n} = $spec->{s} =~ m/!/;
   $opts{$spec->{k}} = undef unless defined $opts{$spec->{k}};
   die "Duplicate option $spec->{k}" if $opt_seen{$spec->{k}}++;
}

Getopt::Long::Configure('no_ignore_case', 'bundling');
GetOptions( map { $_->{s} => \$opts{$_->{k}} } @opt_spec) or $opts{help} = 1;

if ( $opts{version} ) {
   print "mysql-slave-restart  Ver $VERSION Distrib $DISTRIB Changeset $SVN_REV\n";
   exit(0);
}

if ( !$opts{help} ) {
   if ( $opts{t} ) {
      if ( $opts{t} !~ m/^\d+[smhd]?$/ ) {
         warn "Invalid --time argument\n";
         $opts{help} = 1;
      }
      elsif ( $opts{t} =~ m/(\d+)([smhd])$/ ) {
         $opts{t} = $2 eq 's' ? $1            # Seconds
                  : $2 eq 'm' ? $1 * 60       # Minutes
                  : $2 eq 'h' ? $1 * 3600     # Hours
                  :             $1 * 86400;   # Days
      }
   }
   if ( $opts{untilmaster} ) {
      if ( $opts{untilmaster} !~ m/^[.\w-]+,\d+$/ ) {
         warn "Invalid --untilmaster argument, must be file,pos\n";
         $opts{help} = 1;
      }
   }
   if ( $opts{untilrelay} ) {
      if ( $opts{untilrelay} !~ m/^[.\w-]+,\d+$/ ) {
         warn "Invalid --untilrelay argument, must be file,pos\n";
         $opts{help} = 1;
      }
   }
}

# Prepare the list of error numbers.
if ( $opts{e} ) {
   $opts{e} = { map { $_ => 1 } $opts{e} =~ m/(\d+)/g };
}

if ( $opts{help} ) {
   print "Usage: mysql-slave-restart <options>\n\n";
   my $maxw = max(map { length($_->{l}) + ($_->{n} ? 4 : 0)} @opt_spec);
   foreach my $spec ( sort { $a->{l} cmp $b->{l} } @opt_spec ) {
      my $long  = $spec->{n} ? "[no]$spec->{l}" : $spec->{l};
      my $short = $spec->{t} ? "-$spec->{t}" : '';
      printf("  --%-${maxw}s %-4s %s\n", $long, $short, $spec->{d});
   }
   print <<USAGE;

mysql-slave-restart watches a MySQL replication slave for errors, and tries to
restart replication if it stops.  For more details, please read the
documentation:

   perldoc mysql-slave-restart

USAGE
   exit(0);
}

# ############################################################################
# Connect and go to work.
# ############################################################################

# Connect to the database
my %conn = (
   F => 'mysql_read_default_file',
   h => 'host',
   P => 'port',
   S => 'mysql_socket'
);

my $dsn = 'DBI:mysql:' . ( $opts{D} || '' ) . ';'
   . join(';', map  { "$conn{$_}=$opts{$_}" } grep { defined $opts{$_} } qw(F h P S))
   . ';mysql_read_default_group=mysql';
if ( $opts{askpass} ) {
   print "Enter password: ";
   ReadMode('noecho');
   my $pass = <STDIN>;
   chomp $pass;
   ReadMode('normal');
   print "\n";
   $opts{p} = $pass;
}
my $dbh = DBI->connect($dsn, @opts{qw(u p)}, { AutoCommit => 1, RaiseError => 1, PrintError => 0 } );

# VERY IMPORTANT: Lowercases all column names for fetchrow_hashref.  This is
# because different MySQL versions use different lettercase.
$dbh->{FetchHashKeyName} = 'NAME_lc';

my $start_sql = version_ge($dbh, '4.0.5') ? 'START SLAVE' : 'SLAVE START';
if ( $opts{untilmaster} ) {
   my ( $file, $pos ) = split(',', $opts{untilmaster});
   $start_sql .= " UNTIL MASTER_LOG_FILE = '$file', MASTER_LOG_POS = $pos";
}
elsif ( $opts{untilrelay} ) {
   my ( $file, $pos ) = split(',', $opts{untilrelay});
   $start_sql .= " UNTIL RELAY_LOG_FILE = '$file', RELAY_LOG_POS = $pos";
}

my $fetch_stat = $dbh->prepare('SHOW SLAVE STATUS');
my $set_skip   = $dbh->prepare("SET GLOBAL SQL_SLAVE_SKIP_COUNTER = $opts{k}");
my $start      = $dbh->prepare($start_sql);

my $exit_time = time() + ($opts{t} || 0);
my $sleep = $opts{s};
my ($last_log, $last_pos);

while ( ( !$opts{t} || time() < $exit_time ) ) {
   my $was_running = 1;
   $fetch_stat->execute();
   my $stat = $fetch_stat->fetchall_arrayref({})->[0];
   die "No SLAVE STATUS output found\n" unless $stat;

   if ( !$last_log
      || $last_log ne $stat->{relay_log_file}
      || $last_pos != $stat->{relay_log_pos}
   ) {
      $stat->{slave_sql_running} ||= 'No';
      $stat->{last_error}        ||= '';
      $stat->{last_errno}        ||= 0;

      if ( $opts{untilmaster} && pos_ge($stat, 'master') ) {
         die "Slave has advanced past $opts{untilmaster} on master.\n";
      }
      elsif ( $opts{untilrelay} && pos_ge($stat, 'relay') ) {
         die "Slave has advanced past $opts{untilrelay} in relay logs.\n";
      }

      if ( $stat->{slave_sql_running} eq 'No' ) {
         # Print the time, error, etc
         if ( $opts{v} ) {
            my $err = '';
            if ( $opts{v} > 1 ) {
               ($err = $stat->{last_error} ) =~ s/\s+/ /g;
               if ( $opts{L} ) {
                  $err = substr($err, 0, $opts{L});
               }
            }
            printf("%s %s %11d %d %s\n",
               ts(time),
               $stat->{relay_log_file},
               $stat->{relay_log_pos},
               $stat->{last_errno} || 0,
               $err
            );
         }

         if ( $opts{e} && !exists($opts{e}->{$stat->{last_errno}}) ) {
            die "Error $stat->{last_errno} is not in --error-numbers.\n";
         }
         elsif ( $opts{E} && $stat->{last_error} && $stat->{last_error} !~ m/$opts{E}/ ) {
            die "Error does not match --error-text.\n";
         }
         else {
            $set_skip->execute();
            $start->execute();
            $was_running = 0;
            # Only set this on events I tried to restart.  Otherwise there
            # could be a race condition: I see it, I record it, but it hasn't
            # caused an error yet; so I won't try to restart it when it does.
            # (The point of this is to avoid trying to restart the same event
            # twice in case another race condition happens -- I restart it,
            # then check the server and it hasn't yet cleared the error
            # message and restarted the SQL thread).
            $last_log = $stat->{relay_log_file};
            $last_pos = $stat->{relay_log_pos};
         }
      }
   }

   # Adjust sleep time.
   if ( $was_running ) {
      $sleep = min($opts{M}, $sleep * 2);
   }
   else {
      $sleep = max($opts{m}, $sleep / 2);
   }

   # Errors are very likely to follow each other in quick succession.  NOTE:
   # this policy has a side effect with respect to $sleep.  Suppose $sleep is
   # 512 and MySQL Slave Restart finds an error; now $sleep is 256, but MySQL
   # Slave Restart only sleeps 1 (the initial value of --sleep).  Suppose there
   # is no error when it wakes up after 1 second, because 1 was too short.  Now
   # it doubles $sleep, back to 512.  $sleep has the same value it did before
   # the error was ever found.
   print "sleeping $sleep\n" if $opts{v} > 2;
   sleep($was_running ? $sleep : min($sleep, $opts{s}));
}

# ############################################################################
# Subroutines.
# ############################################################################

# Determines if the $stat's log coordinates are greater than or equal to the
# desired coordinates. $which is 'master' or 'relay'
sub pos_ge {
   my ( $stat, $which ) = @_;
   my $fmt  = '%s/%020d';
   my $curr = $which eq 'master'
      ? sprintf($fmt, @{$stat}{qw(relay_master_log_file exec_master_log_pos)})
      : sprintf($fmt, @{$stat}{qw(relay_log_file relay_log_pos)});
   my $stop = sprintf($fmt, split(',', $opts{"until$which"}));
   return $curr ge $stop;
}

# Compares versions like 5.0.27 and 4.1.15-standard-log
sub version_ge {
   my ( $dbh, $target ) = @_;
   my $version = sprintf('%03d%03d%03d', $dbh->{mysql_serverinfo} =~ m/(\d+)/g);
   return $version ge sprintf('%03d%03d%03d', $target =~ m/(\d+)/g);
}

sub ts {
   my ( $time ) = @_;
   my ( $sec, $min, $hour, $mday, $mon, $year )
      = localtime($time);
   $mon  += 1;
   $year += 1900;
   return sprintf("%d-%02d-%02dT%02d:%02d:%02d",
      $year, $mon, $mday, $hour, $min, $sec);
}

# ############################################################################
# Documentation.
# ############################################################################

=pod

=head1 NAME

mysql-slave-restart - Watch and restart MySQL replication after errors.

=head1 SYNOPSIS

 mysql-slave-restart --verbose

=head1 DESCRIPTION

MySQL Slave Restart watches a MySQL replication slave and tries to skip
statements that cause errors.  It polls the slave intelligently with an
exponentially varying sleep time.  You can specify errors to skip and run the
slave until a certain binlog position.

=head1 OPTIONS

=over

=item --askpass

Prompt for a password for the connection.

=item --database

Database to use.

=item --defaults-file

Only read default options from the given file.

=item --error-numbers

Makes MySQL Slave Restart only try to restart if the error number is in this
comma-separated list of errors.  If it sees an error not in the list, it will
exit.

The error number is in the last_errno column of SHOW SLAVE STATUS.

=item --error-text

A Perl regular expression against which the error text, if any, is matched.  If
the error text exists and matches, MySQL Slave Restart will try to restart the
slave.  If it exists but doesn't match, MySQL Slave Restart will exit.

The error text is in the last_error column of SHOW SLAVE STATUS.

=item --error-length

When L<"--verbose"> is set high enough to print the error, this option will
truncate the error text to the specified length.  This can be useful to prevent
wrapping on the terminal.

=item --help

Show a brief help message and exit.

=item --host

Connect to host.

=item --maxsleep

The maximum time MySQL Slave Restart will sleep before polling the slave again.
See L<"SLEEP">.

=item --minsleep

The minimum time MySQL Slave Restart will sleep before polling the slave again.
See L<"SLEEP">.

=item --password

Password to use when connecting.

=item --port

Port number to use for connection.

=item --skipcount

The number of statements to skip when restarting the slave.

=item --sleep

The initial sleep time between checking the slave.  See L<"SLEEP">.

=item --socket

Socket file to use for connection.

=item --time

Causes MySQL Slave Restart to stop after the specified time has elapsed.  The
argument can have a suffix of s, m, h, or d, indicating seconds, minutes, hours,
or days.  The number is interpreted as seconds if there is no suffix.

=item --untilmaster

Start the slave, and retry if it fails, until it reaches the given replication
coordinates.  The coordinates are the logfile and position on the master, given
by relay_master_log_file, exec_master_log_pos.  The argument must be in the
format "file,pos".  Separate the filename and position with a single comma and
no space.

This will also cause an UNTIL clause to be given to START SLAVE.

After reaching this point, the slave should be stopped and MySQL Slave Restart
will exit.

=item --untilrelay

Like L<"--untilmaster">, but in the slave's relay logs instead.  The coordinates
are given by relay_log_file, relay_log_pos.

=item --user

User for login if not current user.

=item --verbose

Verbosity; specify multiple times for more verbosity.  Default is no output.
Verbosity 1 outputs a timestamp, relay_log_file, relay_log_pos, and last_errno.
Verbosity 2 adds last_error.  See also L<"--error-length">.  Verbosity 3
prints the current sleep time each time MySQL Slave Restart sleeps.

=item --version

Output version information and exit.

=back

=head1 SYSTEM REQUIREMENTS

You need Perl, DBI, DBD::mysql, and some core packages that ought to be
installed in any reasonably new version of Perl.

=head1 OUTPUT

If you specify --verbose, MySQL Slave Restart prints a line every time it sees
the slave has an error.  See L<"--verbose"> for details.

=head1 SLEEP

MySQL Slave Restart sleeps intelligently between polling the slave.  The current
sleep time varies.

=over

=item *

The initial sleep time is given by L<"--sleep">.

=item *

If it checks and finds an error, it halves the previous sleep time.

=item *

If it finds no error, it doubles the previous sleep time.

=item *

The sleep time is bounded below by L<"--minsleep"> and above by L<"--maxsleep">.

=item *

Immediately after finding an error, MySQL Slave Restart assumes another error is
very likely to happen next, so it sleeps the current sleep time or the initial
sleep time, whichever is less.

=back

=head1 COMPATIBILITY

MySQL Slave Restart should work on many versions of MySQL.  Lettercase of many
output columns from SHOW SLAVE STATUS has changed over time, so it treats them
all as lowercase.

=head1 BUGS

If you find bugs, need features, etc please use the bug tracker, forums, and
mailing lists at http://sourceforge.net/projects/mysqltoolkit.

=head1 COPYRIGHT, LICENSE AND WARRANTY

This program is copyright (c) 2007 Baron Schwartz.  Feedback and improvements
are welcome.

THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
systems, you can issue `man perlgpl' or `man perlartistic' to read these
licenses.

You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA  02111-1307  USA.

=head1 AUTHOR

Baron Schwartz

=head1 VERSION

This manual page documents Ver 1.0.0 Distrib 848 $Revision: 785 $.

=cut
