Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,3 +84,54 @@ can set:
For a demonstration about how to setup a cluster, see
[http://clusterlabs.github.io/PAF/documentation.html](http://clusterlabs.github.io/PAF/documentation.html).

SELECT
pid,
status,
receive_start_lsn,
flushed_lsn,
latest_end_lsn,
pg_wal_lsn_diff(latest_end_lsn, flushed_lsn) AS byte_lag,
last_msg_send_time,
last_msg_receipt_time,
now() - last_msg_receipt_time AS last_msg_delay,
pg_last_wal_replay_lsn() AS last_replay_lsn,
pg_last_xact_replay_timestamp() AS last_replay_ts,
now() - pg_last_xact_replay_timestamp() AS replay_delay,
CASE
WHEN status = 'streaming'
AND pg_last_wal_replay_lsn() IS NOT NULL
AND pg_last_xact_replay_timestamp() IS NOT NULL
AND (now() - last_msg_receipt_time) < interval '30 seconds'
AND pg_wal_lsn_diff(latest_end_lsn, flushed_lsn) < 16*1024*1024 -- 16 MB threshold
AND (now() - pg_last_xact_replay_timestamp()) < interval '30 seconds'
THEN 'HEALTHY'
ELSE 'UNHEALTHY'
END AS standby_health
FROM pg_stat_wal_receiver;

```
SELECT
pid,
status,
receive_start_lsn,
flushed_lsn,
latest_end_lsn,
pg_wal_lsn_diff(latest_end_lsn, flushed_lsn) AS byte_lag,
last_msg_send_time,
last_msg_receipt_time,
now() - last_msg_receipt_time AS last_msg_delay,
pg_last_wal_replay_lsn() AS last_replay_lsn,
pg_last_xact_replay_timestamp() AS last_replay_ts,
now() - pg_last_xact_replay_timestamp() AS replay_delay,
CASE
WHEN status = 'streaming'
AND pg_last_wal_replay_lsn() IS NOT NULL
AND pg_last_xact_replay_timestamp() IS NOT NULL
AND (now() - last_msg_receipt_time) < interval '30 seconds'
AND pg_wal_lsn_diff(latest_end_lsn, flushed_lsn) < 16*1024*1024 -- 16 MB threshold
AND (now() - pg_last_xact_replay_timestamp()) < interval '30 seconds'
THEN 'HEALTHY'
ELSE 'UNHEALTHY'
END AS standby_health
FROM pg_stat_wal_receiver;
```
53 changes: 53 additions & 0 deletions script/pgsqlms
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,41 @@ sub _get_last_received_lsn {
return undef;
}

# Get the last write-ahead log location that has been replayed during recovery on a standby
# if the first argument is true, returns the value as decimal
# if the first argument is false, returns the value as LSN
# Returns undef if query failed
sub _get_last_replay_lsn {
my ( $dec ) = @_;
my $pg_last_wal_replay_lsn = 'pg_last_wal_replay_lsn()';
my $pg_wal_lsn_diff = 'pg_wal_lsn_diff';
my $query;
my $rc;
my @rs;

if ( $PGVERNUM < $PGVER_10 ) {
$pg_last_wal_receive_lsn = 'pg_last_xlog_replay_location()';
$pg_wal_lsn_diff = 'pg_xlog_location_diff';
}

if ( $dec ) {
$query = "SELECT $pg_wal_lsn_diff( $pg_last_wal_replay_lsn, '0/0' )";
}
else {
$query = "SELECT $pg_last_wal_replay_lsn";
}

$rc = _query( $query, \@rs );

return $rs[0][0] if $rc == 0 and $rs[0][0];

ocf_log( 'err', 'Could not query last replayed LSN (%s)', $rc ) if $rc != 0;
ocf_log( 'err', 'No values for last replayed LSN' )
if $rc == 0 and not $rs[0][0];

return undef;
}

# Get the master score for each connected standby
# Returns directly the result set of the query or exit with an error.
# Exits with OCF_ERR_GENERIC if the query failed
Expand Down Expand Up @@ -2060,6 +2095,24 @@ sub pgsql_notify_pre_promote {
_query( q{ CHECKPOINT }, {} );
%cdata = _get_controldata();
$node_lsn = _get_last_received_lsn( 'in decimal' );
# Checking if the current LSN are accurate
# In most cases, the pg_last_wal_receive_lsn() can accurately retrieve the last received LSN location
# When Postgres is started as a standby, the value obtained by this function is the starting point of the last
# WAL segment in the local pg_wal folder.
# After executing stream replication from primary, this value will be correctly updated.
my $WAL_SEGMENT_START = '000000';
# Convert LSN to hexadecimal
my $node_lsn_hexadecimal = sprintf("%X", $node_lsn);
# If the last three bytes (or six hexadecimal digits) of the LSN are zeros,
# It means that the LSN is the starting point of the last WAL segment in the local pg_wal folder.
if ($node_lsn_hexadecimal =~ /$WAL_SEGMENT_START$/) {
ocf_log( 'info', 'the LSN "%s" is not accurate', $node_lsn_hexadecimal);
my $node_last_replayed_lsn = _get_last_replay_lsn( 'in decimal' );
if ($node_last_replayed_lsn > $node_lsn) {
ocf_log( 'info', 'Using the last replayed LSN "%s" as the value for LSN location.', $node_last_replayed_lsn);
$node_lsn = $node_last_replayed_lsn;
}
}

unless ( defined $node_lsn ) {
ocf_log( 'warning', 'Unknown current node LSN' );
Expand Down