From 3fe861e15d84291a78eff0a5ac0f8d3031767790 Mon Sep 17 00:00:00 2001 From: Tasos Vogiatzoglou Date: Tue, 15 Sep 2015 13:04:33 +0300 Subject: [PATCH] High availability section --- _config.yml | 2 + _high_availability/001-Options.md | 59 +++++++ .../002-Replication-Configuration.md | 55 ++++++ .../003-Cluster-Configuration.md | 158 ++++++++++++++++++ _includes/nav.html | 8 + 5 files changed, 282 insertions(+) create mode 100644 _high_availability/001-Options.md create mode 100644 _high_availability/002-Replication-Configuration.md create mode 100644 _high_availability/003-Cluster-Configuration.md diff --git a/_config.yml b/_config.yml index 0d6fd51..c2709b1 100644 --- a/_config.yml +++ b/_config.yml @@ -25,6 +25,8 @@ collections: output: true sexy: output: true + high_availability: + output: true links: output: false ads: diff --git a/_high_availability/001-Options.md b/_high_availability/001-Options.md new file mode 100644 index 0000000..6be1020 --- /dev/null +++ b/_high_availability/001-Options.md @@ -0,0 +1,59 @@ +--- +layout: page +title: "Options" +date: 2015-01-27 22:02:36 +categories: +permalink: /ha/options.html +--- + + + +Making postgres highly available + +When thinking about high availability and postgres, there are two seperate concerns that we need to address. The first one +is data replication, that is, how we copy the data to all available nodes and the second one is failover, that is, how we +detect and manage failure of a node. + +This guide deals with the standard PostgreSQL distribution and as such the scenario that covers is that of a single master and +multiple slaves. The master serves requests and upon failure, a new master is chosen from the available nodes. + +Replication +----------- + +There are a couple of different options when it comes to replication, with different tradeoffs between them. + +The obvious way is to transfer, somehow, the changes in the underlying datafiles that PostgreSQL generates and have the slaves +waiting on them until they become active. In that setup, there can only be one active node that serves the requests and when +a failover occures the new active node mounts the datafiles, recovers what needs to be recovered and starts serving requests. The +main advantage of this solution is that the primary (the node that serves the requests) has zero penalty in the write performance +but the disadvantage is that we get to have a few nodes sitting inactive, waiting to resume active duties. + +The second solution is to take advantage of postgres binary replication. By using that we can have a master postgres communicating +the changes to the standby servers continuously and having them replay those changes, so all slaves are in the same state. The +advantage in that is that we can use the standby servers to offload read activity (still, we can have only one writeable node) with the +disadvantage being that we have to pay a small performance drop when writing, as there must be a notification that the changes were +actually applied to the slaves. + +This guide covers the second solution, which for applications that are read-oriented, we get an almost linear scale on the read performance, +as read operation (queries) can be performed to a multitude of stand-by servers. + +Prerequisites +============== + +Network +-------- + +We assume that all pg nodes are on the same subnet (e.g. 10.8.4.0/24). There are a few ways to achieve that: + +1. Physically place them on the same network +2. If the above is not feasible (e.g. a cloud provider, or leased machines) you can use a VPN to establish a private network between the servers +from which all communication will happen. + +Operating System: We assume an ubuntu derivative, something later than 14.04 +Postgres Version: 9.3 or later + +We assume that the databases will be accessed through a 10.8.3.0/24 network. The master will be 10.8.3.1 and the slave 10.8.3.2. +We assume that the database nodes will have hostnames db[number].dbcluster + + + diff --git a/_high_availability/002-Replication-Configuration.md b/_high_availability/002-Replication-Configuration.md new file mode 100644 index 0000000..530663c --- /dev/null +++ b/_high_availability/002-Replication-Configuration.md @@ -0,0 +1,55 @@ +--- +layout: page +title: "Replication configuration" +date: 2015-01-27 22:02:36 +categories: +permalink: /ha/replication.html +--- + + +The master configuration +======================= + +The master server is responsible for distributing the changes to the stand by servers. We have to change to following settings to achieve that: + + port = 5432 + listen_addresses = '*' # we are using a star (bind to everything) as we want to reuse the same postgresql.conf to all our nodes + wal_level = hot_standby + max_wal_senders = 5 # this is the total of wal senders that can be used concurrently by the standbys or streaming base backups + wal_keep_segments = 8 # how many wal segments should we keep? This has a relation with the speed with which the standbys consume the logs. Increase if you have slow standbys + archive_mode = on # We keep archives in case we need them for slaves that have fallen behind + hot_standby = on # this is ignored by the master server + +It is much easier to keep a single postgresql.conf and share it between all your nodes. + +Apart from that, we have to permit for stand-by servers to connect to master and request logs. We need to add a line in pg_hba.conf that permits the slaves from the same subnet to connect. + + hostssl replication all 10.8.4.0/24 md5 + +Finally, we need to create a user that is allowed to connect to the server and start replication. + + psql# CREATE USER replicator REPLICATION LOGIN ENCRYPTED PASSWORD 'password'; + + + +The slave configuration +========================= + +Setting up a stand-by to consume the logs is easy. We just need a base backup of the main database, plus all the archive logs that have happened in the meantime. +The command to do it in one take is + +$ sudo -u postgres pg_basebackup -h 10.8.4.1 -U postgres -D /db/data -X stream -R -p 5432 -U replicator -W password + +where 10.8.4.1 is the IP address of the master from which we want to make the backup, 5432 the port of the master and /db/data the directory in the filesystem where +the data are to be saved and replicator is the user we defined in the previous step. + +The same command generates a recovery.conf file that notifies the postgres instance that it's a standby server and from where it should connect to get the archive logs. + +At this point, we can edit the recovery.conf to specify a trigger file. A trigger file is a file that when present, instructs the standby to assume master duties. We don't +need it, as we will do the failover via the clustware, the setting nevertheless is: + + trigger_file = '/path/to/the/trigger/file' + +Keep in mind that the trigger file MUST NOT exist. You create it when you want to promote the standby to master, e.g. via touch /path/to/the/trigger/file + + diff --git a/_high_availability/003-Cluster-Configuration.md b/_high_availability/003-Cluster-Configuration.md new file mode 100644 index 0000000..f72ab3f --- /dev/null +++ b/_high_availability/003-Cluster-Configuration.md @@ -0,0 +1,158 @@ +--- +layout: page +title: "Cluster configuration" +date: 2015-01-27 22:02:36 +categories: +permalink: /ha/cluster.html +--- + + +The cluster +------------------ + +Now that we have a correctly replicating database, we need to establish a mechanism for managing the actual failover and promotion of nodes. For this, we will create a cluster +using pacemaker and corosync. + +Packages +============== + +The following steps must be run in all nodes of our DB cluster. + +Let's start by installing the appropriate packages + + $ sudo apt-get install corosync pacemaker pacemaker-cli-utils cluster-glue + +Then we need to update the postgres RA (resource agent) as the one in the standard distribution is a bit old + + $ sudo wget https://raw.githubusercontent.com/ClusterLabs/resource-agents/master/heartbeat/pgsql -O /usr/lib/ocf/resource.d/heartbeat/pgsql + + +Corosync configuration +===================== + +After we have finished with the installations, it's time to configure corosync. +The corosync configuration should be applied to every database node in /etc/corosync/corosync.conf + + totem { + version: 2 + secauth: off + cluster_name: dbcluster + transport: udpu + } + + nodelist { + node { + ring0_addr: db1.hostname + nodeid: 1 + } + node { + ring0_addr: db2.hostname + nodeid: 2 + } + } + + logging { + fileline: off + to_logfile: yes + to_syslog: no + debug: off + logfile: /var/log/corosync.log + timestamp: on + logger_subsys { + subsys: AMF + debug: off + } + } + + quorum { + provider: corosync_votequorum + expected_votes: 2 + two_nodes: 1 + } + + +After we restart corosync and pacemaker, we are ready to configure pacemaker. Pacemaker configuration is being done through crm, +so we execute the following: + +Pacemaker/resources configuration +=================================== + + crm configure property no-quorum-policy="ignore" + crm configure property stonith-enabled="false" # we don't need STONITH for now + + crm configure rsc_defaults resource-stickiness="INFINITY" + crm configure rsc_defaults migration-threshold=1 + + # The IP of the MASTER node + crm configure primitive vip-master ocf:heartbeat:IPaddr2 params ip=10.8.3.1 cidr_netmask=24 \ + op start timeout="60s" interval="0s" on-fail="restart" \ + op monitor timeout="60s" interval="10s" on-fail="restart" \ + op stop timeout="60s" interval="0s" on-fail="block" + + # The IP of the SLAVE node + crm configure primitive vip-slave ocf:heartbeat:IPaddr2 params ip=10.8.3.2 cidr_netmask=24 \ + meta \ + resource-stickiness="1" \ + op start timeout="60s" interval="0s" on-fail="restart" \ + op monitor timeout="60s" interval="10s" on-fail="restart" \ + op stop timeout="60s" interval="0s" on-fail="block" + + crm configure primitive pingCheck ocf:pacemaker:ping \ + params \ + name="default_ping_set" \ + host_list="10.8.3.1" \ + multiplier="100"\ + op start timeout="60s" interval="0s" on-fail="restart" \ + op monitor timeout="60s" interval="10s" on-fail="restart" \ + op stop timeout="60s" interval="0s" on-fail="ignore" + + crm configure clone clnPingCheck pingCheck + + crm configure primitive pgsql ocf:heartbeat:pgsql params pgport="5234"\ + pgctl="/usr/lib/postgresql/9.3/bin/pg_ctl" \ + psql="/usr/lib/postgresql/9.3/bin/psql" \ + pgdata="/db/data/" \ + node_list="db1.dbcluster db2.dbcluster" \ + restore_command="cp /db/data/pg_archive/%f %p" \ + primary_conninfo_opt="keepalives_idle=60 keepalives_interval=5 keepalives_count=5" \ + master_ip="10.8.3.1" \ + stop_escalate="0" \ + rep_mode="async" \ + start_opt="-p 5234" \ + op start timeout="60s" interval="0s" on-fail="restart" \ + op monitor timeout="60s" interval="4s" on-fail="restart" \ + op monitor timeout="60s" interval="3s" on-fail="restart" role="Master" \ + op promote timeout="60s" interval="0s" on-fail="restart" \ + op demote timeout="60s" interval="0s" on-fail="stop" \ + op stop timeout="60s" interval="0s" on-fail="block" \ + op notify timeout="60s" interval="0s" + + crm configure ms msPostgresql pgsql \ + meta \ + master-max="1" \ + master-node-max="1" \ + clone-max="2" \ + clone-node-max="1" \ + notify="true" + + crm configure colocation rsc_colocation-1 inf: msPostgresql clnPingCheck + crm configure colocation rsc_colocation-2 inf: vip-master msPostgresql:Master + + # we want the slave to move to the master if the slave fails. This is optional but it helps + # if we have the read traffic served by the slave node. + # crm configure colocation rsc_colocation-3 inf: vip-slave msPostgresql:Slave + + crm configure order rsc_order-1 0: clnPingCheck msPostgresql + crm configure order rsc_order-2 0: msPostgresql:promote vip-master:start symmetrical=false + + # Again, optional but required if we serve read traffic from the slave + # crm configure order rsc_order-3 0: msPostgresql:demote vip-slave:start symmetrical=false + + crm configure location rsc_location-1 vip-slave \ + rule 200: pgsql-status eq "HS:sync" \ + rule 200: pgsql-status eq "HS:async" \ + rule 100: pgsql-status eq "PRI" + + crm configure location rsc_location-2 msPostgresql \ + rule -inf: not_defined default_ping_set or default_ping_set lt 100 + diff --git a/_includes/nav.html b/_includes/nav.html index 8d9d478..abdd6c3 100644 --- a/_includes/nav.html +++ b/_includes/nav.html @@ -47,6 +47,14 @@

Postgres the cool parts

{% endfor %} +

High availability

+ {% for document in site.high_availability %} +
  • {{ document.title }} + +
  • + {% endfor %} + {% for document in site.ads limit:0 %}
  • {% endfor %}