Ce guide décrit comment ajouter un serveur PostgreSQL slave (réplique) à un serveur master existant dans un environnement OpenShift.
Architecture cible :
┌─────────────────────────┐
│ Service Master (RW) │ ← Applications (lecture/écriture)
│ postgresql-master │
└────────────┬────────────┘
│
┌────────▼────────┐
│ Pod Master │
│ postgresql-0 │
└────────┬────────┘
│ Streaming Replication
│
┌────────▼────────┐
│ Pod Slave │
│ postgresql-1 │
└────────┬────────┘
│
┌────────────▼────────────┐
│ Service Slave (RO) │ ← Applications (lecture seule)
│ postgresql-slave │
└─────────────────────────┘
Principes clés :
oc) ou web consoleedit ou admin dans le namespace cibleIdentifier les ressources PostgreSQL existantes :
oc project <nom-du-projet>
oc get all | grep postgres
oc get pvc | grep postgres
oc get configmap | grep postgres
oc get secret | grep postgres
Vérifier la version PostgreSQL du master :
POD_NAME=$(oc get pods -l name=postgresql -o jsonpath='{.items[0].metadata.name}')
oc exec $POD_NAME -- psql --version
⚠️ CRITIQUE : Effectuer un backup complet avant toute modification
# Sauvegarder les ressources OpenShift
oc get dc/postgresql -o yaml > backup-postgresql-dc.yaml
oc get svc/postgresql -o yaml > backup-postgresql-svc.yaml
oc get pvc/postgresql -o yaml > backup-postgresql-pvc.yaml
oc get configmap -o yaml > backup-configmaps.yaml
oc get secret -o yaml > backup-secrets.yaml
# Backup de la base de données
oc exec $POD_NAME -- pg_dumpall -U postgres > backup-database.sql
Resource : postgresql-replication-secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: postgresql-replication
type: Opaque
stringData:
replication-user: replicator
replication-password: <GÉNÉRER_MOT_DE_PASSE_FORT>
Appliquer :
oc apply -f postgresql-replication-secret.yaml
Resource : postgresql-master-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: postgresql-master-config
data:
master.conf: |
# Configuration Master PostgreSQL
listen_addresses = '*'
max_connections = 100
shared_buffers = 256MB
# ===== Configuration WAL pour Réplication =====
wal_level = replica # Active la réplication
max_wal_senders = 10 # Nombre max de connexions de réplication
max_replication_slots = 10 # Nombre max de slots
wal_keep_size = 1GB # Rétention des WAL (ajuster selon le besoin)
hot_standby = on # Permet les lectures sur le slave
# ===== Archivage WAL (optionnel mais recommandé) =====
archive_mode = on
archive_command = 'test ! -f /var/lib/postgresql/archive/%f && cp %p /var/lib/postgresql/archive/%f'
# ===== Logging =====
log_destination = 'stderr'
logging_collector = on
Points d'attention :
10.128.0.0/14 est le réseau par défaut OpenShift. À ajuster selon votre configuration réseau.Appliquer :
oc apply -f postgresql-master-config.yaml
Resource : postgresql-master-init.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: postgresql-master-init
data:
init-master.sh: |
#!/bin/bash
set -e
echo "Initialisation du master pour la réplication..."
# Vérifier si déjà initialisé
if psql -U postgres -d postgres -tAc "SELECT 1 FROM pg_roles WHERE rolname='replicator'" | grep -q 1; then
echo "Utilisateur de réplication déjà existant"
else
# Créer l'utilisateur de réplication
psql -U postgres <<-EOSQL
CREATE USER replicator WITH REPLICATION ENCRYPTED PASSWORD '${REPLICATION_PASSWORD}';
SELECT pg_create_physical_replication_slot('replication_slot_slave');
EOSQL
echo "Utilisateur de réplication créé avec succès"
Ce script :
replicator avec le privilège REPLICATIONreplication_slot_slaveAppliquer :
oc apply -f postgresql-master-init.yaml
⚠️ ATTENTION : Redémarrage du pod master requis (interruption de service ~2-5 minutes)
Récupérer le DeploymentConfig actuel :
oc get dc/postgresql -o yaml > postgresql-master-updated.yaml
Modifications à appliquer dans spec.template.spec :
spec:
template:
spec:
volumes:
# ... volumes existants ...
- name: master-config
configMap:
name: postgresql-master-config
- name: master-init
configMap:
name: postgresql-master-init
defaultMode: 0755
- name: archive
emptyDir: {}
containers:
- name: postgresql
volumeMounts:
# ... volumeMounts existants ...
- name: master-config
mountPath: /etc/postgresql/master.conf
subPath: master.conf
- name: master-config
mountPath: /etc/postgresql/pg_hba.conf
subPath: pg_hba.conf
- name: master-init
mountPath: /docker-entrypoint-initdb.d/init-master.sh
subPath: init-master.sh
- name: archive
mountPath: /var/lib/postgresql/archive
env:
# ... env existantes ...
- name: REPLICATION_PASSWORD
valueFrom:
secretKeyRef:
name: postgresql-replication
key: replication-password
- name: POSTGRESQL_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: postgresql # Adapter au nom du secret existant
key: database-password
args:
- "-c"
- "config_file=/etc/postgresql/master.conf"
- "-c"
- "hba_file=/etc/postgresql/pg_hba.conf"
Application des modifications :
# Validation syntaxe
oc apply -f postgresql-master-updated.yaml --dry-run=client
# Application (⚠️ redémarrage du pod)
oc apply -f postgresql-master-updated.yaml
# Surveillance
oc get pods -w
Attendre que le pod soit prêt :
oc wait --for=condition=ready pod -l name=postgresql --timeout=300s
POD_NAME=$(oc get pods -l name=postgresql -o jsonpath='{.items[0].metadata.name}')
Vérifications obligatoires :
oc exec $POD_NAME -- psql -U postgres -c "SELECT rolname, rolreplication FROM pg_roles WHERE rolname='replicator';"
Résultat attendu : replicator | t
oc exec $POD_NAME -- psql -U postgres -c "SELECT * FROM pg_replication_slots;"
Résultat attendu : replication_slot_slave | physical | f
oc exec $POD_NAME -- psql -U postgres -c "SHOW wal_level; SHOW max_wal_senders; SHOW wal_keep_size;"
Résultat attendu : replica, 10, 1GB
Resource : postgresql-slave-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: postgresql-slave
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 20Gi # Ajuster à la taille du master ou plus
storageClassName: standard # Ajuster selon votre storage class
Points d'attention :
Appliquer :
oc apply -f postgresql-slave-pvc.yaml
Resource : postgresql-slave-config.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: postgresql-slave-config
data:
slave.conf: |
# Configuration Slave PostgreSQL
listen_addresses = '*'
max_connections = 100
shared_buffers = 256MB
# ===== Configuration Réplication =====
wal_level = replica
hot_standby = on # CRITIQUE : permet les lectures
max_wal_senders = 10
max_replication_slots = 10
# ===== Logging =====
log_destination = 'stderr'
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d_%H%M%S.log'
log_statement = 'mod'
log_connections = on
log_disconnections = on
setup-slave.sh: |
#!/bin/bash
set -e
echo "Configuration du slave PostgreSQL..."
[ ]
Détails pg_basebackup :
-h : hôte du master-D : répertoire de destination-U : utilisateur de réplication-v -P : verbose et progression-W : demander le mot de passe-R : créer automatiquement la configuration de réplication-X stream : inclure les WAL nécessaires-S : utiliser le slot de réplicationAppliquer :
oc apply -f postgresql-slave-config.yaml
Resource : postgresql-slave-deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: postgresql-slave
labels:
app: postgresql
role: slave
spec:
replicas: 1
selector:
matchLabels:
app: postgresql
role: slave
template:
metadata:
labels:
app: postgresql
role: slave
spec:
# ===== Init Container : Configuration initiale du slave =====
initContainers:
- name: setup-slave
image: postgres:15 # ⚠️ MÊME VERSION QUE LE MASTER
command: ["/bin/bash", "/scripts/setup-slave.sh"]
env:
- name: PGDATA
value: /var/lib/postgresql/data
- name: MASTER_SERVICE
value: postgresql
Points d'attention :
Appliquer :
oc apply -f postgresql-slave-deployment.yaml
Resource : postgresql-slave-service.yaml
apiVersion: v1
kind: Service
metadata:
name: postgresql-slave
labels:
app: postgresql
role: slave
spec:
ports:
- name: postgresql
port: 5432
targetPort: 5432
selector:
app: postgresql
role: slave
type: ClusterIP
Appliquer :
oc apply -f postgresql-slave-service.yaml
# Surveiller les pods
oc get pods -l role=slave -w
# Logs de l'init container
SLAVE_POD=$(oc get pods -l role=slave -o jsonpath='{.items[0].metadata.name}')
oc logs $SLAVE_POD -c setup-slave -f
# Logs du conteneur principal
oc logs $SLAVE_POD -c postgresql -f
Durée estimée : 5-15 minutes selon la taille de la base de données.
MASTER_POD=$(oc get pods -l name=postgresql -o jsonpath='{.items[0].metadata.name}')
oc exec $MASTER_POD -- psql -U postgres -c "
SELECT
pid,
usename,
application_name,
client_addr,
state,
sync_state,
sent_lsn,
write_lsn,
flush_lsn,
replay_lsn
FROM pg_stat_replication;
"
Résultat attendu :
pid | usename | application_name | client_addr | state | sync_state
-----+------------+------------------+-------------+-----------+------------
123 | replicator | walreceiver | 10.128.x.x | streaming | async
Vérification des colonnes :
state = streaming : réplication activesync_state = async : mode asynchrone (normal)sent_lsn = replay_lsn : pas de lagoc exec $MASTER_POD -- psql -U postgres -c "
SELECT
slot_name,
slot_type,
active,
restart_lsn
FROM pg_replication_slots;
"
Résultat attendu :
slot_name | slot_type | active | restart_lsn
----------------------+-----------+--------+-------------
replication_slot_slave | physical | t | 0/3000000
Vérifications :
active = t : le slot est utilisérestart_lsn : position de départoc exec $MASTER_POD -- psql -U postgres -c "
SELECT
client_addr,
state,
(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) / 1024 / 1024)::INT AS lag_mb,
(EXTRACT(EPOCH FROM (now() - replay_time)))::INT AS lag_seconds
FROM pg_stat_replication;
"
Valeurs acceptables :
lag_mb < 10 : excellentlag_seconds < 5 : excellentCréer des données sur le master :
oc exec $MASTER_POD -- psql -U postgres <<EOF
CREATE TABLE IF NOT EXISTS test_replication (
id SERIAL PRIMARY KEY,
message TEXT,
created_at TIMESTAMP DEFAULT NOW()
);
INSERT INTO test_replication (message)
VALUES ('Test 1'), ('Test 2'), ('Test 3');
EOF
Vérifier sur le slave :
SLAVE_POD=$(oc get pods -l role=slave -o jsonpath='{.items[0].metadata.name}')
oc exec $SLAVE_POD -- psql -U postgres -c "SELECT * FROM test_replication;"
✅ Attendu : Les 3 lignes apparaissent sur le slave.
oc exec $SLAVE_POD -- psql -U postgres -c "
INSERT INTO test_replication (message) VALUES ('Should fail');
"
✅ Attendu : Erreur cannot execute INSERT in a read-only transaction
Générer une charge :
oc exec $MASTER_POD -- psql -U postgres <<EOF
INSERT INTO test_replication (message)
SELECT 'Test ' || generate_series(1, 10000);
EOF
Vérifier immédiatement sur le slave :
oc exec $SLAVE_POD -- psql -U postgres -c "SELECT COUNT(*) FROM test_replication;"
✅ Attendu : Les 10000 lignes sont répliquées en < 5 secondes.
Créer un pod de test :
oc run postgresql-test --image=postgres:15 --rm -it --restart=Never -- bash
Depuis le pod de test :
# Connexion master (RW)
psql -h postgresql -U postgres -d postgres
# Connexion slave (RO)
psql -h postgresql-slave -U postgres -d postgres
Resource : postgresql-monitoring-queries.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: postgresql-monitoring-queries
data:
replication-status.sql: |
SELECT
client_addr,
state,
sync_state,
(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) / 1024 / 1024)::INT AS lag_mb,
(EXTRACT(EPOCH FROM (now() - replay_time)))::INT AS lag_seconds
FROM pg_stat_replication;
database-size.sql: |
SELECT
pg_database.datname,
pg_size_pretty(pg_database_size(pg_database.datname)) AS size
FROM pg_database
ORDER BY pg_database_size(pg_database.datname) DESC;
connection-count.sql: |
SELECT
count(*) as total_connections,
sum(case when state = 'active' then 1 else 0 end) as active_connections,
sum(case when state = 'idle' then 1 else 0 end) as idle_connections
FROM pg_stat_activity;
replication-slots.sql: |
SELECT
slot_name,
slot_type,
active,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) AS retained_wal
FROM pg_replication_slots;
Utilisation :
oc exec $MASTER_POD -- psql -U postgres -f /path/to/replication-status.sql
Resource : postgresql-replication-alerts.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: postgresql-replication-alerts
spec:
groups:
- name: postgresql-replication
interval: 30s
rules:
- alert: PostgreSQLReplicationLagHigh
expr: pg_replication_lag_seconds > 30
for: 5m
labels:
severity: warning
annotations:
summary: "PostgreSQL replication lag is high"
description: "Replication lag is {{ $value }} seconds on {{ $labels.instance }}"
- alert: PostgreSQLReplicationBroken
expr: pg_replication_connected == 0
for: 1m
labels:
severity: critical
annotations:
summary: "PostgreSQL replication is broken"
Resource : postgresql-monitoring-cronjob.yaml
apiVersion: batch/v1
kind: CronJob
metadata:
name: postgresql-replication-check
spec:
schedule: "*/15 * * * *" # Toutes les 15 minutes
jobTemplate:
spec:
template:
spec:
serviceAccountName: postgresql-monitor
containers:
- name: check
image: postgres:15
command:
- /bin/bash
- -c
- |
MASTER_POD=$(kubectl get pods -l name=postgresql -o jsonpath='{.items[0].metadata.name}')
echo "Checking replication status..."
kubectl exec $MASTER_POD -- psql -U postgres -t -c "
SELECT
CASE
WHEN count(*) = 0 THEN 'ERROR: No replication connection'
WHEN max((pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) / 1024 / 1024)::INT) > 100 THEN 'WARNING: Lag > 100MB'
WHEN max(EXTRACT(EPOCH FROM (now() - replay_time))::INT) > 60 THEN 'WARNING: Lag > 60 seconds'
ELSE 'OK: Replication healthy'
END as status
FROM pg_stat_replication;
"
Pré-requis : Créer le ServiceAccount avec les permissions nécessaires.
Si le slave pose problème, il peut être supprimé sans impact sur le master :
# Supprimer les ressources
oc delete deployment postgresql-slave
oc delete service postgresql-slave
oc delete pvc postgresql-slave
# Nettoyer le slot de réplication sur le master
MASTER_POD=$(oc get pods -l name=postgresql -o jsonpath='{.items[0].metadata.name}')
oc exec $MASTER_POD -- psql -U postgres -c "
SELECT pg_drop_replication_slot('replication_slot_slave');
"
En cas de problème après modification du master :
# Restaurer la configuration d'origine
oc apply -f backup-postgresql-dc.yaml
# Surveiller le rollout
oc rollout status dc/postgresql
# Vérifier l'état
oc get pods -l name=postgresql
⚠️ Attention : Les ConfigMaps créées resteront présentes mais ne seront plus utilisées.
Symptômes :
CrashLoopBackOffDiagnostic :
oc logs $SLAVE_POD -c setup-slave
oc logs $SLAVE_POD -c postgresql
oc describe pod $SLAVE_POD
Solutions courantes :
oc exec $SLAVE_POD -- ping postgresql
oc get secret postgresql-replication -o yaml
oc exec $MASTER_POD -- psql -U postgres -c "SELECT * FROM pg_replication_slots;"
oc delete pod $SLAVE_POD
oc delete pvc postgresql-slave
oc apply -f postgresql-slave-pvc.yaml
Symptômes :
lag_mb > 100 MBlag_seconds > 30 secondesDiagnostic :
# Ressources du slave
oc describe pod $SLAVE_POD
oc top pod $SLAVE_POD
# Charge du master
oc top pod $MASTER_POD
Solutions :
oc edit deployment postgresql-slave
# Augmenter memory et cpu
oc exec $MASTER_POD -- ping $SLAVE_POD_IP
oc exec $SLAVE_POD -- iostat -x 1
Symptômes :
retained_wal très élevéDiagnostic :
oc exec $MASTER_POD -- psql -U postgres -c "
SELECT
slot_name,
active,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) AS retained_wal
FROM pg_replication_slots;
"
Solutions :
Si le slave est actif mais lent : voir section lag
Si le slave est perdu (inactive) :
# Supprimer le slot
oc exec $MASTER_POD -- psql -U postgres -c "
SELECT pg_drop_replication_slot('replication_slot_slave');
"
# Recréer le slave depuis zéro
pg_dumpall)replicator crééreplication_slot_slave crééstreamingPrincipe : Diriger les lectures seules vers le slave, les écritures vers le master.
Configuration des variables d'environnement :
# Pour une application déployée
oc set env deployment/mon-application \
DATABASE_URL_WRITE="postgresql://user:pass@postgresql:5432/dbname" \
DATABASE_URL_READ="postgresql://user:pass@postgresql-slave:5432/dbname"
Exemple de configuration applicative (Node.js / TypeORM) :
{
type: 'postgres',
replication: {
master: {
host: 'postgresql',
port: 5432,
username: 'user',
password: 'pass',
database: 'dbname'
},
slaves: [{
host: 'postgresql-slave',
port: 5432,
username: 'user',
password: 'pass',
database: 'dbname'
}]
}
}
wal_level = replicahot_standby = on# Logs
oc logs <pod-name>
oc logs <pod-name> -c <container-name>
# Events
oc get events --sort-by='.lastTimestamp' | grep postgres
# Describe
oc describe pod <pod-name>
oc describe pvc <pvc-name>
# État des ressources
oc get all -l app=postgresql
oc top pod -l app=postgresql