Skip to content

Commit

Permalink
Merge pull request #247 from cubeek/osprh-654
Browse files Browse the repository at this point in the history
Fix OVN raft cluster recovery after destructive pods' deletion
  • Loading branch information
openshift-merge-bot[bot] authored Mar 28, 2024
2 parents effd8a0 + b74ef56 commit 1260d2e
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 66 deletions.
30 changes: 18 additions & 12 deletions pkg/ovndbcluster/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ func StatefulSet(
}

var preStopCmd []string
var postStartCmd []string
cmd := []string{"/usr/bin/dumb-init"}
args := []string{"--single-child", "--", "/bin/bash", "-c", ServiceCommand}
//
Expand All @@ -69,19 +68,11 @@ func StatefulSet(
}
readinessProbe.Exec = livenessProbe.Exec

postStartCmd = []string{
"/usr/local/bin/container-scripts/settings.sh",
}
preStopCmd = []string{
"/usr/local/bin/container-scripts/cleanup.sh",
}

lifecycle := &corev1.Lifecycle{
PostStart: &corev1.LifecycleHandler{
Exec: &corev1.ExecAction{
Command: postStartCmd,
},
},
PreStop: &corev1.LifecycleHandler{
Exec: &corev1.ExecAction{
Command: preStopCmd,
Expand Down Expand Up @@ -124,6 +115,19 @@ func StatefulSet(
volumeMounts = append(volumeMounts, svc.CreateVolumeMounts(serviceName)...)
}

// NOTE(ihar) ovndb pods leave the raft cluster on delete; it's important
// that they are not interrupted and have a good chance to propagate the
// leave message to the leader. In general case, this should happen near
// instantly. But if the leader pod is itself down / restarting, it may take
// it some time to recover and start processing messages from other members.
// The default value of 30 seconds is sometimes not enough. In local testing,
// 60 seconds seems enough, but we'll take a significantly more conservative
// approach here and set it to 5 minutes.
//
// If the leader is not back even after 5 minutes, we'll give up
// nevertheless, and manual cluster recovery will be needed.
terminationGracePeriodSeconds := int64(300)

statefulset := &appsv1.StatefulSet{
ObjectMeta: metav1.ObjectMeta{
Name: serviceName,
Expand All @@ -133,15 +137,17 @@ func StatefulSet(
Selector: &metav1.LabelSelector{
MatchLabels: labels,
},
ServiceName: serviceName,
Replicas: instance.Spec.Replicas,
ServiceName: serviceName,
PodManagementPolicy: appsv1.ParallelPodManagement,
Replicas: instance.Spec.Replicas,
Template: corev1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Annotations: annotations,
Labels: labels,
},
Spec: corev1.PodSpec{
ServiceAccountName: instance.RbacResourceName(),
TerminationGracePeriodSeconds: &terminationGracePeriodSeconds,
ServiceAccountName: instance.RbacResourceName(),
Containers: []corev1.Container{
{
Name: serviceName,
Expand Down
21 changes: 21 additions & 0 deletions templates/ovndbcluster/bin/cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,27 @@ DB_TYPE="{{ .DB_TYPE }}"
if [[ "${DB_TYPE}" == "sb" ]]; then
DB_NAME="OVN_Southbound"
fi

# There is nothing special about -0 pod, except that it's always guaranteed to
# exist, assuming any replicas are ordered.
if [[ "$(hostname)" != "{{ .SERVICE_NAME }}-0" ]]; then
ovs-appctl -t /tmp/ovn${DB_TYPE}_db.ctl cluster/leave ${DB_NAME}

# wait for when the leader confirms we left the cluster
while true; do
# TODO: is there a better way to detect the cluster left state?..
STATUS=$(ovs-appctl -t /tmp/ovn${DB_TYPE}_db.ctl cluster/status ${DB_NAME} | grep Status: | awk -e '{print $2}')
if [ -z "$STATUS" -o "x$STATUS" = "xleft cluster" ]; then
break
fi
sleep 1
done
fi

# If replicas are 0 and *all* pods are removed, we still want to retain the
# database with its cid/sid for when the cluster is scaled back to > 0, so
# leaving the database file intact for -0 pod.
if [[ "$(hostname)" != "{{ .SERVICE_NAME }}-0" ]]; then
# now that we left, the database file is no longer valid
rm -f /etc/ovn/ovn${DB_TYPE}_db.db
fi
51 changes: 0 additions & 51 deletions templates/ovndbcluster/bin/settings.sh

This file was deleted.

53 changes: 51 additions & 2 deletions templates/ovndbcluster/bin/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
set -ex
DB_TYPE="{{ .DB_TYPE }}"
DB_PORT="{{ .DB_PORT }}"
{{- if .TLS }}
DB_SCHEME="pssl"
{{- else }}
DB_SCHEME="ptcp"
{{- end }}
RAFT_PORT="{{ .RAFT_PORT }}"
NAMESPACE="{{ .NAMESPACE }}"
OPTS=""
Expand All @@ -33,8 +38,11 @@ else
DB_ADDR="[::]"
fi

# The --cluster-remote-addr / --cluster-local-addr options will have effect
# only on bootstrap, when we assume the leadership role for the first pod.
# Later, cli arguments are still passed, but raft membership hints are already
# stored in the databases, and hence the arguments are of no effect.
if [[ "$(hostname)" != "{{ .SERVICE_NAME }}-0" ]]; then
rm -f /etc/ovn/ovn${DB_TYPE}_db.db
#ovsdb-tool join-cluster /etc/ovn/ovn${DB_TYPE}_db.db ${DB_NAME} tcp:$(hostname).{{ .SERVICE_NAME }}.${NAMESPACE}.svc.cluster.local:${RAFT_PORT} tcp:{{ .SERVICE_NAME }}-0.{{ .SERVICE_NAME }}.${NAMESPACE}.svc.cluster.local:${RAFT_PORT}
OPTS="--db-${DB_TYPE}-cluster-remote-addr={{ .SERVICE_NAME }}-0.{{ .SERVICE_NAME }}.${NAMESPACE}.svc.cluster.local --db-${DB_TYPE}-cluster-remote-port=${RAFT_PORT}"
fi
Expand Down Expand Up @@ -73,4 +81,45 @@ set "$@" --ovn-${DB_TYPE}-log=-vconsole:{{ .OVN_LOG_LEVEL }}
set "$@" --ovn-${DB_TYPE}-logfile=/dev/null

# don't log to file (we already log to console)
$@ ${OPTS} run_${DB_TYPE}_ovsdb -- -vfile:off
$@ ${OPTS} run_${DB_TYPE}_ovsdb -- -vfile:off &

# Once the database is running, we will attempt to configure db options
CTLCMD="ovn-${DB_TYPE}ctl --no-leader-only"

# Nothing special about the first pod, we just know that it always exists with
# replicas > 0 and use it for configuration. In theory, this could be executed
# in any other pod.
if [[ "$(hostname)" == "{{ .SERVICE_NAME }}-0" ]]; then
# The command will wait until the daemon is connected and the DB is available
# All following ctl invocation will use the local DB replica in the daemon
export OVN_${DB_TYPE^^}_DAEMON=$(${CTLCMD} --pidfile --detach)

{{- if .TLS }}
${CTLCMD} set-ssl {{.OVNDB_KEY_PATH}} {{.OVNDB_CERT_PATH}} {{.OVNDB_CACERT_PATH}}
${CTLCMD} set-connection ${DB_SCHEME}:${DB_PORT}:${DB_ADDR}
{{- end }}

# OVN does not support setting inactivity-probe through --remote cli arg so
# we have to set it after database is up.
#
# In theory, ovsdb.local-config(5) could be used to configure inactivity
# probe using a local ovsdb-server. But the future of this database is
# unclear, and it was largely abandoned by the community in mid-flight, so
# no tools exist to configure connections using this database. It may even
# be that this scheme will be abandoned in the future, because its features
# are covered by ovs text config file support added in latest ovs releases.
#
# TODO: Consider migrating inactivity probe setting to config files when
# we update to ovs 3.3. See --config-file in ovsdb-server(1) for more
# details.
while [ "$(${CTLCMD} get connection . inactivity_probe)" != "{{ .OVN_INACTIVITY_PROBE }}" ]; do
${CTLCMD} --inactivity-probe={{ .OVN_INACTIVITY_PROBE }} set-connection ${DB_SCHEME}:${DB_PORT}:${DB_ADDR}
done
${CTLCMD} list connection

# The daemon is no longer needed, kill it
kill $(cat $OVN_RUNDIR/ovn-${DB_TYPE}ctl.pid)
unset OVN_${DB_TYPE^^}_DAEMON
fi

wait
13 changes: 12 additions & 1 deletion tests/functional/ovndbcluster_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
condition "github.com/openstack-k8s-operators/lib-common/modules/common/condition"
. "github.com/openstack-k8s-operators/lib-common/modules/common/test/helpers"
ovnv1 "github.com/openstack-k8s-operators/ovn-operator/api/v1beta1"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
)
Expand Down Expand Up @@ -159,6 +160,16 @@ var _ = Describe("OVNDBCluster controller", func() {
Expect(OVNDBCluster.Spec.DBType).Should(Equal(ovnv1.NBDBType))
})

It("should have the StatefulSet with podManagementPolicy set to Parallel", func() {
statefulSetName := types.NamespacedName{
Namespace: namespace,
Name: "ovsdbserver-nb",
}
ss := th.GetStatefulSet(statefulSetName)

Expect(ss.Spec.PodManagementPolicy).Should(Equal(appsv1.ParallelPodManagement))
})

It("should have the Status fields initialized", func() {
OVNDBCluster := GetOVNDBCluster(OVNDBClusterName)
Expect(OVNDBCluster.Status.Hash).To(BeEmpty())
Expand Down Expand Up @@ -490,7 +501,7 @@ var _ = Describe("OVNDBCluster controller", func() {
return *th.GetConfigMap(scriptsCM)
}, timeout, interval).ShouldNot(BeNil())

Expect(th.GetConfigMap(scriptsCM).Data["settings.sh"]).Should(
Expect(th.GetConfigMap(scriptsCM).Data["setup.sh"]).Should(
ContainSubstring("DB_SCHEME=\"pssl\""))
Expect(th.GetConfigMap(scriptsCM).Data["setup.sh"]).Should(And(
ContainSubstring("-db-ssl-key="),
Expand Down

0 comments on commit 1260d2e

Please sign in to comment.