Skip to content

Commit

Permalink
Shortening datapath downtime for libreswan cable
Browse files Browse the repository at this point in the history
Currently there is about 2 seconds of downtime in datapath for
libreswan cable when the gateway pod restarts.

When Pluto starts running [1] the IPsec datapath rules created by the
previous GW pod are cleared, and traffic is only recovered after
ConnectToEndpoint events are handled by the libreswan cable driver.

This PR reduces datapath downtime by:
A. Checking Pluto control socket status at faster rate
B. Starting Pluto right before the first ConnectToEndpoint event
   is handled by libreswan cable driver.

With this PR, the datapath downtime is reduced to ~0.5 seconds.

[1]
https://github.com/submariner-io/submariner/blob/devel/pkg/cable/libreswan/libreswan.go#L158

Signed-off-by: Yossi Boaron <yboaron@redhat.com>
  • Loading branch information
yboaron authored and skitt committed Jul 10, 2023
1 parent ce1c912 commit f7e001a
Showing 1 changed file with 13 additions and 7 deletions.
20 changes: 13 additions & 7 deletions pkg/cable/libreswan/libreswan.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ type libreswan struct {

debug bool
forceUDPEncapsulation bool
plutoStarted bool
}

type specification struct {
Expand Down Expand Up @@ -131,6 +132,7 @@ func NewLibreswan(localEndpoint *types.SubmarinerEndpoint, localCluster *types.S
localEndpoint: *localEndpoint,
connections: []subv1.Connection{},
forceUDPEncapsulation: ipSecSpec.ForceEncaps,
plutoStarted: false,
}, nil
}

Expand All @@ -152,11 +154,6 @@ func (i *libreswan) Init() error {

fmt.Fprintf(file, "%%any %%any : PSK \"%s\"\n", i.secretKey)

// Ensure Pluto is started
if err := i.runPluto(); err != nil {
return errors.Wrap(err, "error starting Pluto")
}

return nil
}

Expand Down Expand Up @@ -332,6 +329,15 @@ func whack(args ...string) error {
// ConnectToEndpoint establishes a connection to the given endpoint and returns a string
// representation of the IP address of the target endpoint.
func (i *libreswan) ConnectToEndpoint(endpointInfo *natdiscovery.NATEndpointInfo) (string, error) {
if !i.plutoStarted {
// Ensure Pluto is started
if err := i.runPluto(); err != nil {
klog.Fatalf("Error running Pluto: %s", err.Error())
}

i.plutoStarted = true
}

// We'll panic if endpointInfo is nil, this is intentional
endpoint := &endpointInfo.Endpoint

Expand Down Expand Up @@ -597,7 +603,7 @@ func (i *libreswan) runPluto() error {
}()

// Wait up to 5s for the control socket.
for i := 0; i < 5; i++ {
for i := 0; i < 250; i++ {
_, err := os.Stat("/run/pluto/pluto.ctl")
if err == nil {
break
Expand All @@ -608,7 +614,7 @@ func (i *libreswan) runPluto() error {
break
}

time.Sleep(1 * time.Second)
time.Sleep(20 * time.Millisecond)
}

if i.debug {
Expand Down

0 comments on commit f7e001a

Please sign in to comment.