rqlite/cluster/client.go

package cluster

import (
	"compress/gzip"
	"crypto/tls"
	"encoding/binary"
	"errors"
	"fmt"
	"io"
	"net"
	"os"
	"sync"
	"time"

	"github.com/rqlite/rqlite/v9/auth"
	"github.com/rqlite/rqlite/v9/cluster/proto"
	command "github.com/rqlite/rqlite/v9/command/proto"
	"github.com/rqlite/rqlite/v9/internal/rtls"
	"github.com/rqlite/rqlite/v9/tcp"
	"github.com/rqlite/rqlite/v9/tcp/pool"
	pb "google.golang.org/protobuf/proto"
)

const (
	maxPoolCapacity   = 64
	defaultMaxRetries = 0
	noRetries         = 0

	protoBufferLengthSize = 8
)

// CreateRaftDialer creates a dialer for connecting to other nodes' Raft service. If the cert and
// key arguments are not set, then the returned dialer will not use TLS. If they are set then
// the dialer will use TLS. A started CertMonitor will also be returned. The caller is responsible
// for stopping the CertMonitor when the Dialer is no longer needed. The serverName argument is used
// to validate the server certificate. If Insecure is true, then the dialer will not validate the
// server certificate.
func CreateRaftDialer(cert, key, caCert, serverName string, Insecure bool) (*tcp.Dialer, error) {
	var dialerTLSConfig *tls.Config
	var err error
	var cr *rtls.CertReloader
	if cert != "" || key != "" {
		cr, err = rtls.NewCertReloader(cert, key)
		if err != nil {
			return nil, fmt.Errorf("failed to create TLS config for Raft dialer: %s", err.Error())
		}
		dialerTLSConfig, err = rtls.CreateClientConfigWithFunc(cr.GetCertificate, caCert, serverName, Insecure)
		if err != nil {
			return nil, fmt.Errorf("failed to create TLS config for Raft dialer: %s", err.Error())
		}
	}
	return tcp.NewDialer(MuxRaftHeader, dialerTLSConfig), nil
}

// CredentialsFor returns a Credentials instance for the given username, or nil if
// the given CredentialsStore is nil, or the username is not found.
func CredentialsFor(credStr *auth.CredentialsStore, username string) *proto.Credentials {
	if credStr == nil {
		return nil
	}
	pw, ok := credStr.Password(username)
	if !ok {
		return nil
	}
	return &proto.Credentials{
		Username: username,
		Password: pw,
	}
}

// Client allows communicating with a remote node.
type Client struct {
	dialer  Dialer
	timeout time.Duration

	localMu       sync.RWMutex
	localNodeAddr string
	localServ     *Service
	localVersion  string

	poolMu sync.RWMutex
	pools  map[string]pool.Pool
}

// NewClient returns a client instance for talking to a remote node.
// Clients will retry certain commands if they fail, to allow for
// remote node restarts. Cluster management operations such as joining
// and removing nodes are not retried, to make it clear to the operator
// that the operation failed. In addition, higher-level code will
// usually retry these operations.
func NewClient(dl Dialer, t time.Duration) *Client {
	return &Client{
		dialer:  dl,
		timeout: t,
		pools:   make(map[string]pool.Pool),
	}
}

// SetLocal informs the client instance of the node address for the node
// using this client. Along with the Service instance it allows this
// client to serve requests for this node locally without the network hop.
func (c *Client) SetLocal(nodeAddr string, serv *Service) error {
	c.localMu.Lock()
	defer c.localMu.Unlock()
	c.localNodeAddr = nodeAddr
	c.localServ = serv
	return nil
}

// SetLocalVersion informs the client instance of the version of the software
// running on this node. This is used so the client can serve this information
// quickly.
func (c *Client) SetLocalVersion(version string) error {
	c.localMu.Lock()
	defer c.localMu.Unlock()
	c.localVersion = version
	return nil
}

// GetLocalNodeAddr retrieves the version of software of the software
// running on this node.
func (c *Client) GetLocalVersion() string {
	c.localMu.RLock()
	defer c.localMu.RUnlock()
	return c.localVersion
}

// GetNodeAPIAddr retrieves metadata for the node at nodeAddr
func (c *Client) GetNodeMeta(nodeAddr string, retries int, timeout time.Duration) (*proto.NodeMeta, error) {
	c.localMu.RLock()
	defer c.localMu.RUnlock()
	if c.localNodeAddr == nodeAddr && c.localServ != nil {
		// Serve it locally!
		stats.Add(numGetNodeAPIRequestLocal, 1)
		return &proto.NodeMeta{
			Url:     c.localServ.GetNodeAPIURL(),
			Version: c.GetLocalVersion(),
		}, nil
	}

	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_GET_NODE_META,
	}
	p, nr, err := c.retry(command, nodeAddr, timeout, retries)
	stats.Add(numGetNodeAPIRequestRetries, int64(nr))
	if err != nil {
		return nil, err
	}

	a := &proto.NodeMeta{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return nil, fmt.Errorf("protobuf unmarshal: %w", err)
	}
	if a.Version == "" {
		// Handle nodes running older code.
		a.Version = "unknown"
	}
	return a, nil
}

// GetCommitIndex retrieves the commit index for the node at nodeAddr
func (c *Client) GetCommitIndex(nodeAddr string, retries int, timeout time.Duration) (uint64, error) {
	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_GET_NODE_META,
	}
	p, nr, err := c.retry(command, nodeAddr, timeout, retries)
	stats.Add(numGetNodeAPIRequestRetries, int64(nr))
	if err != nil {
		return 0, err
	}

	a := &proto.NodeMeta{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return 0, fmt.Errorf("protobuf unmarshal: %w", err)
	}
	return a.CommitIndex, nil
}

// Execute performs an Execute on a remote node. If creds is nil, then
// no credential information will be included in the Execute request to the
// remote node.
func (c *Client) Execute(er *command.ExecuteRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration, retries int) ([]*command.ExecuteQueryResponse, uint64, error) {
	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_EXECUTE,
		Request: &proto.Command_ExecuteRequest{
			ExecuteRequest: er,
		},
		Credentials: creds,
	}
	p, nr, err := c.retry(command, nodeAddr, timeout, retries)
	stats.Add(numClientExecuteRetries, int64(nr))
	if err != nil {
		return nil, 0, err
	}

	a := &proto.CommandExecuteResponse{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return nil, 0, err
	}

	if a.Error != "" {
		return nil, 0, errors.New(a.Error)
	}
	return a.Response, a.RaftIndex, nil
}

// Query performs a Query on a remote node. If creds is nil, then
// no credential information will be included in the Query request to the
// remote node.
func (c *Client) Query(qr *command.QueryRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration) ([]*command.QueryRows, uint64, error) {
	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_QUERY,
		Request: &proto.Command_QueryRequest{
			QueryRequest: qr,
		},
		Credentials: creds,
	}
	p, nr, err := c.retry(command, nodeAddr, timeout, defaultMaxRetries)
	stats.Add(numClientQueryRetries, int64(nr))
	if err != nil {
		return nil, 0, err
	}

	a := &proto.CommandQueryResponse{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return nil, 0, err
	}

	if a.Error != "" {
		return nil, 0, errors.New(a.Error)
	}
	return a.Rows, a.RaftIndex, nil
}

// Request performs an ExecuteQuery on a remote node. If creds is nil, then
// no credential information will be included in the ExecuteQuery request to the
// remote node.
func (c *Client) Request(r *command.ExecuteQueryRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration, retries int) ([]*command.ExecuteQueryResponse, uint64, uint64, error) {
	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_REQUEST,
		Request: &proto.Command_ExecuteQueryRequest{
			ExecuteQueryRequest: r,
		},
		Credentials: creds,
	}
	p, nr, err := c.retry(command, nodeAddr, timeout, retries)
	stats.Add(numClientRequestRetries, int64(nr))
	if err != nil {
		return nil, 0, 0, err
	}

	a := &proto.CommandRequestResponse{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return nil, 0, 0, err
	}

	if a.Error != "" {
		return nil, 0, 0, errors.New(a.Error)
	}
	return a.Response, a.NumRW, a.RaftIndex, nil
}

// Backup retrieves a backup from a remote node and writes to the io.Writer.
// If creds is nil, then no credential information will be included in the
// Backup request to the remote node.
func (c *Client) Backup(br *command.BackupRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration, w io.Writer) error {
	conn, err := c.dial(nodeAddr)
	if err != nil {
		return err
	}
	defer conn.Close()

	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_BACKUP_STREAM,
		Request: &proto.Command_BackupRequest{
			BackupRequest: br,
		},
		Credentials: creds,
	}

	if err := writeCommand(conn, command, timeout); err != nil {
		handleConnError(conn)
		return err
	}

	p, err := readResponse(conn, timeout)
	if err != nil {
		handleConnError(conn)
		return err
	}

	a := &proto.CommandBackupResponse{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return err
	}
	if a.Error != "" {
		return errors.New(a.Error)
	}

	// The backup stream is unconditionally compressed, so depending on whether
	// the user requested compression, we may need to decompress the response.
	var rc io.ReadCloser
	rc = conn
	if !br.Compress {
		gzr, err := gzip.NewReader(conn)
		if err != nil {
			return err
		}
		gzr.Multistream(false)
		rc = gzr
		defer rc.Close()
	}
	_, err = io.Copy(w, rc)
	return err
}

// Load loads a SQLite file into the database. If creds is nil, then no
// credential information will be included in the Load request to the remote node.
func (c *Client) Load(lr *command.LoadRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration, retries int) error {
	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_LOAD,
		Request: &proto.Command_LoadRequest{
			LoadRequest: lr,
		},
		Credentials: creds,
	}
	p, nr, err := c.retry(command, nodeAddr, timeout, retries)
	stats.Add(numClientLoadRetries, int64(nr))
	if err != nil {
		return err
	}

	a := &proto.CommandLoadResponse{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return err
	}

	if a.Error != "" {
		return errors.New(a.Error)
	}
	return nil
}

// RemoveNode removes a node from the cluster. If creds is nil, then no
// credential information will be included in the RemoveNode request to the
// remote node.
func (c *Client) RemoveNode(rn *command.RemoveNodeRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration) error {
	conn, err := c.dial(nodeAddr)
	if err != nil {
		return err
	}
	defer conn.Close()

	// Create the request.
	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_REMOVE_NODE,
		Request: &proto.Command_RemoveNodeRequest{
			RemoveNodeRequest: rn,
		},
		Credentials: creds,
	}
	if err := writeCommand(conn, command, timeout); err != nil {
		handleConnError(conn)
		return err
	}

	p, err := readResponse(conn, timeout)
	if err != nil {
		handleConnError(conn)
		return err
	}

	a := &proto.CommandRemoveNodeResponse{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return err
	}

	if a.Error != "" {
		return errors.New(a.Error)
	}
	return nil
}

// Stepdown triggers leader stepdown on a remote node. If creds is nil, then no
// credential information will be included in the Stepdown request to the
// remote node.
func (c *Client) Stepdown(sr *command.StepdownRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration) error {
	conn, err := c.dial(nodeAddr)
	if err != nil {
		return err
	}
	defer conn.Close()

	// Create the request.
	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_STEPDOWN,
		Request: &proto.Command_StepdownRequest{
			StepdownRequest: sr,
		},
		Credentials: creds,
	}
	if err := writeCommand(conn, command, timeout); err != nil {
		handleConnError(conn)
		return err
	}

	p, err := readResponse(conn, timeout)
	if err != nil {
		handleConnError(conn)
		return err
	}

	a := &proto.CommandStepdownResponse{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return err
	}

	if a.Error != "" {
		return errors.New(a.Error)
	}
	return nil
}

// Notify notifies a remote node that this node is ready to bootstrap.
// If creds is nil, then no credential information will be included in
// // the Notify request to the remote node.
func (c *Client) Notify(nr *command.NotifyRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration) error {
	conn, err := c.dial(nodeAddr)
	if err != nil {
		return err
	}
	defer conn.Close()

	// Create the request.
	command := &proto.Command{
		Type: proto.Command_COMMAND_TYPE_NOTIFY,
		Request: &proto.Command_NotifyRequest{
			NotifyRequest: nr,
		},
		Credentials: creds,
	}
	if err := writeCommand(conn, command, timeout); err != nil {
		handleConnError(conn)
		return err
	}

	p, err := readResponse(conn, timeout)
	if err != nil {
		handleConnError(conn)
		return err
	}

	a := &proto.CommandNotifyResponse{}
	err = pb.Unmarshal(p, a)
	if err != nil {
		return err
	}

	if a.Error != "" {
		return errors.New(a.Error)
	}
	return nil
}

// Join joins this node to a cluster at the remote address nodeAddr.
// If creds is nil, then no credential information will be included in
// the Join request to the remote node.
func (c *Client) Join(jr *command.JoinRequest, nodeAddr string, creds *proto.Credentials, timeout time.Duration) error {
	for {
		conn, err := c.dial(nodeAddr)
		if err != nil {
			return err
		}
		defer conn.Close()

		// Create the request.
		command := &proto.Command{
			Type: proto.Command_COMMAND_TYPE_JOIN,
			Request: &proto.Command_JoinRequest{
				JoinRequest: jr,
			},
			Credentials: creds,
		}

		if err := writeCommand(conn, command, timeout); err != nil {
			handleConnError(conn)
			return err
		}

		p, err := readResponse(conn, timeout)
		if err != nil {
			handleConnError(conn)
			return err
		}

		a := &proto.CommandJoinResponse{}
		err = pb.Unmarshal(p, a)
		if err != nil {
			return err
		}

		if a.Error != "" {
			if a.Error == "not leader" {
				if a.Leader == "" {
					return errors.New("no leader")
				}
				nodeAddr = a.Leader
				continue
			}
			return errors.New(a.Error)
		}
		return nil
	}
}

// BroadcastHWM performs a broadcast to all specified nodes.
func (c *Client) BroadcastHWM(hwm uint64, retries int, timeout time.Duration, nodeAddr ...string) (map[string]*proto.HighwaterMarkUpdateResponse, error) {
	if len(nodeAddr) == 0 {
		return map[string]*proto.HighwaterMarkUpdateResponse{}, nil
	}

	// Get local node address for the broadcast request
	c.localMu.RLock()
	localAddr := c.localNodeAddr
	c.localMu.RUnlock()

	// Create the broadcast request
	br := &proto.HighwaterMarkUpdateRequest{
		NodeId:        localAddr,
		HighwaterMark: hwm,
	}

	// Channel to collect results
	type result struct {
		resp *proto.HighwaterMarkUpdateResponse
		addr string
		err  error
	}

	resultChan := make(chan result, len(nodeAddr))

	// Launch goroutines for parallel requests
	for _, addr := range nodeAddr {
		go func(nodeAddress string) {
			// Create the command
			command := &proto.Command{
				Type: proto.Command_COMMAND_TYPE_HIGHWATER_MARK_UPDATE,
				Request: &proto.Command_HighwaterMarkUpdateRequest{
					HighwaterMarkUpdateRequest: br,
				},
			}

			// Attempt with retries
			var lastErr error
			for attempt := 0; attempt <= retries; attempt++ {
				conn, err := c.dial(nodeAddress)
				if err != nil {
					lastErr = err
					continue
				}

				// Write command
				if err := writeCommand(conn, command, timeout); err != nil {
					conn.Close()
					handleConnError(conn)
					lastErr = err
					continue
				}

				// Read response
				p, err := readResponse(conn, timeout)
				conn.Close()
				if err != nil {
					handleConnError(conn)
					lastErr = err
					continue
				}

				// Parse response
				resp := &proto.HighwaterMarkUpdateResponse{}
				if err := pb.Unmarshal(p, resp); err != nil {
					lastErr = err
					continue
				}

				// Success
				resultChan <- result{resp: resp, addr: nodeAddress, err: nil}
				return
			}

			// All retries failed
			resultChan <- result{resp: nil, addr: nodeAddress, err: lastErr}
		}(addr)
	}

	// Collect results with timeout
	responses := make(map[string]*proto.HighwaterMarkUpdateResponse)
	collected := 0

	timeoutChan := time.After(timeout)

	for collected < len(nodeAddr) {
		select {
		case res := <-resultChan:
			if res.err != nil {
				responses[res.addr] = &proto.HighwaterMarkUpdateResponse{Error: res.err.Error()}
			} else {
				responses[res.addr] = res.resp
			}
			collected++
		case <-timeoutChan:
			// Timeout reached, fill remaining responses with timeout errors
			for _, addr := range nodeAddr {
				if _, exists := responses[addr]; !exists {
					responses[addr] = &proto.HighwaterMarkUpdateResponse{Error: "timeout"}
				}
			}
			collected = len(nodeAddr)
		}
	}

	return responses, nil
}

// Stats returns stats on the Client instance
func (c *Client) Stats() (map[string]any, error) {
	c.poolMu.RLock()
	defer c.poolMu.RUnlock()

	stats := map[string]any{
		"timeout":         c.timeout.String(),
		"local_node_addr": c.localNodeAddr,
	}

	if (len(c.pools)) == 0 {
		return stats, nil
	}

	poolStats := make(map[string]any, len(c.pools))
	for k, v := range c.pools {
		s, err := v.Stats()
		if err != nil {
			return nil, err
		}
		poolStats[k] = s
	}
	stats["conn_pool_stats"] = poolStats
	return stats, nil
}

func (c *Client) dial(nodeAddr string) (net.Conn, error) {
	var pl pool.Pool
	var ok bool

	c.poolMu.RLock()
	pl, ok = c.pools[nodeAddr]
	c.poolMu.RUnlock()

	// Do we need a new pool for the given address?
	if !ok {
		if err := func() error {
			c.poolMu.Lock()
			defer c.poolMu.Unlock()
			pl, ok = c.pools[nodeAddr]
			if ok {
				return nil // Pool was inserted just after we checked.
			}

			// New pool is needed for given address.
			factory := func() (net.Conn, error) { return c.dialer.Dial(nodeAddr, c.timeout) }
			p, err := pool.NewChannelPool(maxPoolCapacity, factory)
			if err != nil {
				return err
			}
			c.pools[nodeAddr] = p
			pl = p
			return nil
		}(); err != nil {
			return nil, err
		}
	}

	// Got pool, now get a connection.
	conn, err := pl.Get()
	if err != nil {
		return nil, fmt.Errorf("pool get: %w", err)
	}
	return conn, nil
}

// retry retries a command on a remote node. It does this so we churn through connections
// in the pool if we hit an error, as the remote node may have restarted and the pool's
// connections are now stale.
func (c *Client) retry(command *proto.Command, nodeAddr string, timeout time.Duration, maxRetries int) ([]byte, int, error) {
	var p []byte
	var errOuter error
	var nRetries int
	for {
		p, errOuter = func() ([]byte, error) {
			conn, errInner := c.dial(nodeAddr)
			if errInner != nil {
				return nil, errInner
			}
			defer conn.Close()

			if errInner = writeCommand(conn, command, timeout); errInner != nil {
				handleConnError(conn)
				return nil, errInner
			}

			b, errInner := readResponse(conn, timeout)
			if errInner != nil {
				handleConnError(conn)
				return nil, errInner
			}
			return b, nil
		}()
		if errOuter == nil {
			break
		}
		nRetries++
		stats.Add(numClientRetries, 1)
		if nRetries > maxRetries {
			return nil, nRetries, errOuter
		}
	}
	return p, nRetries, nil
}

func writeCommand(conn net.Conn, c *proto.Command, timeout time.Duration) error {
	p, err := pb.Marshal(c)
	if err != nil {
		return fmt.Errorf("command marshal: %w", err)
	}

	// Write length of Protobuf
	if err := conn.SetDeadline(time.Now().Add(timeout)); err != nil {
		return err
	}
	b := make([]byte, protoBufferLengthSize)
	binary.LittleEndian.PutUint64(b[0:], uint64(len(p)))
	_, err = conn.Write(b)
	if err != nil {
		if errors.Is(err, os.ErrDeadlineExceeded) {
			stats.Add(numClientWriteTimeouts, 1)
		}
		return fmt.Errorf("write length: %w", err)
	}
	// Write actual protobuf.
	if err := conn.SetDeadline(time.Now().Add(timeout)); err != nil {
		return err
	}
	_, err = conn.Write(p)
	if err != nil {
		if errors.Is(err, os.ErrDeadlineExceeded) {
			stats.Add(numClientWriteTimeouts, 1)
		}
		return fmt.Errorf("write protobuf bytes: %w", err)
	}
	return nil
}

func readResponse(conn net.Conn, timeout time.Duration) (buf []byte, retErr error) {
	defer func() {
		// Connecting to an open port, but not a rqlite Raft API, may cause a panic
		// when the system tries to read the response. This is a workaround.
		if r := recover(); r != nil {
			retErr = fmt.Errorf("panic reading response from node: %v", r)
		}
	}()

	// Read length of incoming response.
	if err := conn.SetDeadline(time.Now().Add(timeout)); err != nil {
		return nil, err
	}
	b := make([]byte, protoBufferLengthSize)
	_, err := io.ReadFull(conn, b)
	if err != nil {
		if errors.Is(err, os.ErrDeadlineExceeded) {
			stats.Add(numClientReadTimeouts, 1)
		}
		return nil, fmt.Errorf("read protobuf length: %w", err)
	}
	sz := binary.LittleEndian.Uint64(b[0:])

	// Read in the actual response.
	p := make([]byte, sz)
	if err := conn.SetDeadline(time.Now().Add(timeout)); err != nil {
		return nil, err
	}
	_, err = io.ReadFull(conn, p)
	if err != nil {
		if errors.Is(err, os.ErrDeadlineExceeded) {
			stats.Add(numClientReadTimeouts, 1)
		}
		return nil, fmt.Errorf("read protobuf bytes: %w", err)
	}
	return p, nil
}

func handleConnError(conn net.Conn) {
	if pc, ok := conn.(*pool.Conn); ok {
		pc.MarkUnusable()
	}
}