https://github.com/cilium/cilium
Raw File
Tip revision: 9ba05044cd52d3ad38a15dcc55cc91ce79638f83 authored by Ian Vernon on 11 July 2019, 19:56:13 UTC
Prepare for v1.5.5
Tip revision: 9ba0504
common.h
/*
 *  Copyright (C) 2016-2018 Authors of Cilium
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#ifndef __LIB_COMMON_H_
#define __LIB_COMMON_H_

#include <bpf_features.h>
#include <bpf/api.h>
#include <linux/if_ether.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <stdint.h>
#include <stdbool.h>

// FIXME: GH-3239 LRU logic is not handling timeouts gracefully enough
// #ifndef HAVE_LRU_MAP_TYPE
// #define NEEDS_TIMEOUT 1
// #endif
#define NEEDS_TIMEOUT 1

#ifndef EVENT_SOURCE
#define EVENT_SOURCE 0
#endif

#define PORT_UDP_VXLAN 4789
#define PORT_UDP_GENEVE 6081
#define PORT_UDP_VXLAN_LINUX 8472

#ifdef PREALLOCATE_MAPS
#define CONDITIONAL_PREALLOC 0
#else
#define CONDITIONAL_PREALLOC BPF_F_NO_PREALLOC
#endif

#define __inline__ __attribute__((always_inline))
#ifndef __always_inline
#define __always_inline inline __inline__
#endif

#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))

/* These are shared with test/bpf/check-complexity.sh, when modifying any of
 * the below, that script should also be updated. */
#define CILIUM_CALL_DROP_NOTIFY			1
#define CILIUM_CALL_ERROR_NOTIFY		2
#define CILIUM_CALL_SEND_ICMP6_ECHO_REPLY	3
#define CILIUM_CALL_HANDLE_ICMP6_NS		4
#define CILIUM_CALL_SEND_ICMP6_TIME_EXCEEDED	5
#define CILIUM_CALL_ARP				6
#define CILIUM_CALL_IPV4_FROM_LXC		7
#define CILIUM_CALL_NAT64			8
#define CILIUM_CALL_NAT46			9
#define CILIUM_CALL_IPV6_FROM_LXC		10
#define CILIUM_CALL_IPV4_TO_LXC			11
#define CILIUM_CALL_IPV6_TO_LXC			12
#define CILIUM_CALL_SIZE			13

typedef __u64 mac_t;

union v6addr {
        struct {
                __u32 p1;
                __u32 p2;
                __u32 p3;
                __u32 p4;
        };
        __u8 addr[16];
};

static inline bool validate_ethertype(struct __sk_buff *skb, __u16 *proto)
{
	void *data = (void *) (long) skb->data;
	void *data_end = (void *) (long) skb->data_end;

	if (data + ETH_HLEN > data_end)
		return false;

	struct ethhdr *eth = data;
	*proto = eth->h_proto;

	if (bpf_ntohs(*proto) < ETH_P_802_3_MIN)
		return false; // non-Ethernet II unsupported

	return true;
}

static inline bool __revalidate_data(struct __sk_buff *skb, void **data_,
				     void **data_end_, void **l3,
				     size_t l3_len)
{
	void *data = (void *) (long) skb->data;
	void *data_end = (void *) (long) skb->data_end;

	if (data + ETH_HLEN + l3_len > data_end)
		return false;

	*data_ = data;
	*data_end_ = data_end;
	*l3 = data + ETH_HLEN;
	return true;
}

/* revalidate_data() initializes the provided pointers from the skb.
 * Returns true if 'skb' is long enough for an IP header of the provided type,
 * false otherwise. */
#define revalidate_data(skb, data, data_end, ip)	\
	__revalidate_data(skb, data, data_end, (void **)ip, sizeof(**ip))

/* Macros for working with L3 cilium defined IPV6 addresses */
#define BPF_V6(dst, ...)	BPF_V6_1(dst, fetch_ipv6(__VA_ARGS__))
#define BPF_V6_1(dst, ...)	BPF_V6_4(dst, __VA_ARGS__)
#define BPF_V6_4(dst, a1, a2, a3, a4)		\
	({					\
		dst.p1 = a1;			\
		dst.p2 = a2;			\
		dst.p3 = a3;			\
		dst.p4 = a4;			\
	})

/* Macros for building proxy port/nexthdr maps */
#define EVAL0(...) __VA_ARGS__
#define EVAL1(...) EVAL0 (EVAL0 (EVAL0 (__VA_ARGS__)))
#define EVAL2(...) EVAL1 (EVAL1 (EVAL1 (__VA_ARGS__)))
#define EVAL(...)  EVAL2 (EVAL2 (EVAL2 (__VA_ARGS__)))

#define BPF_L4_MAP_OUT
#define BPF_L4_MAP_END(...)
#define BPF_L4_MAP_GET_END() 0, BPF_L4_MAP_END
#define BPF_L4_MAP_NEXT0(dst, port, hdr, index, map, next, ...) next BPF_L4_MAP_OUT
#define BPF_L4_MAP_NEXT1(dst, port, hdr, index, map, next) BPF_L4_MAP_NEXT0(dst, port, hdr, index, map, next, 0)
#define BPF_L4_MAP_NEXT(dst, port, hdr, index, map, next) BPF_L4_MAP_NEXT1 (dst, port, hdr, index, BPF_L4_MAP_GET_END map, next)

#define F(dst, port, hdr, index, map0, map1, map2)				\
	({									\
		dst = (dst > -1 ? dst : ((map0 && map0 == port) ?		\
			((map2 && map2 == hdr) ? map1 : DROP_POLICY_L4) :	\
			DROP_POLICY_L4));					\
	});

#define BPF_L4_MAP0(dst, port, hdr, index, map0, map1, map2, next, ...) \
	F(dst, port, hdr, index, map0, map1, map2) BPF_L4_MAP_NEXT(dst, port, hdr, index, next, BPF_L4_MAP1)(dst, port, hdr, next, __VA_ARGS__)
#define BPF_L4_MAP1(dst, port, hdr, index, map0, map1, map2, next, ...) \
	F(dst, port, hdr, index, map0, map1, map2) BPF_L4_MAP_NEXT(dst, port, hdr, index, next, BPF_L4_MAP0)(dst, port, hdr, next, __VA_ARGS__)

#define BPF_L4_MAP(dst, port, hdr, ...)				\
	({							\
		EVAL (BPF_L4_MAP1(dst, port, hdr, __VA_ARGS__))	\
	})

/* Examples to illustrate how to use BPF_L4_MAP and BPF_V6_16
 *
 * BPF_L4_MAP(my_map, 0, 80, 8080, 0, 1, 80, 8080, 0, (), 0)
 * BPF_V6_16(my_dst, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 */

#define ENDPOINT_KEY_IPV4 1
#define ENDPOINT_KEY_IPV6 2

/* Structure representing an IPv4 or IPv6 address, being used for:
 *  - key as endpoints map
 *  - key for tunnel endpoint map
 *  - value for tunnel endpoint map
 */
struct endpoint_key {
	union {
		struct {
			__u32		ip4;
			__u32		pad1;
			__u32		pad2;
			__u32		pad3;
		};
		union v6addr	ip6;
	};
	__u8 family;
	__u8 key;
	__u16 pad5;
} __attribute__((packed));

#define ENDPOINT_F_HOST		1 /* Special endpoint representing local host */

/* Value of endpoint map */
struct endpoint_info {
	__u32		ifindex;
	__u16		unused; /* used to be sec_label, no longer used */
	__u16           lxc_id;
	__u32		flags;
	mac_t		mac;
	mac_t		node_mac;
	__u32		pad[4];
};

struct remote_endpoint_info {
	__u32		sec_label;
	__u32		tunnel_endpoint;
	__u8		key;
};

struct policy_key {
	__u32		sec_label;
	__u16		dport;
	__u8		protocol;
	__u8		egress:1,
			pad:7;
};

struct policy_entry {
	__be16		proxy_port;
	__u16		pad[3];
	__u64		packets;
	__u64		bytes;
};

struct metrics_key {
    __u8      reason;     //0: forwarded, >0 dropped
    __u8      dir:2,      //1: ingress 2: egress
              pad:6;
    __u16     reserved[3]; // reserved for future extension
};


struct metrics_value {
     __u64	count;
     __u64	bytes;
};


enum {
	CILIUM_NOTIFY_UNSPEC,
	CILIUM_NOTIFY_DROP,
	CILIUM_NOTIFY_DBG_MSG,
	CILIUM_NOTIFY_DBG_CAPTURE,
	CILIUM_NOTIFY_TRACE,
};

#define NOTIFY_COMMON_HDR \
	__u8		type; \
	__u8		subtype; \
	__u16		source; \
	__u32		hash;

#ifndef TRACE_PAYLOAD_LEN
#define TRACE_PAYLOAD_LEN 128ULL
#endif

#ifndef BPF_F_PSEUDO_HDR
# define BPF_F_PSEUDO_HDR                (1ULL << 4)
#endif

#define IS_ERR(x) (unlikely((x < 0) || (x == TC_ACT_SHOT)))

/* Cilium IPSec code to indicate packet needs to be handled
 * by IPSec stack. Maps to TC_ACT_OK.
 */
#define IPSEC_ENDPOINT TC_ACT_OK

/* Cilium error codes, must NOT overlap with TC return codes.
 * These also serve as drop reasons for metrics,
 * where reason > 0 corresponds to -(DROP_*)
 */
#define DROP_INVALID_SMAC	-130
#define DROP_INVALID_DMAC	-131
#define DROP_INVALID_SIP	-132
#define DROP_POLICY		-133
#define DROP_INVALID		-134
#define DROP_CT_INVALID_HDR	-135
#define DROP_CT_MISSING_ACK	-136
#define DROP_CT_UNKNOWN_PROTO	-137
#define DROP_CT_CANT_CREATE	-138 /* unused */
#define DROP_UNKNOWN_L3		-139
#define DROP_MISSED_TAIL_CALL	-140
#define DROP_WRITE_ERROR	-141
#define DROP_UNKNOWN_L4		-142
#define DROP_UNKNOWN_ICMP_CODE	-143
#define DROP_UNKNOWN_ICMP_TYPE	-144
#define DROP_UNKNOWN_ICMP6_CODE	-145
#define DROP_UNKNOWN_ICMP6_TYPE	-146
#define DROP_NO_TUNNEL_KEY	-147
#define DROP_NO_TUNNEL_OPT	-148 /* unused */
#define DROP_INVALID_GENEVE	-149 /* unused */
#define DROP_UNKNOWN_TARGET	-150
#define DROP_NON_LOCAL		-151
#define DROP_NO_LXC		-152
#define DROP_CSUM_L3		-153
#define DROP_CSUM_L4		-154
#define DROP_CT_CREATE_FAILED	-155
#define DROP_INVALID_EXTHDR	-156
#define DROP_FRAG_NOSUPPORT	-157
#define DROP_NO_SERVICE		-158
#define DROP_POLICY_L4		-159
#define DROP_NO_TUNNEL_ENDPOINT -160
#define DROP_PROXYMAP_CREATE_FAILED	-161
#define DROP_POLICY_CIDR		-162
#define DROP_UNKNOWN_CT			-163
#define DROP_HOST_UNREACHABLE		-164
#define DROP_NO_CONFIG		-165
#define DROP_UNSUPPORTED_L2		-166
#define DROP_NAT_NO_MAPPING	-167
#define DROP_NAT_UNSUPP_PROTO	-168
#define DROP_ENCAP_PROHIBITED	-170
#define DROP_INVALID_IDENTITY	-171
#define DROP_UNKNOWN_SENDER	-172

/* Cilium metrics reason for forwarding packet.
 * If reason > 0 then this is a drop reason and value corresponds to -(DROP_*)
 */
#define REASON_FORWARDED  0

/* Cilium metrics direction for dropping/forwarding packet */
#define METRIC_INGRESS  1
#define METRIC_EGRESS   2

/* Magic skb->mark identifies packets origination and encryption status.
 *
 * The upper 16 bits plus lower 8 bits (e.g. mask 0XFFFF00FF) contain the
 * packets security identity. The lower/upper halves are swapped to recover
 * the identity.
 *
 * The 4 bits at 0X0F00 provide
 *  - the magic marker values which indicate whether the packet is coming from
 *    an ingress or egress proxy, a local process and its current encryption
 *    status.
 *
 * The 4 bits at 0xF000 provide
 *  - the key index to use for encryption when multiple keys are in-flight.
 *    In the IPsec case this becomes the SPI on the wire.
 */
#define MARK_MAGIC_HOST_MASK		0x0F00
#define MARK_MAGIC_PROXY_INGRESS	0x0A00
#define MARK_MAGIC_PROXY_EGRESS		0x0B00
#define MARK_MAGIC_HOST			0x0C00
#define MARK_MAGIC_DECRYPT		0x0D00
#define MARK_MAGIC_ENCRYPT		0x0E00

#define MARK_MAGIC_KEY_ID		0xF000
#define MARK_MAGIC_KEY_MASK		0xFF00

/**
 * get_identity - returns source identity from the mark field
 */
static inline int __inline__ get_identity(struct __sk_buff *skb)
{
	return ((skb->mark & 0xFF) << 16) | skb->mark >> 16;
}

/**
 * set_identity - pushes 24 bit identity into skb mark value.
 */
static inline void __inline__ set_identity(struct __sk_buff *skb, __u32 identity)
{
	skb->mark = skb->mark & MARK_MAGIC_KEY_MASK;
	skb->mark |= ((identity & 0xFFFF) << 16) | ((identity & 0xFF0000) >> 16);
}

/* We cap key index at 4 bits because mark value is used to map skb to key */
#define MAX_KEY_INDEX 15

/* encrypt_key is the index into the encrypt map */
struct encrypt_key {
	__u32 ctx;
} __attribute__((packed));

/* encrypt_config is the current encryption context on the node */
struct encrypt_config {
	__u8 encrypt_key;
} __attribute__((packed));

/**
 * or_encrypt_key - mask and shift key into encryption format
 */
static inline __u32 __inline__ or_encrypt_key(__u8 key)
{
	return (((__u32)key & 0x0F) << 12) | MARK_MAGIC_ENCRYPT;
}

/**
 * set_encrypt_key - pushes 8 bit key and encryption marker into skb mark value.
 */
static inline void __inline__ set_encrypt_key(struct __sk_buff *skb, __u8 key)
{
	skb->mark = or_encrypt_key(key);
}

/*
 * skb->tc_index uses
 *
 * cilium_host @egress
 *   bpf_host -> bpf_lxc
 */
#define TC_INDEX_F_SKIP_PROXY		1

/* skb->cb[] usage: */
enum {
	CB_SRC_LABEL,
	CB_IFINDEX,
	CB_POLICY,
	CB_NAT46_STATE,
	CB_CT_STATE,
};

/* State values for NAT46 */
enum {
	NAT46_CLEAR,
	NAT64,
	NAT46,
};

#define TUPLE_F_OUT		0	/* Outgoing flow */
#define TUPLE_F_IN		1	/* Incoming flow */
#define TUPLE_F_RELATED		2	/* Flow represents related packets */
#define TUPLE_F_SERVICE		4	/* Flow represents service/slave map */

#define CT_EGRESS 0
#define CT_INGRESS 1
#define CT_SERVICE 2

enum {
	CT_NEW,
	CT_ESTABLISHED,
	CT_REPLY,
	CT_RELATED,
};

struct ipv6_ct_tuple {
	union v6addr	daddr;
	union v6addr	saddr;
	/* The order of dport+sport must not be changed */
	__be16		dport;
	__be16		sport;
	__u8		nexthdr;
	__u8		flags;
} __attribute__((packed));

struct ipv4_ct_tuple {
	__be32		daddr;
	__be32		saddr;
	/* The order of dport+sport must not be changed */
	__be16		dport;
	__be16		sport;
	__u8		nexthdr;
	__u8		flags;
} __attribute__((packed));

struct ct_entry {
	__u64 rx_packets;
	__u64 rx_bytes;
	__u64 tx_packets;
	__u64 tx_bytes;
	__u32 lifetime;
	__u16 rx_closing:1,
	      tx_closing:1,
	      nat46:1,
	      lb_loopback:1,
	      seen_non_syn:1,
	      reserve:11;
	__u16 rev_nat_index;
	__u16 slave;

	/* *x_flags_seen represents the OR of all TCP flags seen for the
	 * transmit/receive direction of this entry. */
	__u8  tx_flags_seen;
	__u8  rx_flags_seen;

	__u32 src_sec_id;

	/* last_*x_report is a timestamp of the last time a monitor
	 * notification was sent for the transmit/receive direction. */
	__u32 last_tx_report;
	__u32 last_rx_report;
};

struct lb6_key {
        union v6addr address;
        __be16 dport;		/* L4 port filter, if unset, all ports apply */
	__u16 slave;		/* Backend iterator, 0 indicates the master service */
} __attribute__((packed));

struct lb6_service {
	union v6addr target;
	__be16 port;
	__u16 count;
	__u16 rev_nat_index;
	__u16 weight;
} __attribute__((packed));

struct lb6_key_v2 {
	union v6addr address;	/* Service virtual IPv6 address */
	__be16 dport;		/* L4 port filter, if unset, all ports apply */
	__u16 slave;		/* Backend iterator, 0 indicates the master service */
	__u8 proto;		/* L4 protocol, currently not used (set to 0) */
	__u8 pad[3];
};

/* See lb4_service_v2 comments */
struct lb6_service_v2 {
	__u32 backend_id;
	__u16 count;
	__u16 rev_nat_index;
	__u16 weight;
	__u16 pad;
};

/* See lb4_backend comments */
struct lb6_backend {
	union v6addr address;
	__be16 port;
	__u8 proto;
	__u8 pad;
};

struct lb6_reverse_nat {
	union v6addr address;
	__be16 port;
} __attribute__((packed));

struct lb4_key {
	__be32 address;
        __be16 dport;		/* L4 port filter, if unset, all ports apply */
	__u16 slave;		/* Backend iterator, 0 indicates the master service */
} __attribute__((packed));

struct lb4_service {
	__be32 target;
	__be16 port;
	__u16 count;
	__u16 rev_nat_index;
	__u16 weight;
} __attribute__((packed));

struct lb4_key_v2 {
	__be32 address;		/* Service virtual IPv4 address */
	__be16 dport;		/* L4 port filter, if unset, all ports apply */
	__u16 slave;		/* Backend iterator, 0 indicates the master service */
	__u8 proto;		/* L4 protocol, currently not used (set to 0) */
	__u8 pad[3];
};

struct lb4_service_v2 {
	__u32 backend_id;	/* Backend ID in lb4_backends */
	/* For the master service, count denotes number of service endpoints,
	 * while for any service endpoint, count contains a slave slot number
	 * in a corresponding legacy service which points to the same backend
	 * (used for the backward compatibility)
	 */
	__u16 count;
	__u16 rev_nat_index;	/* Reverse NAT ID in lb4_reverse_nat */
	__u16 weight;		/* Currently not used */
	__u16 pad;
};

struct lb4_backend {
	__be32 address;		/* Service endpoint IPv4 address */
	__be16 port;		/* L4 port filter */
	__u8 proto;		/* L4 protocol, currently not used (set to 0) */
	__u8 pad;
};

struct lb4_reverse_nat {
	__be32 address;
	__be16 port;
} __attribute__((packed));

// LB_RR_MAX_SEQ generated by daemon in node_config.h
struct lb_sequence {
	__u16 count;
	__u16 idx[LB_RR_MAX_SEQ];
};

struct ct_state {
	__u16 rev_nat_index;
	__u16 loopback:1,
	      reserved:15;
	__be16 orig_dport;
	__be32 addr;
	__be32 svc_addr;
	__u32 src_sec_id;
	__u16 slave;		/* Slave slot number in a legacy service */
	__u16 backend_id;	/* Backend ID in lb4_backends */
};

/* Lifetime of a proxy redirection entry. All proxies should be using TCP
 * keepalive to force some traffic over the connection periodically to keep
 * these entries alive. Cross-reference with ProxyKeepAlivePeriod. */
#define PROXY_DEFAULT_LIFETIME 720

/* The proxy key is written from the perspective of the source of the
 * connection, so the "destination" port reperesents the local host port which
 * the proxy is listening on, while the "source" address/port represents the
 * non-proxy side of the connection. This applies for both ingress and egress
 * proxies.
 *
 * The value provides the original destination's address/port which was
 * replaced in the initiating connection's packet when the packet was
 * redirected to the proxy.
 */
struct proxy4_tbl_key {
	__be32 saddr;
	__be16 dport; /* dport must be in front of sport, loaded with 4 bytes read */
	__be16 sport;
	__u8 nexthdr;
	__u8 pad;
} __attribute__((packed));

struct proxy4_tbl_value {
	__be32 orig_daddr;
	__be16 orig_dport;
	__u16 pad;
	__u32 identity;
	__u32 lifetime;
} __attribute__((packed));

struct proxy6_tbl_key {
	union v6addr saddr;
	__be16 dport;
	__be16 sport;
	__u8 nexthdr;
	__u8 pad;
} __attribute__((packed));

struct proxy6_tbl_value {
	union v6addr orig_daddr;
	__be16 orig_dport;
	__u16 pad;
	__u32 identity;
	__u32 lifetime;
} __attribute__((packed));

/* ep_config corresponds to the EndpointConfig object in pkg/maps/configmap. */
struct ep_config {
	__u32 flags; /* enum ep_cfg_flag */
	__be32 ipv4Addr;
	union v6addr ipv6Addr;
	mac_t node_mac;
	__u16 lxc_id;
	__be16 lxc_id_nb;
	__u32 identity;
	__be32 identity_nb;
	__u32 pad;
} __attribute__((packed));

/**
 * relax_verifier is a dummy helper call to introduce a pruning checkpoing to
 * help relax the verifier to avoid reaching complexity limits on older
 * kernels.
 */
static inline void relax_verifier(void)
{
	int foo = 0;
	csum_diff(0, 0, &foo, 1, 0);
}

static inline int redirect_self(struct __sk_buff *skb)
{
	/* Looping back the packet into the originating netns. In
	 * case of veth, it's xmit'ing into the hosts' veth device
	 * such that we end up on ingress in the peer. For ipvlan
	 * slave it's redirect to ingress as we are attached on the
	 * slave in netns already.
	 */
#ifdef ENABLE_HOST_REDIRECT
	return redirect(skb->ifindex, 0);
#else
	return redirect(skb->ifindex, BPF_F_INGRESS);
#endif
}

static inline int redirect_peer(int ifindex, uint32_t flags)
{
	/* If our datapath has proper redirect support, we make use
	 * of it here, otherwise we terminate tc processing by letting
	 * stack handle forwarding e.g. in ipvlan case.
	 */
#ifdef ENABLE_HOST_REDIRECT
	return redirect(ifindex, flags);
#else
	return TC_ACT_OK;
#endif /* ENABLE_HOST_REDIRECT */
}

#endif
back to top