Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-10-12

The main changes are:

1) The BPF verifier improvements to track register allocation pattern, from Alexei and Yonghong.

2) libbpf relocation support for different size load/store, from Andrii.

3) bpf_redirect_peer() helper and support for inner map array with different max_entries, from Daniel.

4) BPF support for per-cpu variables, form Hao.

5) sockmap improvements, from John.
====================

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski 2020-10-12 16:16:50 -07:00
commit ccdf7fae3a
73 changed files with 4334 additions and 767 deletions

View File

@ -60,13 +60,13 @@ Q: Where can I find patches currently under discussion for BPF subsystem?
A: All patches that are Cc'ed to netdev are queued for review under netdev
patchwork project:
http://patchwork.ozlabs.org/project/netdev/list/
https://patchwork.kernel.org/project/netdevbpf/list/
Those patches which target BPF, are assigned to a 'bpf' delegate for
further processing from BPF maintainers. The current queue with
patches under review can be found at:
https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
https://patchwork.kernel.org/project/netdevbpf/list/?delegate=121173
Once the patches have been reviewed by the BPF community as a whole
and approved by the BPF maintainers, their status in patchwork will be

View File

@ -3263,7 +3263,7 @@ M: Daniel Borkmann <daniel@iogearbox.net>
R: Martin KaFai Lau <kafai@fb.com>
R: Song Liu <songliubraving@fb.com>
R: Yonghong Song <yhs@fb.com>
R: Andrii Nakryiko <andriin@fb.com>
R: Andrii Nakryiko <andrii@kernel.org>
R: John Fastabend <john.fastabend@gmail.com>
R: KP Singh <kpsingh@chromium.org>
L: netdev@vger.kernel.org

View File

@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev)
return smp_processor_id() % dev->real_num_rx_queues;
}
static struct net_device *veth_peer_dev(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
/* Callers must be under RCU read side. */
return rcu_dereference(priv->peer);
}
static int veth_xdp_xmit(struct net_device *dev, int n,
struct xdp_frame **frames,
u32 flags, bool ndo_xmit)
@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = {
.ndo_set_rx_headroom = veth_set_rx_headroom,
.ndo_bpf = veth_xdp,
.ndo_xdp_xmit = veth_ndo_xdp_xmit,
.ndo_get_peer_dev = veth_peer_dev,
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \

View File

@ -82,7 +82,7 @@ struct bpf_map_ops {
void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file,
int fd);
void (*map_fd_put_ptr)(void *ptr);
u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
int (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf);
u32 (*map_fd_sys_lookup_elem)(void *ptr);
void (*map_seq_show_elem)(struct bpf_map *map, void *key,
struct seq_file *m);
@ -293,6 +293,7 @@ enum bpf_arg_type {
ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */
ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */
ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */
__BPF_ARG_TYPE_MAX,
};
@ -307,6 +308,8 @@ enum bpf_return_type {
RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */
RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */
RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */
RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */
};
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@ -405,6 +408,7 @@ enum bpf_reg_type {
PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
PTR_TO_RDWR_BUF, /* reg points to a read/write buffer */
PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
PTR_TO_PERCPU_BTF_ID, /* reg points to a percpu kernel variable */
};
/* The information passed from prog-specific *_is_valid_access
@ -1828,6 +1832,8 @@ extern const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto;
extern const struct bpf_func_proto bpf_skc_to_udp6_sock_proto;
extern const struct bpf_func_proto bpf_copy_from_user_proto;
extern const struct bpf_func_proto bpf_snprintf_btf_proto;
extern const struct bpf_func_proto bpf_per_cpu_ptr_proto;
extern const struct bpf_func_proto bpf_this_cpu_ptr_proto;
const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog);

View File

@ -308,6 +308,13 @@ struct bpf_insn_aux_data {
u32 map_index; /* index into used_maps[] */
u32 map_off; /* offset from value base address */
};
struct {
enum bpf_reg_type reg_type; /* type of pseudo_btf_id */
union {
u32 btf_id; /* btf_id for struct typed var */
u32 mem_size; /* mem_size for non-struct typed var */
};
} btf_var;
};
u64 map_key_state; /* constant (32 bit) key tracking for maps */
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */

View File

@ -110,6 +110,11 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type,
i < btf_type_vlen(struct_type); \
i++, member++)
#define for_each_vsi(i, datasec_type, member) \
for (i = 0, member = btf_type_var_secinfo(datasec_type); \
i < btf_type_vlen(datasec_type); \
i++, member++)
static inline bool btf_type_is_ptr(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
@ -145,6 +150,21 @@ static inline bool btf_type_is_func_proto(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO;
}
static inline bool btf_type_is_var(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
}
/* union is only a special case of struct:
* all its offsetof(member) == 0
*/
static inline bool btf_type_is_struct(const struct btf_type *t)
{
u8 kind = BTF_INFO_KIND(t->info);
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}
static inline u16 btf_type_vlen(const struct btf_type *t)
{
return BTF_INFO_VLEN(t->info);
@ -179,6 +199,12 @@ static inline const struct btf_member *btf_type_member(const struct btf_type *t)
return (const struct btf_member *)(t + 1);
}
static inline const struct btf_var_secinfo *btf_type_var_secinfo(
const struct btf_type *t)
{
return (const struct btf_var_secinfo *)(t + 1);
}
#ifdef CONFIG_BPF_SYSCALL
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
const char *btf_name_by_offset(const struct btf *btf, u32 offset);

View File

@ -1276,6 +1276,9 @@ struct netdev_net_notifier {
* int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
* int cmd);
* Add, change, delete or get information on an IPv4 tunnel.
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
* If a device is paired with a peer device, return the peer instance.
* The caller must be under RCU read context.
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
@ -1483,6 +1486,7 @@ struct net_device_ops {
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
int (*ndo_tunnel_ctl)(struct net_device *dev,
struct ip_tunnel_parm *p, int cmd);
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
};
/**

View File

@ -308,6 +308,8 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node);
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock);
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock);
int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg);

View File

@ -2228,34 +2228,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
#endif /* CONFIG_NET_SOCK_MSG */
#ifdef CONFIG_CGROUP_BPF
/* Copy the listen sk's HDR_OPT_CB flags to its child.
*
* During 3-Way-HandShake, the synack is usually sent from
* the listen sk with the HDR_OPT_CB flags set so that
* bpf-prog will be called to write the BPF hdr option.
*
* In fastopen, the child sk is used to send synack instead
* of the listen sk. Thus, inheriting the HDR_OPT_CB flags
* from the listen sk gives the bpf-prog a chance to write
* BPF hdr option in the synack pkt during fastopen.
*
* Both fastopen and non-fastopen child will inherit the
* HDR_OPT_CB flags to keep the bpf-prog having a consistent
* behavior when deciding to clear this cb flags (or not)
* during the PASSIVE_ESTABLISHED_CB.
*
* In the future, other cb flags could be inherited here also.
*/
static inline void bpf_skops_init_child(const struct sock *sk,
struct sock *child)
{
tcp_sk(child)->bpf_sock_ops_cb_flags =
tcp_sk(sk)->bpf_sock_ops_cb_flags &
(BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG |
BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG);
}
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
struct sk_buff *skb,
unsigned int end_offset)
@ -2264,11 +2236,6 @@ static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
skops->skb_data_end = skb->data + end_offset;
}
#else
static inline void bpf_skops_init_child(const struct sock *sk,
struct sock *child)
{
}
static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops,
struct sk_buff *skb,
unsigned int end_offset)

View File

@ -356,18 +356,36 @@ enum bpf_link_type {
#define BPF_F_SLEEPABLE (1U << 4)
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
* the following extensions:
*
* insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE
* insn[0].imm: map fd map fd
* insn[1].imm: 0 offset into value
* insn[0].off: 0 0
* insn[1].off: 0 0
* ldimm64 rewrite: address of map address of map[0]+offset
* verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE
* insn[0].src_reg: BPF_PSEUDO_MAP_FD
* insn[0].imm: map fd
* insn[1].imm: 0
* insn[0].off: 0
* insn[1].off: 0
* ldimm64 rewrite: address of map
* verifier type: CONST_PTR_TO_MAP
*/
#define BPF_PSEUDO_MAP_FD 1
/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE
* insn[0].imm: map fd
* insn[1].imm: offset into value
* insn[0].off: 0
* insn[1].off: 0
* ldimm64 rewrite: address of map[0]+offset
* verifier type: PTR_TO_MAP_VALUE
*/
#define BPF_PSEUDO_MAP_VALUE 2
/* insn[0].src_reg: BPF_PSEUDO_BTF_ID
* insn[0].imm: kernel btd id of VAR
* insn[1].imm: 0
* insn[0].off: 0
* insn[1].off: 0
* ldimm64 rewrite: address of the kernel variable
* verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
* is struct/union.
*/
#define BPF_PSEUDO_BTF_ID 3
/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
* offset to another bpf function
@ -417,6 +435,9 @@ enum {
/* Share perf_event among processes */
BPF_F_PRESERVE_ELEMS = (1U << 11),
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
};
/* Flags for BPF_PROG_QUERY. */
@ -1680,7 +1701,7 @@ union bpf_attr {
* **TCP_CONGESTION**, **TCP_BPF_IW**,
* **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
* **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
* **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
* **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
* * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
* Return
@ -2235,7 +2256,7 @@ union bpf_attr {
* Description
* This helper is used in programs implementing policies at the
* skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
* if the verdeict eBPF program returns **SK_PASS**), redirect it
* if the verdict eBPF program returns **SK_PASS**), redirect it
* to the socket referenced by *map* (of type
* **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
* egress interfaces can be used for redirection. The
@ -3661,10 +3682,59 @@ union bpf_attr {
* Redirect the packet to another net device of index *ifindex*
* and fill in L2 addresses from neighboring subsystem. This helper
* is somewhat similar to **bpf_redirect**\ (), except that it
* fills in e.g. MAC addresses based on the L3 information from
* the packet. This helper is supported for IPv4 and IPv6 protocols.
* populates L2 addresses as well, meaning, internally, the helper
* performs a FIB lookup based on the skb's networking header to
* get the address of the next hop and then relies on the neighbor
* lookup for the L2 address of the nexthop.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types.
* currently only supported for tc BPF program types, and enabled
* for IPv4 and IPv6 protocols.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*
* void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
* Description
* Take a pointer to a percpu ksym, *percpu_ptr*, and return a
* pointer to the percpu kernel variable on *cpu*. A ksym is an
* extern variable decorated with '__ksym'. For ksym, there is a
* global var (either static or global) defined of the same name
* in the kernel. The ksym is percpu if the global var is percpu.
* The returned pointer points to the global percpu var on *cpu*.
*
* bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
* kernel, except that bpf_per_cpu_ptr() may return NULL. This
* happens if *cpu* is larger than nr_cpu_ids. The caller of
* bpf_per_cpu_ptr() must check the returned value.
* Return
* A pointer pointing to the kernel percpu variable on *cpu*, or
* NULL, if *cpu* is invalid.
*
* void *bpf_this_cpu_ptr(const void *percpu_ptr)
* Description
* Take a pointer to a percpu ksym, *percpu_ptr*, and return a
* pointer to the percpu kernel variable on this cpu. See the
* description of 'ksym' in **bpf_per_cpu_ptr**\ ().
*
* bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
* the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
* never return NULL.
* Return
* A pointer pointing to the kernel percpu variable on this cpu.
*
* long bpf_redirect_peer(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*.
* This helper is somewhat similar to **bpf_redirect**\ (), except
* that the redirection happens to the *ifindex*' peer device and
* the netns switch takes place from ingress to ingress without
* going through the CPU's backlog queue.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types at the ingress
* hook and for veth device types. The peer device must reside in a
* different network namespace.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
@ -3823,6 +3893,9 @@ union bpf_attr {
FN(seq_printf_btf), \
FN(skb_cgroup_classid), \
FN(redirect_neigh), \
FN(bpf_per_cpu_ptr), \
FN(bpf_this_cpu_ptr), \
FN(redirect_peer), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper

View File

@ -16,7 +16,7 @@
#define ARRAY_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_MMAPABLE | BPF_F_ACCESS_MASK | \
BPF_F_PRESERVE_ELEMS)
BPF_F_PRESERVE_ELEMS | BPF_F_INNER_MAP)
static void bpf_array_free_percpu(struct bpf_array *array)
{
@ -62,7 +62,7 @@ int array_map_alloc_check(union bpf_attr *attr)
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_ARRAY &&
attr->map_flags & BPF_F_MMAPABLE)
attr->map_flags & (BPF_F_MMAPABLE | BPF_F_INNER_MAP))
return -EINVAL;
if (attr->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY &&
@ -214,7 +214,7 @@ static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
@ -223,6 +223,9 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
if (map->map_flags & BPF_F_INNER_MAP)
return -EOPNOTSUPP;
*insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
*insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
if (!map->bypass_spec_v1) {
@ -496,8 +499,10 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
static bool array_map_meta_equal(const struct bpf_map *meta0,
const struct bpf_map *meta1)
{
return meta0->max_entries == meta1->max_entries &&
bpf_map_meta_equal(meta0, meta1);
if (!bpf_map_meta_equal(meta0, meta1))
return false;
return meta0->map_flags & BPF_F_INNER_MAP ? true :
meta0->max_entries == meta1->max_entries;
}
struct bpf_iter_seq_array_map_info {
@ -1251,7 +1256,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
static u32 array_of_map_gen_lookup(struct bpf_map *map,
static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);

View File

@ -188,11 +188,6 @@
i < btf_type_vlen(struct_type); \
i++, member++)
#define for_each_vsi(i, struct_type, member) \
for (i = 0, member = btf_type_var_secinfo(struct_type); \
i < btf_type_vlen(struct_type); \
i++, member++)
#define for_each_vsi_from(i, from, struct_type, member) \
for (i = from, member = btf_type_var_secinfo(struct_type) + from; \
i < btf_type_vlen(struct_type); \
@ -440,16 +435,6 @@ static bool btf_type_nosize_or_null(const struct btf_type *t)
return !t || btf_type_nosize(t);
}
/* union is only a special case of struct:
* all its offsetof(member) == 0
*/
static bool btf_type_is_struct(const struct btf_type *t)
{
u8 kind = BTF_INFO_KIND(t->info);
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
}
static bool __btf_type_is_struct(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
@ -460,11 +445,6 @@ static bool btf_type_is_array(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
}
static bool btf_type_is_var(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_VAR;
}
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@ -613,11 +593,6 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
return (const struct btf_var *)(t + 1);
}
static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t)
{
return (const struct btf_var_secinfo *)(t + 1);
}
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];

View File

@ -612,7 +612,7 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
* bpf_prog
* __htab_map_lookup_elem
*/
static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
@ -651,7 +651,7 @@ static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key)
return __htab_lru_map_lookup_elem(map, key, false);
}
static u32 htab_lru_map_gen_lookup(struct bpf_map *map,
static int htab_lru_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
@ -2070,7 +2070,7 @@ static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key)
return READ_ONCE(*inner_map);
}
static u32 htab_of_map_gen_lookup(struct bpf_map *map,
static int htab_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;

View File

@ -623,6 +623,34 @@ const struct bpf_func_proto bpf_copy_from_user_proto = {
.arg3_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
{
if (cpu >= nr_cpu_ids)
return (unsigned long)NULL;
return (unsigned long)per_cpu_ptr((const void __percpu *)ptr, cpu);
}
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
.func = bpf_per_cpu_ptr,
.gpl_only = false,
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
{
return (unsigned long)this_cpu_ptr((const void __percpu *)percpu_ptr);
}
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
.func = bpf_this_cpu_ptr,
.gpl_only = false,
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
};
const struct bpf_func_proto bpf_get_current_task_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_proto __weak;
const struct bpf_func_proto bpf_probe_read_user_str_proto __weak;
@ -689,6 +717,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_snprintf_btf_proto;
case BPF_FUNC_jiffies64:
return &bpf_jiffies64_proto;
case BPF_FUNC_bpf_per_cpu_ptr:
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_bpf_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
default:
break;
}

View File

@ -17,6 +17,8 @@ int pcpu_freelist_init(struct pcpu_freelist *s)
raw_spin_lock_init(&head->lock);
head->first = NULL;
}
raw_spin_lock_init(&s->extralist.lock);
s->extralist.first = NULL;
return 0;
}
@ -40,12 +42,50 @@ static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
raw_spin_unlock(&head->lock);
}
static inline bool pcpu_freelist_try_push_extra(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
if (!raw_spin_trylock(&s->extralist.lock))
return false;
pcpu_freelist_push_node(&s->extralist, node);
raw_spin_unlock(&s->extralist.lock);
return true;
}
static inline void ___pcpu_freelist_push_nmi(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
int cpu, orig_cpu;
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
struct pcpu_freelist_head *head;
head = per_cpu_ptr(s->freelist, cpu);
if (raw_spin_trylock(&head->lock)) {
pcpu_freelist_push_node(head, node);
raw_spin_unlock(&head->lock);
return;
}
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
/* cannot lock any per cpu lock, try extralist */
if (cpu == orig_cpu &&
pcpu_freelist_try_push_extra(s, node))
return;
}
}
void __pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
___pcpu_freelist_push(head, node);
if (in_nmi())
___pcpu_freelist_push_nmi(s, node);
else
___pcpu_freelist_push(this_cpu_ptr(s->freelist), node);
}
void pcpu_freelist_push(struct pcpu_freelist *s,
@ -81,7 +121,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
}
}
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
@ -102,8 +142,59 @@ struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
if (cpu >= nr_cpu_ids)
cpu = 0;
if (cpu == orig_cpu)
return NULL;
break;
}
/* per cpu lists are all empty, try extralist */
raw_spin_lock(&s->extralist.lock);
node = s->extralist.first;
if (node)
s->extralist.first = node->next;
raw_spin_unlock(&s->extralist.lock);
return node;
}
static struct pcpu_freelist_node *
___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
int orig_cpu, cpu;
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
if (raw_spin_trylock(&head->lock)) {
node = head->first;
if (node) {
head->first = node->next;
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
}
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
if (cpu == orig_cpu)
break;
}
/* cannot pop from per cpu lists, try extralist */
if (!raw_spin_trylock(&s->extralist.lock))
return NULL;
node = s->extralist.first;
if (node)
s->extralist.first = node->next;
raw_spin_unlock(&s->extralist.lock);
return node;
}
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
{
if (in_nmi())
return ___pcpu_freelist_pop_nmi(s);
return ___pcpu_freelist_pop(s);
}
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)

View File

@ -13,6 +13,7 @@ struct pcpu_freelist_head {
struct pcpu_freelist {
struct pcpu_freelist_head __percpu *freelist;
struct pcpu_freelist_head extralist;
};
struct pcpu_freelist_node {

View File

@ -4323,8 +4323,10 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
used_maps_old = prog->aux->used_maps;
for (i = 0; i < prog->aux->used_map_cnt; i++)
if (used_maps_old[i] == map)
if (used_maps_old[i] == map) {
bpf_map_put(map);
goto out_unlock;
}
used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
sizeof(used_maps_new[0]),

View File

@ -238,6 +238,8 @@ struct bpf_call_arg_meta {
u64 msize_max_value;
int ref_obj_id;
int func_id;
u32 btf_id;
u32 ret_btf_id;
};
struct btf *btf_vmlinux;
@ -517,6 +519,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_XDP_SOCK] = "xdp_sock",
[PTR_TO_BTF_ID] = "ptr_",
[PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
[PTR_TO_PERCPU_BTF_ID] = "percpu_ptr_",
[PTR_TO_MEM] = "mem",
[PTR_TO_MEM_OR_NULL] = "mem_or_null",
[PTR_TO_RDONLY_BUF] = "rdonly_buf",
@ -583,7 +586,9 @@ static void print_verifier_state(struct bpf_verifier_env *env,
/* reg->off should be 0 for SCALAR_VALUE */
verbose(env, "%lld", reg->var_off.value + reg->off);
} else {
if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL)
if (t == PTR_TO_BTF_ID ||
t == PTR_TO_BTF_ID_OR_NULL ||
t == PTR_TO_PERCPU_BTF_ID)
verbose(env, "%s", kernel_type_name(reg->btf_id));
verbose(env, "(id=%d", reg->id);
if (reg_type_may_be_refcounted_or_null(t))
@ -2204,6 +2209,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_RDONLY_BUF_OR_NULL:
case PTR_TO_RDWR_BUF:
case PTR_TO_RDWR_BUF_OR_NULL:
case PTR_TO_PERCPU_BTF_ID:
return true;
default:
return false;
@ -2221,6 +2227,20 @@ static bool register_is_const(struct bpf_reg_state *reg)
return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off);
}
static bool __is_scalar_unbounded(struct bpf_reg_state *reg)
{
return tnum_is_unknown(reg->var_off) &&
reg->smin_value == S64_MIN && reg->smax_value == S64_MAX &&
reg->umin_value == 0 && reg->umax_value == U64_MAX &&
reg->s32_min_value == S32_MIN && reg->s32_max_value == S32_MAX &&
reg->u32_min_value == 0 && reg->u32_max_value == U32_MAX;
}
static bool register_is_bounded(struct bpf_reg_state *reg)
{
return reg->type == SCALAR_VALUE && !__is_scalar_unbounded(reg);
}
static bool __is_pointer_value(bool allow_ptr_leaks,
const struct bpf_reg_state *reg)
{
@ -2272,7 +2292,7 @@ static int check_stack_write(struct bpf_verifier_env *env,
if (value_regno >= 0)
reg = &cur->regs[value_regno];
if (reg && size == BPF_REG_SIZE && register_is_const(reg) &&
if (reg && size == BPF_REG_SIZE && register_is_bounded(reg) &&
!register_is_null(reg) && env->bpf_capable) {
if (dst_reg != BPF_REG_FP) {
/* The backtracking logic can only recognize explicit
@ -2667,7 +2687,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
case BPF_PROG_TYPE_CGROUP_SKB:
if (t == BPF_WRITE)
return false;
/* fallthrough */
fallthrough;
/* Program types with direct read + write access go here! */
case BPF_PROG_TYPE_SCHED_CLS:
@ -3978,6 +3998,7 @@ static const struct bpf_reg_types sock_types = {
},
};
#ifdef CONFIG_NET
static const struct bpf_reg_types btf_id_sock_common_types = {
.types = {
PTR_TO_SOCK_COMMON,
@ -3988,6 +4009,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = {
},
.btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
};
#endif
static const struct bpf_reg_types mem_types = {
.types = {
@ -4017,6 +4039,7 @@ static const struct bpf_reg_types alloc_mem_types = { .types = { PTR_TO_MEM } };
static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } };
static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_MAP_KEY] = &map_key_value_types,
@ -4030,7 +4053,9 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_CTX] = &context_types,
[ARG_PTR_TO_CTX_OR_NULL] = &context_types,
[ARG_PTR_TO_SOCK_COMMON] = &sock_types,
#ifdef CONFIG_NET
[ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
#endif
[ARG_PTR_TO_SOCKET] = &fullsock_types,
[ARG_PTR_TO_SOCKET_OR_NULL] = &fullsock_types,
[ARG_PTR_TO_BTF_ID] = &btf_ptr_types,
@ -4042,6 +4067,7 @@ static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
[ARG_PTR_TO_ALLOC_MEM_OR_NULL] = &alloc_mem_types,
[ARG_PTR_TO_INT] = &int_ptr_types,
[ARG_PTR_TO_LONG] = &int_ptr_types,
[ARG_PTR_TO_PERCPU_BTF_ID] = &percpu_btf_ptr_types,
};
static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@ -4205,6 +4231,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
err = check_helper_mem_access(env, regno,
meta->map_ptr->value_size, false,
meta);
} else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
if (!reg->btf_id) {
verbose(env, "Helper has invalid btf_id in R%d\n", regno);
return -EACCES;
}
meta->ret_btf_id = reg->btf_id;
} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
if (meta->func_id == BPF_FUNC_spin_lock) {
if (process_spin_lock(env, regno, true))
@ -5114,6 +5146,35 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
regs[BPF_REG_0].id = ++env->id_gen;
regs[BPF_REG_0].mem_size = meta.mem_size;
} else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
const struct btf_type *t;
mark_reg_known_zero(env, regs, BPF_REG_0);
t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL);
if (!btf_type_is_struct(t)) {
u32 tsize;
const struct btf_type *ret;
const char *tname;
/* resolve the type size of ksym. */
ret = btf_resolve_size(btf_vmlinux, t, &tsize);
if (IS_ERR(ret)) {
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
verbose(env, "unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret));
return -EINVAL;
}
regs[BPF_REG_0].type =
fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
regs[BPF_REG_0].mem_size = tsize;
} else {
regs[BPF_REG_0].type =
fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
regs[BPF_REG_0].btf_id = meta.ret_btf_id;
}
} else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) {
int ret_btf_id;
@ -5432,7 +5493,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
/* smin_val represents the known value */
if (known && smin_val == 0 && opcode == BPF_ADD)
break;
/* fall-through */
fallthrough;
case PTR_TO_PACKET_END:
case PTR_TO_SOCKET:
case PTR_TO_SOCKET_OR_NULL:
@ -6389,6 +6450,11 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
src_reg = NULL;
if (dst_reg->type != SCALAR_VALUE)
ptr_reg = dst_reg;
else
/* Make sure ID is cleared otherwise dst_reg min/max could be
* incorrectly propagated into other registers by find_equal_scalars()
*/
dst_reg->id = 0;
if (BPF_SRC(insn->code) == BPF_X) {
src_reg = &regs[insn->src_reg];
if (src_reg->type != SCALAR_VALUE) {
@ -6522,6 +6588,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* case: R1 = R2
* copy register state to dest reg
*/
if (src_reg->type == SCALAR_VALUE && !src_reg->id)
/* Assign src and dst registers the same ID
* that will be used by find_equal_scalars()
* to propagate min/max range.
*/
src_reg->id = ++env->id_gen;
*dst_reg = *src_reg;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = DEF_NOT_SUBREG;
@ -6534,6 +6606,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return -EACCES;
} else if (src_reg->type == SCALAR_VALUE) {
*dst_reg = *src_reg;
/* Make sure ID is cleared otherwise
* dst_reg min/max could be incorrectly
* propagated into src_reg by find_equal_scalars()
*/
dst_reg->id = 0;
dst_reg->live |= REG_LIVE_WRITTEN;
dst_reg->subreg_def = env->insn_idx + 1;
} else {
@ -7322,6 +7399,30 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn,
return true;
}
static void find_equal_scalars(struct bpf_verifier_state *vstate,
struct bpf_reg_state *known_reg)
{
struct bpf_func_state *state;
struct bpf_reg_state *reg;
int i, j;
for (i = 0; i <= vstate->curframe; i++) {
state = vstate->frame[i];
for (j = 0; j < MAX_BPF_REG; j++) {
reg = &state->regs[j];
if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
*reg = *known_reg;
}
bpf_for_each_spilled_reg(j, state, reg) {
if (!reg)
continue;
if (reg->type == SCALAR_VALUE && reg->id == known_reg->id)
*reg = *known_reg;
}
}
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
@ -7450,6 +7551,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
reg_combine_min_max(&other_branch_regs[insn->src_reg],
&other_branch_regs[insn->dst_reg],
src_reg, dst_reg, opcode);
if (src_reg->id) {
find_equal_scalars(this_branch, src_reg);
find_equal_scalars(other_branch, &other_branch_regs[insn->src_reg]);
}
}
} else if (dst_reg->type == SCALAR_VALUE) {
reg_set_min_max(&other_branch_regs[insn->dst_reg],
@ -7457,6 +7563,11 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
opcode, is_jmp32);
}
if (dst_reg->type == SCALAR_VALUE && dst_reg->id) {
find_equal_scalars(this_branch, dst_reg);
find_equal_scalars(other_branch, &other_branch_regs[insn->dst_reg]);
}
/* detect if R == 0 where R is returned from bpf_map_lookup_elem().
* NOTE: these optimizations below are related with pointer comparison
* which will never be JMP32.
@ -7488,6 +7599,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
struct bpf_insn_aux_data *aux = cur_aux(env);
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *dst_reg;
struct bpf_map *map;
int err;
@ -7504,25 +7616,45 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
dst_reg = &regs[insn->dst_reg];
if (insn->src_reg == 0) {
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
regs[insn->dst_reg].type = SCALAR_VALUE;
dst_reg->type = SCALAR_VALUE;
__mark_reg_known(&regs[insn->dst_reg], imm);
return 0;
}
if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
mark_reg_known_zero(env, regs, insn->dst_reg);
dst_reg->type = aux->btf_var.reg_type;
switch (dst_reg->type) {
case PTR_TO_MEM:
dst_reg->mem_size = aux->btf_var.mem_size;
break;
case PTR_TO_BTF_ID:
case PTR_TO_PERCPU_BTF_ID:
dst_reg->btf_id = aux->btf_var.btf_id;
break;
default:
verbose(env, "bpf verifier is misconfigured\n");
return -EFAULT;
}
return 0;
}
map = env->used_maps[aux->map_index];
mark_reg_known_zero(env, regs, insn->dst_reg);
regs[insn->dst_reg].map_ptr = map;
dst_reg->map_ptr = map;
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
regs[insn->dst_reg].off = aux->map_off;
dst_reg->type = PTR_TO_MAP_VALUE;
dst_reg->off = aux->map_off;
if (map_value_has_spin_lock(map))
regs[insn->dst_reg].id = ++env->id_gen;
dst_reg->id = ++env->id_gen;
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
dst_reg->type = CONST_PTR_TO_MAP;
} else {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
@ -9424,6 +9556,92 @@ static int do_check(struct bpf_verifier_env *env)
return 0;
}
/* replace pseudo btf_id with kernel symbol address */
static int check_pseudo_btf_id(struct bpf_verifier_env *env,
struct bpf_insn *insn,
struct bpf_insn_aux_data *aux)
{
u32 datasec_id, type, id = insn->imm;
const struct btf_var_secinfo *vsi;
const struct btf_type *datasec;
const struct btf_type *t;
const char *sym_name;
bool percpu = false;
u64 addr;
int i;
if (!btf_vmlinux) {
verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
return -EINVAL;
}
if (insn[1].imm != 0) {
verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n");
return -EINVAL;
}
t = btf_type_by_id(btf_vmlinux, id);
if (!t) {
verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
return -ENOENT;
}
if (!btf_type_is_var(t)) {
verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n",
id);
return -EINVAL;
}
sym_name = btf_name_by_offset(btf_vmlinux, t->name_off);
addr = kallsyms_lookup_name(sym_name);
if (!addr) {
verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
sym_name);
return -ENOENT;
}
datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu",
BTF_KIND_DATASEC);
if (datasec_id > 0) {
datasec = btf_type_by_id(btf_vmlinux, datasec_id);
for_each_vsi(i, datasec, vsi) {
if (vsi->type == id) {
percpu = true;
break;
}
}
}
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
type = t->type;
t = btf_type_skip_modifiers(btf_vmlinux, type, NULL);
if (percpu) {
aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID;
aux->btf_var.btf_id = type;
} else if (!btf_type_is_struct(t)) {
const struct btf_type *ret;
const char *tname;
u32 tsize;
/* resolve the type size of ksym. */
ret = btf_resolve_size(btf_vmlinux, t, &tsize);
if (IS_ERR(ret)) {
tname = btf_name_by_offset(btf_vmlinux, t->name_off);
verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret));
return -EINVAL;
}
aux->btf_var.reg_type = PTR_TO_MEM;
aux->btf_var.mem_size = tsize;
} else {
aux->btf_var.reg_type = PTR_TO_BTF_ID;
aux->btf_var.btf_id = type;
}
return 0;
}
static int check_map_prealloc(struct bpf_map *map)
{
return (map->map_type != BPF_MAP_TYPE_HASH &&
@ -9534,10 +9752,14 @@ static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
}
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
/* find and rewrite pseudo imm in ld_imm64 instructions:
*
* 1. if it accesses map FD, replace it with actual map pointer.
* 2. if it accesses btf_id of a VAR, replace it with pointer to the var.
*
* NOTE: btf_vmlinux is required for converting pseudo btf_id.
*/
static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
{
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
@ -9578,6 +9800,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
/* valid generic load 64-bit imm */
goto next_insn;
if (insn[0].src_reg == BPF_PSEUDO_BTF_ID) {
aux = &env->insn_aux_data[i];
err = check_pseudo_btf_id(env, insn, aux);
if (err)
return err;
goto next_insn;
}
/* In final convert_pseudo_ld_imm64() step, this is
* converted into regular 64-bit imm load insn.
*/
@ -10819,7 +11049,9 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
if (insn->imm == BPF_FUNC_map_lookup_elem &&
ops->map_gen_lookup) {
cnt = ops->map_gen_lookup(map_ptr, insn_buf);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
if (cnt == -EOPNOTSUPP)
goto patch_map_ops_generic;
if (cnt <= 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
@ -10849,7 +11081,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
(int (*)(struct bpf_map *map, void *value))NULL));
BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
(int (*)(struct bpf_map *map, void *value))NULL));
patch_map_ops_generic:
switch (insn->imm) {
case BPF_FUNC_map_lookup_elem:
insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) -
@ -11633,10 +11865,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (is_priv)
env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ;
ret = replace_map_fd_with_map_ptr(env);
if (ret < 0)
goto skip_full_check;
if (bpf_prog_is_dev_bound(env->prog->aux)) {
ret = bpf_prog_offload_verifier_prep(env->prog);
if (ret)
@ -11662,6 +11890,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
if (ret)
goto skip_full_check;
ret = resolve_pseudo_ldimm64(env);
if (ret < 0)
goto skip_full_check;
ret = check_cfg(env);
if (ret < 0)
goto skip_full_check;

View File

@ -1327,6 +1327,10 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_bpf_per_cpu_ptr:
return &bpf_per_cpu_ptr_proto;
case BPF_FUNC_bpf_this_cpu_ptr:
return &bpf_this_cpu_ptr_proto;
default:
return NULL;
}
@ -1776,7 +1780,9 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = {
};
const struct bpf_prog_ops raw_tracepoint_prog_ops = {
#ifdef CONFIG_NET
.test_run = bpf_prog_test_run_raw_tp,
#endif
};
const struct bpf_verifier_ops tracing_verifier_ops = {

View File

@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev)
struct net_device *orig_dev, bool *another)
{
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev
*/
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
if (skb_do_redirect(skb) == -EAGAIN) {
__skb_pull(skb, skb->mac_len);
*another = true;
break;
}
return NULL;
case TC_ACT_CONSUMED:
return NULL;
@ -5163,7 +5167,12 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
bool another = false;
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
&another);
if (another)
goto another_round;
if (!skb)
goto out;

View File

@ -76,6 +76,7 @@
#include <net/bpf_sk_storage.h>
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>
static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id);
@ -2379,8 +2380,9 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
/* Internal, non-exposed redirect flags. */
enum {
BPF_F_NEIGH = (1ULL << 1),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH)
BPF_F_NEIGH = (1ULL << 1),
BPF_F_PEER = (1ULL << 2),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER)
};
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
@ -2429,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
int skb_do_redirect(struct sk_buff *skb)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net *net = dev_net(skb->dev);
struct net_device *dev;
u32 flags = ri->flags;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
dev = dev_get_by_index_rcu(net, ri->tgt_index);
ri->tgt_index = 0;
if (unlikely(!dev)) {
kfree_skb(skb);
return -EINVAL;
}
ri->flags = 0;
if (unlikely(!dev))
goto out_drop;
if (flags & BPF_F_PEER) {
const struct net_device_ops *ops = dev->netdev_ops;
if (unlikely(!ops->ndo_get_peer_dev ||
!skb_at_tc_ingress(skb)))
goto out_drop;
dev = ops->ndo_get_peer_dev(dev);
if (unlikely(!dev ||
!is_skb_forwardable(dev, skb) ||
net_eq(net, dev_net(dev))))
goto out_drop;
skb->dev = dev;
return -EAGAIN;
}
return flags & BPF_F_NEIGH ?
__bpf_redirect_neigh(skb, dev) :
__bpf_redirect(skb, dev, flags);
out_drop:
kfree_skb(skb);
return -EINVAL;
}
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
@ -2465,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags))
return TC_ACT_SHOT;
ri->flags = BPF_F_PEER;
ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
static const struct bpf_func_proto bpf_redirect_peer_proto = {
.func = bpf_redirect_peer,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
@ -3479,6 +3518,48 @@ static u32 __bpf_skb_max_len(const struct sk_buff *skb)
SKB_MAX_ALLOC;
}
BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
u32 len_diff_abs = abs(len_diff);
bool shrink = len_diff < 0;
int ret = 0;
if (unlikely(flags || mode))
return -EINVAL;
if (unlikely(len_diff_abs > 0xfffU))
return -EFAULT;
if (!shrink) {
ret = skb_cow(skb, len_diff);
if (unlikely(ret < 0))
return ret;
__skb_push(skb, len_diff_abs);
memset(skb->data, 0, len_diff_abs);
} else {
if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
return -ENOMEM;
__skb_pull(skb, len_diff_abs);
}
bpf_compute_data_end_sk_skb(skb);
if (tls_sw_has_ctx_rx(skb->sk)) {
struct strp_msg *rxm = strp_msg(skb);
rxm->full_len += len_diff;
}
return ret;
}
static const struct bpf_func_proto sk_skb_adjust_room_proto = {
.func = sk_skb_adjust_room,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
u32, mode, u64, flags)
{
@ -4784,6 +4865,10 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname,
else
icsk->icsk_user_timeout = val;
break;
case TCP_NOTSENT_LOWAT:
tp->notsent_lowat = val;
sk->sk_write_space(sk);
break;
default:
ret = -EINVAL;
}
@ -5149,7 +5234,6 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0;
params->h_vlan_proto = 0;
params->ifindex = dev->ifindex;
return 0;
}
@ -5246,6 +5330,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
dev = nhc->nhc_dev;
params->rt_metric = res.fi->fib_priority;
params->ifindex = dev->ifindex;
/* xdp and cls_bpf programs are run in RCU-bh so
* rcu_read_lock_bh is not needed here
@ -5371,6 +5456,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
dev = res.nh->fib_nh_dev;
params->rt_metric = res.f6i->fib6_metric;
params->ifindex = dev->ifindex;
/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
* not needed here.
@ -6745,6 +6831,7 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_skb_change_tail ||
func == sk_skb_change_tail ||
func == bpf_skb_adjust_room ||
func == sk_skb_adjust_room ||
func == bpf_skb_pull_data ||
func == sk_skb_pull_data ||
func == bpf_clone_redirect ||
@ -7005,6 +7092,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_redirect_proto;
case BPF_FUNC_redirect_neigh:
return &bpf_redirect_neigh_proto;
case BPF_FUNC_redirect_peer:
return &bpf_redirect_peer_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
@ -7218,6 +7307,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &sk_skb_change_tail_proto;
case BPF_FUNC_skb_change_head:
return &sk_skb_change_head_proto;
case BPF_FUNC_skb_adjust_room:
return &sk_skb_adjust_room_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:

View File

@ -433,10 +433,12 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
if (ingress)
return sk_psock_skb_ingress(psock, skb);
else
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
return skb_send_sock_locked(psock->sk, skb, off, len);
}
return sk_psock_skb_ingress(psock, skb);
}
static void sk_psock_backlog(struct work_struct *work)
@ -625,6 +627,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
rcu_assign_sk_user_data(sk, NULL);
if (psock->progs.skb_parser)
sk_psock_stop_strp(sk, psock);
else if (psock->progs.skb_verdict)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
@ -682,19 +686,8 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
struct sk_buff *skb)
{
int ret;
skb->sk = psock->sk;
bpf_compute_data_end_sk_skb(skb);
ret = bpf_prog_run_pin_on_cpu(prog, skb);
/* strparser clones the skb before handing it to a upper layer,
* meaning skb_orphan has been called. We NULL sk on the way out
* to ensure we don't trigger a BUG_ON() in skb/sk operations
* later and because we are not charging the memory of this skb
* to any socket yet.
*/
skb->sk = NULL;
return ret;
return bpf_prog_run_pin_on_cpu(prog, skb);
}
static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
@ -709,38 +702,35 @@ static void sk_psock_skb_redirect(struct sk_buff *skb)
{
struct sk_psock *psock_other;
struct sock *sk_other;
bool ingress;
sk_other = tcp_skb_bpf_redirect_fetch(skb);
/* This error is a buggy BPF program, it returned a redirect
* return code, but then didn't set a redirect interface.
*/
if (unlikely(!sk_other)) {
kfree_skb(skb);
return;
}
psock_other = sk_psock(sk_other);
/* This error indicates the socket is being torn down or had another
* error that caused the pipe to break. We can't send a packet on
* a socket that is in this state so we drop the skb.
*/
if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
kfree_skb(skb);
return;
}
ingress = tcp_skb_bpf_ingress(skb);
if ((!ingress && sock_writeable(sk_other)) ||
(ingress &&
atomic_read(&sk_other->sk_rmem_alloc) <=
sk_other->sk_rcvbuf)) {
if (!ingress)
skb_set_owner_w(skb, sk_other);
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
} else {
kfree_skb(skb);
}
skb_queue_tail(&psock_other->ingress_skb, skb);
schedule_work(&psock_other->work);
}
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, int verdict)
static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict)
{
switch (verdict) {
case __SK_REDIRECT:
skb_set_owner_r(skb, sk);
sk_psock_skb_redirect(skb);
break;
case __SK_PASS:
@ -758,11 +748,17 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
/* We skip full set_owner_r here because if we do a SK_PASS
* or SK_DROP we can skip skb memory accounting and use the
* TLS context.
*/
skb->sk = psock->sk;
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
skb->sk = NULL;
}
sk_psock_tls_verdict_apply(skb, ret);
sk_psock_tls_verdict_apply(skb, psock->sk, ret);
rcu_read_unlock();
return ret;
}
@ -771,7 +767,9 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
static void sk_psock_verdict_apply(struct sk_psock *psock,
struct sk_buff *skb, int verdict)
{
struct tcp_skb_cb *tcp;
struct sock *sk_other;
int err = -EIO;
switch (verdict) {
case __SK_PASS:
@ -780,16 +778,24 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
goto out_free;
}
if (atomic_read(&sk_other->sk_rmem_alloc) <=
sk_other->sk_rcvbuf) {
struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);
tcp->bpf.flags |= BPF_F_INGRESS;
tcp = TCP_SKB_CB(skb);
tcp->bpf.flags |= BPF_F_INGRESS;
/* If the queue is empty then we can submit directly
* into the msg queue. If its not empty we have to
* queue work otherwise we may get OOO data. Otherwise,
* if sk_psock_skb_ingress errors will be handled by
* retrying later from workqueue.
*/
if (skb_queue_empty(&psock->ingress_skb)) {
err = sk_psock_skb_ingress(psock, skb);
}
if (err < 0) {
skb_queue_tail(&psock->ingress_skb, skb);
schedule_work(&psock->work);
break;
}
goto out_free;
break;
case __SK_REDIRECT:
sk_psock_skb_redirect(skb);
break;
@ -814,9 +820,9 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
kfree_skb(skb);
goto out;
}
skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
skb_orphan(skb);
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
@ -839,8 +845,11 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
rcu_read_lock();
prog = READ_ONCE(psock->progs.skb_parser);
if (likely(prog))
if (likely(prog)) {
skb->sk = psock->sk;
ret = sk_psock_bpf_run(psock, prog, skb);
skb->sk = NULL;
}
rcu_read_unlock();
return ret;
}
@ -864,6 +873,57 @@ static void sk_psock_strp_data_ready(struct sock *sk)
rcu_read_unlock();
}
static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb,
unsigned int offset, size_t orig_len)
{
struct sock *sk = (struct sock *)desc->arg.data;
struct sk_psock *psock;
struct bpf_prog *prog;
int ret = __SK_DROP;
int len = skb->len;
/* clone here so sk_eat_skb() in tcp_read_sock does not drop our data */
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb) {
desc->error = -ENOMEM;
return 0;
}
rcu_read_lock();
psock = sk_psock(sk);
if (unlikely(!psock)) {
len = 0;
kfree_skb(skb);
goto out;
}
skb_set_owner_r(skb, sk);
prog = READ_ONCE(psock->progs.skb_verdict);
if (likely(prog)) {
tcp_skb_bpf_redirect_clear(skb);
ret = sk_psock_bpf_run(psock, prog, skb);
ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
}
sk_psock_verdict_apply(psock, skb, ret);
out:
rcu_read_unlock();
return len;
}
static void sk_psock_verdict_data_ready(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
read_descriptor_t desc;
if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
return;
desc.arg.data = sk;
desc.error = 0;
desc.count = 1;
sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv);
}
static void sk_psock_write_space(struct sock *sk)
{
struct sk_psock *psock;
@ -893,6 +953,19 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
return strp_init(&psock->parser.strp, sk, &cb);
}
void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
if (parser->enabled)
return;
parser->saved_data_ready = sk->sk_data_ready;
sk->sk_data_ready = sk_psock_verdict_data_ready;
sk->sk_write_space = sk_psock_write_space;
parser->enabled = true;
}
void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
@ -918,3 +991,15 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
strp_stop(&parser->strp);
parser->enabled = false;
}
void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
{
struct sk_psock_parser *parser = &psock->parser;
if (!parser->enabled)
return;
sk->sk_data_ready = parser->saved_data_ready;
parser->saved_data_ready = NULL;
parser->enabled = false;
}

View File

@ -148,8 +148,8 @@ static void sock_map_add_link(struct sk_psock *psock,
static void sock_map_del_link(struct sock *sk,
struct sk_psock *psock, void *link_raw)
{
bool strp_stop = false, verdict_stop = false;
struct sk_psock_link *link, *tmp;
bool strp_stop = false;
spin_lock_bh(&psock->link_lock);
list_for_each_entry_safe(link, tmp, &psock->link, list) {
@ -159,14 +159,19 @@ static void sock_map_del_link(struct sock *sk,
map);
if (psock->parser.enabled && stab->progs.skb_parser)
strp_stop = true;
if (psock->parser.enabled && stab->progs.skb_verdict)
verdict_stop = true;
list_del(&link->list);
sk_psock_free_link(link);
}
}
spin_unlock_bh(&psock->link_lock);
if (strp_stop) {
if (strp_stop || verdict_stop) {
write_lock_bh(&sk->sk_callback_lock);
sk_psock_stop_strp(sk, psock);
if (strp_stop)
sk_psock_stop_strp(sk, psock);
else
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
}
}
@ -230,16 +235,16 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
{
struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
struct sk_psock *psock;
bool skb_progs;
int ret;
skb_verdict = READ_ONCE(progs->skb_verdict);
skb_parser = READ_ONCE(progs->skb_parser);
skb_progs = skb_parser && skb_verdict;
if (skb_progs) {
if (skb_verdict) {
skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
if (IS_ERR(skb_verdict))
return PTR_ERR(skb_verdict);
}
if (skb_parser) {
skb_parser = bpf_prog_inc_not_zero(skb_parser);
if (IS_ERR(skb_parser)) {
bpf_prog_put(skb_verdict);
@ -264,7 +269,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
if (psock) {
if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
(skb_progs && READ_ONCE(psock->progs.skb_parser))) {
(skb_parser && READ_ONCE(psock->progs.skb_parser)) ||
(skb_verdict && READ_ONCE(psock->progs.skb_verdict))) {
sk_psock_put(sk, psock);
ret = -EBUSY;
goto out_progs;
@ -285,28 +291,31 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
goto out_drop;
write_lock_bh(&sk->sk_callback_lock);
if (skb_progs && !psock->parser.enabled) {
if (skb_parser && skb_verdict && !psock->parser.enabled) {
ret = sk_psock_init_strp(sk, psock);
if (ret) {
write_unlock_bh(&sk->sk_callback_lock);
goto out_drop;
}
if (ret)
goto out_unlock_drop;
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
psock_set_prog(&psock->progs.skb_parser, skb_parser);
sk_psock_start_strp(sk, psock);
} else if (!skb_parser && skb_verdict && !psock->parser.enabled) {
psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
sk_psock_start_verdict(sk,psock);
}
write_unlock_bh(&sk->sk_callback_lock);
return 0;
out_unlock_drop:
write_unlock_bh(&sk->sk_callback_lock);
out_drop:
sk_psock_put(sk, psock);
out_progs:
if (msg_parser)
bpf_prog_put(msg_parser);
out:
if (skb_progs) {
if (skb_verdict)
bpf_prog_put(skb_verdict);
if (skb_parser)
bpf_prog_put(skb_parser);
}
return ret;
}

View File

@ -548,7 +548,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
bpf_skops_init_child(sk, newsk);
tcp_bpf_clone(sk, newsk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);

View File

@ -3,9 +3,6 @@
#include <net/xsk_buff_pool.h>
#include <net/xdp_sock.h>
#include <net/xdp_sock_drv.h>
#include <linux/dma-direct.h>
#include <linux/dma-noncoherent.h>
#include <linux/swiotlb.h>
#include "xsk_queue.h"
#include "xdp_umem.h"

View File

@ -15,6 +15,10 @@
struct xdp_ring {
u32 producer ____cacheline_aligned_in_smp;
/* Hinder the adjacent cache prefetcher to prefetch the consumer
* pointer if the producer pointer is touched and vice versa.
*/
u32 pad ____cacheline_aligned_in_smp;
u32 consumer ____cacheline_aligned_in_smp;
u32 flags;
};

View File

@ -132,7 +132,7 @@ static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
static int xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2;
struct bpf_insn *insn = insn_buf;

View File

@ -98,8 +98,8 @@ test_map_in_map-objs := test_map_in_map_user.o
per_socket_stats_example-objs := cookie_uid_helper_example.o
xdp_redirect-objs := xdp_redirect_user.o
xdp_redirect_map-objs := xdp_redirect_map_user.o
xdp_redirect_cpu-objs := bpf_load.o xdp_redirect_cpu_user.o
xdp_monitor-objs := bpf_load.o xdp_monitor_user.o
xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o
xdp_monitor-objs := xdp_monitor_user.o
xdp_rxq_info-objs := xdp_rxq_info_user.o
syscall_tp-objs := syscall_tp_user.o
cpustat-objs := cpustat_user.o
@ -211,6 +211,8 @@ TPROGLDLIBS_xsk_fwd += -pthread
# make M=samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
LLC ?= llc
CLANG ?= clang
OPT ?= opt
LLVM_DIS ?= llvm-dis
LLVM_OBJCOPY ?= llvm-objcopy
BTF_PAHOLE ?= pahole
@ -303,6 +305,11 @@ $(obj)/hbm_edt_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
# asm/sysreg.h - inline assembly used by it is incompatible with llvm.
# But, there is no easy way to fix it, so just exclude it since it is
# useless for BPF samples.
# below we use long chain of commands, clang | opt | llvm-dis | llc,
# to generate final object file. 'clang' compiles the source into IR
# with native target, e.g., x64, arm64, etc. 'opt' does bpf CORE IR builtin
# processing (llvm12) and IR optimizations. 'llvm-dis' converts
# 'opt' output to IR, and finally 'llc' generates bpf byte code.
$(obj)/%.o: $(src)/%.c
@echo " CLANG-bpf " $@
$(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \
@ -314,7 +321,9 @@ $(obj)/%.o: $(src)/%.c
-Wno-address-of-packed-member -Wno-tautological-compare \
-Wno-unknown-warning-option $(CLANG_ARCH_ARGS) \
-I$(srctree)/samples/bpf/ -include asm_goto_workaround.h \
-O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
-O2 -emit-llvm -Xclang -disable-llvm-passes -c $< -o - | \
$(OPT) -O2 -mtriple=bpf-pc-linux | $(LLVM_DIS) | \
$(LLC) -march=bpf $(LLC_FLAGS) -filetype=obj -o $@
ifeq ($(DWARF2BTF),y)
$(BTF_PAHOLE) -J $@
endif

View File

@ -40,6 +40,7 @@
#include <errno.h>
#include <fcntl.h>
#include <linux/unistd.h>
#include <linux/compiler.h>
#include <linux/bpf.h>
#include <bpf/bpf.h>
@ -483,7 +484,7 @@ int main(int argc, char **argv)
"Option -%c requires an argument.\n\n",
optopt);
case 'h':
fallthrough;
__fallthrough;
default:
Usage();
return 0;

View File

@ -6,21 +6,21 @@
#include <uapi/linux/bpf.h>
#include <bpf/bpf_helpers.h>
struct bpf_map_def SEC("maps") redirect_err_cnt = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(u64),
.max_entries = 2,
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, 2);
/* TODO: have entries for all possible errno's */
};
} redirect_err_cnt SEC(".maps");
#define XDP_UNKNOWN XDP_REDIRECT + 1
struct bpf_map_def SEC("maps") exception_cnt = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(u64),
.max_entries = XDP_UNKNOWN + 1,
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, u64);
__uint(max_entries, XDP_UNKNOWN + 1);
} exception_cnt SEC(".maps");
/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format
* Code in: kernel/include/trace/events/xdp.h
@ -129,19 +129,19 @@ struct datarec {
};
#define MAX_CPUS 64
struct bpf_map_def SEC("maps") cpumap_enqueue_cnt = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(struct datarec),
.max_entries = MAX_CPUS,
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, struct datarec);
__uint(max_entries, MAX_CPUS);
} cpumap_enqueue_cnt SEC(".maps");
struct bpf_map_def SEC("maps") cpumap_kthread_cnt = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(struct datarec),
.max_entries = 1,
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, struct datarec);
__uint(max_entries, 1);
} cpumap_kthread_cnt SEC(".maps");
/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_cpumap_enqueue/format
* Code in: kernel/include/trace/events/xdp.h
@ -210,12 +210,12 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
return 0;
}
struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
.key_size = sizeof(u32),
.value_size = sizeof(struct datarec),
.max_entries = 1,
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__type(key, u32);
__type(value, struct datarec);
__uint(max_entries, 1);
} devmap_xmit_cnt SEC(".maps");
/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
* Code in: kernel/include/trace/events/xdp.h

View File

@ -26,12 +26,37 @@ static const char *__doc_err_only__=
#include <net/if.h>
#include <time.h>
#include <signal.h>
#include <bpf/bpf.h>
#include "bpf_load.h"
#include <bpf/libbpf.h>
#include "bpf_util.h"
enum map_type {
REDIRECT_ERR_CNT,
EXCEPTION_CNT,
CPUMAP_ENQUEUE_CNT,
CPUMAP_KTHREAD_CNT,
DEVMAP_XMIT_CNT,
};
static const char *const map_type_strings[] = {
[REDIRECT_ERR_CNT] = "redirect_err_cnt",
[EXCEPTION_CNT] = "exception_cnt",
[CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
[CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
[DEVMAP_XMIT_CNT] = "devmap_xmit_cnt",
};
#define NUM_MAP 5
#define NUM_TP 8
static int tp_cnt;
static int map_cnt;
static int verbose = 1;
static bool debug = false;
struct bpf_map *map_data[NUM_MAP] = {};
struct bpf_link *tp_links[NUM_TP] = {};
struct bpf_object *obj;
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@ -41,6 +66,16 @@ static const struct option long_options[] = {
{0, 0, NULL, 0 }
};
static void int_exit(int sig)
{
/* Detach tracepoints */
while (tp_cnt)
bpf_link__destroy(tp_links[--tp_cnt]);
bpf_object__close(obj);
exit(0);
}
/* C standard specifies two constants, EXIT_SUCCESS(0) and EXIT_FAILURE(1) */
#define EXIT_FAIL_MEM 5
@ -483,23 +518,23 @@ static bool stats_collect(struct stats_record *rec)
* this can happen by someone running perf-record -e
*/
fd = map_data[0].fd; /* map0: redirect_err_cnt */
fd = bpf_map__fd(map_data[REDIRECT_ERR_CNT]);
for (i = 0; i < REDIR_RES_MAX; i++)
map_collect_record_u64(fd, i, &rec->xdp_redirect[i]);
fd = map_data[1].fd; /* map1: exception_cnt */
fd = bpf_map__fd(map_data[EXCEPTION_CNT]);
for (i = 0; i < XDP_ACTION_MAX; i++) {
map_collect_record_u64(fd, i, &rec->xdp_exception[i]);
}
fd = map_data[2].fd; /* map2: cpumap_enqueue_cnt */
fd = bpf_map__fd(map_data[CPUMAP_ENQUEUE_CNT]);
for (i = 0; i < MAX_CPUS; i++)
map_collect_record(fd, i, &rec->xdp_cpumap_enqueue[i]);
fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
fd = bpf_map__fd(map_data[CPUMAP_KTHREAD_CNT]);
map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
fd = bpf_map__fd(map_data[DEVMAP_XMIT_CNT]);
map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
return true;
@ -598,8 +633,8 @@ static void stats_poll(int interval, bool err_only)
/* TODO Need more advanced stats on error types */
if (verbose) {
printf(" - Stats map0: %s\n", map_data[0].name);
printf(" - Stats map1: %s\n", map_data[1].name);
printf(" - Stats map0: %s\n", bpf_map__name(map_data[0]));
printf(" - Stats map1: %s\n", bpf_map__name(map_data[1]));
printf("\n");
}
fflush(stdout);
@ -618,44 +653,51 @@ static void stats_poll(int interval, bool err_only)
static void print_bpf_prog_info(void)
{
int i;
struct bpf_program *prog;
struct bpf_map *map;
int i = 0;
/* Prog info */
printf("Loaded BPF prog have %d bpf program(s)\n", prog_cnt);
for (i = 0; i < prog_cnt; i++) {
printf(" - prog_fd[%d] = fd(%d)\n", i, prog_fd[i]);
printf("Loaded BPF prog have %d bpf program(s)\n", tp_cnt);
bpf_object__for_each_program(prog, obj) {
printf(" - prog_fd[%d] = fd(%d)\n", i, bpf_program__fd(prog));
i++;
}
i = 0;
/* Maps info */
printf("Loaded BPF prog have %d map(s)\n", map_data_count);
for (i = 0; i < map_data_count; i++) {
char *name = map_data[i].name;
int fd = map_data[i].fd;
printf("Loaded BPF prog have %d map(s)\n", map_cnt);
bpf_object__for_each_map(map, obj) {
const char *name = bpf_map__name(map);
int fd = bpf_map__fd(map);
printf(" - map_data[%d] = fd(%d) name:%s\n", i, fd, name);
i++;
}
/* Event info */
printf("Searching for (max:%d) event file descriptor(s)\n", prog_cnt);
for (i = 0; i < prog_cnt; i++) {
if (event_fd[i] != -1)
printf(" - event_fd[%d] = fd(%d)\n", i, event_fd[i]);
printf("Searching for (max:%d) event file descriptor(s)\n", tp_cnt);
for (i = 0; i < tp_cnt; i++) {
int fd = bpf_link__fd(tp_links[i]);
if (fd != -1)
printf(" - event_fd[%d] = fd(%d)\n", i, fd);
}
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
struct bpf_program *prog;
int longindex = 0, opt;
int ret = EXIT_SUCCESS;
char bpf_obj_file[256];
int ret = EXIT_FAILURE;
enum map_type type;
char filename[256];
/* Default settings: */
bool errors_only = true;
int interval = 2;
snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
/* Parse commands line args */
while ((opt = getopt_long(argc, argv, "hDSs:",
long_options, &longindex)) != -1) {
@ -672,40 +714,79 @@ int main(int argc, char **argv)
case 'h':
default:
usage(argv);
return EXIT_FAILURE;
return ret;
}
}
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (setrlimit(RLIMIT_MEMLOCK, &r)) {
perror("setrlimit(RLIMIT_MEMLOCK)");
return EXIT_FAILURE;
return ret;
}
if (load_bpf_file(bpf_obj_file)) {
printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
return EXIT_FAILURE;
/* Remove tracepoint program when program is interrupted or killed */
signal(SIGINT, int_exit);
signal(SIGTERM, int_exit);
obj = bpf_object__open_file(filename, NULL);
if (libbpf_get_error(obj)) {
printf("ERROR: opening BPF object file failed\n");
obj = NULL;
goto cleanup;
}
if (!prog_fd[0]) {
printf("ERROR - load_bpf_file: %s\n", strerror(errno));
return EXIT_FAILURE;
/* load BPF program */
if (bpf_object__load(obj)) {
printf("ERROR: loading BPF object file failed\n");
goto cleanup;
}
for (type = 0; type < NUM_MAP; type++) {
map_data[type] =
bpf_object__find_map_by_name(obj, map_type_strings[type]);
if (libbpf_get_error(map_data[type])) {
printf("ERROR: finding a map in obj file failed\n");
goto cleanup;
}
map_cnt++;
}
bpf_object__for_each_program(prog, obj) {
tp_links[tp_cnt] = bpf_program__attach(prog);
if (libbpf_get_error(tp_links[tp_cnt])) {
printf("ERROR: bpf_program__attach failed\n");
tp_links[tp_cnt] = NULL;
goto cleanup;
}
tp_cnt++;
}
if (debug) {
print_bpf_prog_info();
}
/* Unload/stop tracepoint event by closing fd's */
/* Unload/stop tracepoint event by closing bpf_link's */
if (errors_only) {
/* The prog_fd[i] and event_fd[i] depend on the
* order the functions was defined in _kern.c
/* The bpf_link[i] depend on the order of
* the functions was defined in _kern.c
*/
close(event_fd[2]); /* tracepoint/xdp/xdp_redirect */
close(prog_fd[2]); /* func: trace_xdp_redirect */
close(event_fd[3]); /* tracepoint/xdp/xdp_redirect_map */
close(prog_fd[3]); /* func: trace_xdp_redirect_map */
bpf_link__destroy(tp_links[2]); /* tracepoint/xdp/xdp_redirect */
tp_links[2] = NULL;
bpf_link__destroy(tp_links[3]); /* tracepoint/xdp/xdp_redirect_map */
tp_links[3] = NULL;
}
stats_poll(interval, errors_only);
ret = EXIT_SUCCESS;
cleanup:
/* Detach tracepoints */
while (tp_cnt)
bpf_link__destroy(tp_links[--tp_cnt]);
bpf_object__close(obj);
return ret;
}

View File

@ -37,18 +37,35 @@ static __u32 prog_id;
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
static int n_cpus;
static int cpu_map_fd;
static int rx_cnt_map_fd;
static int redirect_err_cnt_map_fd;
static int cpumap_enqueue_cnt_map_fd;
static int cpumap_kthread_cnt_map_fd;
static int cpus_available_map_fd;
static int cpus_count_map_fd;
static int cpus_iterator_map_fd;
static int exception_cnt_map_fd;
enum map_type {
CPU_MAP,
RX_CNT,
REDIRECT_ERR_CNT,
CPUMAP_ENQUEUE_CNT,
CPUMAP_KTHREAD_CNT,
CPUS_AVAILABLE,
CPUS_COUNT,
CPUS_ITERATOR,
EXCEPTION_CNT,
};
static const char *const map_type_strings[] = {
[CPU_MAP] = "cpu_map",
[RX_CNT] = "rx_cnt",
[REDIRECT_ERR_CNT] = "redirect_err_cnt",
[CPUMAP_ENQUEUE_CNT] = "cpumap_enqueue_cnt",
[CPUMAP_KTHREAD_CNT] = "cpumap_kthread_cnt",
[CPUS_AVAILABLE] = "cpus_available",
[CPUS_COUNT] = "cpus_count",
[CPUS_ITERATOR] = "cpus_iterator",
[EXCEPTION_CNT] = "exception_cnt",
};
#define NUM_TP 5
struct bpf_link *tp_links[NUM_TP] = { 0 };
#define NUM_MAP 9
struct bpf_link *tp_links[NUM_TP] = {};
static int map_fds[NUM_MAP];
static int tp_cnt = 0;
/* Exit return codes */
@ -527,20 +544,20 @@ static void stats_collect(struct stats_record *rec)
{
int fd, i;
fd = rx_cnt_map_fd;
fd = map_fds[RX_CNT];
map_collect_percpu(fd, 0, &rec->rx_cnt);
fd = redirect_err_cnt_map_fd;
fd = map_fds[REDIRECT_ERR_CNT];
map_collect_percpu(fd, 1, &rec->redir_err);
fd = cpumap_enqueue_cnt_map_fd;
fd = map_fds[CPUMAP_ENQUEUE_CNT];
for (i = 0; i < n_cpus; i++)
map_collect_percpu(fd, i, &rec->enq[i]);
fd = cpumap_kthread_cnt_map_fd;
fd = map_fds[CPUMAP_KTHREAD_CNT];
map_collect_percpu(fd, 0, &rec->kthread);
fd = exception_cnt_map_fd;
fd = map_fds[EXCEPTION_CNT];
map_collect_percpu(fd, 0, &rec->exception);
}
@ -565,7 +582,7 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
/* Add a CPU entry to cpumap, as this allocate a cpu entry in
* the kernel for the cpu.
*/
ret = bpf_map_update_elem(cpu_map_fd, &cpu, value, 0);
ret = bpf_map_update_elem(map_fds[CPU_MAP], &cpu, value, 0);
if (ret) {
fprintf(stderr, "Create CPU entry failed (err:%d)\n", ret);
exit(EXIT_FAIL_BPF);
@ -574,21 +591,21 @@ static int create_cpu_entry(__u32 cpu, struct bpf_cpumap_val *value,
/* Inform bpf_prog's that a new CPU is available to select
* from via some control maps.
*/
ret = bpf_map_update_elem(cpus_available_map_fd, &avail_idx, &cpu, 0);
ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &avail_idx, &cpu, 0);
if (ret) {
fprintf(stderr, "Add to avail CPUs failed\n");
exit(EXIT_FAIL_BPF);
}
/* When not replacing/updating existing entry, bump the count */
ret = bpf_map_lookup_elem(cpus_count_map_fd, &key, &curr_cpus_count);
ret = bpf_map_lookup_elem(map_fds[CPUS_COUNT], &key, &curr_cpus_count);
if (ret) {
fprintf(stderr, "Failed reading curr cpus_count\n");
exit(EXIT_FAIL_BPF);
}
if (new) {
curr_cpus_count++;
ret = bpf_map_update_elem(cpus_count_map_fd, &key,
ret = bpf_map_update_elem(map_fds[CPUS_COUNT], &key,
&curr_cpus_count, 0);
if (ret) {
fprintf(stderr, "Failed write curr cpus_count\n");
@ -612,7 +629,7 @@ static void mark_cpus_unavailable(void)
int ret, i;
for (i = 0; i < n_cpus; i++) {
ret = bpf_map_update_elem(cpus_available_map_fd, &i,
ret = bpf_map_update_elem(map_fds[CPUS_AVAILABLE], &i,
&invalid_cpu, 0);
if (ret) {
fprintf(stderr, "Failed marking CPU unavailable\n");
@ -665,68 +682,37 @@ static void stats_poll(int interval, bool use_separators, char *prog_name,
free_stats_record(prev);
}
static struct bpf_link * attach_tp(struct bpf_object *obj,
const char *tp_category,
const char* tp_name)
static int init_tracepoints(struct bpf_object *obj)
{
struct bpf_program *prog;
struct bpf_link *link;
char sec_name[PATH_MAX];
int len;
len = snprintf(sec_name, PATH_MAX, "tracepoint/%s/%s",
tp_category, tp_name);
if (len < 0)
exit(EXIT_FAIL);
bpf_object__for_each_program(prog, obj) {
if (bpf_program__is_tracepoint(prog) != true)
continue;
prog = bpf_object__find_program_by_title(obj, sec_name);
if (!prog) {
fprintf(stderr, "ERR: finding progsec: %s\n", sec_name);
exit(EXIT_FAIL_BPF);
tp_links[tp_cnt] = bpf_program__attach(prog);
if (libbpf_get_error(tp_links[tp_cnt])) {
tp_links[tp_cnt] = NULL;
return -EINVAL;
}
tp_cnt++;
}
link = bpf_program__attach_tracepoint(prog, tp_category, tp_name);
if (libbpf_get_error(link))
exit(EXIT_FAIL_BPF);
return link;
}
static void init_tracepoints(struct bpf_object *obj) {
tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_err");
tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_redirect_map_err");
tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_exception");
tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_enqueue");
tp_links[tp_cnt++] = attach_tp(obj, "xdp", "xdp_cpumap_kthread");
return 0;
}
static int init_map_fds(struct bpf_object *obj)
{
/* Maps updated by tracepoints */
redirect_err_cnt_map_fd =
bpf_object__find_map_fd_by_name(obj, "redirect_err_cnt");
exception_cnt_map_fd =
bpf_object__find_map_fd_by_name(obj, "exception_cnt");
cpumap_enqueue_cnt_map_fd =
bpf_object__find_map_fd_by_name(obj, "cpumap_enqueue_cnt");
cpumap_kthread_cnt_map_fd =
bpf_object__find_map_fd_by_name(obj, "cpumap_kthread_cnt");
enum map_type type;
/* Maps used by XDP */
rx_cnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rx_cnt");
cpu_map_fd = bpf_object__find_map_fd_by_name(obj, "cpu_map");
cpus_available_map_fd =
bpf_object__find_map_fd_by_name(obj, "cpus_available");
cpus_count_map_fd = bpf_object__find_map_fd_by_name(obj, "cpus_count");
cpus_iterator_map_fd =
bpf_object__find_map_fd_by_name(obj, "cpus_iterator");
for (type = 0; type < NUM_MAP; type++) {
map_fds[type] =
bpf_object__find_map_fd_by_name(obj,
map_type_strings[type]);
if (cpu_map_fd < 0 || rx_cnt_map_fd < 0 ||
redirect_err_cnt_map_fd < 0 || cpumap_enqueue_cnt_map_fd < 0 ||
cpumap_kthread_cnt_map_fd < 0 || cpus_available_map_fd < 0 ||
cpus_count_map_fd < 0 || cpus_iterator_map_fd < 0 ||
exception_cnt_map_fd < 0)
return -ENOENT;
if (map_fds[type] < 0)
return -ENOENT;
}
return 0;
}
@ -795,13 +781,13 @@ int main(int argc, char **argv)
bool stress_mode = false;
struct bpf_program *prog;
struct bpf_object *obj;
int err = EXIT_FAIL;
char filename[256];
int added_cpus = 0;
int longindex = 0;
int interval = 2;
int add_cpu = -1;
int opt, err;
int prog_fd;
int opt, prog_fd;
int *cpu, i;
__u32 qsize;
@ -824,24 +810,29 @@ int main(int argc, char **argv)
}
if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
return EXIT_FAIL;
return err;
if (prog_fd < 0) {
fprintf(stderr, "ERR: bpf_prog_load_xattr: %s\n",
strerror(errno));
return EXIT_FAIL;
return err;
}
init_tracepoints(obj);
if (init_tracepoints(obj) < 0) {
fprintf(stderr, "ERR: bpf_program__attach failed\n");
return err;
}
if (init_map_fds(obj) < 0) {
fprintf(stderr, "bpf_object__find_map_fd_by_name failed\n");
return EXIT_FAIL;
return err;
}
mark_cpus_unavailable();
cpu = malloc(n_cpus * sizeof(int));
if (!cpu) {
fprintf(stderr, "failed to allocate cpu array\n");
return EXIT_FAIL;
return err;
}
memset(cpu, 0, n_cpus * sizeof(int));
@ -960,14 +951,12 @@ int main(int argc, char **argv)
prog = bpf_object__find_program_by_title(obj, prog_name);
if (!prog) {
fprintf(stderr, "bpf_object__find_program_by_title failed\n");
err = EXIT_FAIL;
goto out;
}
prog_fd = bpf_program__fd(prog);
if (prog_fd < 0) {
fprintf(stderr, "bpf_program__fd failed\n");
err = EXIT_FAIL;
goto out;
}
@ -986,6 +975,8 @@ int main(int argc, char **argv)
stats_poll(interval, use_separators, prog_name, mprog_name,
&value, stress_mode);
err = EXIT_OK;
out:
free(cpu);
return err;

View File

@ -5,14 +5,12 @@
#include <bpf/bpf_helpers.h>
#define SAMPLE_SIZE 64ul
#define MAX_CPUS 128
struct bpf_map_def SEC("maps") my_map = {
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(u32),
.max_entries = MAX_CPUS,
};
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(u32));
} my_map SEC(".maps");
SEC("xdp_sample")
int xdp_sample_prog(struct xdp_md *ctx)

View File

@ -18,7 +18,6 @@
#include "perf-sys.h"
#define MAX_CPUS 128
static int if_idx;
static char *if_name;
static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;

View File

@ -11,6 +11,7 @@
#include <linux/if_xdp.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/limits.h>
#include <linux/udp.h>
#include <arpa/inet.h>
#include <locale.h>
@ -79,6 +80,10 @@ static u16 opt_pkt_size = MIN_PKT_SIZE;
static u32 opt_pkt_fill_pattern = 0x12345678;
static bool opt_extra_stats;
static bool opt_quiet;
static bool opt_app_stats;
static const char *opt_irq_str = "";
static u32 irq_no;
static int irqs_at_init = -1;
static int opt_poll;
static int opt_interval = 1;
static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
@ -91,18 +96,7 @@ static bool opt_need_wakeup = true;
static u32 opt_num_xsks = 1;
static u32 prog_id;
struct xsk_umem_info {
struct xsk_ring_prod fq;
struct xsk_ring_cons cq;
struct xsk_umem *umem;
void *buffer;
};
struct xsk_socket_info {
struct xsk_ring_cons rx;
struct xsk_ring_prod tx;
struct xsk_umem_info *umem;
struct xsk_socket *xsk;
struct xsk_ring_stats {
unsigned long rx_npkts;
unsigned long tx_npkts;
unsigned long rx_dropped_npkts;
@ -119,6 +113,41 @@ struct xsk_socket_info {
unsigned long prev_rx_full_npkts;
unsigned long prev_rx_fill_empty_npkts;
unsigned long prev_tx_empty_npkts;
};
struct xsk_driver_stats {
unsigned long intrs;
unsigned long prev_intrs;
};
struct xsk_app_stats {
unsigned long rx_empty_polls;
unsigned long fill_fail_polls;
unsigned long copy_tx_sendtos;
unsigned long tx_wakeup_sendtos;
unsigned long opt_polls;
unsigned long prev_rx_empty_polls;
unsigned long prev_fill_fail_polls;
unsigned long prev_copy_tx_sendtos;
unsigned long prev_tx_wakeup_sendtos;
unsigned long prev_opt_polls;
};
struct xsk_umem_info {
struct xsk_ring_prod fq;
struct xsk_ring_cons cq;
struct xsk_umem *umem;
void *buffer;
};
struct xsk_socket_info {
struct xsk_ring_cons rx;
struct xsk_ring_prod tx;
struct xsk_umem_info *umem;
struct xsk_socket *xsk;
struct xsk_ring_stats ring_stats;
struct xsk_app_stats app_stats;
struct xsk_driver_stats drv_stats;
u32 outstanding_tx;
};
@ -173,18 +202,151 @@ static int xsk_get_xdp_stats(int fd, struct xsk_socket_info *xsk)
return err;
if (optlen == sizeof(struct xdp_statistics)) {
xsk->rx_dropped_npkts = stats.rx_dropped;
xsk->rx_invalid_npkts = stats.rx_invalid_descs;
xsk->tx_invalid_npkts = stats.tx_invalid_descs;
xsk->rx_full_npkts = stats.rx_ring_full;
xsk->rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
xsk->tx_empty_npkts = stats.tx_ring_empty_descs;
xsk->ring_stats.rx_dropped_npkts = stats.rx_dropped;
xsk->ring_stats.rx_invalid_npkts = stats.rx_invalid_descs;
xsk->ring_stats.tx_invalid_npkts = stats.tx_invalid_descs;
xsk->ring_stats.rx_full_npkts = stats.rx_ring_full;
xsk->ring_stats.rx_fill_empty_npkts = stats.rx_fill_ring_empty_descs;
xsk->ring_stats.tx_empty_npkts = stats.tx_ring_empty_descs;
return 0;
}
return -EINVAL;
}
static void dump_app_stats(long dt)
{
int i;
for (i = 0; i < num_socks && xsks[i]; i++) {
char *fmt = "%-18s %'-14.0f %'-14lu\n";
double rx_empty_polls_ps, fill_fail_polls_ps, copy_tx_sendtos_ps,
tx_wakeup_sendtos_ps, opt_polls_ps;
rx_empty_polls_ps = (xsks[i]->app_stats.rx_empty_polls -
xsks[i]->app_stats.prev_rx_empty_polls) * 1000000000. / dt;
fill_fail_polls_ps = (xsks[i]->app_stats.fill_fail_polls -
xsks[i]->app_stats.prev_fill_fail_polls) * 1000000000. / dt;
copy_tx_sendtos_ps = (xsks[i]->app_stats.copy_tx_sendtos -
xsks[i]->app_stats.prev_copy_tx_sendtos) * 1000000000. / dt;
tx_wakeup_sendtos_ps = (xsks[i]->app_stats.tx_wakeup_sendtos -
xsks[i]->app_stats.prev_tx_wakeup_sendtos)
* 1000000000. / dt;
opt_polls_ps = (xsks[i]->app_stats.opt_polls -
xsks[i]->app_stats.prev_opt_polls) * 1000000000. / dt;
printf("\n%-18s %-14s %-14s\n", "", "calls/s", "count");
printf(fmt, "rx empty polls", rx_empty_polls_ps, xsks[i]->app_stats.rx_empty_polls);
printf(fmt, "fill fail polls", fill_fail_polls_ps,
xsks[i]->app_stats.fill_fail_polls);
printf(fmt, "copy tx sendtos", copy_tx_sendtos_ps,
xsks[i]->app_stats.copy_tx_sendtos);
printf(fmt, "tx wakeup sendtos", tx_wakeup_sendtos_ps,
xsks[i]->app_stats.tx_wakeup_sendtos);
printf(fmt, "opt polls", opt_polls_ps, xsks[i]->app_stats.opt_polls);
xsks[i]->app_stats.prev_rx_empty_polls = xsks[i]->app_stats.rx_empty_polls;
xsks[i]->app_stats.prev_fill_fail_polls = xsks[i]->app_stats.fill_fail_polls;
xsks[i]->app_stats.prev_copy_tx_sendtos = xsks[i]->app_stats.copy_tx_sendtos;
xsks[i]->app_stats.prev_tx_wakeup_sendtos = xsks[i]->app_stats.tx_wakeup_sendtos;
xsks[i]->app_stats.prev_opt_polls = xsks[i]->app_stats.opt_polls;
}
}
static bool get_interrupt_number(void)
{
FILE *f_int_proc;
char line[4096];
bool found = false;
f_int_proc = fopen("/proc/interrupts", "r");
if (f_int_proc == NULL) {
printf("Failed to open /proc/interrupts.\n");
return found;
}
while (!feof(f_int_proc) && !found) {
/* Make sure to read a full line at a time */
if (fgets(line, sizeof(line), f_int_proc) == NULL ||
line[strlen(line) - 1] != '\n') {
printf("Error reading from interrupts file\n");
break;
}
/* Extract interrupt number from line */
if (strstr(line, opt_irq_str) != NULL) {
irq_no = atoi(line);
found = true;
break;
}
}
fclose(f_int_proc);
return found;
}
static int get_irqs(void)
{
char count_path[PATH_MAX];
int total_intrs = -1;
FILE *f_count_proc;
char line[4096];
snprintf(count_path, sizeof(count_path),
"/sys/kernel/irq/%i/per_cpu_count", irq_no);
f_count_proc = fopen(count_path, "r");
if (f_count_proc == NULL) {
printf("Failed to open %s\n", count_path);
return total_intrs;
}
if (fgets(line, sizeof(line), f_count_proc) == NULL ||
line[strlen(line) - 1] != '\n') {
printf("Error reading from %s\n", count_path);
} else {
static const char com[2] = ",";
char *token;
total_intrs = 0;
token = strtok(line, com);
while (token != NULL) {
/* sum up interrupts across all cores */
total_intrs += atoi(token);
token = strtok(NULL, com);
}
}
fclose(f_count_proc);
return total_intrs;
}
static void dump_driver_stats(long dt)
{
int i;
for (i = 0; i < num_socks && xsks[i]; i++) {
char *fmt = "%-18s %'-14.0f %'-14lu\n";
double intrs_ps;
int n_ints = get_irqs();
if (n_ints < 0) {
printf("error getting intr info for intr %i\n", irq_no);
return;
}
xsks[i]->drv_stats.intrs = n_ints - irqs_at_init;
intrs_ps = (xsks[i]->drv_stats.intrs - xsks[i]->drv_stats.prev_intrs) *
1000000000. / dt;
printf("\n%-18s %-14s %-14s\n", "", "intrs/s", "count");
printf(fmt, "irqs", intrs_ps, xsks[i]->drv_stats.intrs);
xsks[i]->drv_stats.prev_intrs = xsks[i]->drv_stats.intrs;
}
}
static void dump_stats(void)
{
unsigned long now = get_nsecs();
@ -194,67 +356,83 @@ static void dump_stats(void)
prev_time = now;
for (i = 0; i < num_socks && xsks[i]; i++) {
char *fmt = "%-15s %'-11.0f %'-11lu\n";
char *fmt = "%-18s %'-14.0f %'-14lu\n";
double rx_pps, tx_pps, dropped_pps, rx_invalid_pps, full_pps, fill_empty_pps,
tx_invalid_pps, tx_empty_pps;
rx_pps = (xsks[i]->rx_npkts - xsks[i]->prev_rx_npkts) *
rx_pps = (xsks[i]->ring_stats.rx_npkts - xsks[i]->ring_stats.prev_rx_npkts) *
1000000000. / dt;
tx_pps = (xsks[i]->tx_npkts - xsks[i]->prev_tx_npkts) *
tx_pps = (xsks[i]->ring_stats.tx_npkts - xsks[i]->ring_stats.prev_tx_npkts) *
1000000000. / dt;
printf("\n sock%d@", i);
print_benchmark(false);
printf("\n");
printf("%-15s %-11s %-11s %-11.2f\n", "", "pps", "pkts",
printf("%-18s %-14s %-14s %-14.2f\n", "", "pps", "pkts",
dt / 1000000000.);
printf(fmt, "rx", rx_pps, xsks[i]->rx_npkts);
printf(fmt, "tx", tx_pps, xsks[i]->tx_npkts);
printf(fmt, "rx", rx_pps, xsks[i]->ring_stats.rx_npkts);
printf(fmt, "tx", tx_pps, xsks[i]->ring_stats.tx_npkts);
xsks[i]->prev_rx_npkts = xsks[i]->rx_npkts;
xsks[i]->prev_tx_npkts = xsks[i]->tx_npkts;
xsks[i]->ring_stats.prev_rx_npkts = xsks[i]->ring_stats.rx_npkts;
xsks[i]->ring_stats.prev_tx_npkts = xsks[i]->ring_stats.tx_npkts;
if (opt_extra_stats) {
if (!xsk_get_xdp_stats(xsk_socket__fd(xsks[i]->xsk), xsks[i])) {
dropped_pps = (xsks[i]->rx_dropped_npkts -
xsks[i]->prev_rx_dropped_npkts) * 1000000000. / dt;
rx_invalid_pps = (xsks[i]->rx_invalid_npkts -
xsks[i]->prev_rx_invalid_npkts) * 1000000000. / dt;
tx_invalid_pps = (xsks[i]->tx_invalid_npkts -
xsks[i]->prev_tx_invalid_npkts) * 1000000000. / dt;
full_pps = (xsks[i]->rx_full_npkts -
xsks[i]->prev_rx_full_npkts) * 1000000000. / dt;
fill_empty_pps = (xsks[i]->rx_fill_empty_npkts -
xsks[i]->prev_rx_fill_empty_npkts)
* 1000000000. / dt;
tx_empty_pps = (xsks[i]->tx_empty_npkts -
xsks[i]->prev_tx_empty_npkts) * 1000000000. / dt;
dropped_pps = (xsks[i]->ring_stats.rx_dropped_npkts -
xsks[i]->ring_stats.prev_rx_dropped_npkts) *
1000000000. / dt;
rx_invalid_pps = (xsks[i]->ring_stats.rx_invalid_npkts -
xsks[i]->ring_stats.prev_rx_invalid_npkts) *
1000000000. / dt;
tx_invalid_pps = (xsks[i]->ring_stats.tx_invalid_npkts -
xsks[i]->ring_stats.prev_tx_invalid_npkts) *
1000000000. / dt;
full_pps = (xsks[i]->ring_stats.rx_full_npkts -
xsks[i]->ring_stats.prev_rx_full_npkts) *
1000000000. / dt;
fill_empty_pps = (xsks[i]->ring_stats.rx_fill_empty_npkts -
xsks[i]->ring_stats.prev_rx_fill_empty_npkts) *
1000000000. / dt;
tx_empty_pps = (xsks[i]->ring_stats.tx_empty_npkts -
xsks[i]->ring_stats.prev_tx_empty_npkts) *
1000000000. / dt;
printf(fmt, "rx dropped", dropped_pps,
xsks[i]->rx_dropped_npkts);
xsks[i]->ring_stats.rx_dropped_npkts);
printf(fmt, "rx invalid", rx_invalid_pps,
xsks[i]->rx_invalid_npkts);
xsks[i]->ring_stats.rx_invalid_npkts);
printf(fmt, "tx invalid", tx_invalid_pps,
xsks[i]->tx_invalid_npkts);
xsks[i]->ring_stats.tx_invalid_npkts);
printf(fmt, "rx queue full", full_pps,
xsks[i]->rx_full_npkts);
xsks[i]->ring_stats.rx_full_npkts);
printf(fmt, "fill ring empty", fill_empty_pps,
xsks[i]->rx_fill_empty_npkts);
xsks[i]->ring_stats.rx_fill_empty_npkts);
printf(fmt, "tx ring empty", tx_empty_pps,
xsks[i]->tx_empty_npkts);
xsks[i]->ring_stats.tx_empty_npkts);
xsks[i]->prev_rx_dropped_npkts = xsks[i]->rx_dropped_npkts;
xsks[i]->prev_rx_invalid_npkts = xsks[i]->rx_invalid_npkts;
xsks[i]->prev_tx_invalid_npkts = xsks[i]->tx_invalid_npkts;
xsks[i]->prev_rx_full_npkts = xsks[i]->rx_full_npkts;
xsks[i]->prev_rx_fill_empty_npkts = xsks[i]->rx_fill_empty_npkts;
xsks[i]->prev_tx_empty_npkts = xsks[i]->tx_empty_npkts;
xsks[i]->ring_stats.prev_rx_dropped_npkts =
xsks[i]->ring_stats.rx_dropped_npkts;
xsks[i]->ring_stats.prev_rx_invalid_npkts =
xsks[i]->ring_stats.rx_invalid_npkts;
xsks[i]->ring_stats.prev_tx_invalid_npkts =
xsks[i]->ring_stats.tx_invalid_npkts;
xsks[i]->ring_stats.prev_rx_full_npkts =
xsks[i]->ring_stats.rx_full_npkts;
xsks[i]->ring_stats.prev_rx_fill_empty_npkts =
xsks[i]->ring_stats.rx_fill_empty_npkts;
xsks[i]->ring_stats.prev_tx_empty_npkts =
xsks[i]->ring_stats.tx_empty_npkts;
} else {
printf("%-15s\n", "Error retrieving extra stats");
}
}
}
if (opt_app_stats)
dump_app_stats(dt);
if (irq_no)
dump_driver_stats(dt);
}
static bool is_benchmark_done(void)
@ -693,6 +871,17 @@ static struct xsk_socket_info *xsk_configure_socket(struct xsk_umem_info *umem,
if (ret)
exit_with_error(-ret);
xsk->app_stats.rx_empty_polls = 0;
xsk->app_stats.fill_fail_polls = 0;
xsk->app_stats.copy_tx_sendtos = 0;
xsk->app_stats.tx_wakeup_sendtos = 0;
xsk->app_stats.opt_polls = 0;
xsk->app_stats.prev_rx_empty_polls = 0;
xsk->app_stats.prev_fill_fail_polls = 0;
xsk->app_stats.prev_copy_tx_sendtos = 0;
xsk->app_stats.prev_tx_wakeup_sendtos = 0;
xsk->app_stats.prev_opt_polls = 0;
return xsk;
}
@ -720,6 +909,8 @@ static struct option long_options[] = {
{"tx-pkt-pattern", required_argument, 0, 'P'},
{"extra-stats", no_argument, 0, 'x'},
{"quiet", no_argument, 0, 'Q'},
{"app-stats", no_argument, 0, 'a'},
{"irq-string", no_argument, 0, 'I'},
{0, 0, 0, 0}
};
@ -756,6 +947,8 @@ static void usage(const char *prog)
" -P, --tx-pkt-pattern=nPacket fill pattern. Default: 0x%x\n"
" -x, --extra-stats Display extra statistics.\n"
" -Q, --quiet Do not display any stats.\n"
" -a, --app-stats Display application (syscall) statistics.\n"
" -I, --irq-string Display driver interrupt statistics for interface associated with irq-string.\n"
"\n";
fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE,
opt_batch_size, MIN_PKT_SIZE, MIN_PKT_SIZE,
@ -771,7 +964,7 @@ static void parse_command_line(int argc, char **argv)
opterr = 0;
for (;;) {
c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQ",
c = getopt_long(argc, argv, "Frtli:q:pSNn:czf:muMd:b:C:s:P:xQaI:",
long_options, &option_index);
if (c == -1)
break;
@ -857,6 +1050,19 @@ static void parse_command_line(int argc, char **argv)
break;
case 'Q':
opt_quiet = 1;
break;
case 'a':
opt_app_stats = 1;
break;
case 'I':
opt_irq_str = optarg;
if (get_interrupt_number())
irqs_at_init = get_irqs();
if (irqs_at_init < 0) {
fprintf(stderr, "ERROR: Failed to get irqs for %s\n", opt_irq_str);
usage(basename(argv[0]));
}
break;
default:
usage(basename(argv[0]));
@ -908,8 +1114,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
* is driven by the NAPI loop. So as an optimization, we do not have to call
* sendto() all the time in zero-copy mode for l2fwd.
*/
if (opt_xdp_bind_flags & XDP_COPY)
if (opt_xdp_bind_flags & XDP_COPY) {
xsk->app_stats.copy_tx_sendtos++;
kick_tx(xsk);
}
ndescs = (xsk->outstanding_tx > opt_batch_size) ? opt_batch_size :
xsk->outstanding_tx;
@ -924,8 +1132,10 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
if (xsk_ring_prod__needs_wakeup(&umem->fq))
if (xsk_ring_prod__needs_wakeup(&umem->fq)) {
xsk->app_stats.fill_fail_polls++;
ret = poll(fds, num_socks, opt_timeout);
}
ret = xsk_ring_prod__reserve(&umem->fq, rcvd, &idx_fq);
}
@ -936,7 +1146,7 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
xsk_ring_cons__release(&xsk->umem->cq, rcvd);
xsk->outstanding_tx -= rcvd;
xsk->tx_npkts += rcvd;
xsk->ring_stats.tx_npkts += rcvd;
}
}
@ -949,14 +1159,16 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk,
if (!xsk->outstanding_tx)
return;
if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx)) {
xsk->app_stats.tx_wakeup_sendtos++;
kick_tx(xsk);
}
rcvd = xsk_ring_cons__peek(&xsk->umem->cq, batch_size, &idx);
if (rcvd > 0) {
xsk_ring_cons__release(&xsk->umem->cq, rcvd);
xsk->outstanding_tx -= rcvd;
xsk->tx_npkts += rcvd;
xsk->ring_stats.tx_npkts += rcvd;
}
}
@ -968,8 +1180,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
if (!rcvd) {
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
xsk->app_stats.rx_empty_polls++;
ret = poll(fds, num_socks, opt_timeout);
}
return;
}
@ -977,8 +1191,10 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
while (ret != rcvd) {
if (ret < 0)
exit_with_error(-ret);
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
xsk->app_stats.fill_fail_polls++;
ret = poll(fds, num_socks, opt_timeout);
}
ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
}
@ -996,7 +1212,7 @@ static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
xsk_ring_prod__submit(&xsk->umem->fq, rcvd);
xsk_ring_cons__release(&xsk->rx, rcvd);
xsk->rx_npkts += rcvd;
xsk->ring_stats.rx_npkts += rcvd;
}
static void rx_drop_all(void)
@ -1011,6 +1227,8 @@ static void rx_drop_all(void)
for (;;) {
if (opt_poll) {
for (i = 0; i < num_socks; i++)
xsks[i]->app_stats.opt_polls++;
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
@ -1091,6 +1309,8 @@ static void tx_only_all(void)
int batch_size = get_batch_size(pkt_cnt);
if (opt_poll) {
for (i = 0; i < num_socks; i++)
xsks[i]->app_stats.opt_polls++;
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;
@ -1122,8 +1342,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
rcvd = xsk_ring_cons__peek(&xsk->rx, opt_batch_size, &idx_rx);
if (!rcvd) {
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq)) {
xsk->app_stats.rx_empty_polls++;
ret = poll(fds, num_socks, opt_timeout);
}
return;
}
@ -1132,8 +1354,10 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
if (ret < 0)
exit_with_error(-ret);
complete_tx_l2fwd(xsk, fds);
if (xsk_ring_prod__needs_wakeup(&xsk->tx))
if (xsk_ring_prod__needs_wakeup(&xsk->tx)) {
xsk->app_stats.tx_wakeup_sendtos++;
kick_tx(xsk);
}
ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
}
@ -1155,7 +1379,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
xsk_ring_prod__submit(&xsk->tx, rcvd);
xsk_ring_cons__release(&xsk->rx, rcvd);
xsk->rx_npkts += rcvd;
xsk->ring_stats.rx_npkts += rcvd;
xsk->outstanding_tx += rcvd;
}
@ -1171,6 +1395,8 @@ static void l2fwd_all(void)
for (;;) {
if (opt_poll) {
for (i = 0; i < num_socks; i++)
xsks[i]->app_stats.opt_polls++;
ret = poll(fds, num_socks, opt_timeout);
if (ret <= 0)
continue;

View File

@ -356,18 +356,36 @@ enum bpf_link_type {
#define BPF_F_SLEEPABLE (1U << 4)
/* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
* the following extensions:
*
* insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE
* insn[0].imm: map fd map fd
* insn[1].imm: 0 offset into value
* insn[0].off: 0 0
* insn[1].off: 0 0
* ldimm64 rewrite: address of map address of map[0]+offset
* verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE
* insn[0].src_reg: BPF_PSEUDO_MAP_FD
* insn[0].imm: map fd
* insn[1].imm: 0
* insn[0].off: 0
* insn[1].off: 0
* ldimm64 rewrite: address of map
* verifier type: CONST_PTR_TO_MAP
*/
#define BPF_PSEUDO_MAP_FD 1
/* insn[0].src_reg: BPF_PSEUDO_MAP_VALUE
* insn[0].imm: map fd
* insn[1].imm: offset into value
* insn[0].off: 0
* insn[1].off: 0
* ldimm64 rewrite: address of map[0]+offset
* verifier type: PTR_TO_MAP_VALUE
*/
#define BPF_PSEUDO_MAP_VALUE 2
/* insn[0].src_reg: BPF_PSEUDO_BTF_ID
* insn[0].imm: kernel btd id of VAR
* insn[1].imm: 0
* insn[0].off: 0
* insn[1].off: 0
* ldimm64 rewrite: address of the kernel variable
* verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var
* is struct/union.
*/
#define BPF_PSEUDO_BTF_ID 3
/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
* offset to another bpf function
@ -417,6 +435,9 @@ enum {
/* Share perf_event among processes */
BPF_F_PRESERVE_ELEMS = (1U << 11),
/* Create a map that is suitable to be an inner map with dynamic max entries */
BPF_F_INNER_MAP = (1U << 12),
};
/* Flags for BPF_PROG_QUERY. */
@ -1680,7 +1701,7 @@ union bpf_attr {
* **TCP_CONGESTION**, **TCP_BPF_IW**,
* **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**,
* **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**,
* **TCP_SYNCNT**, **TCP_USER_TIMEOUT**.
* **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**.
* * **IPPROTO_IP**, which supports *optname* **IP_TOS**.
* * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**.
* Return
@ -2235,7 +2256,7 @@ union bpf_attr {
* Description
* This helper is used in programs implementing policies at the
* skb socket level. If the sk_buff *skb* is allowed to pass (i.e.
* if the verdeict eBPF program returns **SK_PASS**), redirect it
* if the verdict eBPF program returns **SK_PASS**), redirect it
* to the socket referenced by *map* (of type
* **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and
* egress interfaces can be used for redirection. The
@ -3661,10 +3682,59 @@ union bpf_attr {
* Redirect the packet to another net device of index *ifindex*
* and fill in L2 addresses from neighboring subsystem. This helper
* is somewhat similar to **bpf_redirect**\ (), except that it
* fills in e.g. MAC addresses based on the L3 information from
* the packet. This helper is supported for IPv4 and IPv6 protocols.
* populates L2 addresses as well, meaning, internally, the helper
* performs a FIB lookup based on the skb's networking header to
* get the address of the next hop and then relies on the neighbor
* lookup for the L2 address of the nexthop.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types.
* currently only supported for tc BPF program types, and enabled
* for IPv4 and IPv6 protocols.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*
* void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu)
* Description
* Take a pointer to a percpu ksym, *percpu_ptr*, and return a
* pointer to the percpu kernel variable on *cpu*. A ksym is an
* extern variable decorated with '__ksym'. For ksym, there is a
* global var (either static or global) defined of the same name
* in the kernel. The ksym is percpu if the global var is percpu.
* The returned pointer points to the global percpu var on *cpu*.
*
* bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the
* kernel, except that bpf_per_cpu_ptr() may return NULL. This
* happens if *cpu* is larger than nr_cpu_ids. The caller of
* bpf_per_cpu_ptr() must check the returned value.
* Return
* A pointer pointing to the kernel percpu variable on *cpu*, or
* NULL, if *cpu* is invalid.
*
* void *bpf_this_cpu_ptr(const void *percpu_ptr)
* Description
* Take a pointer to a percpu ksym, *percpu_ptr*, and return a
* pointer to the percpu kernel variable on this cpu. See the
* description of 'ksym' in **bpf_per_cpu_ptr**\ ().
*
* bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in
* the kernel. Different from **bpf_per_cpu_ptr**\ (), it would
* never return NULL.
* Return
* A pointer pointing to the kernel percpu variable on this cpu.
*
* long bpf_redirect_peer(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*.
* This helper is somewhat similar to **bpf_redirect**\ (), except
* that the redirection happens to the *ifindex*' peer device and
* the netns switch takes place from ingress to ingress without
* going through the CPU's backlog queue.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types at the ingress
* hook and for veth device types. The peer device must reside in a
* different network namespace.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
@ -3823,6 +3893,9 @@ union bpf_attr {
FN(seq_printf_btf), \
FN(skb_cgroup_classid), \
FN(redirect_neigh), \
FN(bpf_per_cpu_ptr), \
FN(bpf_this_cpu_ptr), \
FN(redirect_peer), \
/* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper

View File

@ -390,6 +390,12 @@ struct extern_desc {
} kcfg;
struct {
unsigned long long addr;
/* target btf_id of the corresponding kernel var. */
int vmlinux_btf_id;
/* local btf_id of the ksym extern's type. */
__u32 type_id;
} ksym;
};
};
@ -2522,12 +2528,23 @@ static int bpf_object__load_vmlinux_btf(struct bpf_object *obj)
{
bool need_vmlinux_btf = false;
struct bpf_program *prog;
int err;
int i, err;
/* CO-RE relocations need kernel BTF */
if (obj->btf_ext && obj->btf_ext->core_relo_info.len)
need_vmlinux_btf = true;
/* Support for typed ksyms needs kernel BTF */
for (i = 0; i < obj->nr_extern; i++) {
const struct extern_desc *ext;
ext = &obj->externs[i];
if (ext->type == EXT_KSYM && ext->ksym.type_id) {
need_vmlinux_btf = true;
break;
}
}
bpf_object__for_each_program(prog, obj) {
if (!prog->load)
continue;
@ -3156,16 +3173,10 @@ static int bpf_object__collect_externs(struct bpf_object *obj)
return -ENOTSUP;
}
} else if (strcmp(sec_name, KSYMS_SEC) == 0) {
const struct btf_type *vt;
ksym_sec = sec;
ext->type = EXT_KSYM;
vt = skip_mods_and_typedefs(obj->btf, t->type, NULL);
if (!btf_is_void(vt)) {
pr_warn("extern (ksym) '%s' is not typeless (void)\n", ext_name);
return -ENOTSUP;
}
skip_mods_and_typedefs(obj->btf, t->type,
&ext->ksym.type_id);
} else {
pr_warn("unrecognized extern section '%s'\n", sec_name);
return -ENOTSUP;
@ -4192,6 +4203,36 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map)
return 0;
}
static int init_map_slots(struct bpf_map *map)
{
const struct bpf_map *targ_map;
unsigned int i;
int fd, err;
for (i = 0; i < map->init_slots_sz; i++) {
if (!map->init_slots[i])
continue;
targ_map = map->init_slots[i];
fd = bpf_map__fd(targ_map);
err = bpf_map_update_elem(map->fd, &i, &fd, 0);
if (err) {
err = -errno;
pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
map->name, i, targ_map->name,
fd, err);
return err;
}
pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n",
map->name, i, targ_map->name, fd);
}
zfree(&map->init_slots);
map->init_slots_sz = 0;
return 0;
}
static int
bpf_object__create_maps(struct bpf_object *obj)
{
@ -4215,47 +4256,29 @@ bpf_object__create_maps(struct bpf_object *obj)
if (map->fd >= 0) {
pr_debug("map '%s': skipping creation (preset fd=%d)\n",
map->name, map->fd);
continue;
}
err = bpf_object__create_map(obj, map);
if (err)
goto err_out;
pr_debug("map '%s': created successfully, fd=%d\n", map->name,
map->fd);
if (bpf_map__is_internal(map)) {
err = bpf_object__populate_internal_map(obj, map);
if (err < 0) {
zclose(map->fd);
} else {
err = bpf_object__create_map(obj, map);
if (err)
goto err_out;
}
}
if (map->init_slots_sz) {
for (j = 0; j < map->init_slots_sz; j++) {
const struct bpf_map *targ_map;
int fd;
pr_debug("map '%s': created successfully, fd=%d\n",
map->name, map->fd);
if (!map->init_slots[j])
continue;
targ_map = map->init_slots[j];
fd = bpf_map__fd(targ_map);
err = bpf_map_update_elem(map->fd, &j, &fd, 0);
if (err) {
err = -errno;
pr_warn("map '%s': failed to initialize slot [%d] to map '%s' fd=%d: %d\n",
map->name, j, targ_map->name,
fd, err);
if (bpf_map__is_internal(map)) {
err = bpf_object__populate_internal_map(obj, map);
if (err < 0) {
zclose(map->fd);
goto err_out;
}
}
if (map->init_slots_sz) {
err = init_map_slots(map);
if (err < 0) {
zclose(map->fd);
goto err_out;
}
pr_debug("map '%s': slot [%d] set to map '%s' fd=%d\n",
map->name, j, targ_map->name, fd);
}
zfree(&map->init_slots);
map->init_slots_sz = 0;
}
if (map->pin_path && !map->pinned) {
@ -5017,16 +5040,19 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec,
static int bpf_core_calc_field_relo(const struct bpf_program *prog,
const struct bpf_core_relo *relo,
const struct bpf_core_spec *spec,
__u32 *val, bool *validate)
__u32 *val, __u32 *field_sz, __u32 *type_id,
bool *validate)
{
const struct bpf_core_accessor *acc;
const struct btf_type *t;
__u32 byte_off, byte_sz, bit_off, bit_sz;
__u32 byte_off, byte_sz, bit_off, bit_sz, field_type_id;
const struct btf_member *m;
const struct btf_type *mt;
bool bitfield;
__s64 sz;
*field_sz = 0;
if (relo->kind == BPF_FIELD_EXISTS) {
*val = spec ? 1 : 0;
return 0;
@ -5042,6 +5068,12 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
if (!acc->name) {
if (relo->kind == BPF_FIELD_BYTE_OFFSET) {
*val = spec->bit_offset / 8;
/* remember field size for load/store mem size */
sz = btf__resolve_size(spec->btf, acc->type_id);
if (sz < 0)
return -EINVAL;
*field_sz = sz;
*type_id = acc->type_id;
} else if (relo->kind == BPF_FIELD_BYTE_SIZE) {
sz = btf__resolve_size(spec->btf, acc->type_id);
if (sz < 0)
@ -5058,7 +5090,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
}
m = btf_members(t) + acc->idx;
mt = skip_mods_and_typedefs(spec->btf, m->type, NULL);
mt = skip_mods_and_typedefs(spec->btf, m->type, &field_type_id);
bit_off = spec->bit_offset;
bit_sz = btf_member_bitfield_size(t, acc->idx);
@ -5078,7 +5110,7 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
byte_off = bit_off / 8 / byte_sz * byte_sz;
}
} else {
sz = btf__resolve_size(spec->btf, m->type);
sz = btf__resolve_size(spec->btf, field_type_id);
if (sz < 0)
return -EINVAL;
byte_sz = sz;
@ -5096,6 +5128,10 @@ static int bpf_core_calc_field_relo(const struct bpf_program *prog,
switch (relo->kind) {
case BPF_FIELD_BYTE_OFFSET:
*val = byte_off;
if (!bitfield) {
*field_sz = byte_sz;
*type_id = field_type_id;
}
break;
case BPF_FIELD_BYTE_SIZE:
*val = byte_sz;
@ -5196,6 +5232,19 @@ struct bpf_core_relo_res
bool poison;
/* some relocations can't be validated against orig_val */
bool validate;
/* for field byte offset relocations or the forms:
* *(T *)(rX + <off>) = rY
* rX = *(T *)(rY + <off>),
* we remember original and resolved field size to adjust direct
* memory loads of pointers and integers; this is necessary for 32-bit
* host kernel architectures, but also allows to automatically
* relocate fields that were resized from, e.g., u32 to u64, etc.
*/
bool fail_memsz_adjust;
__u32 orig_sz;
__u32 orig_type_id;
__u32 new_sz;
__u32 new_type_id;
};
/* Calculate original and target relocation values, given local and target
@ -5217,10 +5266,56 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
res->new_val = 0;
res->poison = false;
res->validate = true;
res->fail_memsz_adjust = false;
res->orig_sz = res->new_sz = 0;
res->orig_type_id = res->new_type_id = 0;
if (core_relo_is_field_based(relo->kind)) {
err = bpf_core_calc_field_relo(prog, relo, local_spec, &res->orig_val, &res->validate);
err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec, &res->new_val, NULL);
err = bpf_core_calc_field_relo(prog, relo, local_spec,
&res->orig_val, &res->orig_sz,
&res->orig_type_id, &res->validate);
err = err ?: bpf_core_calc_field_relo(prog, relo, targ_spec,
&res->new_val, &res->new_sz,
&res->new_type_id, NULL);
if (err)
goto done;
/* Validate if it's safe to adjust load/store memory size.
* Adjustments are performed only if original and new memory
* sizes differ.
*/
res->fail_memsz_adjust = false;
if (res->orig_sz != res->new_sz) {
const struct btf_type *orig_t, *new_t;
orig_t = btf__type_by_id(local_spec->btf, res->orig_type_id);
new_t = btf__type_by_id(targ_spec->btf, res->new_type_id);
/* There are two use cases in which it's safe to
* adjust load/store's mem size:
* - reading a 32-bit kernel pointer, while on BPF
* size pointers are always 64-bit; in this case
* it's safe to "downsize" instruction size due to
* pointer being treated as unsigned integer with
* zero-extended upper 32-bits;
* - reading unsigned integers, again due to
* zero-extension is preserving the value correctly.
*
* In all other cases it's incorrect to attempt to
* load/store field because read value will be
* incorrect, so we poison relocated instruction.
*/
if (btf_is_ptr(orig_t) && btf_is_ptr(new_t))
goto done;
if (btf_is_int(orig_t) && btf_is_int(new_t) &&
btf_int_encoding(orig_t) != BTF_INT_SIGNED &&
btf_int_encoding(new_t) != BTF_INT_SIGNED)
goto done;
/* mark as invalid mem size adjustment, but this will
* only be checked for LDX/STX/ST insns
*/
res->fail_memsz_adjust = true;
}
} else if (core_relo_is_type_based(relo->kind)) {
err = bpf_core_calc_type_relo(relo, local_spec, &res->orig_val);
err = err ?: bpf_core_calc_type_relo(relo, targ_spec, &res->new_val);
@ -5229,6 +5324,7 @@ static int bpf_core_calc_relo(const struct bpf_program *prog,
err = err ?: bpf_core_calc_enumval_relo(relo, targ_spec, &res->new_val);
}
done:
if (err == -EUCLEAN) {
/* EUCLEAN is used to signal instruction poisoning request */
res->poison = true;
@ -5268,6 +5364,28 @@ static bool is_ldimm64(struct bpf_insn *insn)
return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
}
static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
{
switch (BPF_SIZE(insn->code)) {
case BPF_DW: return 8;
case BPF_W: return 4;
case BPF_H: return 2;
case BPF_B: return 1;
default: return -1;
}
}
static int insn_bytes_to_bpf_size(__u32 sz)
{
switch (sz) {
case 8: return BPF_DW;
case 4: return BPF_W;
case 2: return BPF_H;
case 1: return BPF_B;
default: return -1;
}
}
/*
* Patch relocatable BPF instruction.
*
@ -5277,10 +5395,13 @@ static bool is_ldimm64(struct bpf_insn *insn)
* spec, and is checked before patching instruction. If actual insn->imm value
* is wrong, bail out with error.
*
* Currently three kinds of BPF instructions are supported:
* Currently supported classes of BPF instruction are:
* 1. rX = <imm> (assignment with immediate operand);
* 2. rX += <imm> (arithmetic operations with immediate operand);
* 3. rX = <imm64> (load with 64-bit immediate value).
* 3. rX = <imm64> (load with 64-bit immediate value);
* 4. rX = *(T *)(rY + <off>), where T is one of {u8, u16, u32, u64};
* 5. *(T *)(rX + <off>) = rY, where T is one of {u8, u16, u32, u64};
* 6. *(T *)(rX + <off>) = <imm>, where T is one of {u8, u16, u32, u64}.
*/
static int bpf_core_patch_insn(struct bpf_program *prog,
const struct bpf_core_relo *relo,
@ -5304,6 +5425,7 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
class = BPF_CLASS(insn->code);
if (res->poison) {
poison:
/* poison second part of ldimm64 to avoid confusing error from
* verifier about "unknown opcode 00"
*/
@ -5346,10 +5468,39 @@ static int bpf_core_patch_insn(struct bpf_program *prog,
prog->name, relo_idx, insn_idx, new_val);
return -ERANGE;
}
if (res->fail_memsz_adjust) {
pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) accesses field incorrectly. "
"Make sure you are accessing pointers, unsigned integers, or fields of matching type and size.\n",
prog->name, relo_idx, insn_idx);
goto poison;
}
orig_val = insn->off;
insn->off = new_val;
pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n",
prog->name, relo_idx, insn_idx, orig_val, new_val);
if (res->new_sz != res->orig_sz) {
int insn_bytes_sz, insn_bpf_sz;
insn_bytes_sz = insn_bpf_size_to_bytes(insn);
if (insn_bytes_sz != res->orig_sz) {
pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) unexpected mem size: got %d, exp %u\n",
prog->name, relo_idx, insn_idx, insn_bytes_sz, res->orig_sz);
return -EINVAL;
}
insn_bpf_sz = insn_bytes_to_bpf_size(res->new_sz);
if (insn_bpf_sz < 0) {
pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) invalid new mem size: %u\n",
prog->name, relo_idx, insn_idx, res->new_sz);
return -EINVAL;
}
insn->code = BPF_MODE(insn->code) | insn_bpf_sz | BPF_CLASS(insn->code);
pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) mem_sz %u -> %u\n",
prog->name, relo_idx, insn_idx, res->orig_sz, res->new_sz);
}
break;
case BPF_LD: {
__u64 imm;
@ -5691,7 +5842,7 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
return 0;
if (targ_btf_path)
targ_btf = btf__parse_elf(targ_btf_path, NULL);
targ_btf = btf__parse(targ_btf_path, NULL);
else
targ_btf = obj->btf_vmlinux;
if (IS_ERR_OR_NULL(targ_btf)) {
@ -5742,6 +5893,11 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
err = -EINVAL;
goto out;
}
/* no need to apply CO-RE relocation if the program is
* not going to be loaded
*/
if (!prog->load)
continue;
err = bpf_core_apply_relo(prog, rec, i, obj->btf,
targ_btf, cand_cache);
@ -5800,8 +5956,13 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog)
insn[0].imm = obj->maps[obj->kconfig_map_idx].fd;
insn[1].imm = ext->kcfg.data_off;
} else /* EXT_KSYM */ {
insn[0].imm = (__u32)ext->ksym.addr;
insn[1].imm = ext->ksym.addr >> 32;
if (ext->ksym.type_id) { /* typed ksyms */
insn[0].src_reg = BPF_PSEUDO_BTF_ID;
insn[0].imm = ext->ksym.vmlinux_btf_id;
} else { /* typeless ksyms */
insn[0].imm = (__u32)ext->ksym.addr;
insn[1].imm = ext->ksym.addr >> 32;
}
}
relo->processed = true;
break;
@ -6933,10 +7094,72 @@ static int bpf_object__read_kallsyms_file(struct bpf_object *obj)
return err;
}
static int bpf_object__resolve_ksyms_btf_id(struct bpf_object *obj)
{
struct extern_desc *ext;
int i, id;
for (i = 0; i < obj->nr_extern; i++) {
const struct btf_type *targ_var, *targ_type;
__u32 targ_type_id, local_type_id;
const char *targ_var_name;
int ret;
ext = &obj->externs[i];
if (ext->type != EXT_KSYM || !ext->ksym.type_id)
continue;
id = btf__find_by_name_kind(obj->btf_vmlinux, ext->name,
BTF_KIND_VAR);
if (id <= 0) {
pr_warn("extern (ksym) '%s': failed to find BTF ID in vmlinux BTF.\n",
ext->name);
return -ESRCH;
}
/* find local type_id */
local_type_id = ext->ksym.type_id;
/* find target type_id */
targ_var = btf__type_by_id(obj->btf_vmlinux, id);
targ_var_name = btf__name_by_offset(obj->btf_vmlinux,
targ_var->name_off);
targ_type = skip_mods_and_typedefs(obj->btf_vmlinux,
targ_var->type,
&targ_type_id);
ret = bpf_core_types_are_compat(obj->btf, local_type_id,
obj->btf_vmlinux, targ_type_id);
if (ret <= 0) {
const struct btf_type *local_type;
const char *targ_name, *local_name;
local_type = btf__type_by_id(obj->btf, local_type_id);
local_name = btf__name_by_offset(obj->btf,
local_type->name_off);
targ_name = btf__name_by_offset(obj->btf_vmlinux,
targ_type->name_off);
pr_warn("extern (ksym) '%s': incompatible types, expected [%d] %s %s, but kernel has [%d] %s %s\n",
ext->name, local_type_id,
btf_kind_str(local_type), local_name, targ_type_id,
btf_kind_str(targ_type), targ_name);
return -EINVAL;
}
ext->is_set = true;
ext->ksym.vmlinux_btf_id = id;
pr_debug("extern (ksym) '%s': resolved to [%d] %s %s\n",
ext->name, id, btf_kind_str(targ_var), targ_var_name);
}
return 0;
}
static int bpf_object__resolve_externs(struct bpf_object *obj,
const char *extra_kconfig)
{
bool need_config = false, need_kallsyms = false;
bool need_vmlinux_btf = false;
struct extern_desc *ext;
void *kcfg_data = NULL;
int err, i;
@ -6967,7 +7190,10 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
strncmp(ext->name, "CONFIG_", 7) == 0) {
need_config = true;
} else if (ext->type == EXT_KSYM) {
need_kallsyms = true;
if (ext->ksym.type_id)
need_vmlinux_btf = true;
else
need_kallsyms = true;
} else {
pr_warn("unrecognized extern '%s'\n", ext->name);
return -EINVAL;
@ -6996,6 +7222,11 @@ static int bpf_object__resolve_externs(struct bpf_object *obj,
if (err)
return -EINVAL;
}
if (need_vmlinux_btf) {
err = bpf_object__resolve_ksyms_btf_id(obj);
if (err)
return -EINVAL;
}
for (i = 0; i < obj->nr_extern; i++) {
ext = &obj->externs[i];
@ -7028,10 +7259,10 @@ int bpf_object__load_xattr(struct bpf_object_load_attr *attr)
}
err = bpf_object__probe_loading(obj);
err = err ? : bpf_object__load_vmlinux_btf(obj);
err = err ? : bpf_object__resolve_externs(obj, obj->kconfig);
err = err ? : bpf_object__sanitize_and_load_btf(obj);
err = err ? : bpf_object__sanitize_maps(obj);
err = err ? : bpf_object__load_vmlinux_btf(obj);
err = err ? : bpf_object__init_kern_struct_ops_maps(obj);
err = err ? : bpf_object__create_maps(obj);
err = err ? : bpf_object__relocate(obj, attr->target_btf_path);
@ -10353,9 +10584,8 @@ int bpf_program__set_attach_target(struct bpf_program *prog,
btf_id = libbpf_find_prog_btf_id(attach_func_name,
attach_prog_fd);
else
btf_id = __find_vmlinux_btf_id(prog->obj->btf_vmlinux,
attach_func_name,
prog->expected_attach_type);
btf_id = libbpf_find_vmlinux_btf_id(attach_func_name,
prog->expected_attach_type);
if (btf_id < 0)
return btf_id;

View File

@ -705,7 +705,7 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
struct xsk_ctx *ctx;
int err, ifindex;
if (!umem || !xsk_ptr || !(rx || tx) || !fill || !comp)
if (!umem || !xsk_ptr || !(rx || tx))
return -EFAULT;
xsk = calloc(1, sizeof(*xsk));
@ -735,6 +735,11 @@ int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
ctx = xsk_get_ctx(umem, ifindex, queue_id);
if (!ctx) {
if (!fill || !comp) {
err = -EFAULT;
goto out_socket;
}
ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
fill, comp);
if (!ctx) {

View File

@ -7,6 +7,44 @@ General instructions on running selftests can be found in
Additional information about selftest failures are
documented here.
profiler[23] test failures with clang/llvm <12.0.0
==================================================
With clang/llvm <12.0.0, the profiler[23] test may fail.
The symptom looks like
.. code-block:: c
// r9 is a pointer to map_value
// r7 is a scalar
17: bf 96 00 00 00 00 00 00 r6 = r9
18: 0f 76 00 00 00 00 00 00 r6 += r7
math between map_value pointer and register with unbounded min value is not allowed
// the instructions below will not be seen in the verifier log
19: a5 07 01 00 01 01 00 00 if r7 < 257 goto +1
20: bf 96 00 00 00 00 00 00 r6 = r9
// r6 is used here
The verifier will reject such code with above error.
At insn 18 the r7 is indeed unbounded. The later insn 19 checks the bounds and
the insn 20 undoes map_value addition. It is currently impossible for the
verifier to understand such speculative pointer arithmetic.
Hence
https://reviews.llvm.org/D85570
addresses it on the compiler side. It was committed on llvm 12.
The corresponding C code
.. code-block:: c
for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
filepart_length = bpf_probe_read_str(payload, ...);
if (filepart_length <= MAX_PATH) {
barrier_var(filepart_length); // workaround
payload += filepart_length;
}
}
bpf_iter test failures with clang/llvm 10.0.0
=============================================

View File

@ -195,13 +195,13 @@ static struct bpf_align_test tests[] = {
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.matches = {
{7, "R3_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
{8, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
{8, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
{9, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
{10, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
{10, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
{11, "R4_w=inv(id=0,umax_value=510,var_off=(0x0; 0x1fe))"},
{12, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
{12, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
{13, "R4_w=inv(id=0,umax_value=1020,var_off=(0x0; 0x3fc))"},
{14, "R4_w=inv(id=0,umax_value=255,var_off=(0x0; 0xff))"},
{14, "R4_w=inv(id=1,umax_value=255,var_off=(0x0; 0xff))"},
{15, "R4_w=inv(id=0,umax_value=2040,var_off=(0x0; 0x7f8))"},
{16, "R4_w=inv(id=0,umax_value=4080,var_off=(0x0; 0xff0))"},
},
@ -518,7 +518,7 @@ static struct bpf_align_test tests[] = {
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
{20, "R5=pkt(id=1,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
{20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1034,var_off=(0x2; 0x7fc)"},
},
},
@ -561,18 +561,18 @@ static struct bpf_align_test tests[] = {
/* Adding 14 makes R6 be (4n+2) */
{11, "R6_w=inv(id=0,umin_value=14,umax_value=74,var_off=(0x2; 0x7c))"},
/* Subtracting from packet pointer overflows ubounds */
{13, "R5_w=pkt(id=1,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
{13, "R5_w=pkt(id=2,off=0,r=8,umin_value=18446744073709551542,umax_value=18446744073709551602,var_off=(0xffffffffffffff82; 0x7c)"},
/* New unknown value in R7 is (4n), >= 76 */
{15, "R7_w=inv(id=0,umin_value=76,umax_value=1096,var_off=(0x0; 0x7fc))"},
/* Adding it to packet pointer gives nice bounds again */
{16, "R5_w=pkt(id=2,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
{16, "R5_w=pkt(id=3,off=0,r=0,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
/* At the time the word size load is performed from R5,
* its total fixed offset is NET_IP_ALIGN + reg->off (0)
* which is 2. Then the variable offset is (4n+2), so
* the total offset is 4-byte aligned and meets the
* load's requirements.
*/
{20, "R5=pkt(id=2,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
{20, "R5=pkt(id=3,off=0,r=4,umin_value=2,umax_value=1082,var_off=(0x2; 0xfffffffc)"},
},
},
};

View File

@ -55,10 +55,10 @@ static int kern_sync_rcu(void)
static void test_lookup_update(void)
{
int err, key = 0, val, i;
int map1_fd, map2_fd, map3_fd, map4_fd, map5_fd, map1_id, map2_id;
int outer_arr_fd, outer_hash_fd, outer_arr_dyn_fd;
struct test_btf_map_in_map *skel;
int outer_arr_fd, outer_hash_fd;
int fd, map1_fd, map2_fd, map1_id, map2_id;
int err, key = 0, val, i, fd;
skel = test_btf_map_in_map__open_and_load();
if (CHECK(!skel, "skel_open", "failed to open&load skeleton\n"))
@ -70,32 +70,45 @@ static void test_lookup_update(void)
map1_fd = bpf_map__fd(skel->maps.inner_map1);
map2_fd = bpf_map__fd(skel->maps.inner_map2);
map3_fd = bpf_map__fd(skel->maps.inner_map3);
map4_fd = bpf_map__fd(skel->maps.inner_map4);
map5_fd = bpf_map__fd(skel->maps.inner_map5);
outer_arr_dyn_fd = bpf_map__fd(skel->maps.outer_arr_dyn);
outer_arr_fd = bpf_map__fd(skel->maps.outer_arr);
outer_hash_fd = bpf_map__fd(skel->maps.outer_hash);
/* inner1 = input, inner2 = input + 1 */
map1_fd = bpf_map__fd(skel->maps.inner_map1);
/* inner1 = input, inner2 = input + 1, inner3 = input + 2 */
bpf_map_update_elem(outer_arr_fd, &key, &map1_fd, 0);
map2_fd = bpf_map__fd(skel->maps.inner_map2);
bpf_map_update_elem(outer_hash_fd, &key, &map2_fd, 0);
bpf_map_update_elem(outer_arr_dyn_fd, &key, &map3_fd, 0);
skel->bss->input = 1;
usleep(1);
bpf_map_lookup_elem(map1_fd, &key, &val);
CHECK(val != 1, "inner1", "got %d != exp %d\n", val, 1);
bpf_map_lookup_elem(map2_fd, &key, &val);
CHECK(val != 2, "inner2", "got %d != exp %d\n", val, 2);
bpf_map_lookup_elem(map3_fd, &key, &val);
CHECK(val != 3, "inner3", "got %d != exp %d\n", val, 3);
/* inner1 = input + 1, inner2 = input */
/* inner2 = input, inner1 = input + 1, inner4 = input + 2 */
bpf_map_update_elem(outer_arr_fd, &key, &map2_fd, 0);
bpf_map_update_elem(outer_hash_fd, &key, &map1_fd, 0);
bpf_map_update_elem(outer_arr_dyn_fd, &key, &map4_fd, 0);
skel->bss->input = 3;
usleep(1);
bpf_map_lookup_elem(map1_fd, &key, &val);
CHECK(val != 4, "inner1", "got %d != exp %d\n", val, 4);
bpf_map_lookup_elem(map2_fd, &key, &val);
CHECK(val != 3, "inner2", "got %d != exp %d\n", val, 3);
bpf_map_lookup_elem(map4_fd, &key, &val);
CHECK(val != 5, "inner4", "got %d != exp %d\n", val, 5);
/* inner5 = input + 2 */
bpf_map_update_elem(outer_arr_dyn_fd, &key, &map5_fd, 0);
skel->bss->input = 5;
usleep(1);
bpf_map_lookup_elem(map5_fd, &key, &val);
CHECK(val != 7, "inner5", "got %d != exp %d\n", val, 7);
for (i = 0; i < 5; i++) {
val = i % 2 ? map1_fd : map2_fd;
@ -106,7 +119,13 @@ static void test_lookup_update(void)
}
err = bpf_map_update_elem(outer_arr_fd, &key, &val, 0);
if (CHECK_FAIL(err)) {
printf("failed to update hash_of_maps on iter #%d\n", i);
printf("failed to update array_of_maps on iter #%d\n", i);
goto cleanup;
}
val = i % 2 ? map4_fd : map5_fd;
err = bpf_map_update_elem(outer_arr_dyn_fd, &key, &val, 0);
if (CHECK_FAIL(err)) {
printf("failed to update array_of_maps (dyn) on iter #%d\n", i);
goto cleanup;
}
}

View File

@ -0,0 +1,225 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include <test_progs.h>
#include <bpf/btf.h>
/* real layout and sizes according to test's (32-bit) BTF
* needs to be defined before skeleton is included */
struct test_struct___real {
unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */
unsigned int val2;
unsigned long long val1;
unsigned short val3;
unsigned char val4;
unsigned char _pad;
};
#include "test_core_autosize.skel.h"
static int duration = 0;
static struct {
unsigned long long ptr_samesized;
unsigned long long val1_samesized;
unsigned long long val2_samesized;
unsigned long long val3_samesized;
unsigned long long val4_samesized;
struct test_struct___real output_samesized;
unsigned long long ptr_downsized;
unsigned long long val1_downsized;
unsigned long long val2_downsized;
unsigned long long val3_downsized;
unsigned long long val4_downsized;
struct test_struct___real output_downsized;
unsigned long long ptr_probed;
unsigned long long val1_probed;
unsigned long long val2_probed;
unsigned long long val3_probed;
unsigned long long val4_probed;
unsigned long long ptr_signed;
unsigned long long val1_signed;
unsigned long long val2_signed;
unsigned long long val3_signed;
unsigned long long val4_signed;
struct test_struct___real output_signed;
} out;
void test_core_autosize(void)
{
char btf_file[] = "/tmp/core_autosize.btf.XXXXXX";
int err, fd = -1, zero = 0;
int char_id, short_id, int_id, long_long_id, void_ptr_id, id;
struct test_core_autosize* skel = NULL;
struct bpf_object_load_attr load_attr = {};
struct bpf_program *prog;
struct bpf_map *bss_map;
struct btf *btf = NULL;
size_t written;
const void *raw_data;
__u32 raw_sz;
FILE *f = NULL;
btf = btf__new_empty();
if (!ASSERT_OK_PTR(btf, "empty_btf"))
return;
/* Emit the following struct with 32-bit pointer size:
*
* struct test_struct {
* void *ptr;
* unsigned long val2;
* unsigned long long val1;
* unsigned short val3;
* unsigned char val4;
* char: 8;
* };
*
* This struct is going to be used as the "kernel BTF" for this test.
* It's equivalent memory-layout-wise to test_struct__real above.
*/
/* force 32-bit pointer size */
btf__set_pointer_size(btf, 4);
char_id = btf__add_int(btf, "unsigned char", 1, 0);
ASSERT_EQ(char_id, 1, "char_id");
short_id = btf__add_int(btf, "unsigned short", 2, 0);
ASSERT_EQ(short_id, 2, "short_id");
/* "long unsigned int" of 4 byte size tells BTF that sizeof(void *) == 4 */
int_id = btf__add_int(btf, "long unsigned int", 4, 0);
ASSERT_EQ(int_id, 3, "int_id");
long_long_id = btf__add_int(btf, "unsigned long long", 8, 0);
ASSERT_EQ(long_long_id, 4, "long_long_id");
void_ptr_id = btf__add_ptr(btf, 0);
ASSERT_EQ(void_ptr_id, 5, "void_ptr_id");
id = btf__add_struct(btf, "test_struct", 20 /* bytes */);
ASSERT_EQ(id, 6, "struct_id");
err = btf__add_field(btf, "ptr", void_ptr_id, 0, 0);
err = err ?: btf__add_field(btf, "val2", int_id, 32, 0);
err = err ?: btf__add_field(btf, "val1", long_long_id, 64, 0);
err = err ?: btf__add_field(btf, "val3", short_id, 128, 0);
err = err ?: btf__add_field(btf, "val4", char_id, 144, 0);
ASSERT_OK(err, "struct_fields");
fd = mkstemp(btf_file);
if (CHECK(fd < 0, "btf_tmp", "failed to create file: %d\n", fd))
goto cleanup;
f = fdopen(fd, "w");
if (!ASSERT_OK_PTR(f, "btf_fdopen"))
goto cleanup;
raw_data = btf__get_raw_data(btf, &raw_sz);
if (!ASSERT_OK_PTR(raw_data, "raw_data"))
goto cleanup;
written = fwrite(raw_data, 1, raw_sz, f);
if (CHECK(written != raw_sz, "btf_write", "written: %zu, errno: %d\n", written, errno))
goto cleanup;
fflush(f);
fclose(f);
f = NULL;
close(fd);
fd = -1;
/* open and load BPF program with custom BTF as the kernel BTF */
skel = test_core_autosize__open();
if (!ASSERT_OK_PTR(skel, "skel_open"))
return;
/* disable handle_signed() for now */
prog = bpf_object__find_program_by_name(skel->obj, "handle_signed");
if (!ASSERT_OK_PTR(prog, "prog_find"))
goto cleanup;
bpf_program__set_autoload(prog, false);
load_attr.obj = skel->obj;
load_attr.target_btf_path = btf_file;
err = bpf_object__load_xattr(&load_attr);
if (!ASSERT_OK(err, "prog_load"))
goto cleanup;
prog = bpf_object__find_program_by_name(skel->obj, "handle_samesize");
if (!ASSERT_OK_PTR(prog, "prog_find"))
goto cleanup;
skel->links.handle_samesize = bpf_program__attach(prog);
if (!ASSERT_OK_PTR(skel->links.handle_samesize, "prog_attach"))
goto cleanup;
prog = bpf_object__find_program_by_name(skel->obj, "handle_downsize");
if (!ASSERT_OK_PTR(prog, "prog_find"))
goto cleanup;
skel->links.handle_downsize = bpf_program__attach(prog);
if (!ASSERT_OK_PTR(skel->links.handle_downsize, "prog_attach"))
goto cleanup;
prog = bpf_object__find_program_by_name(skel->obj, "handle_probed");
if (!ASSERT_OK_PTR(prog, "prog_find"))
goto cleanup;
skel->links.handle_probed = bpf_program__attach(prog);
if (!ASSERT_OK_PTR(skel->links.handle_probed, "prog_attach"))
goto cleanup;
usleep(1);
bss_map = bpf_object__find_map_by_name(skel->obj, "test_cor.bss");
if (!ASSERT_OK_PTR(bss_map, "bss_map_find"))
goto cleanup;
err = bpf_map_lookup_elem(bpf_map__fd(bss_map), &zero, (void *)&out);
if (!ASSERT_OK(err, "bss_lookup"))
goto cleanup;
ASSERT_EQ(out.ptr_samesized, 0x01020304, "ptr_samesized");
ASSERT_EQ(out.val1_samesized, 0x1020304050607080, "val1_samesized");
ASSERT_EQ(out.val2_samesized, 0x0a0b0c0d, "val2_samesized");
ASSERT_EQ(out.val3_samesized, 0xfeed, "val3_samesized");
ASSERT_EQ(out.val4_samesized, 0xb9, "val4_samesized");
ASSERT_EQ(out.output_samesized.ptr, 0x01020304, "ptr_samesized");
ASSERT_EQ(out.output_samesized.val1, 0x1020304050607080, "val1_samesized");
ASSERT_EQ(out.output_samesized.val2, 0x0a0b0c0d, "val2_samesized");
ASSERT_EQ(out.output_samesized.val3, 0xfeed, "val3_samesized");
ASSERT_EQ(out.output_samesized.val4, 0xb9, "val4_samesized");
ASSERT_EQ(out.ptr_downsized, 0x01020304, "ptr_downsized");
ASSERT_EQ(out.val1_downsized, 0x1020304050607080, "val1_downsized");
ASSERT_EQ(out.val2_downsized, 0x0a0b0c0d, "val2_downsized");
ASSERT_EQ(out.val3_downsized, 0xfeed, "val3_downsized");
ASSERT_EQ(out.val4_downsized, 0xb9, "val4_downsized");
ASSERT_EQ(out.output_downsized.ptr, 0x01020304, "ptr_downsized");
ASSERT_EQ(out.output_downsized.val1, 0x1020304050607080, "val1_downsized");
ASSERT_EQ(out.output_downsized.val2, 0x0a0b0c0d, "val2_downsized");
ASSERT_EQ(out.output_downsized.val3, 0xfeed, "val3_downsized");
ASSERT_EQ(out.output_downsized.val4, 0xb9, "val4_downsized");
ASSERT_EQ(out.ptr_probed, 0x01020304, "ptr_probed");
ASSERT_EQ(out.val1_probed, 0x1020304050607080, "val1_probed");
ASSERT_EQ(out.val2_probed, 0x0a0b0c0d, "val2_probed");
ASSERT_EQ(out.val3_probed, 0xfeed, "val3_probed");
ASSERT_EQ(out.val4_probed, 0xb9, "val4_probed");
test_core_autosize__destroy(skel);
skel = NULL;
/* now re-load with handle_signed() enabled, it should fail loading */
skel = test_core_autosize__open();
if (!ASSERT_OK_PTR(skel, "skel_open"))
return;
load_attr.obj = skel->obj;
load_attr.target_btf_path = btf_file;
err = bpf_object__load_xattr(&load_attr);
if (!ASSERT_ERR(err, "bad_prog_load"))
goto cleanup;
cleanup:
if (f)
fclose(f);
if (fd >= 0)
close(fd);
remove(btf_file);
btf__free(btf);
test_core_autosize__destroy(skel);
}

View File

@ -7,40 +7,28 @@
static int duration;
static __u64 kallsyms_find(const char *sym)
{
char type, name[500];
__u64 addr, res = 0;
FILE *f;
f = fopen("/proc/kallsyms", "r");
if (CHECK(!f, "kallsyms_fopen", "failed to open: %d\n", errno))
return 0;
while (fscanf(f, "%llx %c %499s%*[^\n]\n", &addr, &type, name) > 0) {
if (strcmp(name, sym) == 0) {
res = addr;
goto out;
}
}
CHECK(false, "not_found", "symbol %s not found\n", sym);
out:
fclose(f);
return res;
}
void test_ksyms(void)
{
__u64 per_cpu_start_addr = kallsyms_find("__per_cpu_start");
__u64 link_fops_addr = kallsyms_find("bpf_link_fops");
const char *btf_path = "/sys/kernel/btf/vmlinux";
struct test_ksyms *skel;
struct test_ksyms__data *data;
__u64 link_fops_addr, per_cpu_start_addr;
struct stat st;
__u64 btf_size;
int err;
err = kallsyms_find("bpf_link_fops", &link_fops_addr);
if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
return;
if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_link_fops' not found\n"))
return;
err = kallsyms_find("__per_cpu_start", &per_cpu_start_addr);
if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
return;
if (CHECK(err == -ENOENT, "ksym_find", "symbol 'per_cpu_start' not found\n"))
return;
if (CHECK(stat(btf_path, &st), "stat_btf", "err %d\n", errno))
return;
btf_size = st.st_size;

View File

@ -0,0 +1,88 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Google */
#include <test_progs.h>
#include <bpf/libbpf.h>
#include <bpf/btf.h>
#include "test_ksyms_btf.skel.h"
static int duration;
void test_ksyms_btf(void)
{
__u64 runqueues_addr, bpf_prog_active_addr;
__u32 this_rq_cpu;
int this_bpf_prog_active;
struct test_ksyms_btf *skel = NULL;
struct test_ksyms_btf__data *data;
struct btf *btf;
int percpu_datasec;
int err;
err = kallsyms_find("runqueues", &runqueues_addr);
if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
return;
if (CHECK(err == -ENOENT, "ksym_find", "symbol 'runqueues' not found\n"))
return;
err = kallsyms_find("bpf_prog_active", &bpf_prog_active_addr);
if (CHECK(err == -EINVAL, "kallsyms_fopen", "failed to open: %d\n", errno))
return;
if (CHECK(err == -ENOENT, "ksym_find", "symbol 'bpf_prog_active' not found\n"))
return;
btf = libbpf_find_kernel_btf();
if (CHECK(IS_ERR(btf), "btf_exists", "failed to load kernel BTF: %ld\n",
PTR_ERR(btf)))
return;
percpu_datasec = btf__find_by_name_kind(btf, ".data..percpu",
BTF_KIND_DATASEC);
if (percpu_datasec < 0) {
printf("%s:SKIP:no PERCPU DATASEC in kernel btf\n",
__func__);
test__skip();
goto cleanup;
}
skel = test_ksyms_btf__open_and_load();
if (CHECK(!skel, "skel_open", "failed to open and load skeleton\n"))
goto cleanup;
err = test_ksyms_btf__attach(skel);
if (CHECK(err, "skel_attach", "skeleton attach failed: %d\n", err))
goto cleanup;
/* trigger tracepoint */
usleep(1);
data = skel->data;
CHECK(data->out__runqueues_addr != runqueues_addr, "runqueues_addr",
"got %llu, exp %llu\n",
(unsigned long long)data->out__runqueues_addr,
(unsigned long long)runqueues_addr);
CHECK(data->out__bpf_prog_active_addr != bpf_prog_active_addr, "bpf_prog_active_addr",
"got %llu, exp %llu\n",
(unsigned long long)data->out__bpf_prog_active_addr,
(unsigned long long)bpf_prog_active_addr);
CHECK(data->out__rq_cpu == -1, "rq_cpu",
"got %u, exp != -1\n", data->out__rq_cpu);
CHECK(data->out__bpf_prog_active < 0, "bpf_prog_active",
"got %d, exp >= 0\n", data->out__bpf_prog_active);
CHECK(data->out__cpu_0_rq_cpu != 0, "cpu_rq(0)->cpu",
"got %u, exp 0\n", data->out__cpu_0_rq_cpu);
this_rq_cpu = data->out__this_rq_cpu;
CHECK(this_rq_cpu != data->out__rq_cpu, "this_rq_cpu",
"got %u, exp %u\n", this_rq_cpu, data->out__rq_cpu);
this_bpf_prog_active = data->out__this_bpf_prog_active;
CHECK(this_bpf_prog_active != data->out__bpf_prog_active, "this_bpf_prog_active",
"got %d, exp %d\n", this_bpf_prog_active,
data->out__bpf_prog_active);
cleanup:
btf__free(btf);
test_ksyms_btf__destroy(skel);
}

View File

@ -37,7 +37,7 @@ void test_pinning(void)
struct stat statbuf = {};
struct bpf_object *obj;
struct bpf_map *map;
int err;
int err, map_fd;
DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts,
.pin_root_path = custpath,
);
@ -213,6 +213,53 @@ void test_pinning(void)
if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
goto out;
/* remove the custom pin path to re-test it with reuse fd below */
err = unlink(custpinpath);
if (CHECK(err, "unlink custpinpath", "err %d errno %d\n", err, errno))
goto out;
err = rmdir(custpath);
if (CHECK(err, "rmdir custpindir", "err %d errno %d\n", err, errno))
goto out;
bpf_object__close(obj);
/* test pinning at custom path with reuse fd */
obj = bpf_object__open_file(file, NULL);
err = libbpf_get_error(obj);
if (CHECK(err, "default open", "err %d errno %d\n", err, errno)) {
obj = NULL;
goto out;
}
map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(__u32),
sizeof(__u64), 1, 0);
if (CHECK(map_fd < 0, "create pinmap manually", "fd %d\n", map_fd))
goto out;
map = bpf_object__find_map_by_name(obj, "pinmap");
if (CHECK(!map, "find map", "NULL map"))
goto close_map_fd;
err = bpf_map__reuse_fd(map, map_fd);
if (CHECK(err, "reuse pinmap fd", "err %d errno %d\n", err, errno))
goto close_map_fd;
err = bpf_map__set_pin_path(map, custpinpath);
if (CHECK(err, "set pin path", "err %d errno %d\n", err, errno))
goto close_map_fd;
err = bpf_object__load(obj);
if (CHECK(err, "custom load", "err %d errno %d\n", err, errno))
goto close_map_fd;
/* check that pinmap was pinned at the custom path */
err = stat(custpinpath, &statbuf);
if (CHECK(err, "stat custpinpath", "err %d errno %d\n", err, errno))
goto close_map_fd;
close_map_fd:
close(map_fd);
out:
unlink(pinpath);
unlink(nopinpath);

View File

@ -198,7 +198,7 @@ static void test_sockmap_copy(enum bpf_map_type map_type)
{
DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
int err, len, src_fd, iter_fd, duration = 0;
union bpf_iter_link_info linfo = {0};
union bpf_iter_link_info linfo = {};
__u32 i, num_sockets, num_elems;
struct bpf_iter_sockmap *skel;
__s64 *sock_fd = NULL;

View File

@ -264,9 +264,19 @@ static int check_error_linum(const struct sk_fds *sk_fds)
static void check_hdr_and_close_fds(struct sk_fds *sk_fds)
{
const __u32 expected_inherit_cb_flags =
BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG |
BPF_SOCK_OPS_STATE_CB_FLAG;
if (sk_fds_shutdown(sk_fds))
goto check_linum;
if (CHECK(expected_inherit_cb_flags != skel->bss->inherit_cb_flags,
"Unexpected inherit_cb_flags", "0x%x != 0x%x\n",
skel->bss->inherit_cb_flags, expected_inherit_cb_flags))
goto check_linum;
if (check_hdr_stg(&exp_passive_hdr_stg, sk_fds->passive_fd,
"passive_hdr_stg"))
goto check_linum;
@ -321,6 +331,8 @@ static void reset_test(void)
memset(&skel->bss->active_estab_in, 0, optsize);
memset(&skel->bss->active_fin_in, 0, optsize);
skel->bss->inherit_cb_flags = 0;
skel->data->test_kind = TCPOPT_EXP;
skel->data->test_magic = 0xeB9F;

View File

@ -0,0 +1,72 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include <test_progs.h>
#include "progs/profiler.h"
#include "profiler1.skel.h"
#include "profiler2.skel.h"
#include "profiler3.skel.h"
static int sanity_run(struct bpf_program *prog)
{
struct bpf_prog_test_run_attr test_attr = {};
__u64 args[] = {1, 2, 3};
__u32 duration = 0;
int err, prog_fd;
prog_fd = bpf_program__fd(prog);
test_attr.prog_fd = prog_fd;
test_attr.ctx_in = args;
test_attr.ctx_size_in = sizeof(args);
err = bpf_prog_test_run_xattr(&test_attr);
if (CHECK(err || test_attr.retval, "test_run",
"err %d errno %d retval %d duration %d\n",
err, errno, test_attr.retval, duration))
return -1;
return 0;
}
void test_test_profiler(void)
{
struct profiler1 *profiler1_skel = NULL;
struct profiler2 *profiler2_skel = NULL;
struct profiler3 *profiler3_skel = NULL;
__u32 duration = 0;
int err;
profiler1_skel = profiler1__open_and_load();
if (CHECK(!profiler1_skel, "profiler1_skel_load", "profiler1 skeleton failed\n"))
goto cleanup;
err = profiler1__attach(profiler1_skel);
if (CHECK(err, "profiler1_attach", "profiler1 attach failed: %d\n", err))
goto cleanup;
if (sanity_run(profiler1_skel->progs.raw_tracepoint__sched_process_exec))
goto cleanup;
profiler2_skel = profiler2__open_and_load();
if (CHECK(!profiler2_skel, "profiler2_skel_load", "profiler2 skeleton failed\n"))
goto cleanup;
err = profiler2__attach(profiler2_skel);
if (CHECK(err, "profiler2_attach", "profiler2 attach failed: %d\n", err))
goto cleanup;
if (sanity_run(profiler2_skel->progs.raw_tracepoint__sched_process_exec))
goto cleanup;
profiler3_skel = profiler3__open_and_load();
if (CHECK(!profiler3_skel, "profiler3_skel_load", "profiler3 skeleton failed\n"))
goto cleanup;
err = profiler3__attach(profiler3_skel);
if (CHECK(err, "profiler3_attach", "profiler3 attach failed: %d\n", err))
goto cleanup;
if (sanity_run(profiler3_skel->progs.raw_tracepoint__sched_process_exec))
goto cleanup;
cleanup:
profiler1__destroy(profiler1_skel);
profiler2__destroy(profiler2_skel);
profiler3__destroy(profiler3_skel);
}

View File

@ -25,7 +25,7 @@ void test_xdp_noinline(void)
__u8 flags;
} real_def = {.dst = MAGIC_VAL};
__u32 ch_key = 11, real_num = 3;
__u32 duration, retval, size;
__u32 duration = 0, retval, size;
int err, i;
__u64 bytes = 0, pkts = 0;
char buf[128];

View File

@ -23,6 +23,10 @@
#define TCP_CA_NAME_MAX 16
#endif
#ifndef TCP_NOTSENT_LOWAT
#define TCP_NOTSENT_LOWAT 25
#endif
#ifndef IFNAMSIZ
#define IFNAMSIZ 16
#endif
@ -128,6 +132,18 @@ static __inline int set_keepalive(struct bpf_sock_addr *ctx)
return 0;
}
static __inline int set_notsent_lowat(struct bpf_sock_addr *ctx)
{
int lowat = 65535;
if (ctx->type == SOCK_STREAM) {
if (bpf_setsockopt(ctx, SOL_TCP, TCP_NOTSENT_LOWAT, &lowat, sizeof(lowat)))
return 1;
}
return 0;
}
SEC("cgroup/connect4")
int connect_v4_prog(struct bpf_sock_addr *ctx)
{
@ -148,6 +164,9 @@ int connect_v4_prog(struct bpf_sock_addr *ctx)
if (set_keepalive(ctx))
return 0;
if (set_notsent_lowat(ctx))
return 0;
if (ctx->type != SOCK_STREAM && ctx->type != SOCK_DGRAM)
return 0;
else if (ctx->type == SOCK_STREAM)

View File

@ -0,0 +1,177 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#pragma once
#define TASK_COMM_LEN 16
#define MAX_ANCESTORS 4
#define MAX_PATH 256
#define KILL_TARGET_LEN 64
#define CTL_MAXNAME 10
#define MAX_ARGS_LEN 4096
#define MAX_FILENAME_LEN 512
#define MAX_ENVIRON_LEN 8192
#define MAX_PATH_DEPTH 32
#define MAX_FILEPATH_LENGTH (MAX_PATH_DEPTH * MAX_PATH)
#define MAX_CGROUPS_PATH_DEPTH 8
#define MAX_METADATA_PAYLOAD_LEN TASK_COMM_LEN
#define MAX_CGROUP_PAYLOAD_LEN \
(MAX_PATH * 2 + (MAX_PATH * MAX_CGROUPS_PATH_DEPTH))
#define MAX_CAP_PAYLOAD_LEN (MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
#define MAX_SYSCTL_PAYLOAD_LEN \
(MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + CTL_MAXNAME + MAX_PATH)
#define MAX_KILL_PAYLOAD_LEN \
(MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + TASK_COMM_LEN + \
KILL_TARGET_LEN)
#define MAX_EXEC_PAYLOAD_LEN \
(MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILENAME_LEN + \
MAX_ARGS_LEN + MAX_ENVIRON_LEN)
#define MAX_FILEMOD_PAYLOAD_LEN \
(MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN + MAX_FILEPATH_LENGTH + \
MAX_FILEPATH_LENGTH)
enum data_type {
INVALID_EVENT,
EXEC_EVENT,
FORK_EVENT,
KILL_EVENT,
SYSCTL_EVENT,
FILEMOD_EVENT,
MAX_DATA_TYPE_EVENT
};
enum filemod_type {
FMOD_OPEN,
FMOD_LINK,
FMOD_SYMLINK,
};
struct ancestors_data_t {
pid_t ancestor_pids[MAX_ANCESTORS];
uint32_t ancestor_exec_ids[MAX_ANCESTORS];
uint64_t ancestor_start_times[MAX_ANCESTORS];
uint32_t num_ancestors;
};
struct var_metadata_t {
enum data_type type;
pid_t pid;
uint32_t exec_id;
uid_t uid;
gid_t gid;
uint64_t start_time;
uint32_t cpu_id;
uint64_t bpf_stats_num_perf_events;
uint64_t bpf_stats_start_ktime_ns;
uint8_t comm_length;
};
struct cgroup_data_t {
ino_t cgroup_root_inode;
ino_t cgroup_proc_inode;
uint64_t cgroup_root_mtime;
uint64_t cgroup_proc_mtime;
uint16_t cgroup_root_length;
uint16_t cgroup_proc_length;
uint16_t cgroup_full_length;
int cgroup_full_path_root_pos;
};
struct var_sysctl_data_t {
struct var_metadata_t meta;
struct cgroup_data_t cgroup_data;
struct ancestors_data_t ancestors_info;
uint8_t sysctl_val_length;
uint16_t sysctl_path_length;
char payload[MAX_SYSCTL_PAYLOAD_LEN];
};
struct var_kill_data_t {
struct var_metadata_t meta;
struct cgroup_data_t cgroup_data;
struct ancestors_data_t ancestors_info;
pid_t kill_target_pid;
int kill_sig;
uint32_t kill_count;
uint64_t last_kill_time;
uint8_t kill_target_name_length;
uint8_t kill_target_cgroup_proc_length;
char payload[MAX_KILL_PAYLOAD_LEN];
size_t payload_length;
};
struct var_exec_data_t {
struct var_metadata_t meta;
struct cgroup_data_t cgroup_data;
pid_t parent_pid;
uint32_t parent_exec_id;
uid_t parent_uid;
uint64_t parent_start_time;
uint16_t bin_path_length;
uint16_t cmdline_length;
uint16_t environment_length;
char payload[MAX_EXEC_PAYLOAD_LEN];
};
struct var_fork_data_t {
struct var_metadata_t meta;
pid_t parent_pid;
uint32_t parent_exec_id;
uint64_t parent_start_time;
char payload[MAX_METADATA_PAYLOAD_LEN];
};
struct var_filemod_data_t {
struct var_metadata_t meta;
struct cgroup_data_t cgroup_data;
enum filemod_type fmod_type;
unsigned int dst_flags;
uint32_t src_device_id;
uint32_t dst_device_id;
ino_t src_inode;
ino_t dst_inode;
uint16_t src_filepath_length;
uint16_t dst_filepath_length;
char payload[MAX_FILEMOD_PAYLOAD_LEN];
};
struct profiler_config_struct {
bool fetch_cgroups_from_bpf;
ino_t cgroup_fs_inode;
ino_t cgroup_login_session_inode;
uint64_t kill_signals_mask;
ino_t inode_filter;
uint32_t stale_info_secs;
bool use_variable_buffers;
bool read_environ_from_exec;
bool enable_cgroup_v1_resolver;
};
struct bpf_func_stats_data {
uint64_t time_elapsed_ns;
uint64_t num_executions;
uint64_t num_perf_events;
};
struct bpf_func_stats_ctx {
uint64_t start_time_ns;
struct bpf_func_stats_data* bpf_func_stats_data_val;
};
enum bpf_function_id {
profiler_bpf_proc_sys_write,
profiler_bpf_sched_process_exec,
profiler_bpf_sched_process_exit,
profiler_bpf_sys_enter_kill,
profiler_bpf_do_filp_open_ret,
profiler_bpf_sched_process_fork,
profiler_bpf_vfs_link,
profiler_bpf_vfs_symlink,
profiler_bpf_max_function_id
};

View File

@ -0,0 +1,969 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include <vmlinux.h>
#include <bpf/bpf_core_read.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "profiler.h"
#ifndef NULL
#define NULL 0
#endif
#define O_WRONLY 00000001
#define O_RDWR 00000002
#define O_DIRECTORY 00200000
#define __O_TMPFILE 020000000
#define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
#define MAX_ERRNO 4095
#define S_IFMT 00170000
#define S_IFSOCK 0140000
#define S_IFLNK 0120000
#define S_IFREG 0100000
#define S_IFBLK 0060000
#define S_IFDIR 0040000
#define S_IFCHR 0020000
#define S_IFIFO 0010000
#define S_ISUID 0004000
#define S_ISGID 0002000
#define S_ISVTX 0001000
#define S_ISLNK(m) (((m)&S_IFMT) == S_IFLNK)
#define S_ISDIR(m) (((m)&S_IFMT) == S_IFDIR)
#define S_ISCHR(m) (((m)&S_IFMT) == S_IFCHR)
#define S_ISBLK(m) (((m)&S_IFMT) == S_IFBLK)
#define S_ISFIFO(m) (((m)&S_IFMT) == S_IFIFO)
#define S_ISSOCK(m) (((m)&S_IFMT) == S_IFSOCK)
#define IS_ERR_VALUE(x) (unsigned long)(void*)(x) >= (unsigned long)-MAX_ERRNO
#define KILL_DATA_ARRAY_SIZE 8
struct var_kill_data_arr_t {
struct var_kill_data_t array[KILL_DATA_ARRAY_SIZE];
};
union any_profiler_data_t {
struct var_exec_data_t var_exec;
struct var_kill_data_t var_kill;
struct var_sysctl_data_t var_sysctl;
struct var_filemod_data_t var_filemod;
struct var_fork_data_t var_fork;
struct var_kill_data_arr_t var_kill_data_arr;
};
volatile struct profiler_config_struct bpf_config = {};
#define FETCH_CGROUPS_FROM_BPF (bpf_config.fetch_cgroups_from_bpf)
#define CGROUP_FS_INODE (bpf_config.cgroup_fs_inode)
#define CGROUP_LOGIN_SESSION_INODE \
(bpf_config.cgroup_login_session_inode)
#define KILL_SIGNALS (bpf_config.kill_signals_mask)
#define STALE_INFO (bpf_config.stale_info_secs)
#define INODE_FILTER (bpf_config.inode_filter)
#define READ_ENVIRON_FROM_EXEC (bpf_config.read_environ_from_exec)
#define ENABLE_CGROUP_V1_RESOLVER (bpf_config.enable_cgroup_v1_resolver)
struct kernfs_iattrs___52 {
struct iattr ia_iattr;
};
struct kernfs_node___52 {
union /* kernfs_node_id */ {
struct {
u32 ino;
u32 generation;
};
u64 id;
} id;
};
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, 1);
__type(key, u32);
__type(value, union any_profiler_data_t);
} data_heap SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
} events SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, KILL_DATA_ARRAY_SIZE);
__type(key, u32);
__type(value, struct var_kill_data_arr_t);
} var_tpid_to_data SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(max_entries, profiler_bpf_max_function_id);
__type(key, u32);
__type(value, struct bpf_func_stats_data);
} bpf_func_stats SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, u32);
__type(value, bool);
__uint(max_entries, 16);
} allowed_devices SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, u64);
__type(value, bool);
__uint(max_entries, 1024);
} allowed_file_inodes SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, u64);
__type(value, bool);
__uint(max_entries, 1024);
} allowed_directory_inodes SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__type(key, u32);
__type(value, bool);
__uint(max_entries, 16);
} disallowed_exec_inodes SEC(".maps");
#ifndef ARRAY_SIZE
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
#endif
static INLINE bool IS_ERR(const void* ptr)
{
return IS_ERR_VALUE((unsigned long)ptr);
}
static INLINE u32 get_userspace_pid()
{
return bpf_get_current_pid_tgid() >> 32;
}
static INLINE bool is_init_process(u32 tgid)
{
return tgid == 1 || tgid == 0;
}
static INLINE unsigned long
probe_read_lim(void* dst, void* src, unsigned long len, unsigned long max)
{
len = len < max ? len : max;
if (len > 1) {
if (bpf_probe_read(dst, len, src))
return 0;
} else if (len == 1) {
if (bpf_probe_read(dst, 1, src))
return 0;
}
return len;
}
static INLINE int get_var_spid_index(struct var_kill_data_arr_t* arr_struct,
int spid)
{
#ifdef UNROLL
#pragma unroll
#endif
for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
if (arr_struct->array[i].meta.pid == spid)
return i;
return -1;
}
static INLINE void populate_ancestors(struct task_struct* task,
struct ancestors_data_t* ancestors_data)
{
struct task_struct* parent = task;
u32 num_ancestors, ppid;
ancestors_data->num_ancestors = 0;
#ifdef UNROLL
#pragma unroll
#endif
for (num_ancestors = 0; num_ancestors < MAX_ANCESTORS; num_ancestors++) {
parent = BPF_CORE_READ(parent, real_parent);
if (parent == NULL)
break;
ppid = BPF_CORE_READ(parent, tgid);
if (is_init_process(ppid))
break;
ancestors_data->ancestor_pids[num_ancestors] = ppid;
ancestors_data->ancestor_exec_ids[num_ancestors] =
BPF_CORE_READ(parent, self_exec_id);
ancestors_data->ancestor_start_times[num_ancestors] =
BPF_CORE_READ(parent, start_time);
ancestors_data->num_ancestors = num_ancestors;
}
}
static INLINE void* read_full_cgroup_path(struct kernfs_node* cgroup_node,
struct kernfs_node* cgroup_root_node,
void* payload,
int* root_pos)
{
void* payload_start = payload;
size_t filepart_length;
#ifdef UNROLL
#pragma unroll
#endif
for (int i = 0; i < MAX_CGROUPS_PATH_DEPTH; i++) {
filepart_length =
bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(cgroup_node, name));
if (!cgroup_node)
return payload;
if (cgroup_node == cgroup_root_node)
*root_pos = payload - payload_start;
if (filepart_length <= MAX_PATH) {
barrier_var(filepart_length);
payload += filepart_length;
}
cgroup_node = BPF_CORE_READ(cgroup_node, parent);
}
return payload;
}
static ino_t get_inode_from_kernfs(struct kernfs_node* node)
{
struct kernfs_node___52* node52 = (void*)node;
if (bpf_core_field_exists(node52->id.ino)) {
barrier_var(node52);
return BPF_CORE_READ(node52, id.ino);
} else {
barrier_var(node);
return (u64)BPF_CORE_READ(node, id);
}
}
int pids_cgrp_id = 1;
static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data,
struct task_struct* task,
void* payload)
{
struct kernfs_node* root_kernfs =
BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn);
struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
if (ENABLE_CGROUP_V1_RESOLVER) {
#ifdef UNROLL
#pragma unroll
#endif
for (int i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys_state* subsys =
BPF_CORE_READ(task, cgroups, subsys[i]);
if (subsys != NULL) {
int subsys_id = BPF_CORE_READ(subsys, ss, id);
if (subsys_id == pids_cgrp_id) {
proc_kernfs = BPF_CORE_READ(subsys, cgroup, kn);
root_kernfs = BPF_CORE_READ(subsys, ss, root, kf_root, kn);
break;
}
}
}
}
cgroup_data->cgroup_root_inode = get_inode_from_kernfs(root_kernfs);
cgroup_data->cgroup_proc_inode = get_inode_from_kernfs(proc_kernfs);
if (bpf_core_field_exists(root_kernfs->iattr->ia_mtime)) {
cgroup_data->cgroup_root_mtime =
BPF_CORE_READ(root_kernfs, iattr, ia_mtime.tv_nsec);
cgroup_data->cgroup_proc_mtime =
BPF_CORE_READ(proc_kernfs, iattr, ia_mtime.tv_nsec);
} else {
struct kernfs_iattrs___52* root_iattr =
(struct kernfs_iattrs___52*)BPF_CORE_READ(root_kernfs, iattr);
cgroup_data->cgroup_root_mtime =
BPF_CORE_READ(root_iattr, ia_iattr.ia_mtime.tv_nsec);
struct kernfs_iattrs___52* proc_iattr =
(struct kernfs_iattrs___52*)BPF_CORE_READ(proc_kernfs, iattr);
cgroup_data->cgroup_proc_mtime =
BPF_CORE_READ(proc_iattr, ia_iattr.ia_mtime.tv_nsec);
}
cgroup_data->cgroup_root_length = 0;
cgroup_data->cgroup_proc_length = 0;
cgroup_data->cgroup_full_length = 0;
size_t cgroup_root_length =
bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(root_kernfs, name));
barrier_var(cgroup_root_length);
if (cgroup_root_length <= MAX_PATH) {
barrier_var(cgroup_root_length);
cgroup_data->cgroup_root_length = cgroup_root_length;
payload += cgroup_root_length;
}
size_t cgroup_proc_length =
bpf_probe_read_str(payload, MAX_PATH, BPF_CORE_READ(proc_kernfs, name));
barrier_var(cgroup_proc_length);
if (cgroup_proc_length <= MAX_PATH) {
barrier_var(cgroup_proc_length);
cgroup_data->cgroup_proc_length = cgroup_proc_length;
payload += cgroup_proc_length;
}
if (FETCH_CGROUPS_FROM_BPF) {
cgroup_data->cgroup_full_path_root_pos = -1;
void* payload_end_pos = read_full_cgroup_path(proc_kernfs, root_kernfs, payload,
&cgroup_data->cgroup_full_path_root_pos);
cgroup_data->cgroup_full_length = payload_end_pos - payload;
payload = payload_end_pos;
}
return (void*)payload;
}
static INLINE void* populate_var_metadata(struct var_metadata_t* metadata,
struct task_struct* task,
u32 pid, void* payload)
{
u64 uid_gid = bpf_get_current_uid_gid();
metadata->uid = (u32)uid_gid;
metadata->gid = uid_gid >> 32;
metadata->pid = pid;
metadata->exec_id = BPF_CORE_READ(task, self_exec_id);
metadata->start_time = BPF_CORE_READ(task, start_time);
metadata->comm_length = 0;
size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm);
barrier_var(comm_length);
if (comm_length <= TASK_COMM_LEN) {
barrier_var(comm_length);
metadata->comm_length = comm_length;
payload += comm_length;
}
return (void*)payload;
}
static INLINE struct var_kill_data_t*
get_var_kill_data(struct pt_regs* ctx, int spid, int tpid, int sig)
{
int zero = 0;
struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero);
if (kill_data == NULL)
return NULL;
struct task_struct* task = (struct task_struct*)bpf_get_current_task();
void* payload = populate_var_metadata(&kill_data->meta, task, spid, kill_data->payload);
payload = populate_cgroup_info(&kill_data->cgroup_data, task, payload);
size_t payload_length = payload - (void*)kill_data->payload;
kill_data->payload_length = payload_length;
populate_ancestors(task, &kill_data->ancestors_info);
kill_data->meta.type = KILL_EVENT;
kill_data->kill_target_pid = tpid;
kill_data->kill_sig = sig;
kill_data->kill_count = 1;
kill_data->last_kill_time = bpf_ktime_get_ns();
return kill_data;
}
static INLINE int trace_var_sys_kill(void* ctx, int tpid, int sig)
{
if ((KILL_SIGNALS & (1ULL << sig)) == 0)
return 0;
u32 spid = get_userspace_pid();
struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid);
if (arr_struct == NULL) {
struct var_kill_data_t* kill_data = get_var_kill_data(ctx, spid, tpid, sig);
int zero = 0;
if (kill_data == NULL)
return 0;
arr_struct = bpf_map_lookup_elem(&data_heap, &zero);
if (arr_struct == NULL)
return 0;
bpf_probe_read(&arr_struct->array[0], sizeof(arr_struct->array[0]), kill_data);
} else {
int index = get_var_spid_index(arr_struct, spid);
if (index == -1) {
struct var_kill_data_t* kill_data =
get_var_kill_data(ctx, spid, tpid, sig);
if (kill_data == NULL)
return 0;
#ifdef UNROLL
#pragma unroll
#endif
for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++)
if (arr_struct->array[i].meta.pid == 0) {
bpf_probe_read(&arr_struct->array[i],
sizeof(arr_struct->array[i]), kill_data);
bpf_map_update_elem(&var_tpid_to_data, &tpid,
arr_struct, 0);
return 0;
}
return 0;
}
struct var_kill_data_t* kill_data = &arr_struct->array[index];
u64 delta_sec =
(bpf_ktime_get_ns() - kill_data->last_kill_time) / 1000000000;
if (delta_sec < STALE_INFO) {
kill_data->kill_count++;
kill_data->last_kill_time = bpf_ktime_get_ns();
bpf_probe_read(&arr_struct->array[index],
sizeof(arr_struct->array[index]),
kill_data);
} else {
struct var_kill_data_t* kill_data =
get_var_kill_data(ctx, spid, tpid, sig);
if (kill_data == NULL)
return 0;
bpf_probe_read(&arr_struct->array[index],
sizeof(arr_struct->array[index]),
kill_data);
}
}
bpf_map_update_elem(&var_tpid_to_data, &tpid, arr_struct, 0);
return 0;
}
static INLINE void bpf_stats_enter(struct bpf_func_stats_ctx* bpf_stat_ctx,
enum bpf_function_id func_id)
{
int func_id_key = func_id;
bpf_stat_ctx->start_time_ns = bpf_ktime_get_ns();
bpf_stat_ctx->bpf_func_stats_data_val =
bpf_map_lookup_elem(&bpf_func_stats, &func_id_key);
if (bpf_stat_ctx->bpf_func_stats_data_val)
bpf_stat_ctx->bpf_func_stats_data_val->num_executions++;
}
static INLINE void bpf_stats_exit(struct bpf_func_stats_ctx* bpf_stat_ctx)
{
if (bpf_stat_ctx->bpf_func_stats_data_val)
bpf_stat_ctx->bpf_func_stats_data_val->time_elapsed_ns +=
bpf_ktime_get_ns() - bpf_stat_ctx->start_time_ns;
}
static INLINE void
bpf_stats_pre_submit_var_perf_event(struct bpf_func_stats_ctx* bpf_stat_ctx,
struct var_metadata_t* meta)
{
if (bpf_stat_ctx->bpf_func_stats_data_val) {
bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events++;
meta->bpf_stats_num_perf_events =
bpf_stat_ctx->bpf_func_stats_data_val->num_perf_events;
}
meta->bpf_stats_start_ktime_ns = bpf_stat_ctx->start_time_ns;
meta->cpu_id = bpf_get_smp_processor_id();
}
static INLINE size_t
read_absolute_file_path_from_dentry(struct dentry* filp_dentry, void* payload)
{
size_t length = 0;
size_t filepart_length;
struct dentry* parent_dentry;
#ifdef UNROLL
#pragma unroll
#endif
for (int i = 0; i < MAX_PATH_DEPTH; i++) {
filepart_length = bpf_probe_read_str(payload, MAX_PATH,
BPF_CORE_READ(filp_dentry, d_name.name));
barrier_var(filepart_length);
if (filepart_length > MAX_PATH)
break;
barrier_var(filepart_length);
payload += filepart_length;
length += filepart_length;
parent_dentry = BPF_CORE_READ(filp_dentry, d_parent);
if (filp_dentry == parent_dentry)
break;
filp_dentry = parent_dentry;
}
return length;
}
static INLINE bool
is_ancestor_in_allowed_inodes(struct dentry* filp_dentry)
{
struct dentry* parent_dentry;
#ifdef UNROLL
#pragma unroll
#endif
for (int i = 0; i < MAX_PATH_DEPTH; i++) {
u64 dir_ino = BPF_CORE_READ(filp_dentry, d_inode, i_ino);
bool* allowed_dir = bpf_map_lookup_elem(&allowed_directory_inodes, &dir_ino);
if (allowed_dir != NULL)
return true;
parent_dentry = BPF_CORE_READ(filp_dentry, d_parent);
if (filp_dentry == parent_dentry)
break;
filp_dentry = parent_dentry;
}
return false;
}
static INLINE bool is_dentry_allowed_for_filemod(struct dentry* file_dentry,
u32* device_id,
u64* file_ino)
{
u32 dev_id = BPF_CORE_READ(file_dentry, d_sb, s_dev);
*device_id = dev_id;
bool* allowed_device = bpf_map_lookup_elem(&allowed_devices, &dev_id);
if (allowed_device == NULL)
return false;
u64 ino = BPF_CORE_READ(file_dentry, d_inode, i_ino);
*file_ino = ino;
bool* allowed_file = bpf_map_lookup_elem(&allowed_file_inodes, &ino);
if (allowed_file == NULL)
if (!is_ancestor_in_allowed_inodes(BPF_CORE_READ(file_dentry, d_parent)))
return false;
return true;
}
SEC("kprobe/proc_sys_write")
ssize_t BPF_KPROBE(kprobe__proc_sys_write,
struct file* filp, const char* buf,
size_t count, loff_t* ppos)
{
struct bpf_func_stats_ctx stats_ctx;
bpf_stats_enter(&stats_ctx, profiler_bpf_proc_sys_write);
u32 pid = get_userspace_pid();
int zero = 0;
struct var_sysctl_data_t* sysctl_data =
bpf_map_lookup_elem(&data_heap, &zero);
if (!sysctl_data)
goto out;
struct task_struct* task = (struct task_struct*)bpf_get_current_task();
sysctl_data->meta.type = SYSCTL_EVENT;
void* payload = populate_var_metadata(&sysctl_data->meta, task, pid, sysctl_data->payload);
payload = populate_cgroup_info(&sysctl_data->cgroup_data, task, payload);
populate_ancestors(task, &sysctl_data->ancestors_info);
sysctl_data->sysctl_val_length = 0;
sysctl_data->sysctl_path_length = 0;
size_t sysctl_val_length = bpf_probe_read_str(payload, CTL_MAXNAME, buf);
barrier_var(sysctl_val_length);
if (sysctl_val_length <= CTL_MAXNAME) {
barrier_var(sysctl_val_length);
sysctl_data->sysctl_val_length = sysctl_val_length;
payload += sysctl_val_length;
}
size_t sysctl_path_length = bpf_probe_read_str(payload, MAX_PATH,
BPF_CORE_READ(filp, f_path.dentry, d_name.name));
barrier_var(sysctl_path_length);
if (sysctl_path_length <= MAX_PATH) {
barrier_var(sysctl_path_length);
sysctl_data->sysctl_path_length = sysctl_path_length;
payload += sysctl_path_length;
}
bpf_stats_pre_submit_var_perf_event(&stats_ctx, &sysctl_data->meta);
unsigned long data_len = payload - (void*)sysctl_data;
data_len = data_len > sizeof(struct var_sysctl_data_t)
? sizeof(struct var_sysctl_data_t)
: data_len;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, sysctl_data, data_len);
out:
bpf_stats_exit(&stats_ctx);
return 0;
}
SEC("tracepoint/syscalls/sys_enter_kill")
int tracepoint__syscalls__sys_enter_kill(struct trace_event_raw_sys_enter* ctx)
{
struct bpf_func_stats_ctx stats_ctx;
bpf_stats_enter(&stats_ctx, profiler_bpf_sys_enter_kill);
int pid = ctx->args[0];
int sig = ctx->args[1];
int ret = trace_var_sys_kill(ctx, pid, sig);
bpf_stats_exit(&stats_ctx);
return ret;
};
SEC("raw_tracepoint/sched_process_exit")
int raw_tracepoint__sched_process_exit(void* ctx)
{
int zero = 0;
struct bpf_func_stats_ctx stats_ctx;
bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exit);
u32 tpid = get_userspace_pid();
struct var_kill_data_arr_t* arr_struct = bpf_map_lookup_elem(&var_tpid_to_data, &tpid);
struct var_kill_data_t* kill_data = bpf_map_lookup_elem(&data_heap, &zero);
if (arr_struct == NULL || kill_data == NULL)
goto out;
struct task_struct* task = (struct task_struct*)bpf_get_current_task();
struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn);
#ifdef UNROLL
#pragma unroll
#endif
for (int i = 0; i < ARRAY_SIZE(arr_struct->array); i++) {
struct var_kill_data_t* past_kill_data = &arr_struct->array[i];
if (past_kill_data != NULL && past_kill_data->kill_target_pid == tpid) {
bpf_probe_read(kill_data, sizeof(*past_kill_data), past_kill_data);
void* payload = kill_data->payload;
size_t offset = kill_data->payload_length;
if (offset >= MAX_METADATA_PAYLOAD_LEN + MAX_CGROUP_PAYLOAD_LEN)
return 0;
payload += offset;
kill_data->kill_target_name_length = 0;
kill_data->kill_target_cgroup_proc_length = 0;
size_t comm_length = bpf_core_read_str(payload, TASK_COMM_LEN, &task->comm);
barrier_var(comm_length);
if (comm_length <= TASK_COMM_LEN) {
barrier_var(comm_length);
kill_data->kill_target_name_length = comm_length;
payload += comm_length;
}
size_t cgroup_proc_length = bpf_probe_read_str(payload, KILL_TARGET_LEN,
BPF_CORE_READ(proc_kernfs, name));
barrier_var(cgroup_proc_length);
if (cgroup_proc_length <= KILL_TARGET_LEN) {
barrier_var(cgroup_proc_length);
kill_data->kill_target_cgroup_proc_length = cgroup_proc_length;
payload += cgroup_proc_length;
}
bpf_stats_pre_submit_var_perf_event(&stats_ctx, &kill_data->meta);
unsigned long data_len = (void*)payload - (void*)kill_data;
data_len = data_len > sizeof(struct var_kill_data_t)
? sizeof(struct var_kill_data_t)
: data_len;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, kill_data, data_len);
}
}
bpf_map_delete_elem(&var_tpid_to_data, &tpid);
out:
bpf_stats_exit(&stats_ctx);
return 0;
}
SEC("raw_tracepoint/sched_process_exec")
int raw_tracepoint__sched_process_exec(struct bpf_raw_tracepoint_args* ctx)
{
struct bpf_func_stats_ctx stats_ctx;
bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_exec);
struct linux_binprm* bprm = (struct linux_binprm*)ctx->args[2];
u64 inode = BPF_CORE_READ(bprm, file, f_inode, i_ino);
bool* should_filter_binprm = bpf_map_lookup_elem(&disallowed_exec_inodes, &inode);
if (should_filter_binprm != NULL)
goto out;
int zero = 0;
struct var_exec_data_t* proc_exec_data = bpf_map_lookup_elem(&data_heap, &zero);
if (!proc_exec_data)
goto out;
if (INODE_FILTER && inode != INODE_FILTER)
return 0;
u32 pid = get_userspace_pid();
struct task_struct* task = (struct task_struct*)bpf_get_current_task();
proc_exec_data->meta.type = EXEC_EVENT;
proc_exec_data->bin_path_length = 0;
proc_exec_data->cmdline_length = 0;
proc_exec_data->environment_length = 0;
void* payload = populate_var_metadata(&proc_exec_data->meta, task, pid,
proc_exec_data->payload);
payload = populate_cgroup_info(&proc_exec_data->cgroup_data, task, payload);
struct task_struct* parent_task = BPF_CORE_READ(task, real_parent);
proc_exec_data->parent_pid = BPF_CORE_READ(parent_task, tgid);
proc_exec_data->parent_uid = BPF_CORE_READ(parent_task, real_cred, uid.val);
proc_exec_data->parent_exec_id = BPF_CORE_READ(parent_task, self_exec_id);
proc_exec_data->parent_start_time = BPF_CORE_READ(parent_task, start_time);
const char* filename = BPF_CORE_READ(bprm, filename);
size_t bin_path_length = bpf_probe_read_str(payload, MAX_FILENAME_LEN, filename);
barrier_var(bin_path_length);
if (bin_path_length <= MAX_FILENAME_LEN) {
barrier_var(bin_path_length);
proc_exec_data->bin_path_length = bin_path_length;
payload += bin_path_length;
}
void* arg_start = (void*)BPF_CORE_READ(task, mm, arg_start);
void* arg_end = (void*)BPF_CORE_READ(task, mm, arg_end);
unsigned int cmdline_length = probe_read_lim(payload, arg_start,
arg_end - arg_start, MAX_ARGS_LEN);
if (cmdline_length <= MAX_ARGS_LEN) {
barrier_var(cmdline_length);
proc_exec_data->cmdline_length = cmdline_length;
payload += cmdline_length;
}
if (READ_ENVIRON_FROM_EXEC) {
void* env_start = (void*)BPF_CORE_READ(task, mm, env_start);
void* env_end = (void*)BPF_CORE_READ(task, mm, env_end);
unsigned long env_len = probe_read_lim(payload, env_start,
env_end - env_start, MAX_ENVIRON_LEN);
if (cmdline_length <= MAX_ENVIRON_LEN) {
proc_exec_data->environment_length = env_len;
payload += env_len;
}
}
bpf_stats_pre_submit_var_perf_event(&stats_ctx, &proc_exec_data->meta);
unsigned long data_len = payload - (void*)proc_exec_data;
data_len = data_len > sizeof(struct var_exec_data_t)
? sizeof(struct var_exec_data_t)
: data_len;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, proc_exec_data, data_len);
out:
bpf_stats_exit(&stats_ctx);
return 0;
}
SEC("kretprobe/do_filp_open")
int kprobe_ret__do_filp_open(struct pt_regs* ctx)
{
struct bpf_func_stats_ctx stats_ctx;
bpf_stats_enter(&stats_ctx, profiler_bpf_do_filp_open_ret);
struct file* filp = (struct file*)PT_REGS_RC_CORE(ctx);
if (filp == NULL || IS_ERR(filp))
goto out;
unsigned int flags = BPF_CORE_READ(filp, f_flags);
if ((flags & (O_RDWR | O_WRONLY)) == 0)
goto out;
if ((flags & O_TMPFILE) > 0)
goto out;
struct inode* file_inode = BPF_CORE_READ(filp, f_inode);
umode_t mode = BPF_CORE_READ(file_inode, i_mode);
if (S_ISDIR(mode) || S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) ||
S_ISSOCK(mode))
goto out;
struct dentry* filp_dentry = BPF_CORE_READ(filp, f_path.dentry);
u32 device_id = 0;
u64 file_ino = 0;
if (!is_dentry_allowed_for_filemod(filp_dentry, &device_id, &file_ino))
goto out;
int zero = 0;
struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
if (!filemod_data)
goto out;
u32 pid = get_userspace_pid();
struct task_struct* task = (struct task_struct*)bpf_get_current_task();
filemod_data->meta.type = FILEMOD_EVENT;
filemod_data->fmod_type = FMOD_OPEN;
filemod_data->dst_flags = flags;
filemod_data->src_inode = 0;
filemod_data->dst_inode = file_ino;
filemod_data->src_device_id = 0;
filemod_data->dst_device_id = device_id;
filemod_data->src_filepath_length = 0;
filemod_data->dst_filepath_length = 0;
void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
filemod_data->payload);
payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
size_t len = read_absolute_file_path_from_dentry(filp_dentry, payload);
barrier_var(len);
if (len <= MAX_FILEPATH_LENGTH) {
barrier_var(len);
payload += len;
filemod_data->dst_filepath_length = len;
}
bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
unsigned long data_len = payload - (void*)filemod_data;
data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
out:
bpf_stats_exit(&stats_ctx);
return 0;
}
SEC("kprobe/vfs_link")
int BPF_KPROBE(kprobe__vfs_link,
struct dentry* old_dentry, struct inode* dir,
struct dentry* new_dentry, struct inode** delegated_inode)
{
struct bpf_func_stats_ctx stats_ctx;
bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_link);
u32 src_device_id = 0;
u64 src_file_ino = 0;
u32 dst_device_id = 0;
u64 dst_file_ino = 0;
if (!is_dentry_allowed_for_filemod(old_dentry, &src_device_id, &src_file_ino) &&
!is_dentry_allowed_for_filemod(new_dentry, &dst_device_id, &dst_file_ino))
goto out;
int zero = 0;
struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
if (!filemod_data)
goto out;
u32 pid = get_userspace_pid();
struct task_struct* task = (struct task_struct*)bpf_get_current_task();
filemod_data->meta.type = FILEMOD_EVENT;
filemod_data->fmod_type = FMOD_LINK;
filemod_data->dst_flags = 0;
filemod_data->src_inode = src_file_ino;
filemod_data->dst_inode = dst_file_ino;
filemod_data->src_device_id = src_device_id;
filemod_data->dst_device_id = dst_device_id;
filemod_data->src_filepath_length = 0;
filemod_data->dst_filepath_length = 0;
void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
filemod_data->payload);
payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
size_t len = read_absolute_file_path_from_dentry(old_dentry, payload);
barrier_var(len);
if (len <= MAX_FILEPATH_LENGTH) {
barrier_var(len);
payload += len;
filemod_data->src_filepath_length = len;
}
len = read_absolute_file_path_from_dentry(new_dentry, payload);
barrier_var(len);
if (len <= MAX_FILEPATH_LENGTH) {
barrier_var(len);
payload += len;
filemod_data->dst_filepath_length = len;
}
bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
unsigned long data_len = payload - (void*)filemod_data;
data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
out:
bpf_stats_exit(&stats_ctx);
return 0;
}
SEC("kprobe/vfs_symlink")
int BPF_KPROBE(kprobe__vfs_symlink, struct inode* dir, struct dentry* dentry,
const char* oldname)
{
struct bpf_func_stats_ctx stats_ctx;
bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_symlink);
u32 dst_device_id = 0;
u64 dst_file_ino = 0;
if (!is_dentry_allowed_for_filemod(dentry, &dst_device_id, &dst_file_ino))
goto out;
int zero = 0;
struct var_filemod_data_t* filemod_data = bpf_map_lookup_elem(&data_heap, &zero);
if (!filemod_data)
goto out;
u32 pid = get_userspace_pid();
struct task_struct* task = (struct task_struct*)bpf_get_current_task();
filemod_data->meta.type = FILEMOD_EVENT;
filemod_data->fmod_type = FMOD_SYMLINK;
filemod_data->dst_flags = 0;
filemod_data->src_inode = 0;
filemod_data->dst_inode = dst_file_ino;
filemod_data->src_device_id = 0;
filemod_data->dst_device_id = dst_device_id;
filemod_data->src_filepath_length = 0;
filemod_data->dst_filepath_length = 0;
void* payload = populate_var_metadata(&filemod_data->meta, task, pid,
filemod_data->payload);
payload = populate_cgroup_info(&filemod_data->cgroup_data, task, payload);
size_t len = bpf_probe_read_str(payload, MAX_FILEPATH_LENGTH, oldname);
barrier_var(len);
if (len <= MAX_FILEPATH_LENGTH) {
barrier_var(len);
payload += len;
filemod_data->src_filepath_length = len;
}
len = read_absolute_file_path_from_dentry(dentry, payload);
barrier_var(len);
if (len <= MAX_FILEPATH_LENGTH) {
barrier_var(len);
payload += len;
filemod_data->dst_filepath_length = len;
}
bpf_stats_pre_submit_var_perf_event(&stats_ctx, &filemod_data->meta);
unsigned long data_len = payload - (void*)filemod_data;
data_len = data_len > sizeof(*filemod_data) ? sizeof(*filemod_data) : data_len;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, filemod_data, data_len);
out:
bpf_stats_exit(&stats_ctx);
return 0;
}
SEC("raw_tracepoint/sched_process_fork")
int raw_tracepoint__sched_process_fork(struct bpf_raw_tracepoint_args* ctx)
{
struct bpf_func_stats_ctx stats_ctx;
bpf_stats_enter(&stats_ctx, profiler_bpf_sched_process_fork);
int zero = 0;
struct var_fork_data_t* fork_data = bpf_map_lookup_elem(&data_heap, &zero);
if (!fork_data)
goto out;
struct task_struct* parent = (struct task_struct*)ctx->args[0];
struct task_struct* child = (struct task_struct*)ctx->args[1];
fork_data->meta.type = FORK_EVENT;
void* payload = populate_var_metadata(&fork_data->meta, child,
BPF_CORE_READ(child, pid), fork_data->payload);
fork_data->parent_pid = BPF_CORE_READ(parent, pid);
fork_data->parent_exec_id = BPF_CORE_READ(parent, self_exec_id);
fork_data->parent_start_time = BPF_CORE_READ(parent, start_time);
bpf_stats_pre_submit_var_perf_event(&stats_ctx, &fork_data->meta);
unsigned long data_len = payload - (void*)fork_data;
data_len = data_len > sizeof(*fork_data) ? sizeof(*fork_data) : data_len;
bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, fork_data, data_len);
out:
bpf_stats_exit(&stats_ctx);
return 0;
}
char _license[] SEC("license") = "GPL";

View File

@ -0,0 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#define barrier_var(var) asm volatile("" : "=r"(var) : "0"(var))
#define UNROLL
#define INLINE __always_inline
#include "profiler.inc.h"

View File

@ -0,0 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#define barrier_var(var) /**/
/* undef #define UNROLL */
#define INLINE /**/
#include "profiler.inc.h"

View File

@ -0,0 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#define barrier_var(var) /**/
#define UNROLL
#define INLINE __noinline
#include "profiler.inc.h"

View File

@ -41,6 +41,43 @@ struct outer_arr {
.values = { (void *)&inner_map1, 0, (void *)&inner_map2 },
};
struct inner_map_sz3 {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_INNER_MAP);
__uint(max_entries, 3);
__type(key, int);
__type(value, int);
} inner_map3 SEC(".maps"),
inner_map4 SEC(".maps");
struct inner_map_sz4 {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_INNER_MAP);
__uint(max_entries, 5);
__type(key, int);
__type(value, int);
} inner_map5 SEC(".maps");
struct outer_arr_dyn {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__uint(max_entries, 3);
__uint(key_size, sizeof(int));
__uint(value_size, sizeof(int));
__array(values, struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(map_flags, BPF_F_INNER_MAP);
__uint(max_entries, 1);
__type(key, int);
__type(value, int);
});
} outer_arr_dyn SEC(".maps") = {
.values = {
[0] = (void *)&inner_map3,
[1] = (void *)&inner_map4,
[2] = (void *)&inner_map5,
},
};
struct outer_hash {
__uint(type, BPF_MAP_TYPE_HASH_OF_MAPS);
__uint(max_entries, 5);
@ -101,6 +138,12 @@ int handle__sys_enter(void *ctx)
val = input + 1;
bpf_map_update_elem(inner_map, &key, &val, 0);
inner_map = bpf_map_lookup_elem(&outer_arr_dyn, &key);
if (!inner_map)
return 1;
val = input + 2;
bpf_map_update_elem(inner_map, &key, &val, 0);
return 0;
}

View File

@ -0,0 +1,172 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#include <linux/bpf.h>
#include <stdint.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_core_read.h>
char _license[] SEC("license") = "GPL";
/* fields of exactly the same size */
struct test_struct___samesize {
void *ptr;
unsigned long long val1;
unsigned int val2;
unsigned short val3;
unsigned char val4;
} __attribute((preserve_access_index));
/* unsigned fields that have to be downsized by libbpf */
struct test_struct___downsize {
void *ptr;
unsigned long val1;
unsigned long val2;
unsigned long val3;
unsigned long val4;
/* total sz: 40 */
} __attribute__((preserve_access_index));
/* fields with signed integers of wrong size, should be rejected */
struct test_struct___signed {
void *ptr;
long val1;
long val2;
long val3;
long val4;
} __attribute((preserve_access_index));
/* real layout and sizes according to test's (32-bit) BTF */
struct test_struct___real {
unsigned int ptr; /* can't use `void *`, it is always 8 byte in BPF target */
unsigned int val2;
unsigned long long val1;
unsigned short val3;
unsigned char val4;
unsigned char _pad;
/* total sz: 20 */
};
struct test_struct___real input = {
.ptr = 0x01020304,
.val1 = 0x1020304050607080,
.val2 = 0x0a0b0c0d,
.val3 = 0xfeed,
.val4 = 0xb9,
._pad = 0xff, /* make sure no accidental zeros are present */
};
unsigned long long ptr_samesized = 0;
unsigned long long val1_samesized = 0;
unsigned long long val2_samesized = 0;
unsigned long long val3_samesized = 0;
unsigned long long val4_samesized = 0;
struct test_struct___real output_samesized = {};
unsigned long long ptr_downsized = 0;
unsigned long long val1_downsized = 0;
unsigned long long val2_downsized = 0;
unsigned long long val3_downsized = 0;
unsigned long long val4_downsized = 0;
struct test_struct___real output_downsized = {};
unsigned long long ptr_probed = 0;
unsigned long long val1_probed = 0;
unsigned long long val2_probed = 0;
unsigned long long val3_probed = 0;
unsigned long long val4_probed = 0;
unsigned long long ptr_signed = 0;
unsigned long long val1_signed = 0;
unsigned long long val2_signed = 0;
unsigned long long val3_signed = 0;
unsigned long long val4_signed = 0;
struct test_struct___real output_signed = {};
SEC("raw_tp/sys_exit")
int handle_samesize(void *ctx)
{
struct test_struct___samesize *in = (void *)&input;
struct test_struct___samesize *out = (void *)&output_samesized;
ptr_samesized = (unsigned long long)in->ptr;
val1_samesized = in->val1;
val2_samesized = in->val2;
val3_samesized = in->val3;
val4_samesized = in->val4;
out->ptr = in->ptr;
out->val1 = in->val1;
out->val2 = in->val2;
out->val3 = in->val3;
out->val4 = in->val4;
return 0;
}
SEC("raw_tp/sys_exit")
int handle_downsize(void *ctx)
{
struct test_struct___downsize *in = (void *)&input;
struct test_struct___downsize *out = (void *)&output_downsized;
ptr_downsized = (unsigned long long)in->ptr;
val1_downsized = in->val1;
val2_downsized = in->val2;
val3_downsized = in->val3;
val4_downsized = in->val4;
out->ptr = in->ptr;
out->val1 = in->val1;
out->val2 = in->val2;
out->val3 = in->val3;
out->val4 = in->val4;
return 0;
}
SEC("raw_tp/sys_enter")
int handle_probed(void *ctx)
{
struct test_struct___downsize *in = (void *)&input;
__u64 tmp;
tmp = 0;
bpf_core_read(&tmp, bpf_core_field_size(in->ptr), &in->ptr);
ptr_probed = tmp;
tmp = 0;
bpf_core_read(&tmp, bpf_core_field_size(in->val1), &in->val1);
val1_probed = tmp;
tmp = 0;
bpf_core_read(&tmp, bpf_core_field_size(in->val2), &in->val2);
val2_probed = tmp;
tmp = 0;
bpf_core_read(&tmp, bpf_core_field_size(in->val3), &in->val3);
val3_probed = tmp;
tmp = 0;
bpf_core_read(&tmp, bpf_core_field_size(in->val4), &in->val4);
val4_probed = tmp;
return 0;
}
SEC("raw_tp/sys_enter")
int handle_signed(void *ctx)
{
struct test_struct___signed *in = (void *)&input;
struct test_struct___signed *out = (void *)&output_signed;
val2_signed = in->val2;
val3_signed = in->val3;
val4_signed = in->val4;
out->val2= in->val2;
out->val3= in->val3;
out->val4= in->val4;
return 0;
}

View File

@ -0,0 +1,55 @@
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Google */
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
__u64 out__runqueues_addr = -1;
__u64 out__bpf_prog_active_addr = -1;
__u32 out__rq_cpu = -1; /* percpu struct fields */
int out__bpf_prog_active = -1; /* percpu int */
__u32 out__this_rq_cpu = -1;
int out__this_bpf_prog_active = -1;
__u32 out__cpu_0_rq_cpu = -1; /* cpu_rq(0)->cpu */
extern const struct rq runqueues __ksym; /* struct type global var. */
extern const int bpf_prog_active __ksym; /* int type global var. */
SEC("raw_tp/sys_enter")
int handler(const void *ctx)
{
struct rq *rq;
int *active;
__u32 cpu;
out__runqueues_addr = (__u64)&runqueues;
out__bpf_prog_active_addr = (__u64)&bpf_prog_active;
cpu = bpf_get_smp_processor_id();
/* test bpf_per_cpu_ptr() */
rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, cpu);
if (rq)
out__rq_cpu = rq->cpu;
active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
if (active)
out__bpf_prog_active = *active;
rq = (struct rq *)bpf_per_cpu_ptr(&runqueues, 0);
if (rq) /* should always be valid, but we can't spare the check. */
out__cpu_0_rq_cpu = rq->cpu;
/* test bpf_this_cpu_ptr */
rq = (struct rq *)bpf_this_cpu_ptr(&runqueues);
out__this_rq_cpu = rq->cpu;
active = (int *)bpf_this_cpu_ptr(&bpf_prog_active);
out__this_bpf_prog_active = *active;
return 0;
}
char _license[] SEC("license") = "GPL";

View File

@ -304,10 +304,10 @@ int misc_estab(struct bpf_sock_ops *skops)
passive_lport_n = __bpf_htons(passive_lport_h);
bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
&true_val, sizeof(true_val));
set_hdr_cb_flags(skops);
set_hdr_cb_flags(skops, 0);
break;
case BPF_SOCK_OPS_TCP_CONNECT_CB:
set_hdr_cb_flags(skops);
set_hdr_cb_flags(skops, 0);
break;
case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
return handle_parse_hdr(skops);

View File

@ -131,39 +131,55 @@ int bpf_prog2(struct __sk_buff *skb)
}
SEC("sk_skb3")
int bpf_prog3(struct __sk_buff *skb)
static inline void bpf_write_pass(struct __sk_buff *skb, int offset)
{
const int one = 1;
int err, *f, ret = SK_PASS;
int err = bpf_skb_pull_data(skb, 6 + offset);
void *data_end;
char *c;
err = bpf_skb_pull_data(skb, 19);
if (err)
goto tls_out;
return;
c = (char *)(long)skb->data;
data_end = (void *)(long)skb->data_end;
if (c + 18 < data_end)
memcpy(&c[13], "PASS", 4);
if (c + 5 + offset < data_end)
memcpy(c + offset, "PASS", 4);
}
SEC("sk_skb3")
int bpf_prog3(struct __sk_buff *skb)
{
int err, *f, ret = SK_PASS;
const int one = 1;
f = bpf_map_lookup_elem(&sock_skb_opts, &one);
if (f && *f) {
__u64 flags = 0;
ret = 0;
flags = *f;
err = bpf_skb_adjust_room(skb, -13, 0, 0);
if (err)
return SK_DROP;
err = bpf_skb_adjust_room(skb, 4, 0, 0);
if (err)
return SK_DROP;
bpf_write_pass(skb, 0);
#ifdef SOCKMAP
return bpf_sk_redirect_map(skb, &tls_sock_map, ret, flags);
#else
return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags);
#endif
}
f = bpf_map_lookup_elem(&sock_skb_opts, &one);
if (f && *f)
ret = SK_DROP;
err = bpf_skb_adjust_room(skb, 4, 0, 0);
if (err)
return SK_DROP;
bpf_write_pass(skb, 13);
tls_out:
return ret;
}

View File

@ -13,17 +13,10 @@
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#ifndef barrier_data
# define barrier_data(ptr) asm volatile("": :"r"(ptr) :"memory")
#endif
#ifndef ctx_ptr
# define ctx_ptr(field) (void *)(long)(field)
#endif
#define dst_to_src_tmp 0xeeddddeeU
#define src_to_dst_tmp 0xeeffffeeU
#define ip4_src 0xac100164 /* 172.16.1.100 */
#define ip4_dst 0xac100264 /* 172.16.2.100 */
@ -39,6 +32,18 @@
a.s6_addr32[3] == b.s6_addr32[3])
#endif
enum {
dev_src,
dev_dst,
};
struct bpf_map_def SEC("maps") ifindex_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 2,
};
static __always_inline bool is_remote_ep_v4(struct __sk_buff *skb,
__be32 addr)
{
@ -73,7 +78,14 @@ static __always_inline bool is_remote_ep_v6(struct __sk_buff *skb,
return v6_equal(ip6h->daddr, addr);
}
SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
static __always_inline int get_dev_ifindex(int which)
{
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
return ifindex ? *ifindex : 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
{
void *data_end = ctx_ptr(skb->data_end);
void *data = ctx_ptr(skb->data);
@ -87,7 +99,6 @@ SEC("chk_neigh") int tc_chk(struct __sk_buff *skb)
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
{
int idx = dst_to_src_tmp;
__u8 zero[ETH_ALEN * 2];
bool redirect = false;
@ -103,19 +114,15 @@ SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
if (!redirect)
return TC_ACT_OK;
barrier_data(&idx);
idx = bpf_ntohl(idx);
__builtin_memset(&zero, 0, sizeof(zero));
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT;
return bpf_redirect_neigh(idx, 0);
return bpf_redirect_neigh(get_dev_ifindex(dev_src), 0);
}
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
{
int idx = src_to_dst_tmp;
__u8 zero[ETH_ALEN * 2];
bool redirect = false;
@ -131,14 +138,11 @@ SEC("src_ingress") int tc_src(struct __sk_buff *skb)
if (!redirect)
return TC_ACT_OK;
barrier_data(&idx);
idx = bpf_ntohl(idx);
__builtin_memset(&zero, 0, sizeof(zero));
if (bpf_skb_store_bytes(skb, 0, &zero, sizeof(zero), 0) < 0)
return TC_ACT_SHOT;
return bpf_redirect_neigh(idx, 0);
return bpf_redirect_neigh(get_dev_ifindex(dev_dst), 0);
}
char __license[] SEC("license") = "GPL";

View File

@ -0,0 +1,45 @@
// SPDX-License-Identifier: GPL-2.0
#include <stdint.h>
#include <stdbool.h>
#include <linux/bpf.h>
#include <linux/stddef.h>
#include <linux/pkt_cls.h>
#include <bpf/bpf_helpers.h>
enum {
dev_src,
dev_dst,
};
struct bpf_map_def SEC("maps") ifindex_map = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 2,
};
static __always_inline int get_dev_ifindex(int which)
{
int *ifindex = bpf_map_lookup_elem(&ifindex_map, &which);
return ifindex ? *ifindex : 0;
}
SEC("chk_egress") int tc_chk(struct __sk_buff *skb)
{
return TC_ACT_SHOT;
}
SEC("dst_ingress") int tc_dst(struct __sk_buff *skb)
{
return bpf_redirect_peer(get_dev_ifindex(dev_src), 0);
}
SEC("src_ingress") int tc_src(struct __sk_buff *skb)
{
return bpf_redirect_peer(get_dev_ifindex(dev_dst), 0);
}
char __license[] SEC("license") = "GPL";

View File

@ -21,6 +21,7 @@
__u8 test_kind = TCPOPT_EXP;
__u16 test_magic = 0xeB9F;
__u32 inherit_cb_flags = 0;
struct bpf_test_option passive_synack_out = {};
struct bpf_test_option passive_fin_out = {};
@ -467,6 +468,8 @@ static int handle_passive_estab(struct bpf_sock_ops *skops)
struct tcphdr *th;
int err;
inherit_cb_flags = skops->bpf_sock_ops_cb_flags;
err = load_option(skops, &passive_estab_in, true);
if (err == -ENOENT) {
/* saved_syn is not found. It was in syncookie mode.
@ -600,10 +603,10 @@ int estab(struct bpf_sock_ops *skops)
case BPF_SOCK_OPS_TCP_LISTEN_CB:
bpf_setsockopt(skops, SOL_TCP, TCP_SAVE_SYN,
&true_val, sizeof(true_val));
set_hdr_cb_flags(skops);
set_hdr_cb_flags(skops, BPF_SOCK_OPS_STATE_CB_FLAG);
break;
case BPF_SOCK_OPS_TCP_CONNECT_CB:
set_hdr_cb_flags(skops);
set_hdr_cb_flags(skops, 0);
break;
case BPF_SOCK_OPS_PARSE_HDR_OPT_CB:
return handle_parse_hdr(skops);

View File

@ -86,6 +86,7 @@ int txmsg_ktls_skb_redir;
int ktls;
int peek_flag;
int skb_use_parser;
int txmsg_omit_skb_parser;
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@ -111,6 +112,7 @@ static const struct option long_options[] = {
{"txmsg_redir_skb", no_argument, &txmsg_redir_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{"peek", no_argument, &peek_flag, 1 },
{"txmsg_omit_skb_parser", no_argument, &txmsg_omit_skb_parser, 1},
{"whitelist", required_argument, NULL, 'n' },
{"blacklist", required_argument, NULL, 'b' },
{0, 0, NULL, 0 }
@ -175,6 +177,7 @@ static void test_reset(void)
txmsg_apply = txmsg_cork = 0;
txmsg_ingress = txmsg_redir_skb = 0;
txmsg_ktls_skb = txmsg_ktls_skb_drop = txmsg_ktls_skb_redir = 0;
txmsg_omit_skb_parser = 0;
skb_use_parser = 0;
}
@ -518,28 +521,13 @@ static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz)
if (i == 0 && txmsg_ktls_skb) {
if (msg->msg_iov[i].iov_len < 4)
return -EIO;
if (txmsg_ktls_skb_redir) {
if (memcmp(&d[13], "PASS", 4) != 0) {
fprintf(stderr,
"detected redirect ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[13], d[14], d[15], d[16]);
return -EIO;
}
d[13] = 0;
d[14] = 1;
d[15] = 2;
d[16] = 3;
j = 13;
} else if (txmsg_ktls_skb) {
if (memcmp(d, "PASS", 4) != 0) {
fprintf(stderr,
"detected ktls_skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n", i, 0, d[0], d[1], d[2], d[3]);
return -EIO;
}
d[0] = 0;
d[1] = 1;
d[2] = 2;
d[3] = 3;
if (memcmp(d, "PASS", 4) != 0) {
fprintf(stderr,
"detected skb data error with skb ingress update @iov[%i]:%i \"%02x %02x %02x %02x\" != \"PASS\"\n",
i, 0, d[0], d[1], d[2], d[3]);
return -EIO;
}
j = 4; /* advance index past PASS header */
}
for (; j < msg->msg_iov[i].iov_len && size; j++) {
@ -927,13 +915,15 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test)
goto run;
/* Attach programs to sockmap */
err = bpf_prog_attach(prog_fd[0], map_fd[0],
BPF_SK_SKB_STREAM_PARSER, 0);
if (err) {
fprintf(stderr,
"ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
prog_fd[0], map_fd[0], err, strerror(errno));
return err;
if (!txmsg_omit_skb_parser) {
err = bpf_prog_attach(prog_fd[0], map_fd[0],
BPF_SK_SKB_STREAM_PARSER, 0);
if (err) {
fprintf(stderr,
"ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n",
prog_fd[0], map_fd[0], err, strerror(errno));
return err;
}
}
err = bpf_prog_attach(prog_fd[1], map_fd[0],
@ -946,13 +936,15 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test)
/* Attach programs to TLS sockmap */
if (txmsg_ktls_skb) {
err = bpf_prog_attach(prog_fd[0], map_fd[8],
BPF_SK_SKB_STREAM_PARSER, 0);
if (err) {
fprintf(stderr,
"ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
prog_fd[0], map_fd[8], err, strerror(errno));
return err;
if (!txmsg_omit_skb_parser) {
err = bpf_prog_attach(prog_fd[0], map_fd[8],
BPF_SK_SKB_STREAM_PARSER, 0);
if (err) {
fprintf(stderr,
"ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n",
prog_fd[0], map_fd[8], err, strerror(errno));
return err;
}
}
err = bpf_prog_attach(prog_fd[2], map_fd[8],
@ -1480,12 +1472,29 @@ static void test_txmsg_skb(int cgrp, struct sockmap_options *opt)
txmsg_ktls_skb_drop = 0;
txmsg_ktls_skb_redir = 1;
test_exec(cgrp, opt);
txmsg_ktls_skb_redir = 0;
/* Tests that omit skb_parser */
txmsg_omit_skb_parser = 1;
ktls = 0;
txmsg_ktls_skb = 0;
test_exec(cgrp, opt);
txmsg_ktls_skb_drop = 1;
test_exec(cgrp, opt);
txmsg_ktls_skb_drop = 0;
txmsg_ktls_skb_redir = 1;
test_exec(cgrp, opt);
ktls = 1;
test_exec(cgrp, opt);
txmsg_omit_skb_parser = 0;
opt->data_test = data;
ktls = k;
}
/* Test cork with hung data. This tests poor usage patterns where
* cork can leave data on the ring if user program is buggy and
* doesn't flush them somehow. They do take some time however

View File

@ -1,168 +0,0 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
# between src and dst. The netns fwd has veth links to each src and dst. The
# client is in src and server in dst. The test installs a TC BPF program to each
# host facing veth in fwd which calls into bpf_redirect_peer() to perform the
# neigh addr population and redirect; it also installs a dropper prog on the
# egress side to drop skbs if neigh addrs were not populated.
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
# check that nc, dd, ping, ping6 and timeout are present
command -v nc >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v dd >/dev/null 2>&1 || \
{ echo >&2 "dd is not available"; exit 1; }
command -v timeout >/dev/null 2>&1 || \
{ echo >&2 "timeout is not available"; exit 1; }
command -v ping >/dev/null 2>&1 || \
{ echo >&2 "ping is not available"; exit 1; }
command -v ping6 >/dev/null 2>&1 || \
{ echo >&2 "ping6 is not available"; exit 1; }
readonly GREEN='\033[0;92m'
readonly RED='\033[0;31m'
readonly NC='\033[0m' # No Color
readonly PING_ARG="-c 3 -w 10 -q"
readonly TIMEOUT=10
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
readonly IP4_SRC="172.16.1.100"
readonly IP4_DST="172.16.2.100"
readonly IP6_SRC="::1:dead:beef:cafe"
readonly IP6_DST="::2:dead:beef:cafe"
readonly IP4_SLL="169.254.0.1"
readonly IP4_DLL="169.254.0.2"
readonly IP4_NET="169.254.0.0"
cleanup()
{
ip netns del ${NS_SRC}
ip netns del ${NS_FWD}
ip netns del ${NS_DST}
}
trap cleanup EXIT
set -e
ip netns add "${NS_SRC}"
ip netns add "${NS_FWD}"
ip netns add "${NS_DST}"
ip link add veth_src type veth peer name veth_src_fwd
ip link add veth_dst type veth peer name veth_dst_fwd
ip link set veth_src netns ${NS_SRC}
ip link set veth_src_fwd netns ${NS_FWD}
ip link set veth_dst netns ${NS_DST}
ip link set veth_dst_fwd netns ${NS_FWD}
ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
# The fwd netns automatically get a v6 LL address / routes, but also needs v4
# one in order to start ARP probing. IP4_NET route is added to the endpoints
# so that the ARP processing will reply.
ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
ip -netns ${NS_SRC} link set dev veth_src up
ip -netns ${NS_FWD} link set dev veth_src_fwd up
ip -netns ${NS_DST} link set dev veth_dst up
ip -netns ${NS_FWD} link set dev veth_dst_fwd up
ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex | awk '{printf "%08x\n", $1}')
veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex | awk '{printf "%08x\n", $1}')
xxd -p < test_tc_neigh.o | sed "s/eeddddee/$veth_src/g" | xxd -r -p > test_tc_neigh.x.o
xxd -p < test_tc_neigh.x.o | sed "s/eeffffee/$veth_dst/g" | xxd -r -p > test_tc_neigh.y.o
ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj test_tc_neigh.y.o sec src_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh
ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj test_tc_neigh.y.o sec dst_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj test_tc_neigh.y.o sec chk_neigh
rm -f test_tc_neigh.x.o test_tc_neigh.y.o
ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
set +e
TEST="TCPv4 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="TCPv6 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv4 connectivity test"
ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv6 connectivity test"
ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"

View File

@ -0,0 +1,204 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# This test sets up 3 netns (src <-> fwd <-> dst). There is no direct veth link
# between src and dst. The netns fwd has veth links to each src and dst. The
# client is in src and server in dst. The test installs a TC BPF program to each
# host facing veth in fwd which calls into i) bpf_redirect_neigh() to perform the
# neigh addr population and redirect or ii) bpf_redirect_peer() for namespace
# switch from ingress side; it also installs a checker prog on the egress side
# to drop unexpected traffic.
if [[ $EUID -ne 0 ]]; then
echo "This script must be run as root"
echo "FAIL"
exit 1
fi
# check that needed tools are present
command -v nc >/dev/null 2>&1 || \
{ echo >&2 "nc is not available"; exit 1; }
command -v dd >/dev/null 2>&1 || \
{ echo >&2 "dd is not available"; exit 1; }
command -v timeout >/dev/null 2>&1 || \
{ echo >&2 "timeout is not available"; exit 1; }
command -v ping >/dev/null 2>&1 || \
{ echo >&2 "ping is not available"; exit 1; }
command -v ping6 >/dev/null 2>&1 || \
{ echo >&2 "ping6 is not available"; exit 1; }
command -v perl >/dev/null 2>&1 || \
{ echo >&2 "perl is not available"; exit 1; }
command -v jq >/dev/null 2>&1 || \
{ echo >&2 "jq is not available"; exit 1; }
command -v bpftool >/dev/null 2>&1 || \
{ echo >&2 "bpftool is not available"; exit 1; }
readonly GREEN='\033[0;92m'
readonly RED='\033[0;31m'
readonly NC='\033[0m' # No Color
readonly PING_ARG="-c 3 -w 10 -q"
readonly TIMEOUT=10
readonly NS_SRC="ns-src-$(mktemp -u XXXXXX)"
readonly NS_FWD="ns-fwd-$(mktemp -u XXXXXX)"
readonly NS_DST="ns-dst-$(mktemp -u XXXXXX)"
readonly IP4_SRC="172.16.1.100"
readonly IP4_DST="172.16.2.100"
readonly IP6_SRC="::1:dead:beef:cafe"
readonly IP6_DST="::2:dead:beef:cafe"
readonly IP4_SLL="169.254.0.1"
readonly IP4_DLL="169.254.0.2"
readonly IP4_NET="169.254.0.0"
netns_cleanup()
{
ip netns del ${NS_SRC}
ip netns del ${NS_FWD}
ip netns del ${NS_DST}
}
netns_setup()
{
ip netns add "${NS_SRC}"
ip netns add "${NS_FWD}"
ip netns add "${NS_DST}"
ip link add veth_src type veth peer name veth_src_fwd
ip link add veth_dst type veth peer name veth_dst_fwd
ip link set veth_src netns ${NS_SRC}
ip link set veth_src_fwd netns ${NS_FWD}
ip link set veth_dst netns ${NS_DST}
ip link set veth_dst_fwd netns ${NS_FWD}
ip -netns ${NS_SRC} addr add ${IP4_SRC}/32 dev veth_src
ip -netns ${NS_DST} addr add ${IP4_DST}/32 dev veth_dst
# The fwd netns automatically get a v6 LL address / routes, but also
# needs v4 one in order to start ARP probing. IP4_NET route is added
# to the endpoints so that the ARP processing will reply.
ip -netns ${NS_FWD} addr add ${IP4_SLL}/32 dev veth_src_fwd
ip -netns ${NS_FWD} addr add ${IP4_DLL}/32 dev veth_dst_fwd
ip -netns ${NS_SRC} addr add ${IP6_SRC}/128 dev veth_src nodad
ip -netns ${NS_DST} addr add ${IP6_DST}/128 dev veth_dst nodad
ip -netns ${NS_SRC} link set dev veth_src up
ip -netns ${NS_FWD} link set dev veth_src_fwd up
ip -netns ${NS_DST} link set dev veth_dst up
ip -netns ${NS_FWD} link set dev veth_dst_fwd up
ip -netns ${NS_SRC} route add ${IP4_DST}/32 dev veth_src scope global
ip -netns ${NS_SRC} route add ${IP4_NET}/16 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP4_SRC}/32 dev veth_src_fwd scope global
ip -netns ${NS_SRC} route add ${IP6_DST}/128 dev veth_src scope global
ip -netns ${NS_FWD} route add ${IP6_SRC}/128 dev veth_src_fwd scope global
ip -netns ${NS_DST} route add ${IP4_SRC}/32 dev veth_dst scope global
ip -netns ${NS_DST} route add ${IP4_NET}/16 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP4_DST}/32 dev veth_dst_fwd scope global
ip -netns ${NS_DST} route add ${IP6_SRC}/128 dev veth_dst scope global
ip -netns ${NS_FWD} route add ${IP6_DST}/128 dev veth_dst_fwd scope global
fmac_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/address)
fmac_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/address)
ip -netns ${NS_SRC} neigh add ${IP4_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP4_SRC} dev veth_dst lladdr $fmac_dst
ip -netns ${NS_SRC} neigh add ${IP6_DST} dev veth_src lladdr $fmac_src
ip -netns ${NS_DST} neigh add ${IP6_SRC} dev veth_dst lladdr $fmac_dst
}
netns_test_connectivity()
{
set +e
ip netns exec ${NS_DST} bash -c "nc -4 -l -p 9004 &"
ip netns exec ${NS_DST} bash -c "nc -6 -l -p 9006 &"
TEST="TCPv4 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP4_DST}/9004"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="TCPv6 connectivity test"
ip netns exec ${NS_SRC} bash -c "timeout ${TIMEOUT} dd if=/dev/zero bs=1000 count=100 > /dev/tcp/${IP6_DST}/9006"
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv4 connectivity test"
ip netns exec ${NS_SRC} ping $PING_ARG ${IP4_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
TEST="ICMPv6 connectivity test"
ip netns exec ${NS_SRC} ping6 $PING_ARG ${IP6_DST}
if [ $? -ne 0 ]; then
echo -e "${TEST}: ${RED}FAIL${NC}"
exit 1
fi
echo -e "${TEST}: ${GREEN}PASS${NC}"
set -e
}
hex_mem_str()
{
perl -e 'print join(" ", unpack("(H2)8", pack("L", @ARGV)))' $1
}
netns_setup_bpf()
{
local obj=$1
ip netns exec ${NS_FWD} tc qdisc add dev veth_src_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd ingress bpf da obj $obj sec src_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_src_fwd egress bpf da obj $obj sec chk_egress
ip netns exec ${NS_FWD} tc qdisc add dev veth_dst_fwd clsact
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd ingress bpf da obj $obj sec dst_ingress
ip netns exec ${NS_FWD} tc filter add dev veth_dst_fwd egress bpf da obj $obj sec chk_egress
veth_src=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_src_fwd/ifindex)
veth_dst=$(ip netns exec ${NS_FWD} cat /sys/class/net/veth_dst_fwd/ifindex)
progs=$(ip netns exec ${NS_FWD} bpftool net --json | jq -r '.[] | .tc | map(.id) | .[]')
for prog in $progs; do
map=$(bpftool prog show id $prog --json | jq -r '.map_ids | .? | .[]')
if [ ! -z "$map" ]; then
bpftool map update id $map key hex $(hex_mem_str 0) value hex $(hex_mem_str $veth_src)
bpftool map update id $map key hex $(hex_mem_str 1) value hex $(hex_mem_str $veth_dst)
fi
done
}
trap netns_cleanup EXIT
set -e
netns_setup
netns_setup_bpf test_tc_neigh.o
netns_test_connectivity
netns_cleanup
netns_setup
netns_setup_bpf test_tc_peer.o
netns_test_connectivity

View File

@ -110,12 +110,13 @@ static inline void clear_hdr_cb_flags(struct bpf_sock_ops *skops)
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG));
}
static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops)
static inline void set_hdr_cb_flags(struct bpf_sock_ops *skops, __u32 extra)
{
bpf_sock_ops_cb_flags_set(skops,
skops->bpf_sock_ops_cb_flags |
BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG |
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG);
BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG |
extra);
}
static inline void
clear_parse_all_hdr_cb_flags(struct bpf_sock_ops *skops)

View File

@ -90,6 +90,33 @@ long ksym_get_addr(const char *name)
return 0;
}
/* open kallsyms and read symbol addresses on the fly. Without caching all symbols,
* this is faster than load + find.
*/
int kallsyms_find(const char *sym, unsigned long long *addr)
{
char type, name[500];
unsigned long long value;
int err = 0;
FILE *f;
f = fopen("/proc/kallsyms", "r");
if (!f)
return -EINVAL;
while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) {
if (strcmp(name, sym) == 0) {
*addr = value;
goto out;
}
}
err = -ENOENT;
out:
fclose(f);
return err;
}
void read_trace_pipe(void)
{
int trace_fd;

View File

@ -12,6 +12,10 @@ struct ksym {
int load_kallsyms(void);
struct ksym *ksym_search(long key);
long ksym_get_addr(const char *name);
/* open kallsyms and find addresses on the fly, faster than load + search. */
int kallsyms_find(const char *sym, unsigned long long *addr);
void read_trace_pipe(void);
#endif

View File

@ -2,7 +2,7 @@
"empty prog",
.insns = {
},
.errstr = "unknown opcode 00",
.errstr = "last insn is not an exit or jmp",
.result = REJECT,
},
{

View File

@ -529,7 +529,7 @@
},
.prog_type = BPF_PROG_TYPE_SCHED_CLS,
.result = REJECT,
.errstr = "invalid access to packet, off=0 size=8, R5(id=1,off=0,r=0)",
.errstr = "invalid access to packet, off=0 size=8, R5(id=2,off=0,r=0)",
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
},
{

View File

@ -50,14 +50,6 @@
.errstr = "invalid bpf_ld_imm64 insn",
.result = REJECT,
},
{
"test5 ld_imm64",
.insns = {
BPF_RAW_INSN(BPF_LD | BPF_IMM | BPF_DW, 0, 0, 0, 0),
},
.errstr = "invalid bpf_ld_imm64 insn",
.result = REJECT,
},
{
"test6 ld_imm64",
.insns = {

View File

@ -0,0 +1,243 @@
{
"regalloc basic",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 4),
BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
{
"regalloc negative",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 24, 4),
BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 3),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_7, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=48 off=48 size=1",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
{
"regalloc src_reg mark",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 5),
BPF_MOV64_IMM(BPF_REG_3, 0),
BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
{
"regalloc src_reg negative",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 9),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 22, 5),
BPF_MOV64_IMM(BPF_REG_3, 0),
BPF_JMP_REG(BPF_JSGE, BPF_REG_3, BPF_REG_2, 3),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=48 off=44 size=8",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
{
"regalloc and spill",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 20, 7),
/* r0 has upper bound that should propagate into r2 */
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */
BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */
BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2),
/* r3 has lower and upper bounds */
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3),
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
{
"regalloc and spill negative",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 48, 7),
/* r0 has upper bound that should propagate into r2 */
BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* spill r2 */
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_MOV64_IMM(BPF_REG_2, 0), /* clear r0 and r2 */
BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_10, -8), /* fill r3 */
BPF_JMP_REG(BPF_JSGE, BPF_REG_0, BPF_REG_3, 2),
/* r3 has lower and upper bounds */
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_3),
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = REJECT,
.errstr = "invalid access to map value, value_size=48 off=48 size=8",
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
{
"regalloc three regs",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_MOV64_REG(BPF_REG_4, BPF_REG_2),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_0, 12, 5),
BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_0),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_2),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_4),
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
{
"regalloc after call",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 10),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
BPF_MOV64_REG(BPF_REG_9, BPF_REG_0),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 6),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_8, 20, 4),
BPF_JMP_IMM(BPF_JSLT, BPF_REG_9, 0, 3),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_8),
BPF_ALU64_REG(BPF_ADD, BPF_REG_7, BPF_REG_9),
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_7, 0),
BPF_EXIT_INSN(),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},
{
"regalloc in callee",
.insns = {
BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
BPF_LD_MAP_FD(BPF_REG_1, 0),
BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6),
BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
BPF_EMIT_CALL(BPF_FUNC_get_prandom_u32),
BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
BPF_MOV64_REG(BPF_REG_2, BPF_REG_0),
BPF_MOV64_REG(BPF_REG_3, BPF_REG_7),
BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 1, 0, 1),
BPF_EXIT_INSN(),
BPF_JMP_IMM(BPF_JSGT, BPF_REG_1, 20, 5),
BPF_JMP_IMM(BPF_JSLT, BPF_REG_2, 0, 4),
BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_2),
BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0),
BPF_EXIT_INSN(),
BPF_MOV64_IMM(BPF_REG_0, 0),
BPF_EXIT_INSN(),
},
.fixup_map_hash_48b = { 4 },
.result = ACCEPT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT,
},