From 5c59801f7018acba11b12de59017a3fcdcf7421d Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Tue, 22 Jan 2019 09:16:19 +0100 Subject: [PATCH 001/298] clk: sunxi-ng: v3s: Fix TCON reset de-assert bit According to the datasheet and the reference code from Allwinner, the bit used to de-assert the TCON reset is bit 4, not bit 3. Fix it in the V3s CCU driver. Signed-off-by: Paul Kocialkowski Signed-off-by: Maxime Ripard --- drivers/clk/sunxi-ng/ccu-sun8i-v3s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c index 621b1cd996db..ac12f261f8ca 100644 --- a/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c +++ b/drivers/clk/sunxi-ng/ccu-sun8i-v3s.c @@ -542,7 +542,7 @@ static struct ccu_reset_map sun8i_v3s_ccu_resets[] = { [RST_BUS_OHCI0] = { 0x2c0, BIT(29) }, [RST_BUS_VE] = { 0x2c4, BIT(0) }, - [RST_BUS_TCON0] = { 0x2c4, BIT(3) }, + [RST_BUS_TCON0] = { 0x2c4, BIT(4) }, [RST_BUS_CSI] = { 0x2c4, BIT(8) }, [RST_BUS_DE] = { 0x2c4, BIT(12) }, [RST_BUS_DBG] = { 0x2c4, BIT(31) }, From 6db2983cd8064808141ccefd75218f5b4345ffae Mon Sep 17 00:00:00 2001 From: Eugene Loh Date: Thu, 17 Jan 2019 14:46:00 -0800 Subject: [PATCH 002/298] kallsyms: Handle too long symbols in kallsyms.c When checking for symbols with excessively long names, account for null terminating character. Fixes: f3462aa952cf ("Kbuild: Handle longer symbols in kallsyms.c") Signed-off-by: Eugene Loh Acked-by: Ard Biesheuvel Signed-off-by: Masahiro Yamada --- scripts/kallsyms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 77cebad0474e..f75e7bda4889 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -118,8 +118,8 @@ static int read_symbol(FILE *in, struct sym_entry *s) fprintf(stderr, "Read error or end of file.\n"); return -1; } - if (strlen(sym) > KSYM_NAME_LEN) { - fprintf(stderr, "Symbol %s too long for kallsyms (%zu vs %d).\n" + if (strlen(sym) >= KSYM_NAME_LEN) { + fprintf(stderr, "Symbol %s too long for kallsyms (%zu >= %d).\n" "Please increase KSYM_NAME_LEN both in kernel and kallsyms.c\n", sym, strlen(sym), KSYM_NAME_LEN); return -1; From ee0b27a3a4da0b0ed2318aa092f8856896e9450b Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Wed, 23 Jan 2019 00:59:11 +0000 Subject: [PATCH 003/298] clk: sunxi: A31: Fix wrong AHB gate number According to the manual the gate clock for MMC3 is at bit 11, and NAND1 is controlled by bit 12. Fix the gate bit definitions in the clock driver. Fixes: c6e6c96d8fa6 ("clk: sunxi-ng: Add A31/A31s clocks") Signed-off-by: Andre Przywara Signed-off-by: Maxime Ripard --- drivers/clk/sunxi-ng/ccu-sun6i-a31.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/clk/sunxi-ng/ccu-sun6i-a31.c b/drivers/clk/sunxi-ng/ccu-sun6i-a31.c index 3b97f60540ad..609970c0b666 100644 --- a/drivers/clk/sunxi-ng/ccu-sun6i-a31.c +++ b/drivers/clk/sunxi-ng/ccu-sun6i-a31.c @@ -264,9 +264,9 @@ static SUNXI_CCU_GATE(ahb1_mmc1_clk, "ahb1-mmc1", "ahb1", static SUNXI_CCU_GATE(ahb1_mmc2_clk, "ahb1-mmc2", "ahb1", 0x060, BIT(10), 0); static SUNXI_CCU_GATE(ahb1_mmc3_clk, "ahb1-mmc3", "ahb1", - 0x060, BIT(12), 0); + 0x060, BIT(11), 0); static SUNXI_CCU_GATE(ahb1_nand1_clk, "ahb1-nand1", "ahb1", - 0x060, BIT(13), 0); + 0x060, BIT(12), 0); static SUNXI_CCU_GATE(ahb1_nand0_clk, "ahb1-nand0", "ahb1", 0x060, BIT(13), 0); static SUNXI_CCU_GATE(ahb1_sdram_clk, "ahb1-sdram", "ahb1", From 09db51241118aeb06e1c8cd393b45879ce099b36 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Mon, 28 Jan 2019 09:35:35 +0100 Subject: [PATCH 004/298] esp: Skip TX bytes accounting when sending from a request socket On ESP output, sk_wmem_alloc is incremented for the added padding if a socket is associated to the skb. When replying with TCP SYNACKs over IPsec, the associated sk is a casted request socket, only. Increasing sk_wmem_alloc on a request socket results in a write at an arbitrary struct offset. In the best case, this produces the following WARNING: WARNING: CPU: 1 PID: 0 at lib/refcount.c:102 esp_output_head+0x2e4/0x308 [esp4] refcount_t: addition on 0; use-after-free. CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.0.0-rc3 #2 Hardware name: Marvell Armada 380/385 (Device Tree) [...] [] (esp_output_head [esp4]) from [] (esp_output+0xb8/0x180 [esp4]) [] (esp_output [esp4]) from [] (xfrm_output_resume+0x558/0x664) [] (xfrm_output_resume) from [] (xfrm4_output+0x44/0xc4) [] (xfrm4_output) from [] (tcp_v4_send_synack+0xa8/0xe8) [] (tcp_v4_send_synack) from [] (tcp_conn_request+0x7f4/0x948) [] (tcp_conn_request) from [] (tcp_rcv_state_process+0x2a0/0xe64) [] (tcp_rcv_state_process) from [] (tcp_v4_do_rcv+0xf0/0x1f4) [] (tcp_v4_do_rcv) from [] (tcp_v4_rcv+0xdb8/0xe20) [] (tcp_v4_rcv) from [] (ip_protocol_deliver_rcu+0x2c/0x2dc) [] (ip_protocol_deliver_rcu) from [] (ip_local_deliver_finish+0x48/0x54) [] (ip_local_deliver_finish) from [] (ip_local_deliver+0x54/0xec) [] (ip_local_deliver) from [] (ip_rcv+0x48/0xb8) [] (ip_rcv) from [] (__netif_receive_skb_one_core+0x50/0x6c) [...] The issue triggers only when not using TCP syncookies, as for syncookies no socket is associated. Fixes: cac2661c53f3 ("esp4: Avoid skb_cow_data whenever possible") Fixes: 03e2a30f6a27 ("esp6: Avoid skb_cow_data whenever possible") Signed-off-by: Martin Willi Signed-off-by: Steffen Klassert --- net/ipv4/esp4.c | 2 +- net/ipv6/esp6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5459f41fc26f..10e809b296ec 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -328,7 +328,7 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * skb->len += tailen; skb->data_len += tailen; skb->truesize += tailen; - if (sk) + if (sk && sk_fullsock(sk)) refcount_add(tailen, &sk->sk_wmem_alloc); goto out; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 5afe9f83374d..239d4a65ad6e 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -296,7 +296,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info skb->len += tailen; skb->data_len += tailen; skb->truesize += tailen; - if (sk) + if (sk && sk_fullsock(sk)) refcount_add(tailen, &sk->sk_wmem_alloc); goto out; From 1b5ba350784242eb1f899bcffd95d2c7cff61e84 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 21 Jan 2019 14:42:42 +0100 Subject: [PATCH 005/298] ARM: 8824/1: fix a migrating irq bug when hotplug cpu Arm TC2 fails cpu hotplug stress test. This issue was tracked down to a missing copy of the new affinity cpumask for the vexpress-spc interrupt into struct irq_common_data.affinity when the interrupt is migrated in migrate_one_irq(). Fix it by replacing the arm specific hotplug cpu migration with the generic irq code. This is the counterpart implementation to commit 217d453d473c ("arm64: fix a migrating irq bug when hotplug cpu"). Tested with cpu hotplug stress test on Arm TC2 (multi_v7_defconfig plus CONFIG_ARM_BIG_LITTLE_CPUFREQ=y and CONFIG_ARM_VEXPRESS_SPC_CPUFREQ=y). The vexpress-spc interrupt (irq=22) on this board is affine to CPU0. Its affinity cpumask now changes correctly e.g. from 0 to 1-4 when CPU0 is hotplugged out. Suggested-by: Marc Zyngier Signed-off-by: Dietmar Eggemann Acked-by: Marc Zyngier Reviewed-by: Linus Walleij Signed-off-by: Russell King --- arch/arm/Kconfig | 1 + arch/arm/include/asm/irq.h | 1 - arch/arm/kernel/irq.c | 62 -------------------------------------- arch/arm/kernel/smp.c | 2 +- 4 files changed, 2 insertions(+), 64 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 664e918e2624..26524b75970a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1400,6 +1400,7 @@ config NR_CPUS config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP + select GENERIC_IRQ_MIGRATION help Say Y here to experiment with turning CPUs off and on. CPUs can be controlled through /sys/devices/system/cpu. diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h index c883fcbe93b6..46d41140df27 100644 --- a/arch/arm/include/asm/irq.h +++ b/arch/arm/include/asm/irq.h @@ -25,7 +25,6 @@ #ifndef __ASSEMBLY__ struct irqaction; struct pt_regs; -extern void migrate_irqs(void); extern void asm_do_IRQ(unsigned int, struct pt_regs *); void handle_IRQ(unsigned int, struct pt_regs *); diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index 9908dacf9229..844861368cd5 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -109,64 +108,3 @@ int __init arch_probe_nr_irqs(void) return nr_irqs; } #endif - -#ifdef CONFIG_HOTPLUG_CPU -static bool migrate_one_irq(struct irq_desc *desc) -{ - struct irq_data *d = irq_desc_get_irq_data(desc); - const struct cpumask *affinity = irq_data_get_affinity_mask(d); - struct irq_chip *c; - bool ret = false; - - /* - * If this is a per-CPU interrupt, or the affinity does not - * include this CPU, then we have nothing to do. - */ - if (irqd_is_per_cpu(d) || !cpumask_test_cpu(smp_processor_id(), affinity)) - return false; - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - affinity = cpu_online_mask; - ret = true; - } - - c = irq_data_get_irq_chip(d); - if (!c->irq_set_affinity) - pr_debug("IRQ%u: unable to set affinity\n", d->irq); - else if (c->irq_set_affinity(d, affinity, false) == IRQ_SET_MASK_OK && ret) - cpumask_copy(irq_data_get_affinity_mask(d), affinity); - - return ret; -} - -/* - * The current CPU has been marked offline. Migrate IRQs off this CPU. - * If the affinity settings do not allow other CPUs, force them onto any - * available CPU. - * - * Note: we must iterate over all IRQs, whether they have an attached - * action structure or not, as we need to get chained interrupts too. - */ -void migrate_irqs(void) -{ - unsigned int i; - struct irq_desc *desc; - unsigned long flags; - - local_irq_save(flags); - - for_each_irq_desc(i, desc) { - bool affinity_broken; - - raw_spin_lock(&desc->lock); - affinity_broken = migrate_one_irq(desc); - raw_spin_unlock(&desc->lock); - - if (affinity_broken) - pr_warn_ratelimited("IRQ%u no longer affine to CPU%u\n", - i, smp_processor_id()); - } - - local_irq_restore(flags); -} -#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 3bf82232b1be..1d6f5ea522f4 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -254,7 +254,7 @@ int __cpu_disable(void) /* * OK - migrate IRQs away from this CPU */ - migrate_irqs(); + irq_migrate_all_off_this_cpu(); /* * Flush user cache and TLB mappings, and then remove this CPU From 48396e80fb6526ea5ed267bd84f028bae56d2f9e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 30 Jan 2019 14:05:55 -0800 Subject: [PATCH 006/298] RDMA/srp: Rework SCSI device reset handling Since .scsi_done() must only be called after scsi_queue_rq() has finished, make sure that the SRP initiator driver does not call .scsi_done() while scsi_queue_rq() is in progress. Although invoking sg_reset -d while I/O is in progress works fine with kernel v4.20 and before, that is not the case with kernel v5.0-rc1. This patch avoids that the following crash is triggered with kernel v5.0-rc1: BUG: unable to handle kernel NULL pointer dereference at 0000000000000138 CPU: 0 PID: 360 Comm: kworker/0:1H Tainted: G B 5.0.0-rc1-dbg+ #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Workqueue: kblockd blk_mq_run_work_fn RIP: 0010:blk_mq_dispatch_rq_list+0x116/0xb10 Call Trace: blk_mq_sched_dispatch_requests+0x2f7/0x300 __blk_mq_run_hw_queue+0xd6/0x180 blk_mq_run_work_fn+0x27/0x30 process_one_work+0x4f1/0xa20 worker_thread+0x67/0x5b0 kthread+0x1cf/0x1f0 ret_from_fork+0x24/0x30 Cc: Fixes: 94a9174c630c ("IB/srp: reduce lock coverage of command completion") Signed-off-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 31d91538bbf4..694324b37480 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3032,7 +3032,6 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); struct srp_rdma_ch *ch; - int i, j; u8 status; shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n"); @@ -3044,15 +3043,6 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) if (status) return FAILED; - for (i = 0; i < target->ch_count; i++) { - ch = &target->ch[i]; - for (j = 0; j < target->req_ring_size; ++j) { - struct srp_request *req = &ch->req_ring[j]; - - srp_finish_req(ch, req, scmnd->device, DID_RESET << 16); - } - } - return SUCCESS; } From f75a2804da391571563c4b6b29e7797787332673 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 31 Jan 2019 13:05:49 -0800 Subject: [PATCH 007/298] xfrm: destroy xfrm_state synchronously on net exit path xfrm_state_put() moves struct xfrm_state to the GC list and schedules the GC work to clean it up. On net exit call path, xfrm_state_flush() is called to clean up and xfrm_flush_gc() is called to wait for the GC work to complete before exit. However, this doesn't work because one of the ->destructor(), ipcomp_destroy(), schedules the same GC work again inside the GC work. It is hard to wait for such a nested async callback. This is also why syzbot still reports the following warning: WARNING: CPU: 1 PID: 33 at net/ipv6/xfrm6_tunnel.c:351 xfrm6_tunnel_net_exit+0x2cb/0x500 net/ipv6/xfrm6_tunnel.c:351 ... ops_exit_list.isra.0+0xb0/0x160 net/core/net_namespace.c:153 cleanup_net+0x51d/0xb10 net/core/net_namespace.c:551 process_one_work+0xd0c/0x1ce0 kernel/workqueue.c:2153 worker_thread+0x143/0x14a0 kernel/workqueue.c:2296 kthread+0x357/0x430 kernel/kthread.c:246 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 In fact, it is perfectly fine to bypass GC and destroy xfrm_state synchronously on net exit call path, because it is in process context and doesn't need a work struct to do any blocking work. This patch introduces xfrm_state_put_sync() which simply bypasses GC, and lets its callers to decide whether to use this synchronous version. On net exit path, xfrm_state_fini() and xfrm6_tunnel_net_exit() use it. And, as ipcomp_destroy() itself is blocking, it can use xfrm_state_put_sync() directly too. Also rename xfrm_state_gc_destroy() to ___xfrm_state_destroy() to reflect this change. Fixes: b48c05ab5d32 ("xfrm: Fix warning in xfrm6_tunnel_net_exit.") Reported-and-tested-by: syzbot+e9aebef558e3ed673934@syzkaller.appspotmail.com Cc: Steffen Klassert Signed-off-by: Cong Wang Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 12 +++++++++--- net/ipv6/xfrm6_tunnel.c | 2 +- net/key/af_key.c | 2 +- net/xfrm/xfrm_state.c | 30 +++++++++++++++++++----------- net/xfrm/xfrm_user.c | 2 +- 5 files changed, 31 insertions(+), 17 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 7298a53b9702..85386becbaea 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -853,7 +853,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) xfrm_pol_put(pols[i]); } -void __xfrm_state_destroy(struct xfrm_state *); +void __xfrm_state_destroy(struct xfrm_state *, bool); static inline void __xfrm_state_put(struct xfrm_state *x) { @@ -863,7 +863,13 @@ static inline void __xfrm_state_put(struct xfrm_state *x) static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x); + __xfrm_state_destroy(x, false); +} + +static inline void xfrm_state_put_sync(struct xfrm_state *x) +{ + if (refcount_dec_and_test(&x->refcnt)) + __xfrm_state_destroy(x, true); } static inline void xfrm_state_hold(struct xfrm_state *x) @@ -1590,7 +1596,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_delete(struct xfrm_state *x); -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index f5b4febeaa25..bc65db782bfb 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -344,8 +344,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_state_flush(net, IPSEC_PROTO_ANY, false); xfrm_flush_gc(); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false, true); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); diff --git a/net/key/af_key.c b/net/key/af_key.c index 655c787f9d54..637030f43b67 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1783,7 +1783,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_m if (proto == 0) return -EINVAL; - err = xfrm_state_flush(net, proto, true); + err = xfrm_state_flush(net, proto, true, false); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 23c92891758a..1bb971f46fc6 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -432,7 +432,7 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); -static void xfrm_state_gc_destroy(struct xfrm_state *x) +static void ___xfrm_state_destroy(struct xfrm_state *x) { tasklet_hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); @@ -474,7 +474,7 @@ static void xfrm_state_gc_task(struct work_struct *work) synchronize_rcu(); hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) - xfrm_state_gc_destroy(x); + ___xfrm_state_destroy(x); } static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) @@ -598,14 +598,19 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) } EXPORT_SYMBOL(xfrm_state_alloc); -void __xfrm_state_destroy(struct xfrm_state *x) +void __xfrm_state_destroy(struct xfrm_state *x, bool sync) { WARN_ON(x->km.state != XFRM_STATE_DEAD); - spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &xfrm_state_gc_list); - spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&xfrm_state_gc_work); + if (sync) { + synchronize_rcu(); + ___xfrm_state_destroy(x); + } else { + spin_lock_bh(&xfrm_state_gc_lock); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_work(&xfrm_state_gc_work); + } } EXPORT_SYMBOL(__xfrm_state_destroy); @@ -708,7 +713,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool } #endif -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) { int i, err = 0, cnt = 0; @@ -730,7 +735,10 @@ int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); - xfrm_state_put(x); + if (sync) + xfrm_state_put_sync(x); + else + xfrm_state_put(x); if (!err) cnt++; @@ -2215,7 +2223,7 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x) if (atomic_read(&t->tunnel_users) == 2) xfrm_state_delete(t); atomic_dec(&t->tunnel_users); - xfrm_state_put(t); + xfrm_state_put_sync(t); x->tunnel = NULL; } } @@ -2375,8 +2383,8 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - xfrm_state_flush(net, IPSEC_PROTO_ANY, false); flush_work(&xfrm_state_gc_work); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false, true); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c6d26afcf89d..a131f9ff979e 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1932,7 +1932,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_usersa_flush *p = nlmsg_data(nlh); int err; - err = xfrm_state_flush(net, p->proto, true); + err = xfrm_state_flush(net, p->proto, true, false); if (err) { if (err == -ESRCH) /* empty table */ return 0; From d04ca383860bef90a0dab4eb397907f7f05e839e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 6 Feb 2019 15:34:16 +0100 Subject: [PATCH 008/298] mt76x0u: fix suspend/resume We need to reset MCU and do other initializations on resume otherwise MT7610U device will fail to initialize, what cause system hung due to USB requests timeouts. Patch fixes 4.19 -> 4.20 regression. Cc: stable@vger.kernel.org # 4.20+ Signed-off-by: Stanislaw Gruszka Acked-by: Lorenzo Bianconi Signed-off-by: Kalle Valo --- .../net/wireless/mediatek/mt76/mt76x0/usb.c | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c index 0e6b43bb4678..a5ea3ba495a4 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/usb.c @@ -158,6 +158,32 @@ static const struct ieee80211_ops mt76x0u_ops = { .get_txpower = mt76x02_get_txpower, }; +static int mt76x0u_init_hardware(struct mt76x02_dev *dev) +{ + int err; + + mt76x0_chip_onoff(dev, true, true); + + if (!mt76x02_wait_for_mac(&dev->mt76)) + return -ETIMEDOUT; + + err = mt76x0u_mcu_init(dev); + if (err < 0) + return err; + + mt76x0_init_usb_dma(dev); + err = mt76x0_init_hardware(dev); + if (err < 0) + return err; + + mt76_rmw(dev, MT_US_CYC_CFG, MT_US_CYC_CNT, 0x1e); + mt76_wr(dev, MT_TXOP_CTRL_CFG, + FIELD_PREP(MT_TXOP_TRUN_EN, 0x3f) | + FIELD_PREP(MT_TXOP_EXT_CCA_DLY, 0x58)); + + return 0; +} + static int mt76x0u_register_device(struct mt76x02_dev *dev) { struct ieee80211_hw *hw = dev->mt76.hw; @@ -171,26 +197,10 @@ static int mt76x0u_register_device(struct mt76x02_dev *dev) if (err < 0) goto out_err; - mt76x0_chip_onoff(dev, true, true); - if (!mt76x02_wait_for_mac(&dev->mt76)) { - err = -ETIMEDOUT; - goto out_err; - } - - err = mt76x0u_mcu_init(dev); + err = mt76x0u_init_hardware(dev); if (err < 0) goto out_err; - mt76x0_init_usb_dma(dev); - err = mt76x0_init_hardware(dev); - if (err < 0) - goto out_err; - - mt76_rmw(dev, MT_US_CYC_CFG, MT_US_CYC_CNT, 0x1e); - mt76_wr(dev, MT_TXOP_CTRL_CFG, - FIELD_PREP(MT_TXOP_TRUN_EN, 0x3f) | - FIELD_PREP(MT_TXOP_EXT_CCA_DLY, 0x58)); - err = mt76x0_register_device(dev); if (err < 0) goto out_err; @@ -301,6 +311,8 @@ static int __maybe_unused mt76x0_suspend(struct usb_interface *usb_intf, mt76u_stop_queues(&dev->mt76); mt76x0u_mac_stop(dev); + clear_bit(MT76_STATE_MCU_RUNNING, &dev->mt76.state); + mt76x0_chip_onoff(dev, false, false); usb_kill_urb(usb->mcu.res.urb); return 0; @@ -328,7 +340,7 @@ static int __maybe_unused mt76x0_resume(struct usb_interface *usb_intf) tasklet_enable(&usb->rx_tasklet); tasklet_enable(&usb->tx_tasklet); - ret = mt76x0_init_hardware(dev); + ret = mt76x0u_init_hardware(dev); if (ret) goto err; From fa84667b98fd1a191b2465d66b440bda6714b3bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20van=20Dorst?= Date: Wed, 30 Jan 2019 17:10:49 +0100 Subject: [PATCH 009/298] gpio: MT7621: use a per instance irq_chip structure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes the kernel complains: gpio gpiochip1: (1e000600.gpio-bank1): detected irqchip that is shared with multiple gpiochips: please fix the driver. gpio gpiochip2: (1e000600.gpio-bank2): detected irqchip that is shared with multiple gpiochips: please fix the driver. Fixes: 4ba9c3afda41 ("gpio: mt7621: Add a driver for MT7621") Cc: stable@vger.kernel.org Signed-off-by: René van Dorst Cc: linux-gpio@vger.kernel.org Cc: linux-mediatek@lists.infradead.org Tested-by: Greg Ungerer Signed-off-by: Linus Walleij --- drivers/gpio/gpio-mt7621.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpio/gpio-mt7621.c b/drivers/gpio/gpio-mt7621.c index 00e954f22bc9..74401e0adb29 100644 --- a/drivers/gpio/gpio-mt7621.c +++ b/drivers/gpio/gpio-mt7621.c @@ -30,6 +30,7 @@ #define GPIO_REG_EDGE 0xA0 struct mtk_gc { + struct irq_chip irq_chip; struct gpio_chip chip; spinlock_t lock; int bank; @@ -189,13 +190,6 @@ mediatek_gpio_irq_type(struct irq_data *d, unsigned int type) return 0; } -static struct irq_chip mediatek_gpio_irq_chip = { - .irq_unmask = mediatek_gpio_irq_unmask, - .irq_mask = mediatek_gpio_irq_mask, - .irq_mask_ack = mediatek_gpio_irq_mask, - .irq_set_type = mediatek_gpio_irq_type, -}; - static int mediatek_gpio_xlate(struct gpio_chip *chip, const struct of_phandle_args *spec, u32 *flags) @@ -254,6 +248,13 @@ mediatek_gpio_bank_probe(struct device *dev, return ret; } + rg->irq_chip.name = dev_name(dev); + rg->irq_chip.parent_device = dev; + rg->irq_chip.irq_unmask = mediatek_gpio_irq_unmask; + rg->irq_chip.irq_mask = mediatek_gpio_irq_mask; + rg->irq_chip.irq_mask_ack = mediatek_gpio_irq_mask; + rg->irq_chip.irq_set_type = mediatek_gpio_irq_type; + if (mtk->gpio_irq) { /* * Manually request the irq here instead of passing @@ -270,14 +271,14 @@ mediatek_gpio_bank_probe(struct device *dev, return ret; } - ret = gpiochip_irqchip_add(&rg->chip, &mediatek_gpio_irq_chip, + ret = gpiochip_irqchip_add(&rg->chip, &rg->irq_chip, 0, handle_simple_irq, IRQ_TYPE_NONE); if (ret) { dev_err(dev, "failed to add gpiochip_irqchip\n"); return ret; } - gpiochip_set_chained_irqchip(&rg->chip, &mediatek_gpio_irq_chip, + gpiochip_set_chained_irqchip(&rg->chip, &rg->irq_chip, mtk->gpio_irq, NULL); } @@ -310,7 +311,6 @@ mediatek_gpio_probe(struct platform_device *pdev) mtk->gpio_irq = irq_of_parse_and_map(np, 0); mtk->dev = dev; platform_set_drvdata(pdev, mtk); - mediatek_gpio_irq_chip.name = dev_name(dev); for (i = 0; i < MTK_BANK_CNT; i++) { ret = mediatek_gpio_bank_probe(dev, np, i); From d623876646be119439999a229a2c3ce30fd197fb Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 8 Feb 2019 22:25:54 -0800 Subject: [PATCH 010/298] bpf: Fix narrow load on a bpf_sock returned from sk_lookup() By adding this test to test_verifier: { "reference tracking: access sk->src_ip4 (narrow load)", .insns = { BPF_SK_LOOKUP, BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_LDX_MEM(BPF_H, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, src_ip4) + 2), BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), BPF_EMIT_CALL(BPF_FUNC_sk_release), BPF_EXIT_INSN(), }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = ACCEPT, }, The above test loads 2 bytes from sk->src_ip4 where sk is obtained by bpf_sk_lookup_tcp(). It hits an internal verifier error from convert_ctx_accesses(): [root@arch-fb-vm1 bpf]# ./test_verifier 665 665 Failed to load prog 'Invalid argument'! 0: (b7) r2 = 0 1: (63) *(u32 *)(r10 -8) = r2 2: (7b) *(u64 *)(r10 -16) = r2 3: (7b) *(u64 *)(r10 -24) = r2 4: (7b) *(u64 *)(r10 -32) = r2 5: (7b) *(u64 *)(r10 -40) = r2 6: (7b) *(u64 *)(r10 -48) = r2 7: (bf) r2 = r10 8: (07) r2 += -48 9: (b7) r3 = 36 10: (b7) r4 = 0 11: (b7) r5 = 0 12: (85) call bpf_sk_lookup_tcp#84 13: (bf) r6 = r0 14: (15) if r0 == 0x0 goto pc+3 R0=sock(id=1,off=0,imm=0) R6=sock(id=1,off=0,imm=0) R10=fp0,call_-1 fp-8=????0000 fp-16=0000mmmm fp-24=mmmmmmmm fp-32=mmmmmmmm fp-40=mmmmmmmm fp-48=mmmmmmmm refs=1 15: (69) r2 = *(u16 *)(r0 +26) 16: (bf) r1 = r6 17: (85) call bpf_sk_release#86 18: (95) exit from 14 to 18: safe processed 20 insns (limit 131072), stack depth 48 bpf verifier is misconfigured Summary: 0 PASSED, 0 SKIPPED, 1 FAILED The bpf_sock_is_valid_access() is expecting src_ip4 can be narrowly loaded (meaning load any 1 or 2 bytes of the src_ip4) by marking info->ctx_field_size. However, this marked ctx_field_size is not used. This patch fixes it. Due to the recent refactoring in test_verifier, this new test will be added to the bpf-next branch (together with the bpf_tcp_sock patchset) to avoid merge conflict. Fixes: c64b7983288e ("bpf: Add PTR_TO_SOCKET verifier type") Cc: Joe Stringer Signed-off-by: Martin KaFai Lau Acked-by: Joe Stringer Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56674a7c3778..8f295b790297 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1617,12 +1617,13 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off, return 0; } -static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, - int size, enum bpf_access_type t) +static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, + u32 regno, int off, int size, + enum bpf_access_type t) { struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; - struct bpf_insn_access_aux info; + struct bpf_insn_access_aux info = {}; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1636,6 +1637,8 @@ static int check_sock_access(struct bpf_verifier_env *env, u32 regno, int off, return -EACCES; } + env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + return 0; } @@ -2032,7 +2035,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn verbose(env, "cannot write into socket\n"); return -EACCES; } - err = check_sock_access(env, regno, off, size, t); + err = check_sock_access(env, insn_idx, regno, off, size, t); if (!err && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else { From b90efd2258749e04e1b3f71ef0d716f2ac2337e0 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 7 Feb 2019 14:54:16 -0500 Subject: [PATCH 011/298] bpf: only adjust gso_size on bytestream protocols bpf_skb_change_proto and bpf_skb_adjust_room change skb header length. For GSO packets they adjust gso_size to maintain the same MTU. The gso size can only be safely adjusted on bytestream protocols. Commit d02f51cbcf12 ("bpf: fix bpf_skb_adjust_net/bpf_skb_proto_xlat to deal with gso sctp skbs") excluded SKB_GSO_SCTP. Since then type SKB_GSO_UDP_L4 has been added, whose contents are one gso_size unit per datagram. Also exclude these. Move from a blacklist to a whitelist check to future proof against additional such new GSO types, e.g., for fraglist based GRO. Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/linux/skbuff.h | 6 ++++++ net/core/filter.c | 12 ++++-------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95d25b010a25..5a7a8b93a5ab 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4212,6 +4212,12 @@ static inline bool skb_is_gso_sctp(const struct sk_buff *skb) return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP; } +static inline bool skb_is_gso_tcp(const struct sk_buff *skb) +{ + return skb_is_gso(skb) && + skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6); +} + static inline void skb_gso_reset(struct sk_buff *skb) { skb_shinfo(skb)->gso_size = 0; diff --git a/net/core/filter.c b/net/core/filter.c index 7a54dc11ac2d..f7d0004fc160 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2789,8 +2789,7 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2831,8 +2830,7 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb) u32 off = skb_mac_header_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); @@ -2957,8 +2955,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_cow(skb, len_diff); @@ -2987,8 +2984,7 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff) u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb); int ret; - /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */ - if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb))) + if (!skb_is_gso_tcp(skb)) return -ENOTSUPP; ret = skb_unclone(skb, GFP_ATOMIC); From e6762c8bcf982821935a2b1cb33cf8335d0eefae Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 8 Feb 2019 14:13:50 +0100 Subject: [PATCH 012/298] xsk: add missing smp_rmb() in xsk_mmap All the setup code in AF_XDP is protected by a mutex with the exception of the mmap code that cannot use it. To make sure that a process banging on the mmap call at the same time as another process is setting up the socket, smp_wmb() calls were added in the umem registration code and the queue creation code, so that the published structures that xsk_mmap needs would be consistent. However, the corresponding smp_rmb() calls were not added to the xsk_mmap code. This patch adds these calls. Fixes: 37b076933a8e3 ("xsk: add missing write- and data-dependency barrier") Fixes: c0c77d8fb787c ("xsk: add user memory registration support sockopt") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index a03268454a27..45f3b528dc09 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -669,6 +669,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (!umem) return -EINVAL; + /* Matches the smp_wmb() in XDP_UMEM_REG */ + smp_rmb(); if (offset == XDP_UMEM_PGOFF_FILL_RING) q = READ_ONCE(umem->fq); else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING) @@ -678,6 +680,8 @@ static int xsk_mmap(struct file *file, struct socket *sock, if (!q) return -EINVAL; + /* Matches the smp_wmb() in xsk_init_queue */ + smp_rmb(); qpg = virt_to_head_page(q->ring); if (size > (PAGE_SIZE << compound_order(qpg))) return -EINVAL; From a5a08c35d382a5a8da397260c3febb8dff4bdeef Mon Sep 17 00:00:00 2001 From: Bjorn Andersson Date: Thu, 31 Jan 2019 09:29:53 -0800 Subject: [PATCH 013/298] pinctrl: qcom: qcs404: Correct SDC tile The SDC controls live in the south tile, not the north one. Correct this so that we program the right registers. Cc: stable@vger.kernel.org Fixes: 22eb8301dbc1 ("pinctrl: qcom: Add qcs404 pinctrl driver") Signed-off-by: Bjorn Andersson Reviewed-by: Vinod Koul Signed-off-by: Linus Walleij --- drivers/pinctrl/qcom/pinctrl-qcs404.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/qcom/pinctrl-qcs404.c b/drivers/pinctrl/qcom/pinctrl-qcs404.c index 7aae52a09ff0..4ffd56ff809e 100644 --- a/drivers/pinctrl/qcom/pinctrl-qcs404.c +++ b/drivers/pinctrl/qcom/pinctrl-qcs404.c @@ -79,7 +79,7 @@ enum { .intr_cfg_reg = 0, \ .intr_status_reg = 0, \ .intr_target_reg = 0, \ - .tile = NORTH, \ + .tile = SOUTH, \ .mux_bit = -1, \ .pull_bit = pull, \ .drv_bit = drv, \ From b10bd9a256aec504c14a7c9b6fccb6301ecf290a Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Mon, 11 Feb 2019 10:20:49 +0100 Subject: [PATCH 014/298] s390: vsie: Use effective CRYCBD.31 to check CRYCBD validity When facility.76 MSAX3 is present for the guest we must issue a validity interception if the CRYCBD is not valid. The bit CRYCBD.31 is an effective field and tested at each guest level and has for effect to mask the facility.76 It follows that if CRYCBD.31 is clear and AP is not in use we do not have to test the CRYCBD validatity even if facility.76 is present in the host. Fixes: 6ee74098201b ("KVM: s390: vsie: allow CRYCB FORMAT-0") Cc: stable@vger.kernel.org Signed-off-by: Pierre Morel Reported-by: Claudio Imbrenda Acked-by: David Hildenbrand Acked-by: Cornelia Huck Reviewed-by: Christian Borntraeger Message-Id: <1549876849-32680-1-git-send-email-pmorel@linux.ibm.com> Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index a153257bf7d9..d62fa148558b 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -297,7 +297,7 @@ static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->crycbd = 0; apie_h = vcpu->arch.sie_block->eca & ECA_APIE; - if (!apie_h && !key_msk) + if (!apie_h && (!key_msk || fmt_o == CRYCB_FORMAT0)) return 0; if (!crycb_addr) From 3defaf2f15b2bfd86c6664181ac009e91985f8ac Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Sun, 10 Feb 2019 12:52:35 -0800 Subject: [PATCH 015/298] bpf: fix lockdep false positive in stackmap Lockdep warns about false positive: [ 11.211460] ------------[ cut here ]------------ [ 11.211936] DEBUG_LOCKS_WARN_ON(depth <= 0) [ 11.211985] WARNING: CPU: 0 PID: 141 at ../kernel/locking/lockdep.c:3592 lock_release+0x1ad/0x280 [ 11.213134] Modules linked in: [ 11.214954] RIP: 0010:lock_release+0x1ad/0x280 [ 11.223508] Call Trace: [ 11.223705] [ 11.223874] ? __local_bh_enable+0x7a/0x80 [ 11.224199] up_read+0x1c/0xa0 [ 11.224446] do_up_read+0x12/0x20 [ 11.224713] irq_work_run_list+0x43/0x70 [ 11.225030] irq_work_run+0x26/0x50 [ 11.225310] smp_irq_work_interrupt+0x57/0x1f0 [ 11.225662] irq_work_interrupt+0xf/0x20 since rw_semaphore is released in a different task vs task that locked the sema. It is expected behavior. Fix the warning with up_read_non_owner() and rwsem_release() annotation. Fixes: bae77c5eb5b2 ("bpf: enable stackmap with build_id in nmi context") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index d43b14535827..950ab2f28922 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -44,7 +44,7 @@ static void do_up_read(struct irq_work *entry) struct stack_map_irq_work *work; work = container_of(entry, struct stack_map_irq_work, irq_work); - up_read(work->sem); + up_read_non_owner(work->sem); work->sem = NULL; } @@ -338,6 +338,12 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } else { work->sem = ¤t->mm->mmap_sem; irq_work_queue(&work->irq_work); + /* + * The irq_work will release the mmap_sem with + * up_read_non_owner(). The rwsem_release() is called + * here to release the lock from lockdep's perspective. + */ + rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_); } } From e451eb51068496054d071af10b3530af4002a4f4 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Mon, 11 Feb 2019 08:15:29 -0800 Subject: [PATCH 016/298] xsk: share the mmap_sem for page pinning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Holding mmap_sem exclusively for a gup() is an overkill. Lets share the lock and replace the gup call for gup_longterm(), as it is better suited for the lifetime of the pinning. Fixes: c0c77d8fb787 ("xsk: add user memory registration support sockopt") Signed-off-by: Davidlohr Bueso Cc: David S. Miller Cc: Bjorn Topel Cc: Magnus Karlsson CC: netdev@vger.kernel.org Acked-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index d4de871e7d4d..597866e7c441 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -259,10 +259,10 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) if (!umem->pgs) return -ENOMEM; - down_write(¤t->mm->mmap_sem); - npgs = get_user_pages(umem->address, umem->npgs, - gup_flags, &umem->pgs[0], NULL); - up_write(¤t->mm->mmap_sem); + down_read(¤t->mm->mmap_sem); + npgs = get_user_pages_longterm(umem->address, umem->npgs, + gup_flags, &umem->pgs[0], NULL); + up_read(¤t->mm->mmap_sem); if (npgs != umem->npgs) { if (npgs >= 0) { From 94ee12b507db8b5876e31c9d6c9d84f556a4b49f Mon Sep 17 00:00:00 2001 From: Michael Clark Date: Mon, 11 Feb 2019 17:38:29 +1300 Subject: [PATCH 017/298] MIPS: fix truncation in __cmpxchg_small for short values __cmpxchg_small erroneously uses u8 for load comparison which can be either char or short. This patch changes the local variable to u32 which is sufficiently sized, as the loaded value is already masked and shifted appropriately. Using an integer size avoids any unnecessary canonicalization from use of non native widths. This patch is part of a series that adapts the MIPS small word atomics code for xchg and cmpxchg on short and char to RISC-V. Cc: RISC-V Patches Cc: Linux RISC-V Cc: Linux MIPS Signed-off-by: Michael Clark [paul.burton@mips.com: - Fix varialble typo per Jonas Gorski. - Consolidate load variable with other declarations.] Signed-off-by: Paul Burton Fixes: 3ba7f44d2b19 ("MIPS: cmpxchg: Implement 1 byte & 2 byte cmpxchg()") Cc: stable@vger.kernel.org # v4.13+ --- arch/mips/kernel/cmpxchg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/mips/kernel/cmpxchg.c b/arch/mips/kernel/cmpxchg.c index 0b9535bc2c53..6b2a4a902a98 100644 --- a/arch/mips/kernel/cmpxchg.c +++ b/arch/mips/kernel/cmpxchg.c @@ -54,10 +54,9 @@ unsigned long __xchg_small(volatile void *ptr, unsigned long val, unsigned int s unsigned long __cmpxchg_small(volatile void *ptr, unsigned long old, unsigned long new, unsigned int size) { - u32 mask, old32, new32, load32; + u32 mask, old32, new32, load32, load; volatile u32 *ptr32; unsigned int shift; - u8 load; /* Check that ptr is naturally aligned */ WARN_ON((unsigned long)ptr & (size - 1)); From fc2d5cfdcfe2ab76b263d91429caa22451123085 Mon Sep 17 00:00:00 2001 From: Sean Tranchetti Date: Thu, 7 Feb 2019 13:33:21 -0700 Subject: [PATCH 018/298] af_key: unconditionally clone on broadcast Attempting to avoid cloning the skb when broadcasting by inflating the refcount with sock_hold/sock_put while under RCU lock is dangerous and violates RCU principles. It leads to subtle race conditions when attempting to free the SKB, as we may reference sockets that have already been freed by the stack. Unable to handle kernel paging request at virtual address 6b6b6b6b6b6c4b [006b6b6b6b6b6c4b] address between user and kernel address ranges Internal error: Oops: 96000004 [#1] PREEMPT SMP task: fffffff78f65b380 task.stack: ffffff8049a88000 pc : sock_rfree+0x38/0x6c lr : skb_release_head_state+0x6c/0xcc Process repro (pid: 7117, stack limit = 0xffffff8049a88000) Call trace: sock_rfree+0x38/0x6c skb_release_head_state+0x6c/0xcc skb_release_all+0x1c/0x38 __kfree_skb+0x1c/0x30 kfree_skb+0xd0/0xf4 pfkey_broadcast+0x14c/0x18c pfkey_sendmsg+0x1d8/0x408 sock_sendmsg+0x44/0x60 ___sys_sendmsg+0x1d0/0x2a8 __sys_sendmsg+0x64/0xb4 SyS_sendmsg+0x34/0x4c el0_svc_naked+0x34/0x38 Kernel panic - not syncing: Fatal exception Suggested-by: Eric Dumazet Signed-off-by: Sean Tranchetti Signed-off-by: Steffen Klassert --- net/key/af_key.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index 637030f43b67..5651c29cb5bd 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -196,30 +196,22 @@ static int pfkey_release(struct socket *sock) return 0; } -static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, - gfp_t allocation, struct sock *sk) +static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation, + struct sock *sk) { int err = -ENOBUFS; - sock_hold(sk); - if (*skb2 == NULL) { - if (refcount_read(&skb->users) != 1) { - *skb2 = skb_clone(skb, allocation); - } else { - *skb2 = skb; - refcount_inc(&skb->users); - } + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) + return err; + + skb = skb_clone(skb, allocation); + + if (skb) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk); + err = 0; } - if (*skb2 != NULL) { - if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { - skb_set_owner_r(*skb2, sk); - skb_queue_tail(&sk->sk_receive_queue, *skb2); - sk->sk_data_ready(sk); - *skb2 = NULL; - err = 0; - } - } - sock_put(sk); return err; } @@ -234,7 +226,6 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; - struct sk_buff *skb2 = NULL; int err = -ESRCH; /* XXX Do we need something like netlink_overrun? I think @@ -253,7 +244,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, * socket. */ if (pfk->promisc) - pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); + pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) @@ -268,7 +259,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, continue; } - err2 = pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); + err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* Error is cleared after successful sending to at least one * registered KM */ @@ -278,9 +269,8 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, rcu_read_unlock(); if (one_sk != NULL) - err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); + err = pfkey_broadcast_one(skb, allocation, one_sk); - kfree_skb(skb2); kfree_skb(skb); return err; } From 0ac569bf6a7983c0c5747d6df8db9dc05bc92b6c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 Feb 2019 16:37:40 +0100 Subject: [PATCH 019/298] ARM: 8834/1: Fix: kprobes: optimized kprobes illegal instruction commit e46daee53bb5 ("ARM: 8806/1: kprobes: Fix false positive with FORTIFY_SOURCE") introduced a regression in optimized kprobes. It triggers "invalid instruction" oopses when using kprobes instrumentation through lttng and perf. This commit was introduced in kernel v4.20, and has been backported to stable kernels 4.19 and 4.14. This crash was also reported by Hongzhi Song on the redhat bugzilla where the patch was originally introduced. Link: https://bugzilla.redhat.com/show_bug.cgi?id=1639397 Link: https://bugs.lttng.org/issues/1174 Link: https://lore.kernel.org/lkml/342740659.2887.1549307721609.JavaMail.zimbra@efficios.com Fixes: e46daee53bb5 ("ARM: 8806/1: kprobes: Fix false positive with FORTIFY_SOURCE") Signed-off-by: Mathieu Desnoyers Reported-by: Robert Berger Tested-by: Robert Berger Acked-by: Kees Cook Cc: Robert Berger Cc: Masami Hiramatsu Cc: William Cohen Cc: Laura Abbott Cc: Kees Cook Cc: # v4.14+ Cc: linux-arm-kernel@lists.infradead.org Cc: patches@armlinux.org.uk Signed-off-by: Russell King --- arch/arm/probes/kprobes/opt-arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c index 2c118a6ab358..0dc23fc227ed 100644 --- a/arch/arm/probes/kprobes/opt-arm.c +++ b/arch/arm/probes/kprobes/opt-arm.c @@ -247,7 +247,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *or } /* Copy arch-dep-instance from template. */ - memcpy(code, (unsigned char *)optprobe_template_entry, + memcpy(code, (unsigned long *)&optprobe_template_entry, TMPL_END_IDX * sizeof(kprobe_opcode_t)); /* Adjust buffer according to instruction. */ From fc67e6f120a388b611d94cc40baf99a5cc56b283 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 6 Feb 2019 18:43:24 +0100 Subject: [PATCH 020/298] ARM: 8835/1: dma-mapping: Clear DMA ops on teardown Installing the appropriate non-IOMMU DMA ops in arm_iommu_detch_device() serves the case where IOMMU-aware drivers choose to control their own mapping but still make DMA API calls, however it also affects the case when the arch code itself tears down the mapping upon driver unbinding, where the ops now get left in place and can inhibit arch_setup_dma_ops() on subsequent re-probe attempts. Fix the latter case by making sure that arch_teardown_dma_ops() cleans up whenever the ops were automatically installed by its counterpart. Reported-by: Tobias Jakobi Reported-by: Marek Szyprowski Fixes: 1874619a7df4 "ARM: dma-mapping: Set proper DMA ops in arm_iommu_detach_device()" Tested-by: Tobias Jakobi Tested-by: Thierry Reding Signed-off-by: Robin Murphy Signed-off-by: Russell King --- arch/arm/mm/dma-mapping.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index f1e2922e447c..1e3e08a1c456 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -2390,4 +2390,6 @@ void arch_teardown_dma_ops(struct device *dev) return; arm_teardown_iommu_dma_ops(dev); + /* Let arch_setup_dma_ops() start again from scratch upon re-probe */ + set_dma_ops(dev, NULL); } From 1e405c1a3f667bf152905127b94e9c8f454a343e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Tue, 12 Feb 2019 08:51:14 +0100 Subject: [PATCH 021/298] xsk: do not remove umem from netdevice on fall-back to copy-mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c9b47cc1fabc ("xsk: fix bug when trying to use both copy and zero-copy on one queue id") stores the umem into the netdev._rx struct. However, the patch incorrectly removed the umem from the netdev._rx struct when user-space passed "best-effort" mode (i.e. select the fastest possible option available), and zero-copy mode was not available. This commit fixes that. Fixes: c9b47cc1fabc ("xsk: fix bug when trying to use both copy and zero-copy on one queue id") Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xdp_umem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 597866e7c441..37e1fe180769 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -125,9 +125,10 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, return 0; err_unreg_umem: - xdp_clear_umem_at_qid(dev, queue_id); if (!force_zc) err = 0; /* fallback to copy mode */ + if (err) + xdp_clear_umem_at_qid(dev, queue_id); out_rtnl_unlock: rtnl_unlock(); return err; From 323fb7b947b265753de34703dbbf8acc8ea3a4de Mon Sep 17 00:00:00 2001 From: Sylwester Nawrocki Date: Thu, 7 Feb 2019 18:00:12 +0100 Subject: [PATCH 022/298] ASoC: samsung: i2s: Fix prescaler setting for the secondary DAI Make sure i2s->rclk_srcrate is properly initialized also during playback through the secondary DAI. Signed-off-by: Sylwester Nawrocki Acked-by: Krzysztof Kozlowski Signed-off-by: Mark Brown --- sound/soc/samsung/i2s.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sound/soc/samsung/i2s.c b/sound/soc/samsung/i2s.c index ce00fe2f6aae..d4bde4834ce5 100644 --- a/sound/soc/samsung/i2s.c +++ b/sound/soc/samsung/i2s.c @@ -604,6 +604,7 @@ static int i2s_set_fmt(struct snd_soc_dai *dai, unsigned int fmt) { struct i2s_dai *i2s = to_info(dai); + struct i2s_dai *other = get_other_dai(i2s); int lrp_shift, sdf_shift, sdf_mask, lrp_rlow, mod_slave; u32 mod, tmp = 0; unsigned long flags; @@ -661,7 +662,8 @@ static int i2s_set_fmt(struct snd_soc_dai *dai, * CLK_I2S_RCLK_SRC clock is not exposed so we ensure any * clock configuration assigned in DT is not overwritten. */ - if (i2s->rclk_srcrate == 0 && i2s->clk_data.clks == NULL) + if (i2s->rclk_srcrate == 0 && i2s->clk_data.clks == NULL && + other->clk_data.clks == NULL) i2s_set_sysclk(dai, SAMSUNG_I2S_RCLKSRC_0, 0, SND_SOC_CLOCK_IN); break; @@ -699,6 +701,7 @@ static int i2s_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params, struct snd_soc_dai *dai) { struct i2s_dai *i2s = to_info(dai); + struct i2s_dai *other = get_other_dai(i2s); u32 mod, mask = 0, val = 0; struct clk *rclksrc; unsigned long flags; @@ -784,6 +787,9 @@ static int i2s_hw_params(struct snd_pcm_substream *substream, i2s->frmclk = params_rate(params); rclksrc = i2s->clk_table[CLK_I2S_RCLK_SRC]; + if (!rclksrc || IS_ERR(rclksrc)) + rclksrc = other->clk_table[CLK_I2S_RCLK_SRC]; + if (rclksrc && !IS_ERR(rclksrc)) i2s->rclk_srcrate = clk_get_rate(rclksrc); From 74f03104ed465ff71b11076ef620e4eaa53dbf74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 1 Feb 2019 09:47:44 +0100 Subject: [PATCH 023/298] MIPS: lantiq: pass struct device to DMA API functions The DMA API generally relies on a struct device to work properly, and only barely works without one for legacy reasons. Pass the easily available struct device from the platform_device to remedy this. Also use GFP_KERNEL instead of GFP_ATOMIC as the gfp_t for the memory allocation, as we aren't in interrupt context or under a lock. Note that this whole function looks somewhat bogus given that we never even look at the returned dma address, and the CPHYSADDR magic on a returned noncached mapping looks "interesting". But I'll leave that to people more familiar with the code to sort out. Signed-off-by: Christoph Hellwig Signed-off-by: Paul Burton Cc: John Crispin Cc: Vinod Koul Cc: Dmitry Tarnyagin Cc: Nicolas Ferre Cc: Sudip Mukherjee Cc: Felipe Balbi Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: dmaengine@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-usb@vger.kernel.org Cc: linux-fbdev@vger.kernel.org Cc: alsa-devel@alsa-project.org Cc: iommu@lists.linux-foundation.org --- arch/mips/lantiq/xway/vmmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/lantiq/xway/vmmc.c b/arch/mips/lantiq/xway/vmmc.c index 577ec81b557d..3deab9a77718 100644 --- a/arch/mips/lantiq/xway/vmmc.c +++ b/arch/mips/lantiq/xway/vmmc.c @@ -31,8 +31,8 @@ static int vmmc_probe(struct platform_device *pdev) dma_addr_t dma; cp1_base = - (void *) CPHYSADDR(dma_alloc_coherent(NULL, CP1_SIZE, - &dma, GFP_ATOMIC)); + (void *) CPHYSADDR(dma_alloc_coherent(&pdev->dev, CP1_SIZE, + &dma, GFP_KERNEL)); gpio_count = of_gpio_count(pdev->dev.of_node); while (gpio_count > 0) { From 6e9526852fad7eb77b7755114951237312258ec7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 11 Feb 2019 19:03:37 +0530 Subject: [PATCH 024/298] mtd: Use mtd->name when registering nvmem device With this patch, we use the mtd->name instead of concatenating the name with '0'. Fixes: c4dfa25ab307 ("mtd: add support for reading MTD devices via the nvmem API") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Boris Brezillon --- drivers/mtd/mtdcore.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index 999b705769a8..3ef01baef9b6 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -507,6 +507,7 @@ static int mtd_nvmem_add(struct mtd_info *mtd) { struct nvmem_config config = {}; + config.id = -1; config.dev = &mtd->dev; config.name = mtd->name; config.owner = THIS_MODULE; From 3e35730dd7540bad2d4e002703996391d9be49a0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 11 Feb 2019 19:03:38 +0530 Subject: [PATCH 025/298] mtd: powernv_flash: Fix device registration error This change helps me to get multiple mtd device registered. Without this I get sysfs: cannot create duplicate filename '/bus/nvmem/devices/flash0' CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc2-00557-g1ef20ef21f22 #13 Call Trace: [c0000000b38e3220] [c000000000b58fe4] dump_stack+0xe8/0x164 (unreliable) [c0000000b38e3270] [c0000000004cf074] sysfs_warn_dup+0x84/0xb0 [c0000000b38e32f0] [c0000000004cf6c4] sysfs_do_create_link_sd.isra.0+0x114/0x150 [c0000000b38e3340] [c000000000726a84] bus_add_device+0x94/0x1e0 [c0000000b38e33c0] [c0000000007218f0] device_add+0x4d0/0x830 [c0000000b38e3480] [c0000000009d54a8] nvmem_register.part.2+0x1c8/0xb30 [c0000000b38e3560] [c000000000834530] mtd_nvmem_add+0x90/0x120 [c0000000b38e3650] [c000000000835bc8] add_mtd_device+0x198/0x4e0 [c0000000b38e36f0] [c00000000083619c] mtd_device_parse_register+0x11c/0x280 [c0000000b38e3780] [c000000000840830] powernv_flash_probe+0x180/0x250 [c0000000b38e3820] [c00000000072c120] platform_drv_probe+0x60/0xf0 [c0000000b38e38a0] [c0000000007283c8] really_probe+0x138/0x4d0 [c0000000b38e3930] [c000000000728acc] driver_probe_device+0x13c/0x1b0 [c0000000b38e39b0] [c000000000728c7c] __driver_attach+0x13c/0x1c0 [c0000000b38e3a30] [c000000000725130] bus_for_each_dev+0xa0/0x120 [c0000000b38e3a90] [c000000000727b2c] driver_attach+0x2c/0x40 [c0000000b38e3ab0] [c0000000007270f8] bus_add_driver+0x228/0x360 [c0000000b38e3b40] [c00000000072a2e0] driver_register+0x90/0x1a0 [c0000000b38e3bb0] [c00000000072c020] __platform_driver_register+0x50/0x70 [c0000000b38e3bd0] [c00000000105c984] powernv_flash_driver_init+0x24/0x38 [c0000000b38e3bf0] [c000000000010904] do_one_initcall+0x84/0x464 [c0000000b38e3cd0] [c000000001004548] kernel_init_freeable+0x530/0x634 [c0000000b38e3db0] [c000000000011154] kernel_init+0x1c/0x168 [c0000000b38e3e20] [c00000000000bed4] ret_from_kernel_thread+0x5c/0x68 mtd mtd1: Failed to register NVMEM device With the change we now have root@(none):/sys/bus/nvmem/devices# ls -al total 0 drwxr-xr-x 2 root root 0 Feb 6 20:49 . drwxr-xr-x 4 root root 0 Feb 6 20:49 .. lrwxrwxrwx 1 root root 0 Feb 6 20:49 flash@0 -> ../../../devices/platform/ibm,opal:flash@0/mtd/mtd0/flash@0 lrwxrwxrwx 1 root root 0 Feb 6 20:49 flash@1 -> ../../../devices/platform/ibm,opal:flash@1/mtd/mtd1/flash@1 Fixes: 1cbb4a1c433a ("mtd: powernv: Add powernv flash MTD abstraction driver") Signed-off-by: Aneesh Kumar K.V Signed-off-by: Boris Brezillon --- drivers/mtd/devices/powernv_flash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/devices/powernv_flash.c b/drivers/mtd/devices/powernv_flash.c index 22f753e555ac..83f88b8b5d9f 100644 --- a/drivers/mtd/devices/powernv_flash.c +++ b/drivers/mtd/devices/powernv_flash.c @@ -212,7 +212,7 @@ static int powernv_flash_set_driver_info(struct device *dev, * Going to have to check what details I need to set and how to * get them */ - mtd->name = devm_kasprintf(dev, GFP_KERNEL, "%pOFn", dev->of_node); + mtd->name = devm_kasprintf(dev, GFP_KERNEL, "%pOFP", dev->of_node); mtd->type = MTD_NORFLASH; mtd->flags = MTD_WRITEABLE; mtd->size = size; From 207a369e3c085799e7836221f64e7a7329985fb6 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Mon, 4 Feb 2019 18:10:55 +0900 Subject: [PATCH 026/298] sh: fix build error for invisible CONFIG_BUILTIN_DTB_SOURCE I fixed a similar build error in commit 1b1e4ee86e00 ("sh: fix build error for empty CONFIG_BUILTIN_DTB_SOURCE"), but it came back again. Since commit 37c8a5fafa3b ("kbuild: consolidate Devicetree dtb build rules"), the combination of CONFIG_OF_EARLY_FLATTREE=y and CONFIG_USE_BUILTIN_DTB=n results in the following build error: make[1]: *** No rule to make target 'arch/sh/boot/dts/.dtb.o', needed by 'arch/sh/boot/dts/built-in.a'. Stop. Prior to that commit, there was only one path to descend into arch/sh/boot/dts/, and arch/sh/Makefile correctly guards it with CONFIG_USE_BUILTIN_DTB: core-$(CONFIG_USE_BUILTIN_DTB) += arch/sh/boot/dts/ Now, there is another path to descend there from the top Makefile when CONFIG_OF_EARLY_FLATTREE=y. If CONFIG_USE_BUILTIN_DTB is disabled, CONFIG_BUILTIN_DTB_SOURCE is invisible instead of defined as "". Add obj-$(CONFIG_USE_BUILTIN_DTB) guard to avoid the attempt to build the non-existing file. Fixes: 37c8a5fafa3b ("kbuild: consolidate Devicetree dtb build rules") Reported-by: kbuild test robot Signed-off-by: Masahiro Yamada --- arch/sh/boot/dts/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/boot/dts/Makefile b/arch/sh/boot/dts/Makefile index 01d0f7fb14cc..2563d1e532e2 100644 --- a/arch/sh/boot/dts/Makefile +++ b/arch/sh/boot/dts/Makefile @@ -1,3 +1,3 @@ ifneq ($(CONFIG_BUILTIN_DTB_SOURCE),"") -obj-y += $(patsubst "%",%,$(CONFIG_BUILTIN_DTB_SOURCE)).dtb.o +obj-$(CONFIG_USE_BUILTIN_DTB) += $(patsubst "%",%,$(CONFIG_BUILTIN_DTB_SOURCE)).dtb.o endif From 7f665b1c3283aae5b61843136d0a8ee808ba3199 Mon Sep 17 00:00:00 2001 From: Jeremy Soller Date: Wed, 13 Feb 2019 10:56:19 -0700 Subject: [PATCH 027/298] ALSA: hda/realtek - Headset microphone and internal speaker support for System76 oryp5 On the System76 Oryx Pro (oryp5), there is a headset microphone input attached to 0x19 that does not have a jack detect. In order to get it working, the pin configuration needs to be set correctly, and the ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC fixup needs to be applied. This is similar to the MIC_NO_PRESENCE fixups for some Dell laptops, except we have a separate microphone jack that is already configured correctly. Since the ALC1220 does not have a fixup similar to ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC, I have exposed the fixup from the ALC269 in a way that it can be accessed from the alc1220_fixup_system76_oryp5 function. In addition, the alc1220_fixup_clevo_p950 needs to be applied to gain speaker output. Signed-off-by: Jeremy Soller Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 6df758adff84..3ce318a3086d 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -1855,6 +1855,8 @@ enum { ALC887_FIXUP_BASS_CHMAP, ALC1220_FIXUP_GB_DUAL_CODECS, ALC1220_FIXUP_CLEVO_P950, + ALC1220_FIXUP_SYSTEM76_ORYP5, + ALC1220_FIXUP_SYSTEM76_ORYP5_PINS, }; static void alc889_fixup_coef(struct hda_codec *codec, @@ -2056,6 +2058,17 @@ static void alc1220_fixup_clevo_p950(struct hda_codec *codec, snd_hda_override_conn_list(codec, 0x1b, 1, conn1); } +static void alc_fixup_headset_mode_no_hp_mic(struct hda_codec *codec, + const struct hda_fixup *fix, int action); + +static void alc1220_fixup_system76_oryp5(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + alc1220_fixup_clevo_p950(codec, fix, action); + alc_fixup_headset_mode_no_hp_mic(codec, fix, action); +} + static const struct hda_fixup alc882_fixups[] = { [ALC882_FIXUP_ABIT_AW9D_MAX] = { .type = HDA_FIXUP_PINS, @@ -2300,6 +2313,19 @@ static const struct hda_fixup alc882_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc1220_fixup_clevo_p950, }, + [ALC1220_FIXUP_SYSTEM76_ORYP5] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc1220_fixup_system76_oryp5, + }, + [ALC1220_FIXUP_SYSTEM76_ORYP5_PINS] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { + { 0x19, 0x01a1913c }, /* use as headset mic, without its own jack detect */ + {} + }, + .chained = true, + .chain_id = ALC1220_FIXUP_SYSTEM76_ORYP5, + }, }; static const struct snd_pci_quirk alc882_fixup_tbl[] = { @@ -2376,6 +2402,8 @@ static const struct snd_pci_quirk alc882_fixup_tbl[] = { SND_PCI_QUIRK(0x1558, 0x9501, "Clevo P950HR", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x95e1, "Clevo P95xER", ALC1220_FIXUP_CLEVO_P950), SND_PCI_QUIRK(0x1558, 0x95e2, "Clevo P950ER", ALC1220_FIXUP_CLEVO_P950), + SND_PCI_QUIRK(0x1558, 0x96e1, "System76 Oryx Pro (oryp5)", ALC1220_FIXUP_SYSTEM76_ORYP5_PINS), + SND_PCI_QUIRK(0x1558, 0x97e1, "System76 Oryx Pro (oryp5)", ALC1220_FIXUP_SYSTEM76_ORYP5_PINS), SND_PCI_QUIRK_VENDOR(0x1558, "Clevo laptop", ALC882_FIXUP_EAPD), SND_PCI_QUIRK(0x161f, 0x2054, "Medion laptop", ALC883_FIXUP_EAPD), SND_PCI_QUIRK(0x17aa, 0x3a0d, "Lenovo Y530", ALC882_FIXUP_LENOVO_Y530), From c8c6ee611926685a7d753409e0a6e48b9e1b8748 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Thu, 14 Feb 2019 11:41:33 +0800 Subject: [PATCH 028/298] ALSA: hda/realtek: Disable PC beep in passthrough on alc285 It is reported that there's a constant background "hum/whitenoise" in the headset on the Lenovo X1 machines with the codec alc285, and it is confirmed that if we run the command below, the noise will stop. sudo hda-verb /dev/snd/hwC0D0 0x1d SET_PIN_WIDGET_CONTROL 0x0 Then I consulted this issue with Kailang, he told me the pin 0x1d on this codec is used for PC beep in, the noise probably comes from this pin and we can also disable the PC beep in passthrough, then the PC beep in will not affect other sound playback. Fixes: c4cfcf6f4297 ("ALSA: hda/realtek - fix the pop noise on headphone for lenovo laptops") Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1660581 Cc: Signed-off-by: Kailang Yang Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 3ce318a3086d..1ffa36e987b4 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5660,6 +5660,7 @@ enum { ALC294_FIXUP_ASUS_SPK, ALC225_FIXUP_HEADSET_JACK, ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE, + ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE, }; static const struct hda_fixup alc269_fixups[] = { @@ -6615,6 +6616,17 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC269_FIXUP_HEADSET_MODE_NO_HP_MIC }, + [ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE] = { + .type = HDA_FIXUP_VERBS, + .v.verbs = (const struct hda_verb[]) { + /* Disable PCBEEP-IN passthrough */ + { 0x20, AC_VERB_SET_COEF_INDEX, 0x36 }, + { 0x20, AC_VERB_SET_PROC_COEF, 0x57d7 }, + { } + }, + .chained = true, + .chain_id = ALC285_FIXUP_LENOVO_HEADPHONE_NOISE + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -7300,7 +7312,7 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x12, 0x90a60130}, {0x19, 0x03a11020}, {0x21, 0x0321101f}), - SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_LENOVO_HEADPHONE_NOISE, + SND_HDA_PIN_QUIRK(0x10ec0285, 0x17aa, "Lenovo", ALC285_FIXUP_LENOVO_PC_BEEP_IN_NOISE, {0x12, 0x90a60130}, {0x14, 0x90170110}, {0x19, 0x04a11040}, From af14b2c98adb85e9517390bb88309338b9075350 Mon Sep 17 00:00:00 2001 From: Lubomir Rintel Date: Thu, 14 Feb 2019 00:06:18 +0100 Subject: [PATCH 029/298] gpio: pxa: avoid attempting to set pin direction via pinctrl on MMP2 Similarly to PXA3xx, pinctrl-single can't set pin direction on MMP2 either. See also: commit 9dabfdd84bdfa ("gpio: pxa: disable pinctrl calls for PXA3xx") Cc: stable@vger.kernel.org Fixes: a770d946371e ("gpio: pxa: add pin control gpio direction and request") Signed-off-by: Lubomir Rintel Acked-by: Pavel Machek Signed-off-by: Linus Walleij --- drivers/gpio/gpio-pxa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c index e9600b556f39..bcc6be4a5cb2 100644 --- a/drivers/gpio/gpio-pxa.c +++ b/drivers/gpio/gpio-pxa.c @@ -245,6 +245,7 @@ static bool pxa_gpio_has_pinctrl(void) { switch (gpio_type) { case PXA3XX_GPIO: + case MMP2_GPIO: return false; default: From b4c3fbe6360178dc2181b7b43b7ae793a192b282 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 14 Feb 2019 22:03:24 +0800 Subject: [PATCH 030/298] mac80211: Use linked list instead of rhashtable walk for mesh tables The mesh table code walks over hash tables for two purposes. First of all it's used as part of a netlink dump process, but it is also used for looking up entries to delete using criteria other than the hash key. The second purpose is directly contrary to the design specification of rhashtable walks. It is only meant for use by netlink dumps. This is because rhashtable is resizable and you cannot obtain a stable walk over it during a resize process. In fact mesh's use of rhashtable for dumping is bogus too. Rather than using rhashtable walk's iterator to keep track of the current position, it always converts the current position to an integer which defeats the purpose of the iterator. Therefore this patch converts all uses of rhashtable walk into a simple linked list. This patch also adds a new spin lock to protect the hash table insertion/removal as well as the walk list modifications. In fact the previous code was buggy as the removals can race with each other, potentially resulting in a double-free. Cc: stable@vger.kernel.org Signed-off-by: Herbert Xu Signed-off-by: Johannes Berg --- net/mac80211/mesh.h | 6 ++ net/mac80211/mesh_pathtbl.c | 138 ++++++++++-------------------------- 2 files changed, 43 insertions(+), 101 deletions(-) diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index cad6592c52a1..2ec7011a4d07 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -70,6 +70,7 @@ enum mesh_deferred_task_flags { * @dst: mesh path destination mac address * @mpp: mesh proxy mac address * @rhash: rhashtable list pointer + * @walk_list: linked list containing all mesh_path objects. * @gate_list: list pointer for known gates list * @sdata: mesh subif * @next_hop: mesh neighbor to which frames for this destination will be @@ -105,6 +106,7 @@ struct mesh_path { u8 dst[ETH_ALEN]; u8 mpp[ETH_ALEN]; /* used for MPP or MAP */ struct rhash_head rhash; + struct hlist_node walk_list; struct hlist_node gate_list; struct ieee80211_sub_if_data *sdata; struct sta_info __rcu *next_hop; @@ -133,12 +135,16 @@ struct mesh_path { * gate's mpath may or may not be resolved and active. * @gates_lock: protects updates to known_gates * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr + * @walk_head: linked list containging all mesh_path objects + * @walk_lock: lock protecting walk_head * @entries: number of entries in the table */ struct mesh_table { struct hlist_head known_gates; spinlock_t gates_lock; struct rhashtable rhead; + struct hlist_head walk_head; + spinlock_t walk_lock; atomic_t entries; /* Up to MAX_MESH_NEIGHBOURS */ }; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index a5125624a76d..884a0d212e8b 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -59,8 +59,10 @@ static struct mesh_table *mesh_table_alloc(void) return NULL; INIT_HLIST_HEAD(&newtbl->known_gates); + INIT_HLIST_HEAD(&newtbl->walk_head); atomic_set(&newtbl->entries, 0); spin_lock_init(&newtbl->gates_lock); + spin_lock_init(&newtbl->walk_lock); return newtbl; } @@ -249,28 +251,15 @@ mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst) static struct mesh_path * __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx) { - int i = 0, ret; - struct mesh_path *mpath = NULL; - struct rhashtable_iter iter; + int i = 0; + struct mesh_path *mpath; - ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); - if (ret) - return NULL; - - rhashtable_walk_start(&iter); - - while ((mpath = rhashtable_walk_next(&iter))) { - if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) - continue; - if (IS_ERR(mpath)) - break; + hlist_for_each_entry_rcu(mpath, &tbl->walk_head, walk_list) { if (i++ == idx) break; } - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); - if (IS_ERR(mpath) || !mpath) + if (!mpath) return NULL; if (mpath_expired(mpath)) { @@ -432,6 +421,7 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, return ERR_PTR(-ENOMEM); tbl = sdata->u.mesh.mesh_paths; + spin_lock_bh(&tbl->walk_lock); do { ret = rhashtable_lookup_insert_fast(&tbl->rhead, &new_mpath->rhash, @@ -441,8 +431,10 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, mpath = rhashtable_lookup_fast(&tbl->rhead, dst, mesh_rht_params); - + else if (!ret) + hlist_add_head(&new_mpath->walk_list, &tbl->walk_head); } while (unlikely(ret == -EEXIST && !mpath)); + spin_unlock_bh(&tbl->walk_lock); if (ret && ret != -EEXIST) return ERR_PTR(ret); @@ -480,9 +472,14 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata, memcpy(new_mpath->mpp, mpp, ETH_ALEN); tbl = sdata->u.mesh.mpp_paths; + + spin_lock_bh(&tbl->walk_lock); ret = rhashtable_lookup_insert_fast(&tbl->rhead, &new_mpath->rhash, mesh_rht_params); + if (!ret) + hlist_add_head_rcu(&new_mpath->walk_list, &tbl->walk_head); + spin_unlock_bh(&tbl->walk_lock); sdata->u.mesh.mpp_paths_generation++; return ret; @@ -503,20 +500,9 @@ void mesh_plink_broken(struct sta_info *sta) struct mesh_table *tbl = sdata->u.mesh.mesh_paths; static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; struct mesh_path *mpath; - struct rhashtable_iter iter; - int ret; - ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); - if (ret) - return; - - rhashtable_walk_start(&iter); - - while ((mpath = rhashtable_walk_next(&iter))) { - if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) - continue; - if (IS_ERR(mpath)) - break; + rcu_read_lock(); + hlist_for_each_entry_rcu(mpath, &tbl->walk_head, walk_list) { if (rcu_access_pointer(mpath->next_hop) == sta && mpath->flags & MESH_PATH_ACTIVE && !(mpath->flags & MESH_PATH_FIXED)) { @@ -530,8 +516,7 @@ void mesh_plink_broken(struct sta_info *sta) WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast); } } - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); + rcu_read_unlock(); } static void mesh_path_free_rcu(struct mesh_table *tbl, @@ -551,6 +536,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl, static void __mesh_path_del(struct mesh_table *tbl, struct mesh_path *mpath) { + hlist_del_rcu(&mpath->walk_list); rhashtable_remove_fast(&tbl->rhead, &mpath->rhash, mesh_rht_params); mesh_path_free_rcu(tbl, mpath); } @@ -571,27 +557,14 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta) struct ieee80211_sub_if_data *sdata = sta->sdata; struct mesh_table *tbl = sdata->u.mesh.mesh_paths; struct mesh_path *mpath; - struct rhashtable_iter iter; - int ret; - - ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); - if (ret) - return; - - rhashtable_walk_start(&iter); - - while ((mpath = rhashtable_walk_next(&iter))) { - if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) - continue; - if (IS_ERR(mpath)) - break; + struct hlist_node *n; + spin_lock_bh(&tbl->walk_lock); + hlist_for_each_entry_safe(mpath, n, &tbl->walk_head, walk_list) { if (rcu_access_pointer(mpath->next_hop) == sta) __mesh_path_del(tbl, mpath); } - - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); + spin_unlock_bh(&tbl->walk_lock); } static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, @@ -599,51 +572,26 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata, { struct mesh_table *tbl = sdata->u.mesh.mpp_paths; struct mesh_path *mpath; - struct rhashtable_iter iter; - int ret; - - ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); - if (ret) - return; - - rhashtable_walk_start(&iter); - - while ((mpath = rhashtable_walk_next(&iter))) { - if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) - continue; - if (IS_ERR(mpath)) - break; + struct hlist_node *n; + spin_lock_bh(&tbl->walk_lock); + hlist_for_each_entry_safe(mpath, n, &tbl->walk_head, walk_list) { if (ether_addr_equal(mpath->mpp, proxy)) __mesh_path_del(tbl, mpath); } - - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); + spin_unlock_bh(&tbl->walk_lock); } static void table_flush_by_iface(struct mesh_table *tbl) { struct mesh_path *mpath; - struct rhashtable_iter iter; - int ret; + struct hlist_node *n; - ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC); - if (ret) - return; - - rhashtable_walk_start(&iter); - - while ((mpath = rhashtable_walk_next(&iter))) { - if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) - continue; - if (IS_ERR(mpath)) - break; + spin_lock_bh(&tbl->walk_lock); + hlist_for_each_entry_safe(mpath, n, &tbl->walk_head, walk_list) { __mesh_path_del(tbl, mpath); } - - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); + spin_unlock_bh(&tbl->walk_lock); } /** @@ -675,7 +623,7 @@ static int table_path_del(struct mesh_table *tbl, { struct mesh_path *mpath; - rcu_read_lock(); + spin_lock_bh(&tbl->walk_lock); mpath = rhashtable_lookup_fast(&tbl->rhead, addr, mesh_rht_params); if (!mpath) { rcu_read_unlock(); @@ -683,7 +631,7 @@ static int table_path_del(struct mesh_table *tbl, } __mesh_path_del(tbl, mpath); - rcu_read_unlock(); + spin_unlock_bh(&tbl->walk_lock); return 0; } @@ -854,28 +802,16 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata, struct mesh_table *tbl) { struct mesh_path *mpath; - struct rhashtable_iter iter; - int ret; + struct hlist_node *n; - ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_KERNEL); - if (ret) - return; - - rhashtable_walk_start(&iter); - - while ((mpath = rhashtable_walk_next(&iter))) { - if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN) - continue; - if (IS_ERR(mpath)) - break; + spin_lock_bh(&tbl->walk_lock); + hlist_for_each_entry_safe(mpath, n, &tbl->walk_head, walk_list) { if ((!(mpath->flags & MESH_PATH_RESOLVING)) && (!(mpath->flags & MESH_PATH_FIXED)) && time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE)) __mesh_path_del(tbl, mpath); } - - rhashtable_walk_stop(&iter); - rhashtable_walk_exit(&iter); + spin_unlock_bh(&tbl->walk_lock); } void mesh_path_expire(struct ieee80211_sub_if_data *sdata) From 4ff3a9d14c6c06eaa4e5976c61599ea2bd9e81b2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 14 Feb 2019 22:03:25 +0800 Subject: [PATCH 031/298] mac80211: Free mpath object when rhashtable insertion fails When rhashtable insertion fails the mesh table code doesn't free the now-orphan mesh path object. This patch fixes that. Cc: stable@vger.kernel.org Signed-off-by: Herbert Xu Signed-off-by: Johannes Berg --- net/mac80211/mesh_pathtbl.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 884a0d212e8b..c3a7396fb955 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -436,17 +436,15 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata, } while (unlikely(ret == -EEXIST && !mpath)); spin_unlock_bh(&tbl->walk_lock); - if (ret && ret != -EEXIST) - return ERR_PTR(ret); - - /* At this point either new_mpath was added, or we found a - * matching entry already in the table; in the latter case - * free the unnecessary new entry. - */ - if (ret == -EEXIST) { + if (ret) { kfree(new_mpath); + + if (ret != -EEXIST) + return ERR_PTR(ret); + new_mpath = mpath; } + sdata->u.mesh.mesh_paths_generation++; return new_mpath; } @@ -481,6 +479,9 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata, hlist_add_head_rcu(&new_mpath->walk_list, &tbl->walk_head); spin_unlock_bh(&tbl->walk_lock); + if (ret) + kfree(new_mpath); + sdata->u.mesh.mpp_paths_generation++; return ret; } From 83e37e0bdd1470bbe6612250b745ad39b1a7b130 Mon Sep 17 00:00:00 2001 From: Rakesh Pillai Date: Fri, 15 Feb 2019 14:16:02 +0530 Subject: [PATCH 032/298] mac80211: Restore vif beacon interval if start ap fails The starting of AP interface can fail due to invalid beacon interval, which does not match the minimum gcd requirement set by the wifi driver. In such case, the beacon interval of that interface gets updated with that invalid beacon interval. The next time that interface is brought up in AP mode, an interface combination check is performed and the beacon interval is taken from the previously set value. In a case where an invalid beacon interval, i.e. a beacon interval value which does not satisfy the minimum gcd criteria set by the driver, is set, all the subsequent trials to bring that interface in AP mode will fail, even if the subsequent trials have a valid beacon interval. To avoid this, in case of a failure in bringing up an interface in AP mode due to interface combination error, the interface beacon interval which is stored in bss conf, needs to be restored with the last working value of beacon interval. Tested on ath10k using WCN3990. Cc: stable@vger.kernel.org Fixes: 0c317a02ca98 ("cfg80211: support virtual interfaces with different beacon intervals") Signed-off-by: Rakesh Pillai Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2493c74c2d37..96496b2c1670 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -941,6 +941,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_P2P_PS | BSS_CHANGED_TXPOWER; int err; + int prev_beacon_int; old = sdata_dereference(sdata->u.ap.beacon, sdata); if (old) @@ -963,6 +964,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->needed_rx_chains = sdata->local->rx_chains; + prev_beacon_int = sdata->vif.bss_conf.beacon_int; sdata->vif.bss_conf.beacon_int = params->beacon_interval; if (params->he_cap) @@ -974,8 +976,10 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (!err) ieee80211_vif_copy_chanctx_to_vlans(sdata, false); mutex_unlock(&local->mtx); - if (err) + if (err) { + sdata->vif.bss_conf.beacon_int = prev_beacon_int; return err; + } /* * Apply control port protocol, this allows us to From 23b7ca4f745f21c2b9cfcb67fdd33733b3ae7e66 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 15 Feb 2019 12:50:24 +0100 Subject: [PATCH 033/298] netfilter: nf_tables: fix flush after rule deletion in the same batch Flush after rule deletion bogusly hits -ENOENT. Skip rules that have been already from nft_delrule_by_chain() which is always called from the flush path. Fixes: cf9dc09d0949 ("netfilter: nf_tables: fix missing rules flushing per table") Reported-by: Phil Sutter Acked-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 5a92f23f179f..4893f248dfdc 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -313,6 +313,9 @@ static int nft_delrule_by_chain(struct nft_ctx *ctx) int err; list_for_each_entry(rule, &ctx->chain->rules, list) { + if (!nft_is_active_next(ctx->net, rule)) + continue; + err = nft_delrule(ctx, rule); if (err < 0) return err; From fc4144e7815b7747b6aba140d7a91da45ee9dd8c Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Thu, 14 Feb 2019 17:40:53 +0530 Subject: [PATCH 034/298] cxgb4: Export sge_host_page_size to ulds Export the sge_host_page_size field to ULDs via cxgb4_lld_info, so that iw_cxgb4 can make use of this in calculating the correct qp/cq mask. Fixes: 2391b0030e24 ("cxgb4: Remove SGE_HOST_PAGE_SIZE dependency on page size") Signed-off-by: Raju Rangoju Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 1 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c index c041f44324db..b3654598a2d5 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c @@ -660,6 +660,7 @@ static void uld_init(struct adapter *adap, struct cxgb4_lld_info *lld) lld->cclk_ps = 1000000000 / adap->params.vpd.cclk; lld->udb_density = 1 << adap->params.sge.eq_qpp; lld->ucq_density = 1 << adap->params.sge.iq_qpp; + lld->sge_host_page_size = 1 << (adap->params.sge.hps + 10); lld->filt_mode = adap->params.tp.vlan_pri_map; /* MODQ_REQ_MAP sets queues 0-3 to chan 0-3 */ for (i = 0; i < NCHAN; i++) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index 5fa9a2d5fc4b..21da34a4ca24 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -336,6 +336,7 @@ struct cxgb4_lld_info { unsigned int cclk_ps; /* Core clock period in psec */ unsigned short udb_density; /* # of user DB/page */ unsigned short ucq_density; /* # of user CQs/page */ + unsigned int sge_host_page_size; /* SGE host page size */ unsigned short filt_mode; /* filter optional components */ unsigned short tx_modq[NCHAN]; /* maps each tx channel to a */ /* scheduler queue */ From f09ef134a7ca3f0d2ce485a757f5b79809ebb803 Mon Sep 17 00:00:00 2001 From: Raju Rangoju Date: Thu, 14 Feb 2019 17:40:54 +0530 Subject: [PATCH 035/298] iw_cxgb4: cq/qp mask depends on bar2 pages in a host page Adjust the cq/qp mask based on the number of bar2 pages in a host page. For user-mode rdma, the granularity of the BAR2 memory mapped to a user rdma process during queue allocation must be based on the host page size. The lld attributes udb_density and ucq_density are used to figure out how many sge contexts are in a bar2 page. So the rdev->qpmask and rdev->cqmask in iw_cxgb4 need to now be adjusted based on how many sge bar2 pages are in a host page. Otherwise the device fails to work on non 4k page size systems. Fixes: 2391b0030e24 ("cxgb4: Remove SGE_HOST_PAGE_SIZE dependency on page size") Signed-off-by: Raju Rangoju Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/device.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index c13c0ba30f63..d499cd61c0e8 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -783,6 +783,7 @@ void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev, static int c4iw_rdev_open(struct c4iw_rdev *rdev) { int err; + unsigned int factor; c4iw_init_dev_ucontext(rdev, &rdev->uctx); @@ -806,8 +807,18 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) return -EINVAL; } - rdev->qpmask = rdev->lldi.udb_density - 1; - rdev->cqmask = rdev->lldi.ucq_density - 1; + /* This implementation requires a sge_host_page_size <= PAGE_SIZE. */ + if (rdev->lldi.sge_host_page_size > PAGE_SIZE) { + pr_err("%s: unsupported sge host page size %u\n", + pci_name(rdev->lldi.pdev), + rdev->lldi.sge_host_page_size); + return -EINVAL; + } + + factor = PAGE_SIZE / rdev->lldi.sge_host_page_size; + rdev->qpmask = (rdev->lldi.udb_density * factor) - 1; + rdev->cqmask = (rdev->lldi.ucq_density * factor) - 1; + pr_debug("dev %s stag start 0x%0x size 0x%0x num stags %d pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x qp qid start %u size %u cq qid start %u size %u srq size %u\n", pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start, rdev->lldi.vr->stag.size, c4iw_num_stags(rdev), From 2c4f1fcbef0bc324830bc2fb1a264c08ec93dec5 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Fri, 25 Jan 2019 23:10:50 +0800 Subject: [PATCH 036/298] kprobe: Do not use uaccess functions to access kernel memory that can fault The userspace can ask kprobe to intercept strings at any memory address, including invalid kernel address. In this case, fetch_store_strlen() would crash since it uses general usercopy function, and user access functions are no longer allowed to access kernel memory. For example, we can crash the kernel by doing something as below: $ sudo kprobe 'p:do_sys_open +0(+0(%si)):string' [ 103.620391] BUG: GPF in non-whitelisted uaccess (non-canonical address?) [ 103.622104] general protection fault: 0000 [#1] SMP PTI [ 103.623424] CPU: 10 PID: 1046 Comm: cat Not tainted 5.0.0-rc3-00130-gd73aba1-dirty #96 [ 103.625321] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-2-g628b2e6-dirty-20190104_103505-linux 04/01/2014 [ 103.628284] RIP: 0010:process_fetch_insn+0x1ab/0x4b0 [ 103.629518] Code: 10 83 80 28 2e 00 00 01 31 d2 31 ff 48 8b 74 24 28 eb 0c 81 fa ff 0f 00 00 7f 1c 85 c0 75 18 66 66 90 0f ae e8 48 63 ca 89 f8 <8a> 0c 31 66 66 90 83 c2 01 84 c9 75 dc 89 54 24 34 89 44 24 28 48 [ 103.634032] RSP: 0018:ffff88845eb37ce0 EFLAGS: 00010246 [ 103.635312] RAX: 0000000000000000 RBX: ffff888456c4e5a8 RCX: 0000000000000000 [ 103.637057] RDX: 0000000000000000 RSI: 2e646c2f6374652f RDI: 0000000000000000 [ 103.638795] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [ 103.640556] R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 [ 103.642297] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 [ 103.644040] FS: 0000000000000000(0000) GS:ffff88846f000000(0000) knlGS:0000000000000000 [ 103.646019] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 103.647436] CR2: 00007ffc79758038 CR3: 0000000463360006 CR4: 0000000000020ee0 [ 103.649147] Call Trace: [ 103.649781] ? sched_clock_cpu+0xc/0xa0 [ 103.650747] ? do_sys_open+0x5/0x220 [ 103.651635] kprobe_trace_func+0x303/0x380 [ 103.652645] ? do_sys_open+0x5/0x220 [ 103.653528] kprobe_dispatcher+0x45/0x50 [ 103.654682] ? do_sys_open+0x1/0x220 [ 103.655875] kprobe_ftrace_handler+0x90/0xf0 [ 103.657282] ftrace_ops_assist_func+0x54/0xf0 [ 103.658564] ? __call_rcu+0x1dc/0x280 [ 103.659482] 0xffffffffc00000bf [ 103.660384] ? __ia32_sys_open+0x20/0x20 [ 103.661682] ? do_sys_open+0x1/0x220 [ 103.662863] do_sys_open+0x5/0x220 [ 103.663988] do_syscall_64+0x60/0x210 [ 103.665201] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 103.666862] RIP: 0033:0x7fc22fadccdd [ 103.668034] Code: 48 89 54 24 e0 41 83 e2 40 75 32 89 f0 25 00 00 41 00 3d 00 00 41 00 74 24 89 f2 b8 01 01 00 00 48 89 fe bf 9c ff ff ff 0f 05 <48> 3d 00 f0 ff ff 77 33 f3 c3 66 0f 1f 84 00 00 00 00 00 48 8d 44 [ 103.674029] RSP: 002b:00007ffc7972c3a8 EFLAGS: 00000287 ORIG_RAX: 0000000000000101 [ 103.676512] RAX: ffffffffffffffda RBX: 0000562f86147a21 RCX: 00007fc22fadccdd [ 103.678853] RDX: 0000000000080000 RSI: 00007fc22fae1428 RDI: 00000000ffffff9c [ 103.681151] RBP: ffffffffffffffff R08: 0000000000000000 R09: 0000000000000000 [ 103.683489] R10: 0000000000000000 R11: 0000000000000287 R12: 00007fc22fce90a8 [ 103.685774] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000000 [ 103.688056] Modules linked in: [ 103.689131] ---[ end trace 43792035c28984a1 ]--- This can be fixed by using probe_mem_read() instead, as it can handle faulting kernel memory addresses, which kprobes can legitimately do. Link: http://lkml.kernel.org/r/20190125151051.7381-1-changbin.du@gmail.com Cc: stable@vger.kernel.org Fixes: 9da3f2b7405 ("x86/fault: BUG() when uaccess helpers fault on kernel addresses") Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d5fb09ebba8b..9eaf07f99212 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -861,22 +861,14 @@ static const struct file_operations kprobe_profile_ops = { static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - mm_segment_t old_fs; int ret, len = 0; u8 c; - old_fs = get_fs(); - set_fs(KERNEL_DS); - pagefault_disable(); - do { - ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + ret = probe_mem_read(&c, (u8 *)addr + len, 1); len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - pagefault_enable(); - set_fs(old_fs); - return (ret < 0) ? ret : len; } From 9e7382153f80ba45a0bbcd540fb77d4b15f6e966 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 14 Feb 2019 15:29:50 +0000 Subject: [PATCH 037/298] tracing: Fix number of entries in trace header The following commit 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") removed the call to print_event_info() from print_func_help_header_irq() which results in the ftrace header not reporting the number of entries written in the buffer. As this wasn't the original intent of the patch, re-introduce the call to print_event_info() to restore the orginal behaviour. Link: http://lkml.kernel.org/r/20190214152950.4179-1-quentin.perret@arm.com Acked-by: Joel Fernandes Cc: stable@vger.kernel.org Fixes: 441dae8f2f29 ("tracing: Add support for display of tgid in trace output") Signed-off-by: Quentin Perret Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c521b7347482..c4238b441624 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3384,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file const char tgid_space[] = " "; const char space[] = " "; + print_event_info(buf, m); + seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? tgid_space : space); seq_printf(m, "# %s / _----=> need-resched\n", From a08bf91ce28ed3ae7b6fef35d843fef8dc8c2cd9 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 14 Feb 2019 16:20:01 +0000 Subject: [PATCH 038/298] KEYS: allow reaching the keys quotas exactly If the sysctl 'kernel.keys.maxkeys' is set to some number n, then actually users can only add up to 'n - 1' keys. Likewise for 'kernel.keys.maxbytes' and the root_* versions of these sysctls. But these sysctls are apparently supposed to be *maximums*, as per their names and all documentation I could find -- the keyrings(7) man page, Documentation/security/keys/core.rst, and all the mentions of EDQUOT meaning that the key quota was *exceeded* (as opposed to reached). Thus, fix the code to allow reaching the quotas exactly. Fixes: 0b77f5bfb45c ("keys: make the keyring quotas controllable through /proc/sys") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: James Morris --- security/keys/key.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/keys/key.c b/security/keys/key.c index 44a80d6741a1..0ec9322af4f9 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -265,8 +265,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, spin_lock(&user->lock); if (!(flags & KEY_ALLOC_QUOTA_OVERRUN)) { - if (user->qnkeys + 1 >= maxkeys || - user->qnbytes + quotalen >= maxbytes || + if (user->qnkeys + 1 > maxkeys || + user->qnbytes + quotalen > maxbytes || user->qnbytes + quotalen < user->qnbytes) goto no_quota; } From bb2ba2d75a2d673e76ddaf13a9bd30d6a8b1bb08 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Feb 2019 16:20:15 +0000 Subject: [PATCH 039/298] assoc_array: Fix shortcut creation Fix the creation of shortcuts for which the length of the index key value is an exact multiple of the machine word size. The problem is that the code that blanks off the unused bits of the shortcut value malfunctions if the number of bits in the last word equals machine word size. This is due to the "<<" operator being given a shift of zero in this case, and so the mask that should be all zeros is all ones instead. This causes the subsequent masking operation to clear everything rather than clearing nothing. Ordinarily, the presence of the hash at the beginning of the tree index key makes the issue very hard to test for, but in this case, it was encountered due to a development mistake that caused the hash output to be either 0 (keyring) or 1 (non-keyring) only. This made it susceptible to the keyctl/unlink/valid test in the keyutils package. The fix is simply to skip the blanking if the shift would be 0. For example, an index key that is 64 bits long would produce a 0 shift and thus a 'blank' of all 1s. This would then be inverted and AND'd onto the index_key, incorrectly clearing the entire last word. Fixes: 3cb989501c26 ("Add a generic associative array implementation.") Signed-off-by: David Howells Signed-off-by: James Morris --- lib/assoc_array.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/assoc_array.c b/lib/assoc_array.c index c6659cb37033..59875eb278ea 100644 --- a/lib/assoc_array.c +++ b/lib/assoc_array.c @@ -768,9 +768,11 @@ static bool assoc_array_insert_into_terminal_node(struct assoc_array_edit *edit, new_s0->index_key[i] = ops->get_key_chunk(index_key, i * ASSOC_ARRAY_KEY_CHUNK_SIZE); - blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK); - pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank); - new_s0->index_key[keylen - 1] &= ~blank; + if (level & ASSOC_ARRAY_KEY_CHUNK_MASK) { + blank = ULONG_MAX << (level & ASSOC_ARRAY_KEY_CHUNK_MASK); + pr_devel("blank off [%zu] %d: %lx\n", keylen - 1, level, blank); + new_s0->index_key[keylen - 1] &= ~blank; + } /* This now reduces to a node splitting exercise for which we'll need * to regenerate the disparity table. From 822ad64d7e46a8e2c8b8a796738d7b657cbb146d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Feb 2019 16:20:25 +0000 Subject: [PATCH 040/298] keys: Fix dependency loop between construction record and auth key In the request_key() upcall mechanism there's a dependency loop by which if a key type driver overrides the ->request_key hook and the userspace side manages to lose the authorisation key, the auth key and the internal construction record (struct key_construction) can keep each other pinned. Fix this by the following changes: (1) Killing off the construction record and using the auth key instead. (2) Including the operation name in the auth key payload and making the payload available outside of security/keys/. (3) The ->request_key hook is given the authkey instead of the cons record and operation name. Changes (2) and (3) allow the auth key to naturally be cleaned up if the keyring it is in is destroyed or cleared or the auth key is unlinked. Fixes: 7ee02a316600 ("keys: Fix dependency loop between construction record and auth key") Signed-off-by: David Howells Signed-off-by: James Morris --- fs/nfs/nfs4idmap.c | 31 ++++++------ include/keys/request_key_auth-type.h | 36 ++++++++++++++ include/linux/key-type.h | 22 +++------ security/keys/internal.h | 13 +---- security/keys/keyctl.c | 1 + security/keys/process_keys.c | 1 + security/keys/request_key.c | 72 +++++++++++----------------- security/keys/request_key_auth.c | 16 ++++--- 8 files changed, 100 insertions(+), 92 deletions(-) create mode 100644 include/keys/request_key_auth-type.h diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 3f23b6840547..bf34ddaa2ad7 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "internal.h" @@ -59,7 +60,7 @@ static struct key_type key_type_id_resolver_legacy; struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; - struct key_construction *key_cons; + struct key *authkey; struct idmap *idmap; }; @@ -384,7 +385,7 @@ static const match_table_t nfs_idmap_tokens = { { Opt_find_err, NULL } }; -static int nfs_idmap_legacy_upcall(struct key_construction *, const char *, void *); +static int nfs_idmap_legacy_upcall(struct key *, void *); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_release_pipe(struct inode *); @@ -549,11 +550,12 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, static void nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret) { - struct key_construction *cons = idmap->idmap_upcall_data->key_cons; + struct key *authkey = idmap->idmap_upcall_data->authkey; kfree(idmap->idmap_upcall_data); idmap->idmap_upcall_data = NULL; - complete_request_key(cons, ret); + complete_request_key(authkey, ret); + key_put(authkey); } static void @@ -563,15 +565,14 @@ nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret) nfs_idmap_complete_pipe_upcall_locked(idmap, ret); } -static int nfs_idmap_legacy_upcall(struct key_construction *cons, - const char *op, - void *aux) +static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) { struct idmap_legacy_upcalldata *data; + struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; struct idmap *idmap = (struct idmap *)aux; - struct key *key = cons->key; + struct key *key = rka->target_key; int ret = -ENOKEY; if (!aux) @@ -586,7 +587,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; - data->key_cons = cons; + data->authkey = key_get(authkey); ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) @@ -604,7 +605,7 @@ static int nfs_idmap_legacy_upcall(struct key_construction *cons, out2: kfree(data); out1: - complete_request_key(cons, ret); + complete_request_key(authkey, ret); return ret; } @@ -651,9 +652,10 @@ static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { + struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; - struct key_construction *cons; + struct key *authkey; struct idmap_msg im; size_t namelen_in; int ret = -ENOKEY; @@ -665,7 +667,8 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (idmap->idmap_upcall_data == NULL) goto out_noupcall; - cons = idmap->idmap_upcall_data->key_cons; + authkey = idmap->idmap_upcall_data->authkey; + rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { ret = -ENOSPC; @@ -690,9 +693,9 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) ret = nfs_idmap_read_and_verify_message(&im, &idmap->idmap_upcall_data->idmap_msg, - cons->key, cons->authkey); + rka->target_key, authkey); if (ret >= 0) { - key_set_timeout(cons->key, nfs_idmap_cache_timeout); + key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } diff --git a/include/keys/request_key_auth-type.h b/include/keys/request_key_auth-type.h new file mode 100644 index 000000000000..a726dd3f1dc6 --- /dev/null +++ b/include/keys/request_key_auth-type.h @@ -0,0 +1,36 @@ +/* request_key authorisation token key type + * + * Copyright (C) 2005 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _KEYS_REQUEST_KEY_AUTH_TYPE_H +#define _KEYS_REQUEST_KEY_AUTH_TYPE_H + +#include + +/* + * Authorisation record for request_key(). + */ +struct request_key_auth { + struct key *target_key; + struct key *dest_keyring; + const struct cred *cred; + void *callout_info; + size_t callout_len; + pid_t pid; + char op[8]; +} __randomize_layout; + +static inline struct request_key_auth *get_request_key_auth(const struct key *key) +{ + return key->payload.data[0]; +} + + +#endif /* _KEYS_REQUEST_KEY_AUTH_TYPE_H */ diff --git a/include/linux/key-type.h b/include/linux/key-type.h index bc9af551fc83..e49d1de0614e 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -20,15 +20,6 @@ struct kernel_pkey_query; struct kernel_pkey_params; -/* - * key under-construction record - * - passed to the request_key actor if supplied - */ -struct key_construction { - struct key *key; /* key being constructed */ - struct key *authkey;/* authorisation for key being constructed */ -}; - /* * Pre-parsed payload, used by key add, update and instantiate. * @@ -50,8 +41,7 @@ struct key_preparsed_payload { time64_t expiry; /* Expiry time of key */ } __randomize_layout; -typedef int (*request_key_actor_t)(struct key_construction *key, - const char *op, void *aux); +typedef int (*request_key_actor_t)(struct key *auth_key, void *aux); /* * Preparsed matching criterion. @@ -181,20 +171,20 @@ extern int key_instantiate_and_link(struct key *key, const void *data, size_t datalen, struct key *keyring, - struct key *instkey); + struct key *authkey); extern int key_reject_and_link(struct key *key, unsigned timeout, unsigned error, struct key *keyring, - struct key *instkey); -extern void complete_request_key(struct key_construction *cons, int error); + struct key *authkey); +extern void complete_request_key(struct key *authkey, int error); static inline int key_negate_and_link(struct key *key, unsigned timeout, struct key *keyring, - struct key *instkey) + struct key *authkey) { - return key_reject_and_link(key, timeout, ENOKEY, keyring, instkey); + return key_reject_and_link(key, timeout, ENOKEY, keyring, authkey); } extern int generic_key_instantiate(struct key *key, struct key_preparsed_payload *prep); diff --git a/security/keys/internal.h b/security/keys/internal.h index 479909b858c7..8f533c81aa8d 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -186,20 +186,9 @@ static inline int key_permission(const key_ref_t key_ref, unsigned perm) return key_task_permission(key_ref, current_cred(), perm); } -/* - * Authorisation record for request_key(). - */ -struct request_key_auth { - struct key *target_key; - struct key *dest_keyring; - const struct cred *cred; - void *callout_info; - size_t callout_len; - pid_t pid; -} __randomize_layout; - extern struct key_type key_type_request_key_auth; extern struct key *request_key_auth_new(struct key *target, + const char *op, const void *callout_info, size_t callout_len, struct key *dest_keyring); diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index e8093d025966..7bbe03593e58 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "internal.h" #define KEY_MAX_DESC_SIZE 4096 diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 02c77e928f68..0e0b9ccad2f8 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" /* Session keyring create vs join semaphore */ diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 301f0e300dbd..3f56a312dd35 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -18,31 +18,30 @@ #include #include #include "internal.h" +#include #define key_negative_timeout 60 /* default timeout on a negative key's existence */ /** * complete_request_key - Complete the construction of a key. - * @cons: The key construction record. + * @auth_key: The authorisation key. * @error: The success or failute of the construction. * * Complete the attempt to construct a key. The key will be negated * if an error is indicated. The authorisation key will be revoked * unconditionally. */ -void complete_request_key(struct key_construction *cons, int error) +void complete_request_key(struct key *authkey, int error) { - kenter("{%d,%d},%d", cons->key->serial, cons->authkey->serial, error); + struct request_key_auth *rka = get_request_key_auth(authkey); + struct key *key = rka->target_key; + + kenter("%d{%d},%d", authkey->serial, key->serial, error); if (error < 0) - key_negate_and_link(cons->key, key_negative_timeout, NULL, - cons->authkey); + key_negate_and_link(key, key_negative_timeout, NULL, authkey); else - key_revoke(cons->authkey); - - key_put(cons->key); - key_put(cons->authkey); - kfree(cons); + key_revoke(authkey); } EXPORT_SYMBOL(complete_request_key); @@ -91,21 +90,19 @@ static int call_usermodehelper_keys(const char *path, char **argv, char **envp, * Request userspace finish the construction of a key * - execute "/sbin/request-key " */ -static int call_sbin_request_key(struct key_construction *cons, - const char *op, - void *aux) +static int call_sbin_request_key(struct key *authkey, void *aux) { static char const request_key[] = "/sbin/request-key"; + struct request_key_auth *rka = get_request_key_auth(authkey); const struct cred *cred = current_cred(); key_serial_t prkey, sskey; - struct key *key = cons->key, *authkey = cons->authkey, *keyring, - *session; + struct key *key = rka->target_key, *keyring, *session; char *argv[9], *envp[3], uid_str[12], gid_str[12]; char key_str[12], keyring_str[3][12]; char desc[20]; int ret, i; - kenter("{%d},{%d},%s", key->serial, authkey->serial, op); + kenter("{%d},{%d},%s", key->serial, authkey->serial, rka->op); ret = install_user_keyrings(); if (ret < 0) @@ -163,7 +160,7 @@ static int call_sbin_request_key(struct key_construction *cons, /* set up the argument list */ i = 0; argv[i++] = (char *)request_key; - argv[i++] = (char *) op; + argv[i++] = (char *)rka->op; argv[i++] = key_str; argv[i++] = uid_str; argv[i++] = gid_str; @@ -191,7 +188,7 @@ static int call_sbin_request_key(struct key_construction *cons, key_put(keyring); error_alloc: - complete_request_key(cons, ret); + complete_request_key(authkey, ret); kleave(" = %d", ret); return ret; } @@ -205,42 +202,31 @@ static int construct_key(struct key *key, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring) { - struct key_construction *cons; request_key_actor_t actor; struct key *authkey; int ret; kenter("%d,%p,%zu,%p", key->serial, callout_info, callout_len, aux); - cons = kmalloc(sizeof(*cons), GFP_KERNEL); - if (!cons) - return -ENOMEM; - /* allocate an authorisation key */ - authkey = request_key_auth_new(key, callout_info, callout_len, + authkey = request_key_auth_new(key, "create", callout_info, callout_len, dest_keyring); - if (IS_ERR(authkey)) { - kfree(cons); - ret = PTR_ERR(authkey); - authkey = NULL; - } else { - cons->authkey = key_get(authkey); - cons->key = key_get(key); + if (IS_ERR(authkey)) + return PTR_ERR(authkey); - /* make the call */ - actor = call_sbin_request_key; - if (key->type->request_key) - actor = key->type->request_key; + /* Make the call */ + actor = call_sbin_request_key; + if (key->type->request_key) + actor = key->type->request_key; - ret = actor(cons, "create", aux); + ret = actor(authkey, aux); - /* check that the actor called complete_request_key() prior to - * returning an error */ - WARN_ON(ret < 0 && - !test_bit(KEY_FLAG_REVOKED, &authkey->flags)); - key_put(authkey); - } + /* check that the actor called complete_request_key() prior to + * returning an error */ + WARN_ON(ret < 0 && + !test_bit(KEY_FLAG_REVOKED, &authkey->flags)); + key_put(authkey); kleave(" = %d", ret); return ret; } @@ -275,7 +261,7 @@ static int construct_get_dest_keyring(struct key **_dest_keyring) if (cred->request_key_auth) { authkey = cred->request_key_auth; down_read(&authkey->sem); - rka = authkey->payload.data[0]; + rka = get_request_key_auth(authkey); if (!test_bit(KEY_FLAG_REVOKED, &authkey->flags)) dest_keyring = diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index 87ea2f54dedc..afc304e8b61e 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -17,7 +17,7 @@ #include #include #include "internal.h" -#include +#include static int request_key_auth_preparse(struct key_preparsed_payload *); static void request_key_auth_free_preparse(struct key_preparsed_payload *); @@ -68,7 +68,7 @@ static int request_key_auth_instantiate(struct key *key, static void request_key_auth_describe(const struct key *key, struct seq_file *m) { - struct request_key_auth *rka = key->payload.data[0]; + struct request_key_auth *rka = get_request_key_auth(key); seq_puts(m, "key:"); seq_puts(m, key->description); @@ -83,7 +83,7 @@ static void request_key_auth_describe(const struct key *key, static long request_key_auth_read(const struct key *key, char __user *buffer, size_t buflen) { - struct request_key_auth *rka = key->payload.data[0]; + struct request_key_auth *rka = get_request_key_auth(key); size_t datalen; long ret; @@ -109,7 +109,7 @@ static long request_key_auth_read(const struct key *key, */ static void request_key_auth_revoke(struct key *key) { - struct request_key_auth *rka = key->payload.data[0]; + struct request_key_auth *rka = get_request_key_auth(key); kenter("{%d}", key->serial); @@ -136,7 +136,7 @@ static void free_request_key_auth(struct request_key_auth *rka) */ static void request_key_auth_destroy(struct key *key) { - struct request_key_auth *rka = key->payload.data[0]; + struct request_key_auth *rka = get_request_key_auth(key); kenter("{%d}", key->serial); @@ -147,8 +147,9 @@ static void request_key_auth_destroy(struct key *key) * Create an authorisation token for /sbin/request-key or whoever to gain * access to the caller's security data. */ -struct key *request_key_auth_new(struct key *target, const void *callout_info, - size_t callout_len, struct key *dest_keyring) +struct key *request_key_auth_new(struct key *target, const char *op, + const void *callout_info, size_t callout_len, + struct key *dest_keyring) { struct request_key_auth *rka, *irka; const struct cred *cred = current->cred; @@ -166,6 +167,7 @@ struct key *request_key_auth_new(struct key *target, const void *callout_info, if (!rka->callout_info) goto error_free_rka; rka->callout_len = callout_len; + strlcpy(rka->op, op, sizeof(rka->op)); /* see if the calling process is already servicing the key request of * another process */ From 7c1857bdbdf1e4c541e45eab477ee23ed4333ea4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Feb 2019 16:20:37 +0000 Subject: [PATCH 041/298] keys: Timestamp new keys Set the timestamp on new keys rather than leaving it unset. Fixes: 31d5a79d7f3d ("KEYS: Do LRU discard in full keyrings") Signed-off-by: David Howells Signed-off-by: James Morris --- security/keys/key.c | 1 + 1 file changed, 1 insertion(+) diff --git a/security/keys/key.c b/security/keys/key.c index 0ec9322af4f9..696f1c092c50 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -297,6 +297,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->gid = gid; key->perm = perm; key->restrict_link = restrict_link; + key->last_used_at = ktime_get_real_seconds(); if (!(flags & KEY_ALLOC_NOT_IN_QUOTA)) key->flags |= 1 << KEY_FLAG_IN_QUOTA; From 13443154f6cac61d148471ede6d7f1f6b5ea946a Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 15 Feb 2019 20:14:15 +0000 Subject: [PATCH 042/298] MIPS: eBPF: Always return sign extended 32b values The function prototype used to call JITed eBPF code (ie. the type of the struct bpf_prog bpf_func field) returns an unsigned int. The MIPS n64 ABI that MIPS64 kernels target defines that 32 bit integers should always be sign extended when passed in registers as either arguments or return values. This means that when returning any value which may not already be sign extended (ie. of type REG_64BIT or REG_32BIT_ZERO_EX) we need to perform that sign extension in order to comply with the n64 ABI. Without this we see strange looking test failures from test_bpf.ko, such as: test_bpf: #65 ALU64_MOV_X: dst = 4294967295 jited:1 ret -1 != -1 FAIL (1 times) Although the return value printed matches the expected value, this is only because printf is only examining the least significant 32 bits of the 64 bit register value we returned. The register holding the expected value is sign extended whilst the v0 register was set to a zero extended value by our JITed code, so when compared by a conditional branch instruction the values are not equal. We already handle this when the return value register is of type REG_32BIT_ZERO_EX, so simply extend this to also cover REG_64BIT. Signed-off-by: Paul Burton Fixes: b6bd53f9c4e8 ("MIPS: Add missing file for eBPF JIT.") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Daniel Borkmann --- arch/mips/net/ebpf_jit.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index b16710a8a9e7..715415fa2345 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -343,12 +343,15 @@ static int build_int_epilogue(struct jit_ctx *ctx, int dest_reg) const struct bpf_prog *prog = ctx->skf; int stack_adjust = ctx->stack_size; int store_offset = stack_adjust - 8; + enum reg_val_type td; int r0 = MIPS_R_V0; - if (dest_reg == MIPS_R_RA && - get_reg_val_type(ctx, prog->len, BPF_REG_0) == REG_32BIT_ZERO_EX) + if (dest_reg == MIPS_R_RA) { /* Don't let zero extended value escape. */ - emit_instr(ctx, sll, r0, r0, 0); + td = get_reg_val_type(ctx, prog->len, BPF_REG_0); + if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) + emit_instr(ctx, sll, r0, r0, 0); + } if (ctx->flags & EBPF_SAVE_RA) { emit_instr(ctx, ld, MIPS_R_RA, store_offset, MIPS_R_SP); From 1910faebf61d85a5b7138c0c1c600672e41f82a3 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 15 Feb 2019 20:14:16 +0000 Subject: [PATCH 043/298] MIPS: eBPF: Remove REG_32BIT_ZERO_EX REG_32BIT_ZERO_EX and REG_64BIT are always handled in exactly the same way, and reg_val_propagate_range() never actually sets any register to type REG_32BIT_ZERO_EX. Remove the redundant & unused REG_32BIT_ZERO_EX. Signed-off-by: Paul Burton Signed-off-by: Daniel Borkmann --- arch/mips/net/ebpf_jit.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 715415fa2345..76e9bf88d3b9 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -79,8 +79,6 @@ enum reg_val_type { REG_64BIT_32BIT, /* 32-bit compatible, need truncation for 64-bit ops. */ REG_32BIT, - /* 32-bit zero extended. */ - REG_32BIT_ZERO_EX, /* 32-bit no sign/zero extension needed. */ REG_32BIT_POS }; @@ -349,7 +347,7 @@ static int build_int_epilogue(struct jit_ctx *ctx, int dest_reg) if (dest_reg == MIPS_R_RA) { /* Don't let zero extended value escape. */ td = get_reg_val_type(ctx, prog->len, BPF_REG_0); - if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) + if (td == REG_64BIT) emit_instr(ctx, sll, r0, r0, 0); } @@ -695,7 +693,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (dst < 0) return dst; td = get_reg_val_type(ctx, this_idx, insn->dst_reg); - if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) { + if (td == REG_64BIT) { /* sign extend */ emit_instr(ctx, sll, dst, dst, 0); } @@ -710,7 +708,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (dst < 0) return dst; td = get_reg_val_type(ctx, this_idx, insn->dst_reg); - if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) { + if (td == REG_64BIT) { /* sign extend */ emit_instr(ctx, sll, dst, dst, 0); } @@ -724,7 +722,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (dst < 0) return dst; td = get_reg_val_type(ctx, this_idx, insn->dst_reg); - if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) + if (td == REG_64BIT) /* sign extend */ emit_instr(ctx, sll, dst, dst, 0); if (insn->imm == 1) { @@ -863,13 +861,13 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (src < 0 || dst < 0) return -EINVAL; td = get_reg_val_type(ctx, this_idx, insn->dst_reg); - if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) { + if (td == REG_64BIT) { /* sign extend */ emit_instr(ctx, sll, dst, dst, 0); } did_move = false; ts = get_reg_val_type(ctx, this_idx, insn->src_reg); - if (ts == REG_64BIT || ts == REG_32BIT_ZERO_EX) { + if (ts == REG_64BIT) { int tmp_reg = MIPS_R_AT; if (bpf_op == BPF_MOV) { @@ -1257,8 +1255,7 @@ static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, if (insn->imm == 64 && td == REG_32BIT) emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); - if (insn->imm != 64 && - (td == REG_64BIT || td == REG_32BIT_ZERO_EX)) { + if (insn->imm != 64 && td == REG_64BIT) { /* sign extend */ emit_instr(ctx, sll, dst, dst, 0); } From 79edd00dc6a96644d76b4a1cb97d94d49e026768 Mon Sep 17 00:00:00 2001 From: Anoob Soman Date: Wed, 13 Feb 2019 13:21:39 +0800 Subject: [PATCH 044/298] scsi: libiscsi: Fix race between iscsi_xmit_task and iscsi_complete_task When a target sends Check Condition, whilst initiator is busy xmiting re-queued data, could lead to race between iscsi_complete_task() and iscsi_xmit_task() and eventually crashing with the following kernel backtrace. [3326150.987523] ALERT: BUG: unable to handle kernel NULL pointer dereference at 0000000000000078 [3326150.987549] ALERT: IP: [] iscsi_xmit_task+0x2d/0xc0 [libiscsi] [3326150.987571] WARN: PGD 569c8067 PUD 569c9067 PMD 0 [3326150.987582] WARN: Oops: 0002 [#1] SMP [3326150.987593] WARN: Modules linked in: tun nfsv3 nfs fscache dm_round_robin [3326150.987762] WARN: CPU: 2 PID: 8399 Comm: kworker/u32:1 Tainted: G O 4.4.0+2 #1 [3326150.987774] WARN: Hardware name: Dell Inc. PowerEdge R720/0W7JN5, BIOS 2.5.4 01/22/2016 [3326150.987790] WARN: Workqueue: iscsi_q_13 iscsi_xmitworker [libiscsi] [3326150.987799] WARN: task: ffff8801d50f3800 ti: ffff8801f5458000 task.ti: ffff8801f5458000 [3326150.987810] WARN: RIP: e030:[] [] iscsi_xmit_task+0x2d/0xc0 [libiscsi] [3326150.987825] WARN: RSP: e02b:ffff8801f545bdb0 EFLAGS: 00010246 [3326150.987831] WARN: RAX: 00000000ffffffc3 RBX: ffff880282d2ab20 RCX: ffff88026b6ac480 [3326150.987842] WARN: RDX: 0000000000000000 RSI: 00000000fffffe01 RDI: ffff880282d2ab20 [3326150.987852] WARN: RBP: ffff8801f545bdc8 R08: 0000000000000000 R09: 0000000000000008 [3326150.987862] WARN: R10: 0000000000000000 R11: 000000000000fe88 R12: 0000000000000000 [3326150.987872] WARN: R13: ffff880282d2abe8 R14: ffff880282d2abd8 R15: ffff880282d2ac08 [3326150.987890] WARN: FS: 00007f5a866b4840(0000) GS:ffff88028a640000(0000) knlGS:0000000000000000 [3326150.987900] WARN: CS: e033 DS: 0000 ES: 0000 CR0: 0000000080050033 [3326150.987907] WARN: CR2: 0000000000000078 CR3: 0000000070244000 CR4: 0000000000042660 [3326150.987918] WARN: Stack: [3326150.987924] WARN: ffff880282d2ad58 ffff880282d2ab20 ffff880282d2abe8 ffff8801f545be18 [3326150.987938] WARN: ffffffffa05cea90 ffff880282d2abf8 ffff88026b59cc80 ffff88026b59cc00 [3326150.987951] WARN: ffff88022acf32c0 ffff880289491800 ffff880255a80800 0000000000000400 [3326150.987964] WARN: Call Trace: [3326150.987975] WARN: [] iscsi_xmitworker+0x2f0/0x360 [libiscsi] [3326150.987988] WARN: [] process_one_work+0x1fc/0x3b0 [3326150.987997] WARN: [] worker_thread+0x2a5/0x470 [3326150.988006] WARN: [] ? __schedule+0x648/0x870 [3326150.988015] WARN: [] ? rescuer_thread+0x300/0x300 [3326150.988023] WARN: [] kthread+0xd5/0xe0 [3326150.988031] WARN: [] ? kthread_stop+0x110/0x110 [3326150.988040] WARN: [] ret_from_fork+0x3f/0x70 [3326150.988048] WARN: [] ? kthread_stop+0x110/0x110 [3326150.988127] ALERT: RIP [] iscsi_xmit_task+0x2d/0xc0 [libiscsi] [3326150.988138] WARN: RSP [3326150.988144] WARN: CR2: 0000000000000078 [3326151.020366] WARN: ---[ end trace 1c60974d4678d81b ]--- Commit 6f8830f5bbab ("scsi: libiscsi: add lock around task lists to fix list corruption regression") introduced "taskqueuelock" to fix list corruption during the race, but this wasn't enough. Re-setting of conn->task to NULL, could race with iscsi_xmit_task(). iscsi_complete_task() { .... if (conn->task == task) conn->task = NULL; } conn->task in iscsi_xmit_task() could be NULL and so will be task. __iscsi_get_task(task) will crash (NullPtr de-ref), trying to access refcount. iscsi_xmit_task() { struct iscsi_task *task = conn->task; __iscsi_get_task(task); } This commit will take extra conn->session->back_lock in iscsi_xmit_task() to ensure iscsi_xmit_task() waits for iscsi_complete_task(), if iscsi_complete_task() wins the race. If iscsi_xmit_task() wins the race, iscsi_xmit_task() increments task->refcount (__iscsi_get_task) ensuring iscsi_complete_task() will not iscsi_free_task(). Signed-off-by: Anoob Soman Signed-off-by: Bob Liu Acked-by: Lee Duncan Signed-off-by: Martin K. Petersen --- drivers/scsi/libiscsi.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index b8d325ce8754..120fc520f27a 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1459,7 +1459,13 @@ static int iscsi_xmit_task(struct iscsi_conn *conn) if (test_bit(ISCSI_SUSPEND_BIT, &conn->suspend_tx)) return -ENODATA; + spin_lock_bh(&conn->session->back_lock); + if (conn->task == NULL) { + spin_unlock_bh(&conn->session->back_lock); + return -ENODATA; + } __iscsi_get_task(task); + spin_unlock_bh(&conn->session->back_lock); spin_unlock_bh(&conn->session->frwd_lock); rc = conn->session->tt->xmit_task(task); spin_lock_bh(&conn->session->frwd_lock); From 515ce60613128be7a176a8b82b20c7624f3b440d Mon Sep 17 00:00:00 2001 From: Masato Suzuki Date: Thu, 14 Feb 2019 15:01:18 +0900 Subject: [PATCH 045/298] scsi: sd_zbc: Fix sd_zbc_report_zones() buffer allocation The function sd_zbc_do_report_zones() issues a REPORT ZONES command with a buffer size calculated based on the number of zones requested by the caller. This value should however not exceed the capabilities of the hardware maximum command size, that is, should not exceed the max_hw_sectors limit of the device. This problem leads to failures of report zones commands when re-validating disks with some SAS HBAs. Fix this by limiting a report zone command buffer size to the minimum of the device max_hw_sectors and calculated value based on the requested number of zones. This does not change the semantic of the report_zones file operation as report zones can always return less zone reports than requested. Short reports are handled using a loop execution of the report_zones file operation in the function blk_report_zones(). [Damien] Before patch 'e76239a3748c ("block: add a report_zones method")', report zones buffer allocation was limited to max_sectors when allocated in blk_report_zones(). This however does not consider the actual format of the device reply which is interface dependent. Limiting the allocation based on the size of the expected reply format rather than the size of the array of generic sturct blkzone passed by blk_report_zones() makes more sense. Fixes: e76239a3748c ("block: add a report_zones method") Cc: stable@vger.kernel.org Signed-off-by: Masato Suzuki Signed-off-by: Damien Le Moal Signed-off-by: Martin K. Petersen --- drivers/scsi/sd_zbc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index fff86940388b..a340af797a85 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -142,10 +142,12 @@ int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, return -EOPNOTSUPP; /* - * Get a reply buffer for the number of requested zones plus a header. - * For ATA, buffers must be aligned to 512B. + * Get a reply buffer for the number of requested zones plus a header, + * without exceeding the device maximum command size. For ATA disks, + * buffers must be aligned to 512B. */ - buflen = roundup((nrz + 1) * 64, 512); + buflen = min(queue_max_hw_sectors(disk->queue) << 9, + roundup((nrz + 1) * 64, 512)); buf = kmalloc(buflen, gfp_mask); if (!buf) return -ENOMEM; From ffeafdd2bf0b280d67ec1a47ea6287910d271f3f Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 15 Feb 2019 00:37:57 +0800 Subject: [PATCH 046/298] scsi: libsas: Fix rphy phy_identifier for PHYs with end devices attached The sysfs phy_identifier attribute for a sas_end_device comes from the rphy phy_identifier value. Currently this is not being set for rphys with an end device attached, so we see incorrect symlinks from systemd disk/by-path: root@localhost:~# ls -l /dev/disk/by-path/ total 0 lrwxrwxrwx 1 root root 9 Feb 13 12:26 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy0-lun-0 -> ../../sdb lrwxrwxrwx 1 root root 10 Feb 13 12:26 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy0-lun-0-part1 -> ../../sdb1 lrwxrwxrwx 1 root root 10 Feb 13 12:26 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy0-lun-0-part2 -> ../../sdb2 lrwxrwxrwx 1 root root 10 Feb 13 12:26 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy0-lun-0-part3 -> ../../sdc3 Indeed, each sas_end_device phy_identifier value is 0: root@localhost:/# more sys/class/sas_device/end_device-0\:0\:2/phy_identifier 0 root@localhost:/# more sys/class/sas_device/end_device-0\:0\:10/phy_identifier 0 This patch fixes the discovery code to set the phy_identifier. With this, we now get proper symlinks: root@localhost:~# ls -l /dev/disk/by-path/ total 0 lrwxrwxrwx 1 root root 9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy10-lun-0 -> ../../sdg lrwxrwxrwx 1 root root 9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy11-lun-0 -> ../../sdh lrwxrwxrwx 1 root root 9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy2-lun-0 -> ../../sda lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy2-lun-0-part1 -> ../../sda1 lrwxrwxrwx 1 root root 9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy3-lun-0 -> ../../sdb lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy3-lun-0-part1 -> ../../sdb1 lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy3-lun-0-part2 -> ../../sdb2 lrwxrwxrwx 1 root root 9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy4-lun-0 -> ../../sdc lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy4-lun-0-part1 -> ../../sdc1 lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy4-lun-0-part2 -> ../../sdc2 lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy4-lun-0-part3 -> ../../sdc3 lrwxrwxrwx 1 root root 9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy5-lun-0 -> ../../sdd lrwxrwxrwx 1 root root 9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy7-lun-0 -> ../../sde lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy7-lun-0-part1 -> ../../sde1 lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy7-lun-0-part2 -> ../../sde2 lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy7-lun-0-part3 -> ../../sde3 lrwxrwxrwx 1 root root 9 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy8-lun-0 -> ../../sdf lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy8-lun-0-part1 -> ../../sdf1 lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy8-lun-0-part2 -> ../../sdf2 lrwxrwxrwx 1 root root 10 Feb 13 11:53 platform-HISI0162:01-sas-exp0x500e004aaaaaaa1f-phy8-lun-0-part3 -> ../../sdf3 Fixes: 2908d778ab3e ("[SCSI] aic94xx: new driver") Reported-by: dann frazier Signed-off-by: John Garry Reviewed-by: Jason Yan Tested-by: dann frazier Signed-off-by: Martin K. Petersen --- drivers/scsi/libsas/sas_expander.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 17eb4185f29d..f21c93bbb35c 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -828,6 +828,7 @@ static struct domain_device *sas_ex_discover_end_dev( rphy = sas_end_device_alloc(phy->port); if (!rphy) goto out_free; + rphy->identify.phy_identifier = phy_id; child->rphy = rphy; get_device(&rphy->dev); @@ -854,6 +855,7 @@ static struct domain_device *sas_ex_discover_end_dev( child->rphy = rphy; get_device(&rphy->dev); + rphy->identify.phy_identifier = phy_id; sas_fill_in_rphy(child, rphy); list_add_tail(&child->disco_list_node, &parent->port->disco_list); From 4a067cf823d9d8e50d41cfb618011c0d4a969c72 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 14 Feb 2019 22:57:41 +0100 Subject: [PATCH 047/298] scsi: core: reset host byte in DID_NEXUS_FAILURE case Up to 4.12, __scsi_error_from_host_byte() would reset the host byte to DID_OK for various cases including DID_NEXUS_FAILURE. Commit 2a842acab109 ("block: introduce new block status code type") replaced this function with scsi_result_to_blk_status() and removed the host-byte resetting code for the DID_NEXUS_FAILURE case. As the line set_host_byte(cmd, DID_OK) was preserved for the other cases, I suppose this was an editing mistake. The fact that the host byte remains set after 4.13 is causing problems with the sg_persist tool, which now returns success rather then exit status 24 when a RESERVATION CONFLICT error is encountered. Fixes: 2a842acab109 "block: introduce new block status code type" Signed-off-by: Martin Wilck Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6d65ac584eba..f8d51c3d5582 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -655,6 +655,7 @@ static blk_status_t scsi_result_to_blk_status(struct scsi_cmnd *cmd, int result) set_host_byte(cmd, DID_OK); return BLK_STS_TARGET; case DID_NEXUS_FAILURE: + set_host_byte(cmd, DID_OK); return BLK_STS_NEXUS; case DID_ALLOC_FAILURE: set_host_byte(cmd, DID_OK); From 4974d5f678abb34401558559d47e2ea3d1c15cba Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Fri, 15 Feb 2019 15:10:32 +0100 Subject: [PATCH 048/298] net: ip6_gre: initialize erspan_ver just for erspan tunnels After commit c706863bc890 ("net: ip6_gre: always reports o_key to userspace"), ip6gre and ip6gretap tunnels started reporting TUNNEL_KEY output flag even if it is not configured. ip6gre_fill_info checks erspan_ver value to add TUNNEL_KEY for erspan tunnels, however in commit 84581bdae9587 ("erspan: set erspan_ver to 1 by default when adding an erspan dev") erspan_ver is initialized to 1 even for ip6gre or ip6gretap Fix the issue moving erspan_ver initialization in a dedicated routine Fixes: c706863bc890 ("net: ip6_gre: always reports o_key to userspace") Signed-off-by: Lorenzo Bianconi Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 801a9a0c217e..43890898b0b5 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1719,6 +1719,24 @@ static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } +static void ip6erspan_set_version(struct nlattr *data[], + struct __ip6_tnl_parm *parms) +{ + parms->erspan_ver = 1; + if (data[IFLA_GRE_ERSPAN_VER]) + parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); + + if (parms->erspan_ver == 1) { + if (data[IFLA_GRE_ERSPAN_INDEX]) + parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + } else if (parms->erspan_ver == 2) { + if (data[IFLA_GRE_ERSPAN_DIR]) + parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); + if (data[IFLA_GRE_ERSPAN_HWID]) + parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); + } +} + static void ip6gre_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { @@ -1767,20 +1785,6 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_COLLECT_METADATA]) parms->collect_md = true; - - parms->erspan_ver = 1; - if (data[IFLA_GRE_ERSPAN_VER]) - parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - - if (parms->erspan_ver == 1) { - if (data[IFLA_GRE_ERSPAN_INDEX]) - parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); - } else if (parms->erspan_ver == 2) { - if (data[IFLA_GRE_ERSPAN_DIR]) - parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); - if (data[IFLA_GRE_ERSPAN_HWID]) - parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); - } } static int ip6gre_tap_init(struct net_device *dev) @@ -2203,6 +2207,7 @@ static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, int err; ip6gre_netlink_parms(data, &nt->parms); + ip6erspan_set_version(data, &nt->parms); ign = net_generic(net, ip6gre_net_id); if (nt->parms.collect_md) { @@ -2248,6 +2253,7 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[], if (IS_ERR(t)) return PTR_ERR(t); + ip6erspan_set_version(data, &p); ip6gre_tunnel_unlink_md(ign, t); ip6gre_tunnel_unlink(ign, t); ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]); From 197f9ab7f08ce4b9ece662f747c3991b2f0fbb57 Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Fri, 15 Feb 2019 17:17:08 +0100 Subject: [PATCH 049/298] net: phy: xgmiitorgmii: Support generic PHY status read Some PHY drivers like the generic one do not provide a read_status callback on their own but rely on genphy_read_status being called directly. With the current code, this results in a NULL function pointer call. Call genphy_read_status instead when there is no specific callback. Signed-off-by: Paul Kocialkowski Signed-off-by: David S. Miller --- drivers/net/phy/xilinx_gmii2rgmii.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/xilinx_gmii2rgmii.c b/drivers/net/phy/xilinx_gmii2rgmii.c index 74a8782313cf..bd6084e315de 100644 --- a/drivers/net/phy/xilinx_gmii2rgmii.c +++ b/drivers/net/phy/xilinx_gmii2rgmii.c @@ -44,7 +44,10 @@ static int xgmiitorgmii_read_status(struct phy_device *phydev) u16 val = 0; int err; - err = priv->phy_drv->read_status(phydev); + if (priv->phy_drv->read_status) + err = priv->phy_drv->read_status(phydev); + else + err = genphy_read_status(phydev); if (err < 0) return err; From 3b89ea9c5902acccdbbdec307c85edd1bf52515e Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Fri, 15 Feb 2019 17:58:54 +0100 Subject: [PATCH 050/298] net: Fix for_each_netdev_feature on Big endian The features attribute is of type u64 and stored in the native endianes on the system. The for_each_set_bit() macro takes a pointer to a 32 bit array and goes over the bits in this area. On little Endian systems this also works with an u64 as the most significant bit is on the highest address, but on big endian the words are swapped. When we expect bit 15 here we get bit 47 (15 + 32). This patch converts it more or less to its own for_each_set_bit() implementation which works on 64 bit integers directly. This is then completely in host endianness and should work like expected. Fixes: fd867d51f ("net/core: generic support for disabling netdev features down stack") Signed-off-by: Hauke Mehrtens Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 23 +++++++++++++++++++++-- net/core/dev.c | 4 ++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 2b2a6dce1630..fce28562bed2 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -11,6 +11,7 @@ #define _LINUX_NETDEV_FEATURES_H #include +#include typedef u64 netdev_features_t; @@ -154,8 +155,26 @@ enum { #define NETIF_F_HW_TLS_TX __NETIF_F(HW_TLS_TX) #define NETIF_F_HW_TLS_RX __NETIF_F(HW_TLS_RX) -#define for_each_netdev_feature(mask_addr, bit) \ - for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) +/* Finds the next feature with the highest number of the range of start till 0. + */ +static inline int find_next_netdev_feature(u64 feature, unsigned long start) +{ + /* like BITMAP_LAST_WORD_MASK() for u64 + * this sets the most significant 64 - start to 0. + */ + feature &= ~0ULL >> (-start & ((sizeof(feature) * 8) - 1)); + + return fls64(feature) - 1; +} + +/* This goes for the MSB to the LSB through the set feature bits, + * mask_addr should be a u64 and bit an int + */ +#define for_each_netdev_feature(mask_addr, bit) \ + for ((bit) = find_next_netdev_feature((mask_addr), \ + NETDEV_FEATURE_COUNT); \ + (bit) >= 0; \ + (bit) = find_next_netdev_feature((mask_addr), (bit) - 1)) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ diff --git a/net/core/dev.c b/net/core/dev.c index 8e276e0192a1..5d03889502eb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8152,7 +8152,7 @@ static netdev_features_t netdev_sync_upper_features(struct net_device *lower, netdev_features_t feature; int feature_bit; - for_each_netdev_feature(&upper_disables, feature_bit) { + for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(upper->wanted_features & feature) && (features & feature)) { @@ -8172,7 +8172,7 @@ static void netdev_sync_lower_features(struct net_device *upper, netdev_features_t feature; int feature_bit; - for_each_netdev_feature(&upper_disables, feature_bit) { + for_each_netdev_feature(upper_disables, feature_bit) { feature = __NETIF_F_BIT(feature_bit); if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", From d5be7f632bad0f489879eed0ff4b99bd7fe0b74c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 15 Feb 2019 12:15:47 -0500 Subject: [PATCH 051/298] net: validate untrusted gso packets without csum offload Syzkaller again found a path to a kernel crash through bad gso input. By building an excessively large packet to cause an skb field to wrap. If VIRTIO_NET_HDR_F_NEEDS_CSUM was set this would have been dropped in skb_partial_csum_set. GSO packets that do not set checksum offload are suspicious and rare. Most callers of virtio_net_hdr_to_skb already pass them to skb_probe_transport_header. Move that test forward, change it to detect parse failure and drop packets on failure as those cleary are not one of the legitimate VIRTIO_NET_HDR_GSO types. Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Fixes: f43798c27684 ("tun: Allow GSO using virtio_net_hdr") Reported-by: syzbot Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- include/linux/virtio_net.h | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 95d25b010a25..4c1c82a5678c 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2434,7 +2434,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0)) skb_set_transport_header(skb, keys.control.thoff); - else + else if (offset_hint >= 0) skb_set_transport_header(skb, offset_hint); } diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index cb462f9ab7dd..71f2394abbf7 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -57,6 +57,15 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL; + } else { + /* gso packets without NEEDS_CSUM do not set transport_offset. + * probe and drop if does not match one of the above types. + */ + if (gso_type) { + skb_probe_transport_header(skb, -1); + if (!skb_transport_header_was_set(skb)) + return -EINVAL; + } } if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { From fea83353177a55540c71c140887737c282137aa2 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Feb 2019 12:16:49 -0800 Subject: [PATCH 052/298] net: dsa: b53: Fix default VLAN ID We were not consistent in how the default VID of a given port was defined, b53_br_leave() would make sure the VLAN ID would be either 0/1 depending on the switch generation, but b53_configure_vlan(), which is the default configuration would unconditionally set it to 1. The correct value is 1 for 5325/5365 series and 0 otherwise. To avoid repeating that mistake ever again, introduce a helper function: b53_default_pvid() to factor that out. Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 0e4bbdcc614f..964a9ec4652a 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -632,15 +632,25 @@ static void b53_enable_mib(struct b53_device *dev) b53_write8(dev, B53_MGMT_PAGE, B53_GLOBAL_CONFIG, gc); } +static u16 b53_default_pvid(struct b53_device *dev) +{ + if (is5325(dev) || is5365(dev)) + return 1; + else + return 0; +} + int b53_configure_vlan(struct dsa_switch *ds) { struct b53_device *dev = ds->priv; struct b53_vlan vl = { 0 }; - int i; + int i, def_vid; + + def_vid = b53_default_pvid(dev); /* clear all vlan entries */ if (is5325(dev) || is5365(dev)) { - for (i = 1; i < dev->num_vlans; i++) + for (i = def_vid; i < dev->num_vlans; i++) b53_set_vlan_entry(dev, i, &vl); } else { b53_do_vlan_op(dev, VTA_CMD_CLEAR); @@ -650,7 +660,7 @@ int b53_configure_vlan(struct dsa_switch *ds) b53_for_each_port(dev, i) b53_write16(dev, B53_VLAN_PAGE, - B53_VLAN_PORT_DEF_TAG(i), 1); + B53_VLAN_PORT_DEF_TAG(i), def_vid); if (!is5325(dev) && !is5365(dev)) b53_set_jumbo(dev, dev->enable_jumbo, false); @@ -1326,12 +1336,8 @@ int b53_vlan_del(struct dsa_switch *ds, int port, vl->members &= ~BIT(port); - if (pvid == vid) { - if (is5325(dev) || is5365(dev)) - pvid = 1; - else - pvid = 0; - } + if (pvid == vid) + pvid = b53_default_pvid(dev); if (untagged && !dsa_is_cpu_port(ds, port)) vl->untag &= ~(BIT(port)); @@ -1644,10 +1650,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *br) b53_write16(dev, B53_PVLAN_PAGE, B53_PVLAN_PORT_MASK(port), pvlan); dev->ports[port].vlan_ctl_mask = pvlan; - if (is5325(dev) || is5365(dev)) - pvid = 1; - else - pvid = 0; + pvid = b53_default_pvid(dev); /* Make this port join all VLANs without VLAN entries */ if (is58xx(dev)) { From dad8d7c6452b5b9f9828c9e2c7ca143205fd40c7 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Feb 2019 12:16:50 -0800 Subject: [PATCH 053/298] net: dsa: b53: Properly account for VLAN filtering VLAN filtering can be built into the kernel, and also dynamically turned on/off through the bridge master device. Allow re-configuring the switch appropriately to account for that by deciding whether VLAN table (v_table) misses should lead to a drop or forward. Fixes: a2482d2ce349 ("net: dsa: b53: Plug in VLAN support") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 59 +++++++++++++++++++++++++++++--- drivers/net/dsa/b53/b53_priv.h | 3 ++ 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 964a9ec4652a..2fef4c564420 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -344,7 +344,8 @@ static void b53_set_forwarding(struct b53_device *dev, int enable) b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_CTRL, mgmt); } -static void b53_enable_vlan(struct b53_device *dev, bool enable) +static void b53_enable_vlan(struct b53_device *dev, bool enable, + bool enable_filtering) { u8 mgmt, vc0, vc1, vc4 = 0, vc5; @@ -369,8 +370,13 @@ static void b53_enable_vlan(struct b53_device *dev, bool enable) vc0 |= VC0_VLAN_EN | VC0_VID_CHK_EN | VC0_VID_HASH_VID; vc1 |= VC1_RX_MCST_UNTAG_EN | VC1_RX_MCST_FWD_EN; vc4 &= ~VC4_ING_VID_CHECK_MASK; - vc4 |= VC4_ING_VID_VIO_DROP << VC4_ING_VID_CHECK_S; - vc5 |= VC5_DROP_VTABLE_MISS; + if (enable_filtering) { + vc4 |= VC4_ING_VID_VIO_DROP << VC4_ING_VID_CHECK_S; + vc5 |= VC5_DROP_VTABLE_MISS; + } else { + vc4 |= VC4_ING_VID_VIO_FWD << VC4_ING_VID_CHECK_S; + vc5 &= ~VC5_DROP_VTABLE_MISS; + } if (is5325(dev)) vc0 &= ~VC0_RESERVED_1; @@ -420,6 +426,9 @@ static void b53_enable_vlan(struct b53_device *dev, bool enable) } b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt); + + dev->vlan_enabled = enable; + dev->vlan_filtering_enabled = enable_filtering; } static int b53_set_jumbo(struct b53_device *dev, bool enable, bool allow_10_100) @@ -656,7 +665,7 @@ int b53_configure_vlan(struct dsa_switch *ds) b53_do_vlan_op(dev, VTA_CMD_CLEAR); } - b53_enable_vlan(dev, false); + b53_enable_vlan(dev, false, dev->vlan_filtering_enabled); b53_for_each_port(dev, i) b53_write16(dev, B53_VLAN_PAGE, @@ -1265,6 +1274,46 @@ EXPORT_SYMBOL(b53_phylink_mac_link_up); int b53_vlan_filtering(struct dsa_switch *ds, int port, bool vlan_filtering) { + struct b53_device *dev = ds->priv; + struct net_device *bridge_dev; + unsigned int i; + u16 pvid, new_pvid; + + /* Handle the case were multiple bridges span the same switch device + * and one of them has a different setting than what is being requested + * which would be breaking filtering semantics for any of the other + * bridge devices. + */ + b53_for_each_port(dev, i) { + bridge_dev = dsa_to_port(ds, i)->bridge_dev; + if (bridge_dev && + bridge_dev != dsa_to_port(ds, port)->bridge_dev && + br_vlan_enabled(bridge_dev) != vlan_filtering) { + netdev_err(bridge_dev, + "VLAN filtering is global to the switch!\n"); + return -EINVAL; + } + } + + b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &pvid); + new_pvid = pvid; + if (dev->vlan_filtering_enabled && !vlan_filtering) { + /* Filtering is currently enabled, use the default PVID since + * the bridge does not expect tagging anymore + */ + dev->ports[port].pvid = pvid; + new_pvid = b53_default_pvid(dev); + } else if (!dev->vlan_filtering_enabled && vlan_filtering) { + /* Filtering is currently disabled, restore the previous PVID */ + new_pvid = dev->ports[port].pvid; + } + + if (pvid != new_pvid) + b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), + new_pvid); + + b53_enable_vlan(dev, dev->vlan_enabled, vlan_filtering); + return 0; } EXPORT_SYMBOL(b53_vlan_filtering); @@ -1280,7 +1329,7 @@ int b53_vlan_prepare(struct dsa_switch *ds, int port, if (vlan->vid_end > dev->num_vlans) return -ERANGE; - b53_enable_vlan(dev, true); + b53_enable_vlan(dev, true, dev->vlan_filtering_enabled); return 0; } diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index ec796482792d..4dc7ee38b258 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -91,6 +91,7 @@ enum { struct b53_port { u16 vlan_ctl_mask; struct ethtool_eee eee; + u16 pvid; }; struct b53_vlan { @@ -137,6 +138,8 @@ struct b53_device { unsigned int num_vlans; struct b53_vlan *vlans; + bool vlan_enabled; + bool vlan_filtering_enabled; unsigned int num_ports; struct b53_port *ports; }; From a40061ea2e39494104602b3048751341bda374a1 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Feb 2019 12:16:51 -0800 Subject: [PATCH 054/298] net: systemport: Fix reception of BPDUs SYSTEMPORT has its RXCHK parser block that attempts to validate the packet structures, unfortunately setting the L2 header check bit will cause Bridge PDUs (BPDUs) to be incorrectly rejected because they look like LLC/SNAP packets with a non-IPv4 or non-IPv6 Ethernet Type. Fixes: 4e8aedfe78c7 ("net: systemport: Turn on offloads by default") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 28c9b0bdf2f6..bc3ac369cbe3 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -134,6 +134,10 @@ static void bcm_sysport_set_rx_csum(struct net_device *dev, priv->rx_chk_en = !!(wanted & NETIF_F_RXCSUM); reg = rxchk_readl(priv, RXCHK_CONTROL); + /* Clear L2 header checks, which would prevent BPDUs + * from being received. + */ + reg &= ~RXCHK_L2_HDR_DIS; if (priv->rx_chk_en) reg |= RXCHK_EN; else From c3152ec4c0691e351f35a2f63347a464b5f35151 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Feb 2019 12:16:52 -0800 Subject: [PATCH 055/298] net: dsa: bcm_sf2: Do not assume DSA master supports WoL We assume in the bcm_sf2 driver that the DSA master network device supports ethtool_ops::{get,set}_wol operations, which is not a given. Avoid de-referencing potentially non-existent function pointers and check them as we should. Fixes: 96e65d7f3f88 ("net: dsa: bcm_sf2: add support for Wake-on-LAN") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/bcm_sf2.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 17ec32b0a1cc..14138d423cf1 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -726,10 +726,11 @@ static void bcm_sf2_sw_get_wol(struct dsa_switch *ds, int port, { struct net_device *p = ds->ports[port].cpu_dp->master; struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); - struct ethtool_wolinfo pwol; + struct ethtool_wolinfo pwol = { }; /* Get the parent device WoL settings */ - p->ethtool_ops->get_wol(p, &pwol); + if (p->ethtool_ops->get_wol) + p->ethtool_ops->get_wol(p, &pwol); /* Advertise the parent device supported settings */ wol->supported = pwol.supported; @@ -750,9 +751,10 @@ static int bcm_sf2_sw_set_wol(struct dsa_switch *ds, int port, struct net_device *p = ds->ports[port].cpu_dp->master; struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); s8 cpu_port = ds->ports[port].cpu_dp->index; - struct ethtool_wolinfo pwol; + struct ethtool_wolinfo pwol = { }; - p->ethtool_ops->get_wol(p, &pwol); + if (p->ethtool_ops->get_wol) + p->ethtool_ops->get_wol(p, &pwol); if (wol->wolopts & ~pwol.supported) return -EINVAL; From 10163aaee9671b01b2f4737922e1a4f43581047a Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 15 Feb 2019 12:16:53 -0800 Subject: [PATCH 056/298] net: dsa: b53: Do not program CPU port's PVID The CPU port is special and does not need to obey VLAN restrictions as far as untagged traffic goes, also, having the CPU port be part of a particular PVID is against the idea of keeping it tagged in all VLANs. Fixes: ca8931948344 ("net: dsa: b53: Keep CPU port as tagged in all VLANs") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 2fef4c564420..c76892ac4e69 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1359,7 +1359,7 @@ void b53_vlan_add(struct dsa_switch *ds, int port, b53_fast_age_vlan(dev, vid); } - if (pvid) { + if (pvid && !dsa_is_cpu_port(ds, port)) { b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), vlan->vid_end); b53_fast_age_vlan(dev, vid); From c93a49b9769e435990c82297aa0baa31e1538790 Mon Sep 17 00:00:00 2001 From: Andrea Claudi Date: Fri, 15 Feb 2019 17:51:48 +0100 Subject: [PATCH 057/298] ipvs: fix warning on unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_IP_VS_IPV6 is not defined, build produced this warning: net/netfilter/ipvs/ip_vs_ctl.c:899:6: warning: unused variable ‘ret’ [-Wunused-variable] int ret = 0; ^~~ Fix this by moving the declaration of 'ret' in the CONFIG_IP_VS_IPV6 section in the same function. While at it, drop its unneeded initialisation. Fixes: 098e13f5b21d ("ipvs: fix dependency on nf_defrag_ipv6") Reported-by: Stefano Brivio Signed-off-by: Andrea Claudi Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 86afacb07e5f..ac8d848d7624 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -896,12 +896,13 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, { struct ip_vs_dest *dest; unsigned int atype, i; - int ret = 0; EnterFunction(2); #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { + int ret; + atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && From 8681ef1f3d295bd3600315325f3b3396d76d02f6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 16 Feb 2019 13:44:39 -0800 Subject: [PATCH 058/298] net: Add header for usage of fls64() Fixes: 3b89ea9c5902 ("net: Fix for_each_netdev_feature on Big endian") Suggested-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index fce28562bed2..4c76fe2c8488 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -11,6 +11,7 @@ #define _LINUX_NETDEV_FEATURES_H #include +#include #include typedef u64 netdev_features_t; From 289460404f6947ef1c38e67d680be9a84161250b Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Sun, 17 Feb 2019 07:18:41 +0000 Subject: [PATCH 059/298] mlxsw: __mlxsw_sp_port_headroom_set(): Fix a use of local variable The function-local variable "delay" enters the loop interpreted as delay in bits. However, inside the loop it gets overwritten by the result of mlxsw_sp_pg_buf_delay_get(), and thus leaves the loop as quantity in cells. Thus on second and further loop iterations, the headroom for a given priority is configured with a wrong size. Fix by introducing a loop-local variable, delay_cells. Rename thres to thres_cells for consistency. Fixes: f417f04da589 ("mlxsw: spectrum: Refactor port buffer configuration") Signed-off-by: Petr Machata Acked-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 32519c93df17..b65e274b02e9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -862,8 +862,9 @@ int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu, for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++) { bool configure = false; bool pfc = false; + u16 thres_cells; + u16 delay_cells; bool lossy; - u16 thres; for (j = 0; j < IEEE_8021QAZ_MAX_TCS; j++) { if (prio_tc[j] == i) { @@ -877,10 +878,11 @@ int __mlxsw_sp_port_headroom_set(struct mlxsw_sp_port *mlxsw_sp_port, int mtu, continue; lossy = !(pfc || pause_en); - thres = mlxsw_sp_pg_buf_threshold_get(mlxsw_sp, mtu); - delay = mlxsw_sp_pg_buf_delay_get(mlxsw_sp, mtu, delay, pfc, - pause_en); - mlxsw_sp_pg_buf_pack(pbmc_pl, i, thres + delay, thres, lossy); + thres_cells = mlxsw_sp_pg_buf_threshold_get(mlxsw_sp, mtu); + delay_cells = mlxsw_sp_pg_buf_delay_get(mlxsw_sp, mtu, delay, + pfc, pause_en); + mlxsw_sp_pg_buf_pack(pbmc_pl, i, thres_cells + delay_cells, + thres_cells, lossy); } return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(pbmc), pbmc_pl); From c17abcfa93bf0be5e48bb011607d237ac2bfc839 Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sat, 9 Feb 2019 02:01:01 +0100 Subject: [PATCH 060/298] pinctrl: meson: meson8b: fix the sdxc_a data 1..3 pins Fix the mismatch between the "sdxc_d13_1_a" pin group definition from meson8b_cbus_groups and the entry in sdxc_a_groups ("sdxc_d0_13_1_a"). This makes it possible to use "sdxc_d13_1_a" in device-tree files to route the MMC data 1..3 pins to GPIOX_1..3. Fixes: 0fefcb6876d0d6 ("pinctrl: Add support for Meson8b") Signed-off-by: Martin Blumenstingl Signed-off-by: Linus Walleij --- drivers/pinctrl/meson/pinctrl-meson8b.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/meson/pinctrl-meson8b.c b/drivers/pinctrl/meson/pinctrl-meson8b.c index c69ca95b1ad5..0f140a802137 100644 --- a/drivers/pinctrl/meson/pinctrl-meson8b.c +++ b/drivers/pinctrl/meson/pinctrl-meson8b.c @@ -693,7 +693,7 @@ static const char * const sd_a_groups[] = { static const char * const sdxc_a_groups[] = { "sdxc_d0_0_a", "sdxc_d13_0_a", "sdxc_d47_a", "sdxc_clk_a", - "sdxc_cmd_a", "sdxc_d0_1_a", "sdxc_d0_13_1_a" + "sdxc_cmd_a", "sdxc_d0_1_a", "sdxc_d13_1_a" }; static const char * const pcm_a_groups[] = { From 0358affb5cd8bbd685a6ab163a36dd28a818da73 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Feb 2019 15:41:01 +0100 Subject: [PATCH 061/298] Documentation: change linux-4.x references to 5.x As linux-5.0.x is coming up soon, the documentation should match, in particular the README.rst file, so change all 4.x references accordingly. There was a mix of lowercase and uppercase X here, which I changed to using lowercase consistently. Signed-off-by: Arnd Bergmann Signed-off-by: Jonathan Corbet --- Documentation/admin-guide/README.rst | 32 ++--- Documentation/process/applying-patches.rst | 117 +++++++++--------- .../translations/it_IT/admin-guide/README.rst | 2 +- 3 files changed, 78 insertions(+), 73 deletions(-) diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index 0797eec76be1..47e577264198 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -1,9 +1,9 @@ .. _readme: -Linux kernel release 4.x +Linux kernel release 5.x ============================================= -These are the release notes for Linux version 4. Read them carefully, +These are the release notes for Linux version 5. Read them carefully, as they tell you what this is all about, explain how to install the kernel, and what to do if something goes wrong. @@ -63,7 +63,7 @@ Installing the kernel source directory where you have permissions (e.g. your home directory) and unpack it:: - xz -cd linux-4.X.tar.xz | tar xvf - + xz -cd linux-5.x.tar.xz | tar xvf - Replace "X" with the version number of the latest kernel. @@ -72,26 +72,26 @@ Installing the kernel source files. They should match the library, and not get messed up by whatever the kernel-du-jour happens to be. - - You can also upgrade between 4.x releases by patching. Patches are + - You can also upgrade between 5.x releases by patching. Patches are distributed in the xz format. To install by patching, get all the newer patch files, enter the top level directory of the kernel source - (linux-4.X) and execute:: + (linux-5.x) and execute:: - xz -cd ../patch-4.x.xz | patch -p1 + xz -cd ../patch-5.x.xz | patch -p1 - Replace "x" for all versions bigger than the version "X" of your current + Replace "x" for all versions bigger than the version "x" of your current source tree, **in_order**, and you should be ok. You may want to remove the backup files (some-file-name~ or some-file-name.orig), and make sure that there are no failed patches (some-file-name# or some-file-name.rej). If there are, either you or I have made a mistake. - Unlike patches for the 4.x kernels, patches for the 4.x.y kernels + Unlike patches for the 5.x kernels, patches for the 5.x.y kernels (also known as the -stable kernels) are not incremental but instead apply - directly to the base 4.x kernel. For example, if your base kernel is 4.0 - and you want to apply the 4.0.3 patch, you must not first apply the 4.0.1 - and 4.0.2 patches. Similarly, if you are running kernel version 4.0.2 and - want to jump to 4.0.3, you must first reverse the 4.0.2 patch (that is, - patch -R) **before** applying the 4.0.3 patch. You can read more on this in + directly to the base 5.x kernel. For example, if your base kernel is 5.0 + and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1 + and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and + want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is, + patch -R) **before** applying the 5.0.3 patch. You can read more on this in :ref:`Documentation/process/applying-patches.rst `. Alternatively, the script patch-kernel can be used to automate this @@ -114,7 +114,7 @@ Installing the kernel source Software requirements --------------------- - Compiling and running the 4.x kernels requires up-to-date + Compiling and running the 5.x kernels requires up-to-date versions of various software packages. Consult :ref:`Documentation/process/changes.rst ` for the minimum version numbers required and how to get updates for these packages. Beware that using @@ -132,12 +132,12 @@ Build directory for the kernel place for the output files (including .config). Example:: - kernel source code: /usr/src/linux-4.X + kernel source code: /usr/src/linux-5.x build directory: /home/name/build/kernel To configure and build the kernel, use:: - cd /usr/src/linux-4.X + cd /usr/src/linux-5.x make O=/home/name/build/kernel menuconfig make O=/home/name/build/kernel sudo make O=/home/name/build/kernel modules_install install diff --git a/Documentation/process/applying-patches.rst b/Documentation/process/applying-patches.rst index dc2ddc345044..fbb9297e6360 100644 --- a/Documentation/process/applying-patches.rst +++ b/Documentation/process/applying-patches.rst @@ -216,14 +216,14 @@ You can use the ``interdiff`` program (http://cyberelk.net/tim/patchutils/) to generate a patch representing the differences between two patches and then apply the result. -This will let you move from something like 4.7.2 to 4.7.3 in a single +This will let you move from something like 5.7.2 to 5.7.3 in a single step. The -z flag to interdiff will even let you feed it patches in gzip or bzip2 compressed form directly without the use of zcat or bzcat or manual decompression. -Here's how you'd go from 4.7.2 to 4.7.3 in a single step:: +Here's how you'd go from 5.7.2 to 5.7.3 in a single step:: - interdiff -z ../patch-4.7.2.gz ../patch-4.7.3.gz | patch -p1 + interdiff -z ../patch-5.7.2.gz ../patch-5.7.3.gz | patch -p1 Although interdiff may save you a step or two you are generally advised to do the additional steps since interdiff can get things wrong in some cases. @@ -245,62 +245,67 @@ The patches are available at http://kernel.org/ Most recent patches are linked from the front page, but they also have specific homes. -The 4.x.y (-stable) and 4.x patches live at +The 5.x.y (-stable) and 5.x patches live at - https://www.kernel.org/pub/linux/kernel/v4.x/ + https://www.kernel.org/pub/linux/kernel/v5.x/ -The -rc patches live at +The -rc patches are not stored on the webserver but are generated on +demand from git tags such as - https://www.kernel.org/pub/linux/kernel/v4.x/testing/ + https://git.kernel.org/torvalds/p/v5.1-rc1/v5.0 + +The stable -rc patches live at + + https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/ -The 4.x kernels +The 5.x kernels =============== These are the base stable releases released by Linus. The highest numbered release is the most recent. If regressions or other serious flaws are found, then a -stable fix patch -will be released (see below) on top of this base. Once a new 4.x base +will be released (see below) on top of this base. Once a new 5.x base kernel is released, a patch is made available that is a delta between the -previous 4.x kernel and the new one. +previous 5.x kernel and the new one. -To apply a patch moving from 4.6 to 4.7, you'd do the following (note -that such patches do **NOT** apply on top of 4.x.y kernels but on top of the -base 4.x kernel -- if you need to move from 4.x.y to 4.x+1 you need to -first revert the 4.x.y patch). +To apply a patch moving from 5.6 to 5.7, you'd do the following (note +that such patches do **NOT** apply on top of 5.x.y kernels but on top of the +base 5.x kernel -- if you need to move from 5.x.y to 5.x+1 you need to +first revert the 5.x.y patch). Here are some examples:: - # moving from 4.6 to 4.7 + # moving from 5.6 to 5.7 - $ cd ~/linux-4.6 # change to kernel source dir - $ patch -p1 < ../patch-4.7 # apply the 4.7 patch + $ cd ~/linux-5.6 # change to kernel source dir + $ patch -p1 < ../patch-5.7 # apply the 5.7 patch $ cd .. - $ mv linux-4.6 linux-4.7 # rename source dir + $ mv linux-5.6 linux-5.7 # rename source dir - # moving from 4.6.1 to 4.7 + # moving from 5.6.1 to 5.7 - $ cd ~/linux-4.6.1 # change to kernel source dir - $ patch -p1 -R < ../patch-4.6.1 # revert the 4.6.1 patch - # source dir is now 4.6 - $ patch -p1 < ../patch-4.7 # apply new 4.7 patch + $ cd ~/linux-5.6.1 # change to kernel source dir + $ patch -p1 -R < ../patch-5.6.1 # revert the 5.6.1 patch + # source dir is now 5.6 + $ patch -p1 < ../patch-5.7 # apply new 5.7 patch $ cd .. - $ mv linux-4.6.1 linux-4.7 # rename source dir + $ mv linux-5.6.1 linux-5.7 # rename source dir -The 4.x.y kernels +The 5.x.y kernels ================= Kernels with 3-digit versions are -stable kernels. They contain small(ish) critical fixes for security problems or significant regressions discovered -in a given 4.x kernel. +in a given 5.x kernel. This is the recommended branch for users who want the most recent stable kernel and are not interested in helping test development/experimental versions. -If no 4.x.y kernel is available, then the highest numbered 4.x kernel is +If no 5.x.y kernel is available, then the highest numbered 5.x kernel is the current stable kernel. .. note:: @@ -308,23 +313,23 @@ the current stable kernel. The -stable team usually do make incremental patches available as well as patches against the latest mainline release, but I only cover the non-incremental ones below. The incremental ones can be found at - https://www.kernel.org/pub/linux/kernel/v4.x/incr/ + https://www.kernel.org/pub/linux/kernel/v5.x/incr/ -These patches are not incremental, meaning that for example the 4.7.3 -patch does not apply on top of the 4.7.2 kernel source, but rather on top -of the base 4.7 kernel source. +These patches are not incremental, meaning that for example the 5.7.3 +patch does not apply on top of the 5.7.2 kernel source, but rather on top +of the base 5.7 kernel source. -So, in order to apply the 4.7.3 patch to your existing 4.7.2 kernel -source you have to first back out the 4.7.2 patch (so you are left with a -base 4.7 kernel source) and then apply the new 4.7.3 patch. +So, in order to apply the 5.7.3 patch to your existing 5.7.2 kernel +source you have to first back out the 5.7.2 patch (so you are left with a +base 5.7 kernel source) and then apply the new 5.7.3 patch. Here's a small example:: - $ cd ~/linux-4.7.2 # change to the kernel source dir - $ patch -p1 -R < ../patch-4.7.2 # revert the 4.7.2 patch - $ patch -p1 < ../patch-4.7.3 # apply the new 4.7.3 patch + $ cd ~/linux-5.7.2 # change to the kernel source dir + $ patch -p1 -R < ../patch-5.7.2 # revert the 5.7.2 patch + $ patch -p1 < ../patch-5.7.3 # apply the new 5.7.3 patch $ cd .. - $ mv linux-4.7.2 linux-4.7.3 # rename the kernel source dir + $ mv linux-5.7.2 linux-5.7.3 # rename the kernel source dir The -rc kernels =============== @@ -343,38 +348,38 @@ This is a good branch to run for people who want to help out testing development kernels but do not want to run some of the really experimental stuff (such people should see the sections about -next and -mm kernels below). -The -rc patches are not incremental, they apply to a base 4.x kernel, just -like the 4.x.y patches described above. The kernel version before the -rcN +The -rc patches are not incremental, they apply to a base 5.x kernel, just +like the 5.x.y patches described above. The kernel version before the -rcN suffix denotes the version of the kernel that this -rc kernel will eventually turn into. -So, 4.8-rc5 means that this is the fifth release candidate for the 4.8 -kernel and the patch should be applied on top of the 4.7 kernel source. +So, 5.8-rc5 means that this is the fifth release candidate for the 5.8 +kernel and the patch should be applied on top of the 5.7 kernel source. Here are 3 examples of how to apply these patches:: - # first an example of moving from 4.7 to 4.8-rc3 + # first an example of moving from 5.7 to 5.8-rc3 - $ cd ~/linux-4.7 # change to the 4.7 source dir - $ patch -p1 < ../patch-4.8-rc3 # apply the 4.8-rc3 patch + $ cd ~/linux-5.7 # change to the 5.7 source dir + $ patch -p1 < ../patch-5.8-rc3 # apply the 5.8-rc3 patch $ cd .. - $ mv linux-4.7 linux-4.8-rc3 # rename the source dir + $ mv linux-5.7 linux-5.8-rc3 # rename the source dir - # now let's move from 4.8-rc3 to 4.8-rc5 + # now let's move from 5.8-rc3 to 5.8-rc5 - $ cd ~/linux-4.8-rc3 # change to the 4.8-rc3 dir - $ patch -p1 -R < ../patch-4.8-rc3 # revert the 4.8-rc3 patch - $ patch -p1 < ../patch-4.8-rc5 # apply the new 4.8-rc5 patch + $ cd ~/linux-5.8-rc3 # change to the 5.8-rc3 dir + $ patch -p1 -R < ../patch-5.8-rc3 # revert the 5.8-rc3 patch + $ patch -p1 < ../patch-5.8-rc5 # apply the new 5.8-rc5 patch $ cd .. - $ mv linux-4.8-rc3 linux-4.8-rc5 # rename the source dir + $ mv linux-5.8-rc3 linux-5.8-rc5 # rename the source dir - # finally let's try and move from 4.7.3 to 4.8-rc5 + # finally let's try and move from 5.7.3 to 5.8-rc5 - $ cd ~/linux-4.7.3 # change to the kernel source dir - $ patch -p1 -R < ../patch-4.7.3 # revert the 4.7.3 patch - $ patch -p1 < ../patch-4.8-rc5 # apply new 4.8-rc5 patch + $ cd ~/linux-5.7.3 # change to the kernel source dir + $ patch -p1 -R < ../patch-5.7.3 # revert the 5.7.3 patch + $ patch -p1 < ../patch-5.8-rc5 # apply new 5.8-rc5 patch $ cd .. - $ mv linux-4.7.3 linux-4.8-rc5 # rename the kernel source dir + $ mv linux-5.7.3 linux-5.8-rc5 # rename the kernel source dir The -mm patches and the linux-next tree diff --git a/Documentation/translations/it_IT/admin-guide/README.rst b/Documentation/translations/it_IT/admin-guide/README.rst index 80f5ffc94a9e..b37166817842 100644 --- a/Documentation/translations/it_IT/admin-guide/README.rst +++ b/Documentation/translations/it_IT/admin-guide/README.rst @@ -4,7 +4,7 @@ .. _it_readme: -Rilascio del kernel Linux 4.x +Rilascio del kernel Linux 5.x =================================================== .. warning:: From 31a1b8d528fa4aedaa207b38d7fafc4e9b0a0d6c Mon Sep 17 00:00:00 2001 From: Petr Vorel Date: Fri, 15 Feb 2019 00:43:27 +0100 Subject: [PATCH 062/298] doc: Mention MSG_ZEROCOPY implementation for UDP MSG_ZEROCOPY implementation for UDP was merged in v5.0, 6e360f733113 ("Merge branch 'udp-msg_zerocopy'"). Signed-off-by: Petr Vorel Signed-off-by: David S. Miller --- Documentation/networking/msg_zerocopy.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/msg_zerocopy.rst b/Documentation/networking/msg_zerocopy.rst index fe46d4867e2d..18c1415e7bfa 100644 --- a/Documentation/networking/msg_zerocopy.rst +++ b/Documentation/networking/msg_zerocopy.rst @@ -7,7 +7,7 @@ Intro ===== The MSG_ZEROCOPY flag enables copy avoidance for socket send calls. -The feature is currently implemented for TCP sockets. +The feature is currently implemented for TCP and UDP sockets. Opportunity and Caveats From 4012e7d09d99b62d80046790657c0b0e32310d50 Mon Sep 17 00:00:00 2001 From: Alexandre Torgue Date: Fri, 15 Feb 2019 10:49:09 +0100 Subject: [PATCH 063/298] net: stmmac: handle endianness in dwmac4_get_timestamp GMAC IP is little-endian and used on several kind of CPU (big or little endian). Main callbacks functions of the stmmac drivers take care about it. It was not the case for dwmac4_get_timestamp function. Fixes: ba1ffd74df74 ("stmmac: fix PTP support for GMAC4") Signed-off-by: Alexandre Torgue Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c index 20299f6f65fc..736e29635b77 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_descs.c @@ -241,15 +241,18 @@ static inline void dwmac4_get_timestamp(void *desc, u32 ats, u64 *ts) static int dwmac4_rx_check_timestamp(void *desc) { struct dma_desc *p = (struct dma_desc *)desc; + unsigned int rdes0 = le32_to_cpu(p->des0); + unsigned int rdes1 = le32_to_cpu(p->des1); + unsigned int rdes3 = le32_to_cpu(p->des3); u32 own, ctxt; int ret = 1; - own = p->des3 & RDES3_OWN; - ctxt = ((p->des3 & RDES3_CONTEXT_DESCRIPTOR) + own = rdes3 & RDES3_OWN; + ctxt = ((rdes3 & RDES3_CONTEXT_DESCRIPTOR) >> RDES3_CONTEXT_DESCRIPTOR_SHIFT); if (likely(!own && ctxt)) { - if ((p->des0 == 0xffffffff) && (p->des1 == 0xffffffff)) + if ((rdes0 == 0xffffffff) && (rdes1 == 0xffffffff)) /* Corrupted value */ ret = -EINVAL; else From 97dc47a1308a3af46a09b1546cfb869f2e382a81 Mon Sep 17 00:00:00 2001 From: Beniamino Galvani Date: Fri, 15 Feb 2019 13:20:42 +0100 Subject: [PATCH 064/298] qmi_wwan: apply SET_DTR quirk to Sierra WP7607 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 1199:68C0 USB ID is reused by Sierra WP7607 which requires the DTR quirk to be detected. Apply QMI_QUIRK_SET_DTR unconditionally as already done for other IDs shared between different devices. Signed-off-by: Beniamino Galvani Acked-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 735ad838e2ba..18af2f8eee96 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1201,8 +1201,8 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x114f, 0x68a2, 8)}, /* Sierra Wireless MC7750 */ {QMI_FIXED_INTF(0x1199, 0x68a2, 8)}, /* Sierra Wireless MC7710 in QMI mode */ {QMI_FIXED_INTF(0x1199, 0x68a2, 19)}, /* Sierra Wireless MC7710 in QMI mode */ - {QMI_FIXED_INTF(0x1199, 0x68c0, 8)}, /* Sierra Wireless MC7304/MC7354 */ - {QMI_FIXED_INTF(0x1199, 0x68c0, 10)}, /* Sierra Wireless MC7304/MC7354 */ + {QMI_QUIRK_SET_DTR(0x1199, 0x68c0, 8)}, /* Sierra Wireless MC7304/MC7354, WP76xx */ + {QMI_QUIRK_SET_DTR(0x1199, 0x68c0, 10)},/* Sierra Wireless MC7304/MC7354 */ {QMI_FIXED_INTF(0x1199, 0x901c, 8)}, /* Sierra Wireless EM7700 */ {QMI_FIXED_INTF(0x1199, 0x901f, 8)}, /* Sierra Wireless EM7355 */ {QMI_FIXED_INTF(0x1199, 0x9041, 8)}, /* Sierra Wireless MC7305/MC7355 */ From e928b5d6b75e239feb9c6d5488974b6646a0ebc8 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Sat, 16 Feb 2019 00:20:54 +0300 Subject: [PATCH 065/298] net: mv643xx_eth: disable clk on error path in mv643xx_eth_shared_probe() If mv643xx_eth_shared_of_probe() fails, mv643xx_eth_shared_probe() leaves clk enabled. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mv643xx_eth.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 2f427271a793..292a668ce88e 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -2879,7 +2879,7 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev) ret = mv643xx_eth_shared_of_probe(pdev); if (ret) - return ret; + goto err_put_clk; pd = dev_get_platdata(&pdev->dev); msp->tx_csum_limit = (pd != NULL && pd->tx_csum_limit) ? @@ -2887,6 +2887,11 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev) infer_hw_params(msp); return 0; + +err_put_clk: + if (!IS_ERR(msp->clk)) + clk_disable_unprepare(msp->clk); + return ret; } static int mv643xx_eth_shared_remove(struct platform_device *pdev) From 04c03114be82194d4a4858d41dba8e286ad1787c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Feb 2019 13:36:20 -0800 Subject: [PATCH 066/298] tcp: clear icsk_backoff in tcp_write_queue_purge() soukjin bae reported a crash in tcp_v4_err() handling ICMP_DEST_UNREACH after tcp_write_queue_head(sk) returned a NULL pointer. Current logic should have prevented this : if (seq != tp->snd_una || !icsk->icsk_retransmits || !icsk->icsk_backoff || fastopen) break; Problem is the write queue might have been purged and icsk_backoff has not been cleared. Signed-off-by: Eric Dumazet Reported-by: soukjin bae Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2079145a3b7c..cf3c5095c10e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2528,6 +2528,7 @@ void tcp_write_queue_purge(struct sock *sk) sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; + inet_csk(sk)->icsk_backoff = 0; } int tcp_disconnect(struct sock *sk, int flags) @@ -2576,7 +2577,6 @@ int tcp_disconnect(struct sock *sk, int flags) tp->write_seq += tp->max_window + 2; if (tp->write_seq == 0) tp->write_seq = 1; - icsk->icsk_backoff = 0; tp->snd_cwnd = 2; icsk->icsk_probes_out = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; From 2c4cc9712364c051b1de2d175d5fbea6be948ebf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Feb 2019 13:36:21 -0800 Subject: [PATCH 067/298] tcp: tcp_v4_err() should be more careful ICMP handlers are not very often stressed, we should make them more resilient to bugs that might surface in the future. If there is no packet in retransmit queue, we should avoid a NULL deref. Signed-off-by: Eric Dumazet Reported-by: soukjin bae Acked-by: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index efc6fef692ff..ec3cea9d6828 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -536,12 +536,15 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (sock_owned_by_user(sk)) break; + skb = tcp_rtx_queue_head(sk); + if (WARN_ON_ONCE(!skb)) + break; + icsk->icsk_backoff--; icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_rtx_queue_head(sk); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); From 8644772637deb121f7ac2df690cbf83fa63d3b70 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 15 Feb 2019 14:44:12 -0800 Subject: [PATCH 068/298] mm: Use fixed constant in page_frag_alloc instead of size + 1 This patch replaces the size + 1 value introduced with the recent fix for 1 byte allocs with a constant value. The idea here is to reduce code overhead as the previous logic would have to read size into a register, then increment it, and write it back to whatever field was being used. By using a constant we can avoid those memory reads and arithmetic operations in favor of just encoding the maximum value into the operation itself. Fixes: 2c2ade81741c ("mm: page_alloc: fix ref bias in page_frag_alloc() for 1-byte allocs") Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 46285d28e43b..7f79b78bc829 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4675,11 +4675,11 @@ void *page_frag_alloc(struct page_frag_cache *nc, /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - page_ref_add(page, size); + page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = size + 1; + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; nc->offset = size; } @@ -4695,10 +4695,10 @@ void *page_frag_alloc(struct page_frag_cache *nc, size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - set_page_count(page, size + 1); + set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1); /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size + 1; + nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1; offset = size - fragsz; } From 3bed3cc4156eedf652b4df72bdb35d4f1a2a739d Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 15 Feb 2019 14:44:18 -0800 Subject: [PATCH 069/298] net: Do not allocate page fragments that are not skb aligned This patch addresses the fact that there are drivers, specifically tun, that will call into the network page fragment allocators with buffer sizes that are not cache aligned. Doing this could result in data alignment and DMA performance issues as these fragment pools are also shared with the skb allocator and any other devices that will use napi_alloc_frags or netdev_alloc_frags. Fixes: ffde7328a36d ("net: Split netdev_alloc_frag into __alloc_page_frag and add __napi_alloc_frag") Reported-by: Jann Horn Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 26d848484912..2415d9cb9b89 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -356,6 +356,8 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { + fragsz = SKB_DATA_ALIGN(fragsz); + return __netdev_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(netdev_alloc_frag); @@ -369,6 +371,8 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) void *napi_alloc_frag(unsigned int fragsz) { + fragsz = SKB_DATA_ALIGN(fragsz); + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(napi_alloc_frag); From 660899ddf06ae8bb5bbbd0a19418b739375430c5 Mon Sep 17 00:00:00 2001 From: Tobias Brunner Date: Mon, 18 Feb 2019 10:49:39 +0100 Subject: [PATCH 070/298] xfrm: Fix inbound traffic via XFRM interfaces across network namespaces After moving an XFRM interface to another namespace it stays associated with the original namespace (net in `struct xfrm_if` and the list keyed with `xfrmi_net_id`), allowing processes in the new namespace to use SAs/policies that were created in the original namespace. For instance, this allows a keying daemon in one namespace to establish IPsec SAs for other namespaces without processes there having access to the keys or IKE credentials. This worked fine for outbound traffic, however, for inbound traffic the lookup for the interfaces and the policies used the incorrect namespace (the one the XFRM interface was moved to). Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") Signed-off-by: Tobias Brunner Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_interface.c | 4 ++-- net/xfrm/xfrm_policy.c | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 6be8c7df15bb..dbb3c1945b5c 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -76,10 +76,10 @@ static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb) int ifindex; struct xfrm_if *xi; - if (!skb->dev) + if (!secpath_exists(skb) || !skb->dev) return NULL; - xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id); + xfrmn = net_generic(xs_net(xfrm_input_state(skb)), xfrmi_net_id); ifindex = skb->dev->ifindex; for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ba0a4048c846..8d1a898d0ba5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3314,8 +3314,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, if (ifcb) { xi = ifcb->decode_session(skb); - if (xi) + if (xi) { if_id = xi->p.if_id; + net = xi->net; + } } rcu_read_unlock(); From f54dada8274643e3ff4436df0ea124aeedc43cae Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 15 Feb 2019 16:34:27 +0000 Subject: [PATCH 071/298] arm64: fix SSBS sanitization In valid_user_regs() we treat SSBS as a RES0 bit, and consequently it is unexpectedly cleared when we restore a sigframe or fiddle with GPRs via ptrace. This patch fixes valid_user_regs() to account for this, updating the function to refer to the latest ARM ARM (ARM DDI 0487D.a). For AArch32 tasks, SSBS appears in bit 23 of SPSR_EL1, matching its position in the AArch32-native PSR format, and we don't need to translate it as we have to for DIT. There are no other bit assignments that we need to account for today. As the recent documentation describes the DIT bit, we can drop our comment regarding DIT. While removing SSBS from the RES0 masks, existing inconsistent whitespace is corrected. Fixes: d71be2b6c0e19180 ("arm64: cpufeature: Detect SSBS and advertise to userspace") Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Suzuki K Poulose Cc: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/kernel/ptrace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 9dce33b0e260..ddaea0fd2fa4 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1702,19 +1702,20 @@ void syscall_trace_exit(struct pt_regs *regs) } /* - * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487C.a - * We also take into account DIT (bit 24), which is not yet documented, and - * treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may be - * allocated an EL0 meaning in future. + * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487D.a. + * We permit userspace to set SSBS (AArch64 bit 12, AArch32 bit 23) which is + * not described in ARM DDI 0487D.a. + * We treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may + * be allocated an EL0 meaning in future. * Userspace cannot use these until they have an architectural meaning. * Note that this follows the SPSR_ELx format, not the AArch32 PSR format. * We also reserve IL for the kernel; SS is handled dynamically. */ #define SPSR_EL1_AARCH64_RES0_BITS \ - (GENMASK_ULL(63,32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \ - GENMASK_ULL(20, 10) | GENMASK_ULL(5, 5)) + (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 25) | GENMASK_ULL(23, 22) | \ + GENMASK_ULL(20, 13) | GENMASK_ULL(11, 10) | GENMASK_ULL(5, 5)) #define SPSR_EL1_AARCH32_RES0_BITS \ - (GENMASK_ULL(63,32) | GENMASK_ULL(23, 22) | GENMASK_ULL(20,20)) + (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20)) static int valid_compat_regs(struct user_pt_regs *regs) { From 0738c8b5915c7eaf1e6007b441008e8f3b460443 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 14 Feb 2019 18:39:59 -0700 Subject: [PATCH 072/298] arm64/neon: Disable -Wincompatible-pointer-types when building with Clang After commit cc9f8349cb33 ("arm64: crypto: add NEON accelerated XOR implementation"), Clang builds for arm64 started failing with the following error message. arch/arm64/lib/xor-neon.c:58:28: error: incompatible pointer types assigning to 'const unsigned long *' from 'uint64_t *' (aka 'unsigned long long *') [-Werror,-Wincompatible-pointer-types] v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6)); ^~~~~~~~ /usr/lib/llvm-9/lib/clang/9.0.0/include/arm_neon.h:7538:47: note: expanded from macro 'vld1q_u64' __ret = (uint64x2_t) __builtin_neon_vld1q_v(__p0, 51); \ ^~~~ There has been quite a bit of debate and triage that has gone into figuring out what the proper fix is, viewable at the link below, which is still ongoing. Ard suggested disabling this warning with Clang with a pragma so no neon code will have this type of error. While this is not at all an ideal solution, this build error is the only thing preventing KernelCI from having successful arm64 defconfig and allmodconfig builds on linux-next. Getting continuous integration running is more important so new warnings/errors or boot failures can be caught and fixed quickly. Link: https://github.com/ClangBuiltLinux/linux/issues/283 Suggested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Nathan Chancellor Signed-off-by: Will Deacon --- arch/arm64/include/asm/neon-intrinsics.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/neon-intrinsics.h b/arch/arm64/include/asm/neon-intrinsics.h index 2ba6c6b9541f..71abfc7612b2 100644 --- a/arch/arm64/include/asm/neon-intrinsics.h +++ b/arch/arm64/include/asm/neon-intrinsics.h @@ -36,4 +36,8 @@ #include #endif +#ifdef CONFIG_CC_IS_CLANG +#pragma clang diagnostic ignored "-Wincompatible-pointer-types" +#endif + #endif /* __ASM_NEON_INTRINSICS_H */ From 4f0557795e76d049f0a1687f1f050addf4df2dac Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 4 Feb 2019 15:07:06 +0100 Subject: [PATCH 073/298] mailbox: Export mbox_flush() The mbox_flush() function can be used by drivers that are built as modules, so the function needs to be exported. Reported-by: Mark Brown Signed-off-by: Thierry Reding Signed-off-by: Jassi Brar --- drivers/mailbox/mailbox.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index c6a7d4582dc6..38d9df3fb199 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -310,6 +310,7 @@ int mbox_flush(struct mbox_chan *chan, unsigned long timeout) return ret; } +EXPORT_SYMBOL_GPL(mbox_flush); /** * mbox_request_channel - Request a mailbox channel. From d7bf31a0f85faaf63c63c39d55154825a1eaaea9 Mon Sep 17 00:00:00 2001 From: Rayagonda Kokatanur Date: Mon, 4 Feb 2019 11:21:29 -0800 Subject: [PATCH 074/298] mailbox: bcm-flexrm-mailbox: Fix FlexRM ring flush timeout issue RING_CONTROL reg was not written due to wrong address, hence all the subsequent ring flush was timing out. Fixes: a371c10ea4b3 ("mailbox: bcm-flexrm-mailbox: Fix FlexRM ring flush sequence") Signed-off-by: Rayagonda Kokatanur Signed-off-by: Ray Jui Reviewed-by: Scott Branden Signed-off-by: Jassi Brar --- drivers/mailbox/bcm-flexrm-mailbox.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mailbox/bcm-flexrm-mailbox.c b/drivers/mailbox/bcm-flexrm-mailbox.c index d713271ebf7c..a64116586b4c 100644 --- a/drivers/mailbox/bcm-flexrm-mailbox.c +++ b/drivers/mailbox/bcm-flexrm-mailbox.c @@ -1396,9 +1396,9 @@ static void flexrm_shutdown(struct mbox_chan *chan) /* Clear ring flush state */ timeout = 1000; /* timeout of 1s */ - writel_relaxed(0x0, ring + RING_CONTROL); + writel_relaxed(0x0, ring->regs + RING_CONTROL); do { - if (!(readl_relaxed(ring + RING_FLUSH_DONE) & + if (!(readl_relaxed(ring->regs + RING_FLUSH_DONE) & FLUSH_DONE_MASK)) break; mdelay(1); From 0fd3fd0a9bb0b02b6435bb7070e9f7b82a23f068 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 5 Feb 2019 20:30:27 +0100 Subject: [PATCH 075/298] libceph: handle an empty authorize reply The authorize reply can be empty, for example when the ticket used to build the authorizer is too old and TAG_BADAUTHORIZER is returned from the service. Calling ->verify_authorizer_reply() results in an attempt to decrypt and validate (somewhat) random data in au->buf (most likely the signature block from calc_signature()), which fails and ends up in con_fault_finish() with !con->auth_retry. The ticket isn't invalidated and the connection is retried again and again until a new ticket is obtained from the monitor: libceph: osd2 192.168.122.1:6809 bad authorize reply libceph: osd2 192.168.122.1:6809 bad authorize reply libceph: osd2 192.168.122.1:6809 bad authorize reply libceph: osd2 192.168.122.1:6809 bad authorize reply Let TAG_BADAUTHORIZER handler kick in and increment con->auth_retry. Cc: stable@vger.kernel.org Fixes: 5c056fdc5b47 ("libceph: verify authorize reply on connect") Link: https://tracker.ceph.com/issues/20164 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/messenger.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3661cdd927f1..7e71b0df1fbc 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2058,6 +2058,8 @@ static int process_connect(struct ceph_connection *con) dout("process_connect on %p tag %d\n", con, (int)con->in_tag); if (con->auth) { + int len = le32_to_cpu(con->in_reply.authorizer_len); + /* * Any connection that defines ->get_authorizer() * should also define ->add_authorizer_challenge() and @@ -2067,8 +2069,7 @@ static int process_connect(struct ceph_connection *con) */ if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { ret = con->ops->add_authorizer_challenge( - con, con->auth->authorizer_reply_buf, - le32_to_cpu(con->in_reply.authorizer_len)); + con, con->auth->authorizer_reply_buf, len); if (ret < 0) return ret; @@ -2078,10 +2079,12 @@ static int process_connect(struct ceph_connection *con) return 0; } - ret = con->ops->verify_authorizer_reply(con); - if (ret < 0) { - con->error_msg = "bad authorize reply"; - return ret; + if (len) { + ret = con->ops->verify_authorizer_reply(con); + if (ret < 0) { + con->error_msg = "bad authorize reply"; + return ret; + } } } From 04242ff3ac0abbaa4362f97781dac268e6c3541a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 11 Feb 2019 15:18:52 +0800 Subject: [PATCH 076/298] ceph: avoid repeatedly adding inode to mdsc->snap_flush_list Otherwise, mdsc->snap_flush_list may get corrupted. Cc: stable@vger.kernel.org Signed-off-by: "Yan, Zheng" Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/snap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 041c27ea8de1..f74193da0e09 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -616,7 +616,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size); spin_lock(&mdsc->snap_flush_lock); - list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); + if (list_empty(&ci->i_snap_flush_item)) + list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); spin_unlock(&mdsc->snap_flush_lock); return 1; /* caller may want to ceph_flush_snaps */ } From 304017d31df36fb61eb2ed3ebf65fb6870b3c731 Mon Sep 17 00:00:00 2001 From: Bard liao Date: Sun, 17 Feb 2019 21:23:47 +0800 Subject: [PATCH 077/298] ASoC: topology: free created components in tplg load error Topology resources are no longer needed if any element failed to load. Signed-off-by: Bard liao Signed-off-by: Mark Brown --- sound/soc/soc-topology.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index fc79ec6927e3..731b963b6995 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -2487,6 +2487,7 @@ int snd_soc_tplg_component_load(struct snd_soc_component *comp, struct snd_soc_tplg_ops *ops, const struct firmware *fw, u32 id) { struct soc_tplg tplg; + int ret; /* setup parsing context */ memset(&tplg, 0, sizeof(tplg)); @@ -2500,7 +2501,12 @@ int snd_soc_tplg_component_load(struct snd_soc_component *comp, tplg.bytes_ext_ops = ops->bytes_ext_ops; tplg.bytes_ext_ops_count = ops->bytes_ext_ops_count; - return soc_tplg_load(&tplg); + ret = soc_tplg_load(&tplg); + /* free the created components if fail to load topology */ + if (ret) + snd_soc_tplg_component_remove(comp, SND_SOC_TPLG_INDEX_ALL); + + return ret; } EXPORT_SYMBOL_GPL(snd_soc_tplg_component_load); From 19dd0777773ab17b4d97f7105e836867c0cdecb4 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Fri, 15 Feb 2019 15:31:29 +0900 Subject: [PATCH 078/298] ASoC: simple-card: fixup refcount_t underflow commit da215354eb55c ("ASoC: simple-card: merge simple-scu-card") merged simple-card and simple-scu-card. Then it had refcount underflow bug. This patch fixup it. We will get below error without this patch. OF: ERROR: Bad of_node_put() on /sound CPU: 3 PID: 237 Comm: kworker/3:1 Not tainted 5.0.0-rc6+ #1514 Hardware name: Renesas H3ULCB Kingfisher board based on r8a7795 ES2.0+ (DT) Workqueue: events deferred_probe_work_func Call trace: dump_backtrace+0x0/0x150 show_stack+0x24/0x30 dump_stack+0xb0/0xec of_node_release+0xd0/0xd8 kobject_put+0x74/0xe8 of_node_put+0x24/0x30 __of_get_next_child+0x50/0x70 of_get_next_child+0x40/0x68 asoc_simple_card_probe+0x604/0x730 platform_drv_probe+0x58/0xa8 ... Reported-by: Vicente Bergas Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown --- sound/soc/generic/simple-card.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c index 37e001cf9cd1..3fe34417ec89 100644 --- a/sound/soc/generic/simple-card.c +++ b/sound/soc/generic/simple-card.c @@ -462,7 +462,7 @@ static int asoc_simple_card_parse_of(struct simple_card_data *priv) conf_idx = 0; node = of_get_child_by_name(top, PREFIX "dai-link"); if (!node) { - node = dev->of_node; + node = of_node_get(top); loop = 0; } From 9060cb719e61b685ec0102574e10337fa5f445ea Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Mon, 18 Feb 2019 10:44:44 +0800 Subject: [PATCH 079/298] net: crypto set sk to NULL when af_alg_release. KASAN has found use-after-free in sockfs_setattr. The existed commit 6d8c50dcb029 ("socket: close race condition between sock_close() and sockfs_setattr()") is to fix this simillar issue, but it seems to ignore that crypto module forgets to set the sk to NULL after af_alg_release. KASAN report details as below: BUG: KASAN: use-after-free in sockfs_setattr+0x120/0x150 Write of size 4 at addr ffff88837b956128 by task syz-executor0/4186 CPU: 2 PID: 4186 Comm: syz-executor0 Not tainted xxx + #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0xca/0x13e print_address_description+0x79/0x330 ? vprintk_func+0x5e/0xf0 kasan_report+0x18a/0x2e0 ? sockfs_setattr+0x120/0x150 sockfs_setattr+0x120/0x150 ? sock_register+0x2d0/0x2d0 notify_change+0x90c/0xd40 ? chown_common+0x2ef/0x510 chown_common+0x2ef/0x510 ? chmod_common+0x3b0/0x3b0 ? __lock_is_held+0xbc/0x160 ? __sb_start_write+0x13d/0x2b0 ? __mnt_want_write+0x19a/0x250 do_fchownat+0x15c/0x190 ? __ia32_sys_chmod+0x80/0x80 ? trace_hardirqs_on_thunk+0x1a/0x1c __x64_sys_fchownat+0xbf/0x160 ? lockdep_hardirqs_on+0x39a/0x5e0 do_syscall_64+0xc8/0x580 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x462589 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fb4b2c83c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000104 RAX: ffffffffffffffda RBX: 000000000072bfa0 RCX: 0000000000462589 RDX: 0000000000000000 RSI: 00000000200000c0 RDI: 0000000000000007 RBP: 0000000000000005 R08: 0000000000001000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fb4b2c846bc R13: 00000000004bc733 R14: 00000000006f5138 R15: 00000000ffffffff Allocated by task 4185: kasan_kmalloc+0xa0/0xd0 __kmalloc+0x14a/0x350 sk_prot_alloc+0xf6/0x290 sk_alloc+0x3d/0xc00 af_alg_accept+0x9e/0x670 hash_accept+0x4a3/0x650 __sys_accept4+0x306/0x5c0 __x64_sys_accept4+0x98/0x100 do_syscall_64+0xc8/0x580 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 4184: __kasan_slab_free+0x12e/0x180 kfree+0xeb/0x2f0 __sk_destruct+0x4e6/0x6a0 sk_destruct+0x48/0x70 __sk_free+0xa9/0x270 sk_free+0x2a/0x30 af_alg_release+0x5c/0x70 __sock_release+0xd3/0x280 sock_close+0x1a/0x20 __fput+0x27f/0x7f0 task_work_run+0x136/0x1b0 exit_to_usermode_loop+0x1a7/0x1d0 do_syscall_64+0x461/0x580 entry_SYSCALL_64_after_hwframe+0x49/0xbe Syzkaller reproducer: r0 = perf_event_open(&(0x7f0000000000)={0x0, 0x70, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, @perf_config_ext}, 0x0, 0x0, 0xffffffffffffffff, 0x0) r1 = socket$alg(0x26, 0x5, 0x0) getrusage(0x0, 0x0) bind(r1, &(0x7f00000001c0)=@alg={0x26, 'hash\x00', 0x0, 0x0, 'sha256-ssse3\x00'}, 0x80) r2 = accept(r1, 0x0, 0x0) r3 = accept4$unix(r2, 0x0, 0x0, 0x0) r4 = dup3(r3, r0, 0x0) fchownat(r4, &(0x7f00000000c0)='\x00', 0x0, 0x0, 0x1000) Fixes: 6d8c50dcb029 ("socket: close race condition between sock_close() and sockfs_setattr()") Signed-off-by: Mao Wenan Signed-off-by: David S. Miller --- crypto/af_alg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index 17eb09d222ff..ec78a04eb136 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -122,8 +122,10 @@ static void alg_do_release(const struct af_alg_type *type, void *private) int af_alg_release(struct socket *sock) { - if (sock->sk) + if (sock->sk) { sock_put(sock->sk); + sock->sk = NULL; + } return 0; } EXPORT_SYMBOL_GPL(af_alg_release); From 21d2cb491b9e10bfdf10424673b43cd9eddc2da1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 17 Feb 2019 23:03:31 +0000 Subject: [PATCH 080/298] net/mlx4_en: fix spelling mistake: "quiting" -> "quitting" There is a spelling mistake in a en_err error message. Fix it. Signed-off-by: Colin Ian King Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 6b88881b8e35..c1438ae52a11 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -3360,7 +3360,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, dev->addr_len = ETH_ALEN; mlx4_en_u64_to_mac(dev->dev_addr, mdev->dev->caps.def_mac[priv->port]); if (!is_valid_ether_addr(dev->dev_addr)) { - en_err(priv, "Port: %d, invalid mac burned: %pM, quiting\n", + en_err(priv, "Port: %d, invalid mac burned: %pM, quitting\n", priv->port, dev->dev_addr); err = -EINVAL; goto out; From df1a2cb7c74b3d3abc8d8c2d690f82c8ebc3490a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 12 Feb 2019 15:42:38 -0800 Subject: [PATCH 081/298] bpf/test_run: fix unkillable BPF_PROG_TEST_RUN Syzbot found out that running BPF_PROG_TEST_RUN with repeat=0xffffffff makes process unkillable. The problem is that when CONFIG_PREEMPT is enabled, we never see need_resched() return true. This is due to the fact that preempt_enable() (which we do in bpf_test_run_one on each iteration) now handles resched if it's needed. Let's disable preemption for the whole run, not per test. In this case we can properly see whether resched is needed. Let's also properly return -EINTR to the userspace in case of a signal interrupt. See recent discussion: http://lore.kernel.org/netdev/CAH3MdRWHr4N8jei8jxDppXjmw-Nw=puNDLbu1dQOFQHxfU2onA@mail.gmail.com I'll follow up with the same fix bpf_prog_test_run_flow_dissector in bpf-next. Reported-by: syzbot Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann --- net/bpf/test_run.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index fa2644d276ef..e31e1b20f7f4 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -13,27 +13,13 @@ #include #include -static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx, - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) -{ - u32 ret; - - preempt_disable(); - rcu_read_lock(); - bpf_cgroup_storage_set(storage); - ret = BPF_PROG_RUN(prog, ctx); - rcu_read_unlock(); - preempt_enable(); - - return ret; -} - -static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, - u32 *time) +static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, + u32 *retval, u32 *time) { struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { 0 }; enum bpf_cgroup_storage_type stype; u64 time_start, time_spent = 0; + int ret = 0; u32 i; for_each_cgroup_storage_type(stype) { @@ -48,25 +34,42 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *ret, if (!repeat) repeat = 1; + + rcu_read_lock(); + preempt_disable(); time_start = ktime_get_ns(); for (i = 0; i < repeat; i++) { - *ret = bpf_test_run_one(prog, ctx, storage); + bpf_cgroup_storage_set(storage); + *retval = BPF_PROG_RUN(prog, ctx); + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (need_resched()) { - if (signal_pending(current)) - break; time_spent += ktime_get_ns() - time_start; + preempt_enable(); + rcu_read_unlock(); + cond_resched(); + + rcu_read_lock(); + preempt_disable(); time_start = ktime_get_ns(); } } time_spent += ktime_get_ns() - time_start; + preempt_enable(); + rcu_read_unlock(); + do_div(time_spent, repeat); *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(storage[stype]); - return 0; + return ret; } static int bpf_test_finish(const union bpf_attr *kattr, From f2ffff085d287eec499f1fccd682796ad8010303 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 18 Feb 2019 11:29:29 +0100 Subject: [PATCH 082/298] mac80211: mesh: fix missing unlock on error in table_path_del() spin_lock_bh() is used in table_path_del() but rcu_read_unlock() is used for unlocking. Fix it by using spin_unlock_bh() instead of rcu_read_unlock() in the error handling case. Fixes: b4c3fbe63601 ("mac80211: Use linked list instead of rhashtable walk for mesh tables") Acked-by: Herbert Xu Signed-off-by: Wei Yongjun Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/mac80211/mesh_pathtbl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index c3a7396fb955..88a6d5e18ccc 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -627,7 +627,7 @@ static int table_path_del(struct mesh_table *tbl, spin_lock_bh(&tbl->walk_lock); mpath = rhashtable_lookup_fast(&tbl->rhead, addr, mesh_rht_params); if (!mpath) { - rcu_read_unlock(); + spin_unlock_bh(&tbl->walk_lock); return -ENXIO; } From 8e29d23e28ee7fb995a00c1ca7e1a4caf5070b12 Mon Sep 17 00:00:00 2001 From: David Chen Date: Sat, 16 Feb 2019 17:16:42 +0800 Subject: [PATCH 083/298] r8152: Add support for MAC address pass through on RTL8153-BD RTL8153-BD is used in Dell DA300 type-C dongle. It should be added to the whitelist of devices to activate MAC address pass through. Per confirming with Realtek all devices containing RTL8153-BD should activate MAC pass through and there won't use pass through bit on efuse like in RTL8153-AD. Signed-off-by: David Chen Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 60dd1ec1665f..ada6baf8847a 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -557,6 +557,7 @@ enum spd_duplex { /* MAC PASSTHRU */ #define AD_MASK 0xfee0 #define BND_MASK 0x0004 +#define BD_MASK 0x0001 #define EFUSE 0xcfdb #define PASS_THRU_MASK 0x1 @@ -1176,9 +1177,9 @@ static int vendor_mac_passthru_addr_read(struct r8152 *tp, struct sockaddr *sa) return -ENODEV; } } else { - /* test for RTL8153-BND */ + /* test for RTL8153-BND and RTL8153-BD */ ocp_data = ocp_read_byte(tp, MCU_TYPE_USB, USB_MISC_1); - if ((ocp_data & BND_MASK) == 0) { + if ((ocp_data & BND_MASK) == 0 && (ocp_data & BD_MASK)) { netif_dbg(tp, probe, tp->netdev, "Invalid variant for MAC pass through\n"); return -ENODEV; From b5372fe5dc84235dbe04998efdede3c4daa866a9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Feb 2019 16:36:48 -0800 Subject: [PATCH 084/298] exec: load_script: Do not exec truncated interpreter path Commit 8099b047ecc4 ("exec: load_script: don't blindly truncate shebang string") was trying to protect against a confused exec of a truncated interpreter path. However, it was overeager and also refused to truncate arguments as well, which broke userspace, and it was reverted. This attempts the protection again, but allows arguments to remain truncated. In an effort to improve readability, helper functions and comments have been added. Co-developed-by: Linus Torvalds Signed-off-by: Kees Cook Cc: Andrew Morton Cc: Oleg Nesterov Cc: Samuel Dionne-Riel Cc: Richard Weinberger Cc: Graham Christensen Cc: Michal Hocko Signed-off-by: Linus Torvalds --- fs/binfmt_script.c | 57 ++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 7cde3f46ad26..e996174cbfc0 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -14,13 +14,30 @@ #include #include +static inline bool spacetab(char c) { return c == ' ' || c == '\t'; } +static inline char *next_non_spacetab(char *first, const char *last) +{ + for (; first <= last; first++) + if (!spacetab(*first)) + return first; + return NULL; +} +static inline char *next_terminator(char *first, const char *last) +{ + for (; first <= last; first++) + if (spacetab(*first) || !*first) + return first; + return NULL; +} + static int load_script(struct linux_binprm *bprm) { const char *i_arg, *i_name; - char *cp; + char *cp, *buf_end; struct file *file; int retval; + /* Not ours to exec if we don't start with "#!". */ if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!')) return -ENOEXEC; @@ -33,18 +50,40 @@ static int load_script(struct linux_binprm *bprm) if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE) return -ENOENT; - /* - * This section does the #! interpretation. - * Sorta complicated, but hopefully it will work. -TYT - */ - + /* Release since we are not mapping a binary into memory. */ allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; - bprm->buf[BINPRM_BUF_SIZE - 1] = '\0'; - if ((cp = strchr(bprm->buf, '\n')) == NULL) - cp = bprm->buf+BINPRM_BUF_SIZE-1; + /* + * This section handles parsing the #! line into separate + * interpreter path and argument strings. We must be careful + * because bprm->buf is not yet guaranteed to be NUL-terminated + * (though the buffer will have trailing NUL padding when the + * file size was smaller than the buffer size). + * + * We do not want to exec a truncated interpreter path, so either + * we find a newline (which indicates nothing is truncated), or + * we find a space/tab/NUL after the interpreter path (which + * itself may be preceded by spaces/tabs). Truncating the + * arguments is fine: the interpreter can re-read the script to + * parse them on its own. + */ + buf_end = bprm->buf + sizeof(bprm->buf) - 1; + cp = strnchr(bprm->buf, sizeof(bprm->buf), '\n'); + if (!cp) { + cp = next_non_spacetab(bprm->buf + 2, buf_end); + if (!cp) + return -ENOEXEC; /* Entire buf is spaces/tabs */ + /* + * If there is no later space/tab/NUL we must assume the + * interpreter path is truncated. + */ + if (!next_terminator(cp, buf_end)) + return -ENOEXEC; + cp = buf_end; + } + /* NUL-terminate the buffer and any trailing spaces/tabs. */ *cp = '\0'; while (cp > bprm->buf) { cp--; From 8f5b27347e88b171c755562f0090ce40e514fc00 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Mon, 18 Feb 2019 16:58:01 +1100 Subject: [PATCH 085/298] powerpc/powernv/sriov: Register IOMMU groups for VFs The compound IOMMU group rework moved iommu_register_group() together in pnv_pci_ioda_setup_iommu_api() (which is a part of ppc_md.pcibios_fixup). As the result, pnv_ioda_setup_bus_iommu_group() does not create groups any more, it only adds devices to groups. This works fine for boot time devices. However IOMMU groups for SRIOV's VFs were added by pnv_ioda_setup_bus_iommu_group() so this got broken: pnv_tce_iommu_bus_notifier() expects a group to be registered for VF and it is not. This adds missing group registration and adds a NULL pointer check into the bus notifier so we won't crash if there is no group, although it is not expected to happen now because of the change above. Example oops seen prior to this patch: $ echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/sriov_numvfs Unable to handle kernel paging request for data at address 0x00000030 Faulting instruction address: 0xc0000000004a6018 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA PowerNV CPU: 46 PID: 7006 Comm: bash Not tainted 4.15-ish NIP: c0000000004a6018 LR: c0000000004a6014 CTR: 0000000000000000 REGS: c000008fc876b400 TRAP: 0300 Not tainted (4.15-ish) MSR: 900000000280b033 CFAR: c000000000d0be20 DAR: 0000000000000030 DSISR: 40000000 SOFTE: 1 ... NIP sysfs_do_create_link_sd.isra.0+0x68/0x150 LR sysfs_do_create_link_sd.isra.0+0x64/0x150 Call Trace: pci_dev_type+0x0/0x30 (unreliable) iommu_group_add_device+0x8c/0x600 iommu_add_device+0xe8/0x180 pnv_tce_iommu_bus_notifier+0xb0/0xf0 notifier_call_chain+0x9c/0x110 blocking_notifier_call_chain+0x64/0xa0 device_add+0x524/0x7d0 pci_device_add+0x248/0x450 pci_iov_add_virtfn+0x294/0x3e0 pci_enable_sriov+0x43c/0x580 mlx5_core_sriov_configure+0x15c/0x2f0 [mlx5_core] sriov_numvfs_store+0x180/0x240 dev_attr_store+0x3c/0x60 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1ac/0x240 __vfs_write+0x3c/0x70 vfs_write+0xd8/0x220 SyS_write+0x6c/0x110 system_call+0x58/0x6c Fixes: 0bd971676e68 ("powerpc/powernv/npu: Add compound IOMMU groups") Signed-off-by: Alexey Kardashevskiy Reported-by: Santwana Samantray Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/pci-ioda.c | 2 ++ arch/powerpc/platforms/powernv/pci.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7db3119f8a5b..145373f0e5dc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1593,6 +1593,8 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs) pnv_pci_ioda2_setup_dma_pe(phb, pe); #ifdef CONFIG_IOMMU_API + iommu_register_group(&pe->table_group, + pe->phb->hose->global_number, pe->pe_number); pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL); #endif } diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 45fb70b4bfa7..ef9448a907c6 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -1147,6 +1147,8 @@ static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb, return 0; pe = &phb->ioda.pe_array[pdn->pe_number]; + if (!pe->table_group.group) + return 0; iommu_add_device(&pe->table_group, dev); return 0; case BUS_NOTIFY_DEL_DEVICE: From 9addc92730df55e2c05e8d3f69267a89d65bcba8 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Mon, 18 Feb 2019 15:24:02 +0200 Subject: [PATCH 086/298] qed: Fix iWARP buffer size provided for syn packet processing. The assumption that the maximum size of a syn packet is 128 bytes is wrong. Tunneling headers were not accounted for. Allocate buffers large enough for mtu. Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 12 ++++++------ drivers/net/ethernet/qlogic/qed/qed_iwarp.h | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index beb8e5d6401a..e84fb01b91fd 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -2605,7 +2605,7 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, struct qed_iwarp_info *iwarp_info; struct qed_ll2_acquire_data data; struct qed_ll2_cbs cbs; - u32 mpa_buff_size; + u32 buff_size; u16 n_ooo_bufs; int rc = 0; int i; @@ -2632,7 +2632,7 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, memset(&data, 0, sizeof(data)); data.input.conn_type = QED_LL2_TYPE_IWARP; - data.input.mtu = QED_IWARP_MAX_SYN_PKT_SIZE; + data.input.mtu = params->max_mtu; data.input.rx_num_desc = QED_IWARP_LL2_SYN_RX_SIZE; data.input.tx_num_desc = QED_IWARP_LL2_SYN_TX_SIZE; data.input.tx_max_bds_per_packet = 1; /* will never be fragmented */ @@ -2654,9 +2654,10 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, goto err; } + buff_size = QED_IWARP_MAX_BUF_SIZE(params->max_mtu); rc = qed_iwarp_ll2_alloc_buffers(p_hwfn, QED_IWARP_LL2_SYN_RX_SIZE, - QED_IWARP_MAX_SYN_PKT_SIZE, + buff_size, iwarp_info->ll2_syn_handle); if (rc) goto err; @@ -2710,10 +2711,9 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, if (rc) goto err; - mpa_buff_size = QED_IWARP_MAX_BUF_SIZE(params->max_mtu); rc = qed_iwarp_ll2_alloc_buffers(p_hwfn, data.input.rx_num_desc, - mpa_buff_size, + buff_size, iwarp_info->ll2_mpa_handle); if (rc) goto err; @@ -2726,7 +2726,7 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, iwarp_info->max_num_partial_fpdus = (u16)p_hwfn->p_rdma_info->num_qps; - iwarp_info->mpa_intermediate_buf = kzalloc(mpa_buff_size, GFP_KERNEL); + iwarp_info->mpa_intermediate_buf = kzalloc(buff_size, GFP_KERNEL); if (!iwarp_info->mpa_intermediate_buf) goto err; diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h index b8f612d00241..7ac959038324 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h @@ -46,7 +46,6 @@ enum qed_iwarp_qp_state qed_roce2iwarp_state(enum qed_roce_qp_state state); #define QED_IWARP_LL2_SYN_TX_SIZE (128) #define QED_IWARP_LL2_SYN_RX_SIZE (256) -#define QED_IWARP_MAX_SYN_PKT_SIZE (128) #define QED_IWARP_LL2_OOO_DEF_TX_SIZE (256) #define QED_IWARP_MAX_OOO (16) From 8be3dadf04050c2907760ec1955ca1c8fbc25585 Mon Sep 17 00:00:00 2001 From: Michal Kalderon Date: Mon, 18 Feb 2019 15:24:03 +0200 Subject: [PATCH 087/298] qed: Fix iWARP syn packet mac address validation. The ll2 forwards all syn packets to the driver without validating the mac address. Add validation check in the driver's iWARP listener flow and drop the packet if it isn't intended for the device. Signed-off-by: Ariel Elior Signed-off-by: Michal Kalderon Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index e84fb01b91fd..ded556b7bab5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -1688,6 +1688,15 @@ qed_iwarp_parse_rx_pkt(struct qed_hwfn *p_hwfn, eth_hlen = ETH_HLEN + (vlan_valid ? sizeof(u32) : 0); + if (!ether_addr_equal(ethh->h_dest, + p_hwfn->p_rdma_info->iwarp.mac_addr)) { + DP_VERBOSE(p_hwfn, + QED_MSG_RDMA, + "Got unexpected mac %pM instead of %pM\n", + ethh->h_dest, p_hwfn->p_rdma_info->iwarp.mac_addr); + return -EINVAL; + } + ether_addr_copy(remote_mac_addr, ethh->h_source); ether_addr_copy(local_mac_addr, ethh->h_dest); From 8a7493e58ad688eb23b81e45461c5d314f4402f1 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Mon, 18 Feb 2019 14:35:03 +0100 Subject: [PATCH 088/298] net: stmmac: Fix a race in EEE enable callback We are saving the status of EEE even before we try to enable it. This leads to a race with XMIT function that tries to arm EEE timer before we set it up. Fix this by only saving the EEE parameters after all operations are performed with success. Signed-off-by: Jose Abreu Fixes: d765955d2ae0 ("stmmac: add the Energy Efficient Ethernet support") Cc: Joao Pinto Cc: David S. Miller Cc: Giuseppe Cavallaro Cc: Alexandre Torgue Signed-off-by: David S. Miller --- .../ethernet/stmicro/stmmac/stmmac_ethtool.c | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c index 5d85742a2be0..3c749c327cbd 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c @@ -696,25 +696,27 @@ static int stmmac_ethtool_op_set_eee(struct net_device *dev, struct ethtool_eee *edata) { struct stmmac_priv *priv = netdev_priv(dev); + int ret; - priv->eee_enabled = edata->eee_enabled; - - if (!priv->eee_enabled) + if (!edata->eee_enabled) { stmmac_disable_eee_mode(priv); - else { + } else { /* We are asking for enabling the EEE but it is safe * to verify all by invoking the eee_init function. * In case of failure it will return an error. */ - priv->eee_enabled = stmmac_eee_init(priv); - if (!priv->eee_enabled) + edata->eee_enabled = stmmac_eee_init(priv); + if (!edata->eee_enabled) return -EOPNOTSUPP; - - /* Do not change tx_lpi_timer in case of failure */ - priv->tx_lpi_timer = edata->tx_lpi_timer; } - return phy_ethtool_set_eee(dev->phydev, edata); + ret = phy_ethtool_set_eee(dev->phydev, edata); + if (ret) + return ret; + + priv->eee_enabled = edata->eee_enabled; + priv->tx_lpi_timer = edata->tx_lpi_timer; + return 0; } static u32 stmmac_usec2riwt(u32 usec, struct stmmac_priv *priv) From 4d96e13ee9cd1f7f801e8c7f4b12f09d1da4a5d8 Mon Sep 17 00:00:00 2001 From: Salil Mehta Date: Mon, 18 Feb 2019 17:40:32 +0000 Subject: [PATCH 089/298] net: hns: Fixes the missing put_device in positive leg for roce reset This patch fixes the missing device reference release-after-use in the positive leg of the roce reset API of the HNS DSAF. Fixes: c969c6e7ab8c ("net: hns: Fix object reference leaks in hns_dsaf_roce_reset()") Reported-by: John Garry Signed-off-by: Salil Mehta Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c index b8155f5e71b4..ac55db065f16 100644 --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c @@ -3128,6 +3128,9 @@ int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset) dsaf_set_bit(credit, DSAF_SBM_ROCEE_CFG_CRD_EN_B, 1); dsaf_write_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG, credit); } + + put_device(&pdev->dev); + return 0; } EXPORT_SYMBOL(hns_dsaf_roce_reset); From 1f43f400a2cbb02f3d34de8fe30075c070254816 Mon Sep 17 00:00:00 2001 From: Murali Karicheri Date: Mon, 18 Feb 2019 15:10:51 -0500 Subject: [PATCH 090/298] net: netcp: Fix ethss driver probe issue Recent commit below has introduced a bug in netcp driver that causes the ethss driver probe failure and thus break the networking function on K2 SoCs such as K2HK, K2L, K2E etc. This patch fixes the issue to restore networking on the above SoCs. Fixes: 21c328dcecfc ("net: ethernet: Convert to using %pOFn instead of device_node.name") Signed-off-by: Murali Karicheri Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/netcp_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 1f612268c998..d847f672a705 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -259,7 +259,7 @@ static int netcp_module_probe(struct netcp_device *netcp_device, const char *name; char node_name[32]; - if (of_property_read_string(node, "label", &name) < 0) { + if (of_property_read_string(child, "label", &name) < 0) { snprintf(node_name, sizeof(node_name), "%pOFn", child); name = node_name; } From 8cbd468bdeb5ed3acac2d7a9f7494d5b77e46297 Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Sat, 16 Feb 2019 11:31:48 -0500 Subject: [PATCH 091/298] cpufreq: scmi: Fix use-after-free in scmi_cpufreq_exit() This issue was detected with the help of Coccinelle. So change the order of function calls to fix it. Fixes: 1690d8bb91e37 (cpufreq: scpi/scmi: Fix freeing of dynamic OPPs) Signed-off-by: Yangtao Li Acked-by: Viresh Kumar Acked-by: Sudeep Holla Cc: 4.20+ # 4.20+ Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/scmi-cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index 242c3370544e..9ed46d188cb5 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -187,8 +187,8 @@ static int scmi_cpufreq_exit(struct cpufreq_policy *policy) cpufreq_cooling_unregister(priv->cdev); dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table); - kfree(priv); dev_pm_opp_remove_all_dynamic(priv->cpu_dev); + kfree(priv); return 0; } From 6fc979179c98d2591784937d5618edc3e5cd31c1 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 15 Feb 2019 16:30:42 +0100 Subject: [PATCH 092/298] ARM: dts: armada-xp: fix Armada XP boards NAND description Commit 3b79919946cd2cf4dac47842afc9a893acec4ed7 ("ARM: dts: armada-370-xp: update NAND node with new bindings") updated some Marvell Armada DT description to use the new NAND controller bindings, but did it incorrectly for a number of boards: armada-xp-gp, armada-xp-db and armada-xp-lenovo-ix4-300d. Due to this, the NAND is no longer detected on those platforms. This commit fixes that by properly using the new NAND DT binding. This commit was runtime-tested on Armada XP GP, the two other platforms are only compile-tested. Fixes: 3b79919946cd2 ("ARM: dts: armada-370-xp: update NAND node with new bindings") Cc: Miquel Raynal Signed-off-by: Thomas Petazzoni Signed-off-by: Gregory CLEMENT --- arch/arm/boot/dts/armada-xp-db.dts | 42 ++++++----- arch/arm/boot/dts/armada-xp-gp.dts | 13 ++-- .../boot/dts/armada-xp-lenovo-ix4-300d.dts | 73 ++++++++++--------- 3 files changed, 68 insertions(+), 60 deletions(-) diff --git a/arch/arm/boot/dts/armada-xp-db.dts b/arch/arm/boot/dts/armada-xp-db.dts index f3ac7483afed..5d04dc68cf57 100644 --- a/arch/arm/boot/dts/armada-xp-db.dts +++ b/arch/arm/boot/dts/armada-xp-db.dts @@ -144,30 +144,32 @@ usb@52000 { status = "okay"; }; - nand@d0000 { + nand-controller@d0000 { status = "okay"; - label = "pxa3xx_nand-0"; - num-cs = <1>; - marvell,nand-keep-config; - nand-on-flash-bbt; - partitions { - compatible = "fixed-partitions"; - #address-cells = <1>; - #size-cells = <1>; + nand@0 { + reg = <0>; + label = "pxa3xx_nand-0"; + nand-rb = <0>; + nand-on-flash-bbt; - partition@0 { - label = "U-Boot"; - reg = <0 0x800000>; - }; - partition@800000 { - label = "Linux"; - reg = <0x800000 0x800000>; - }; - partition@1000000 { - label = "Filesystem"; - reg = <0x1000000 0x3f000000>; + partitions { + compatible = "fixed-partitions"; + #address-cells = <1>; + #size-cells = <1>; + partition@0 { + label = "U-Boot"; + reg = <0 0x800000>; + }; + partition@800000 { + label = "Linux"; + reg = <0x800000 0x800000>; + }; + partition@1000000 { + label = "Filesystem"; + reg = <0x1000000 0x3f000000>; + }; }; }; }; diff --git a/arch/arm/boot/dts/armada-xp-gp.dts b/arch/arm/boot/dts/armada-xp-gp.dts index 1139e9469a83..b4cca507cf13 100644 --- a/arch/arm/boot/dts/armada-xp-gp.dts +++ b/arch/arm/boot/dts/armada-xp-gp.dts @@ -160,12 +160,15 @@ bm@c0000 { status = "okay"; }; - nand@d0000 { + nand-controller@d0000 { status = "okay"; - label = "pxa3xx_nand-0"; - num-cs = <1>; - marvell,nand-keep-config; - nand-on-flash-bbt; + + nand@0 { + reg = <0>; + label = "pxa3xx_nand-0"; + nand-rb = <0>; + nand-on-flash-bbt; + }; }; }; diff --git a/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts b/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts index bbbb38888bb8..87dcb502f72d 100644 --- a/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts +++ b/arch/arm/boot/dts/armada-xp-lenovo-ix4-300d.dts @@ -81,49 +81,52 @@ pcf8563@51 { }; - nand@d0000 { + nand-controller@d0000 { status = "okay"; - label = "pxa3xx_nand-0"; - num-cs = <1>; - marvell,nand-keep-config; - nand-on-flash-bbt; - partitions { - compatible = "fixed-partitions"; - #address-cells = <1>; - #size-cells = <1>; + nand@0 { + reg = <0>; + label = "pxa3xx_nand-0"; + nand-rb = <0>; + nand-on-flash-bbt; - partition@0 { - label = "u-boot"; - reg = <0x00000000 0x000e0000>; - read-only; - }; + partitions { + compatible = "fixed-partitions"; + #address-cells = <1>; + #size-cells = <1>; - partition@e0000 { - label = "u-boot-env"; - reg = <0x000e0000 0x00020000>; - read-only; - }; + partition@0 { + label = "u-boot"; + reg = <0x00000000 0x000e0000>; + read-only; + }; - partition@100000 { - label = "u-boot-env2"; - reg = <0x00100000 0x00020000>; - read-only; - }; + partition@e0000 { + label = "u-boot-env"; + reg = <0x000e0000 0x00020000>; + read-only; + }; - partition@120000 { - label = "zImage"; - reg = <0x00120000 0x00400000>; - }; + partition@100000 { + label = "u-boot-env2"; + reg = <0x00100000 0x00020000>; + read-only; + }; - partition@520000 { - label = "initrd"; - reg = <0x00520000 0x00400000>; - }; + partition@120000 { + label = "zImage"; + reg = <0x00120000 0x00400000>; + }; - partition@e00000 { - label = "boot"; - reg = <0x00e00000 0x3f200000>; + partition@520000 { + label = "initrd"; + reg = <0x00520000 0x00400000>; + }; + + partition@e00000 { + label = "boot"; + reg = <0x00e00000 0x3f200000>; + }; }; }; }; From bdd22a41d55bb0068c8685e28839ed9492e96aba Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Sun, 17 Feb 2019 20:21:40 +0200 Subject: [PATCH 093/298] arm64: dts: clearfog-gt-8k: fix SGMII PHY reset signal The PHY reset signal goes to mpp43 on CP0. Fixes: babc5544c293 ("arm64: dts: clearfog-gt-8k: 1G eth PHY reset signal") Reported-by: Denis Odintsov Signed-off-by: Baruch Siach Signed-off-by: Gregory CLEMENT --- arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts b/arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts index 5b4a9609e31f..2468762283a5 100644 --- a/arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts +++ b/arch/arm64/boot/dts/marvell/armada-8040-clearfog-gt-8k.dts @@ -351,7 +351,7 @@ ge_phy: ethernet-phy@0 { reg = <0>; pinctrl-names = "default"; pinctrl-0 = <&cp0_copper_eth_phy_reset>; - reset-gpios = <&cp1_gpio1 11 GPIO_ACTIVE_LOW>; + reset-gpios = <&cp0_gpio2 11 GPIO_ACTIVE_LOW>; reset-assert-us = <10000>; }; From 759c962d3c9bb1a60e3b4b780daa66ee6d4be13a Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 19 Feb 2019 08:46:32 -0800 Subject: [PATCH 094/298] ARM: dts: am335x-evmsk: Fix PHY mode for ethernet The PHY must add both tx and rx delay and not only on the tx clock. The board uses AR8031_AL1A PHY where the rx delay is enabled by default, the tx dealy is disabled. The reason why rgmii-txid worked because the rx delay was not disabled by the driver so essentially we ended up with rgmii-id PHY mode. Signed-off-by: Peter Ujfalusi Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am335x-evmsk.dts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/am335x-evmsk.dts b/arch/arm/boot/dts/am335x-evmsk.dts index 172c0224e7f6..b128998097ce 100644 --- a/arch/arm/boot/dts/am335x-evmsk.dts +++ b/arch/arm/boot/dts/am335x-evmsk.dts @@ -651,13 +651,13 @@ ethphy1: ethernet-phy@1 { &cpsw_emac0 { phy-handle = <ðphy0>; - phy-mode = "rgmii-txid"; + phy-mode = "rgmii-id"; dual_emac_res_vlan = <1>; }; &cpsw_emac1 { phy-handle = <ðphy1>; - phy-mode = "rgmii-txid"; + phy-mode = "rgmii-id"; dual_emac_res_vlan = <2>; }; From 37685f6a63eeca2135d1f704e7638409a071b1f6 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Tue, 19 Feb 2019 08:46:33 -0800 Subject: [PATCH 095/298] ARM: dts: am335x-evm: Fix PHY mode for ethernet The PHY must add both tx and rx delay and not only on the tx clock. The board uses AR8031_AL1A PHY where the rx delay is enabled by default, the tx dealy is disabled. The reason why rgmii-txid worked because the rx delay was not disabled by the driver so essentially we ended up with rgmii-id PHY mode. Signed-off-by: Peter Ujfalusi Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am335x-evm.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/am335x-evm.dts b/arch/arm/boot/dts/am335x-evm.dts index b67f5fee1469..dce5be5df97b 100644 --- a/arch/arm/boot/dts/am335x-evm.dts +++ b/arch/arm/boot/dts/am335x-evm.dts @@ -729,7 +729,7 @@ ethphy0: ethernet-phy@0 { &cpsw_emac0 { phy-handle = <ðphy0>; - phy-mode = "rgmii-txid"; + phy-mode = "rgmii-id"; }; &tscadc { From 450d007d199e632a1a4c4b91302deacd7d56815f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 14 Feb 2019 23:46:19 +0100 Subject: [PATCH 096/298] gpu: drm: radeon: Set DPM_FLAG_NEVER_SKIP when enabling PM-runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On HP ProBook 4540s, if PM-runtime is enabled in the radeon driver and the direct-complete optimization is used for the radeon device during system-wide suspend, the system doesn't resume. Preventing direct-complete from being used with the radeon device by setting the DPM_FLAG_NEVER_SKIP driver flag for it makes the problem go away, which indicates that direct-complete is not safe for the radeon driver in general and should not be used with it (at least for now). This fixes a regression introduced by commit c62ec4610c40 ("PM / core: Fix direct_complete handling for devices with no callbacks") which allowed direct-complete to be applied to devices without PM callbacks (again) which in turn unlocked direct-complete for radeon on HP ProBook 4540s. Fixes: c62ec4610c40 ("PM / core: Fix direct_complete handling for devices with no callbacks") Link: https://bugzilla.kernel.org/show_bug.cgi?id=201519 Reported-by: Ярослав Семченко Tested-by: Ярослав Семченко Signed-off-by: Rafael J. Wysocki Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_kms.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index dec1e081f529..6a8fb6fd183c 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -172,6 +172,7 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags) } if (radeon_is_px(dev)) { + dev_pm_set_driver_flags(dev->dev, DPM_FLAG_NEVER_SKIP); pm_runtime_use_autosuspend(dev->dev); pm_runtime_set_autosuspend_delay(dev->dev, 5000); pm_runtime_set_active(dev->dev); From d33158530660bc89be3cc870a2152e4e9a76cac7 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 18 Feb 2019 17:11:38 -0500 Subject: [PATCH 097/298] drm/amdgpu: Set DPM_FLAG_NEVER_SKIP when enabling PM-runtime Based on a similar patch from Rafael for radeon. When using ATPX to control dGPU power, the state is not retained across suspend and resume cycles by default. This can probably be loosened for Hybrid Graphics (_PR3) laptops where I think the state is properly retained. Fixes: c62ec4610c40 ("PM / core: Fix direct_complete handling for devices with no callbacks") Cc: Rafael J. Wysocki Acked-by: Rafael J. Wysocki Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index bc62bf41b7e9..5dc349173e4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -212,6 +212,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) } if (amdgpu_device_is_px(dev)) { + dev_pm_set_driver_flags(dev->dev, DPM_FLAG_NEVER_SKIP); pm_runtime_use_autosuspend(dev->dev); pm_runtime_set_autosuspend_delay(dev->dev, 5000); pm_runtime_set_active(dev->dev); From 9db97d8aa8f8a518c421196a504dbfc942ef8d40 Mon Sep 17 00:00:00 2001 From: shaoyunl Date: Fri, 15 Feb 2019 11:05:04 -0500 Subject: [PATCH 098/298] drm/amdgpu: Update sdma golden setting for vega20 According to hardware engineer, WRITE_BURST_LENGTH [9:8] in register SDMA0_CHICKEN_BITS need to change to 3 for better performance Signed-off-by: shaoyunl Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 6811a5d05b27..aa2f71cc1eba 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -128,7 +128,7 @@ static const struct soc15_reg_golden golden_settings_sdma0_4_2_init[] = { static const struct soc15_reg_golden golden_settings_sdma0_4_2[] = { - SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_CHICKEN_BITS, 0xfe931f07, 0x02831d07), + SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_CHICKEN_BITS, 0xfe931f07, 0x02831f07), SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_CLK_CTRL, 0xffffffff, 0x3f000100), SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_GB_ADDR_CONFIG, 0x0000773f, 0x00004002), SOC15_REG_GOLDEN_VALUE(SDMA0, 0, mmSDMA0_GB_ADDR_CONFIG_READ, 0x0000773f, 0x00004002), @@ -158,7 +158,7 @@ static const struct soc15_reg_golden golden_settings_sdma0_4_2[] = }; static const struct soc15_reg_golden golden_settings_sdma1_4_2[] = { - SOC15_REG_GOLDEN_VALUE(SDMA1, 0, mmSDMA1_CHICKEN_BITS, 0xfe931f07, 0x02831d07), + SOC15_REG_GOLDEN_VALUE(SDMA1, 0, mmSDMA1_CHICKEN_BITS, 0xfe931f07, 0x02831f07), SOC15_REG_GOLDEN_VALUE(SDMA1, 0, mmSDMA1_CLK_CTRL, 0xffffffff, 0x3f000100), SOC15_REG_GOLDEN_VALUE(SDMA1, 0, mmSDMA1_GB_ADDR_CONFIG, 0x0000773f, 0x00004002), SOC15_REG_GOLDEN_VALUE(SDMA1, 0, mmSDMA1_GB_ADDR_CONFIG_READ, 0x0000773f, 0x00004002), From d2f0b53bda3193874f3905bc839888f895d1c0cf Mon Sep 17 00:00:00 2001 From: "Leo (Hanghong) Ma" Date: Thu, 24 Jan 2019 15:07:52 -0500 Subject: [PATCH 099/298] drm/amd/display: Fix MST reboot/poweroff sequence [Why] drm_dp_mst_topology_mgr_suspend() is added into the new reboot sequence, which disables the UP request at the beginning. Therefore sideband messages are blocked. [How] Finish MST sideband message transaction before UP request is suppressed. Signed-off-by: Leo (Hanghong) Ma Reviewed-by: Roman Li Acked-by: Leo Li Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 0b392bfca284..5296b8f3e0ab 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -786,12 +786,13 @@ static int dm_suspend(void *handle) struct amdgpu_display_manager *dm = &adev->dm; int ret = 0; + WARN_ON(adev->dm.cached_state); + adev->dm.cached_state = drm_atomic_helper_suspend(adev->ddev); + s3_handle_mst(adev->ddev, true); amdgpu_dm_irq_suspend(adev); - WARN_ON(adev->dm.cached_state); - adev->dm.cached_state = drm_atomic_helper_suspend(adev->ddev); dc_set_power_state(dm->dc, DC_ACPI_CM_POWER_STATE_D3); From 8852ae9a82498207c15262b6294d14aea1796966 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Mon, 28 Jan 2019 10:59:34 -0500 Subject: [PATCH 100/298] drm/amd/display: Raise dispclk value for dce11 [Why] The visual corruption due to low display clock value. Observed on Carrizo 4K@60Hz. [How] There was earlier patch for dce_update_clocks: Adding +15% workaround also to to dce11_update_clocks Signed-off-by: Roman Li Reviewed-by: Nicholas Kazlauskas Acked-by: Leo Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dce/dce_clk_mgr.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dce_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/dce/dce_clk_mgr.c index 19801bdba0d2..7a72ee46f14b 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dce_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dce_clk_mgr.c @@ -662,6 +662,11 @@ static void dce11_update_clocks(struct clk_mgr *clk_mgr, { struct dce_clk_mgr *clk_mgr_dce = TO_DCE_CLK_MGR(clk_mgr); struct dm_pp_power_level_change_request level_change_req; + int patched_disp_clk = context->bw.dce.dispclk_khz; + + /*TODO: W/A for dal3 linux, investigate why this works */ + if (!clk_mgr_dce->dfs_bypass_active) + patched_disp_clk = patched_disp_clk * 115 / 100; level_change_req.power_level = dce_get_required_clocks_state(clk_mgr, context); /* get max clock state from PPLIB */ @@ -671,9 +676,9 @@ static void dce11_update_clocks(struct clk_mgr *clk_mgr, clk_mgr_dce->cur_min_clks_state = level_change_req.power_level; } - if (should_set_clock(safe_to_lower, context->bw.dce.dispclk_khz, clk_mgr->clks.dispclk_khz)) { - context->bw.dce.dispclk_khz = dce_set_clock(clk_mgr, context->bw.dce.dispclk_khz); - clk_mgr->clks.dispclk_khz = context->bw.dce.dispclk_khz; + if (should_set_clock(safe_to_lower, patched_disp_clk, clk_mgr->clks.dispclk_khz)) { + context->bw.dce.dispclk_khz = dce_set_clock(clk_mgr, patched_disp_clk); + clk_mgr->clks.dispclk_khz = patched_disp_clk; } dce11_pplib_apply_display_requirements(clk_mgr->ctx->dc, context); } From 816db7663565cd23f74ed3d5c9240522e3fb0dda Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 19 Feb 2019 14:53:44 +0800 Subject: [PATCH 101/298] vhost: correctly check the return value of translate_desc() in log_used() When fail, translate_desc() returns negative value, otherwise the number of iovs. So we should fail when the return value is negative instead of a blindly check against zero. Detected by CoverityScan, CID# 1442593: Control flow issues (DEADCODE) Fixes: cc5e71075947 ("vhost: log dirty page correctly") Acked-by: Michael S. Tsirkin Reported-by: Stephen Hemminger Signed-off-by: Jason Wang Signed-off-by: David S. Miller --- drivers/vhost/vhost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 24a129fcdd61..a2e5dc7716e2 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1788,7 +1788,7 @@ static int log_used(struct vhost_virtqueue *vq, u64 used_offset, u64 len) ret = translate_desc(vq, (uintptr_t)vq->used + used_offset, len, iov, 64, VHOST_ACCESS_WO); - if (ret) + if (ret < 0) return ret; for (i = 0; i < ret; i++) { From 1765f5dcd00963e33f1b8a4e0f34061fbc0e2f7f Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Tue, 19 Feb 2019 23:45:29 +0800 Subject: [PATCH 102/298] sky2: Increase D3 delay again Another platform requires even longer delay to make the device work correctly after S3. So increase the delay to 300ms. BugLink: https://bugs.launchpad.net/bugs/1798921 Signed-off-by: Kai-Heng Feng Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/sky2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c index f3a5fa84860f..57727fe1501e 100644 --- a/drivers/net/ethernet/marvell/sky2.c +++ b/drivers/net/ethernet/marvell/sky2.c @@ -5073,7 +5073,7 @@ static int sky2_probe(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&hw->restart_work, sky2_restart); pci_set_drvdata(pdev, hw); - pdev->d3_delay = 200; + pdev->d3_delay = 300; return 0; From d179b88deb3bf6fed4991a31fd6f0f2cad21fab5 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 15 Feb 2019 12:30:19 +0000 Subject: [PATCH 103/298] drm/i915/fbdev: Actually configure untiled displays If we skipped all the connectors that were not part of a tile, we would leave conn_seq=0 and conn_configured=0, convincing ourselves that we had stagnated in our configuration attempts. Avoid this situation by starting conn_seq=ALL_CONNECTORS, and repeating until we find no more connectors to configure. Fixes: 754a76591b12 ("drm/i915/fbdev: Stop repeating tile configuration on stagnation") Reported-by: Maarten Lankhorst Signed-off-by: Chris Wilson Cc: Maarten Lankhorst Reviewed-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20190215123019.32283-1-chris@chris-wilson.co.uk Cc: # v3.19+ (cherry picked from commit d9b308b1f8a1acc0c3279f443d4fe0f9f663252e) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_fbdev.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_fbdev.c b/drivers/gpu/drm/i915/intel_fbdev.c index 7f365ac0b549..4ee16b264dbe 100644 --- a/drivers/gpu/drm/i915/intel_fbdev.c +++ b/drivers/gpu/drm/i915/intel_fbdev.c @@ -336,8 +336,8 @@ static bool intel_fb_initial_config(struct drm_fb_helper *fb_helper, bool *enabled, int width, int height) { struct drm_i915_private *dev_priv = to_i915(fb_helper->dev); - unsigned long conn_configured, conn_seq, mask; unsigned int count = min(fb_helper->connector_count, BITS_PER_LONG); + unsigned long conn_configured, conn_seq; int i, j; bool *save_enabled; bool fallback = true, ret = true; @@ -355,10 +355,9 @@ static bool intel_fb_initial_config(struct drm_fb_helper *fb_helper, drm_modeset_backoff(&ctx); memcpy(save_enabled, enabled, count); - mask = GENMASK(count - 1, 0); + conn_seq = GENMASK(count - 1, 0); conn_configured = 0; retry: - conn_seq = conn_configured; for (i = 0; i < count; i++) { struct drm_fb_helper_connector *fb_conn; struct drm_connector *connector; @@ -371,7 +370,8 @@ static bool intel_fb_initial_config(struct drm_fb_helper *fb_helper, if (conn_configured & BIT(i)) continue; - if (conn_seq == 0 && !connector->has_tile) + /* First pass, only consider tiled connectors */ + if (conn_seq == GENMASK(count - 1, 0) && !connector->has_tile) continue; if (connector->status == connector_status_connected) @@ -475,8 +475,10 @@ static bool intel_fb_initial_config(struct drm_fb_helper *fb_helper, conn_configured |= BIT(i); } - if ((conn_configured & mask) != mask && conn_configured != conn_seq) + if (conn_configured != conn_seq) { /* repeat until no more are found */ + conn_seq = conn_configured; goto retry; + } /* * If the BIOS didn't enable everything it could, fall back to have the From 74698f6971f25d045301139413578865fc2bd8f9 Mon Sep 17 00:00:00 2001 From: Vladimir Murzin Date: Wed, 20 Feb 2019 11:43:05 +0000 Subject: [PATCH 104/298] arm64: Relax GIC version check during early boot Updates to the GIC architecture allow ID_AA64PFR0_EL1.GIC to have values other than 0 or 1. At the moment, Linux is quite strict in the way it handles this field at early boot stage (cpufeature is fine) and will refuse to use the system register CPU interface if it doesn't find the value 1. Fixes: 021f653791ad17e03f98aaa7fb933816ae16f161 ("irqchip: gic-v3: Initial support for GICv3") Reported-by: Chase Conklin Reviewed-by: Marc Zyngier Signed-off-by: Vladimir Murzin Signed-off-by: Will Deacon --- arch/arm64/kernel/head.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 15d79a8e5e5e..eecf7927dab0 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -539,8 +539,7 @@ set_hcr: /* GICv3 system register access */ mrs x0, id_aa64pfr0_el1 ubfx x0, x0, #24, #4 - cmp x0, #1 - b.ne 3f + cbz x0, 3f mrs_s x0, SYS_ICC_SRE_EL2 orr x0, x0, #ICC_SRE_EL2_SRE // Set ICC_SRE_EL2.SRE==1 From 94d9b9337d09bdd27735005b3251d97ab29f7273 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Mon, 11 Feb 2019 12:09:19 +0100 Subject: [PATCH 105/298] ARM: tegra: Restore DT ABI on Tegra124 Chromebooks Commit 482997699ef0 ("ARM: tegra: Fix unit_address_vs_reg DTC warnings for /memory") inadventently broke device tree ABI by adding a unit- address to the "/memory" node because the device tree compiler flagged the missing unit-address as a warning. Tegra124 Chromebooks (a.k.a. Nyan) use a bootloader that relies on the full name of the memory node in device tree being exactly "/memory". It can be argued whether this was a good decision or not, and some other bootloaders (such as U-Boot) do accept a unit-address in the name of the node, but the device tree is an ABI and we can't break existing setups just because the device tree compiler considers it bad practice to omit the unit-address nowadays. This partially reverts the offending commit and restores device tree ABI compatibility. Fixes: 482997699ef0 ("ARM: tegra: Fix unit_address_vs_reg DTC warnings for /memory") Reported-by: Tristan Bastian Signed-off-by: Thierry Reding Tested-by: Tristan Bastian Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/tegra124-nyan.dtsi | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/tegra124-nyan.dtsi b/arch/arm/boot/dts/tegra124-nyan.dtsi index d5f11d6d987e..bc85b6a166c7 100644 --- a/arch/arm/boot/dts/tegra124-nyan.dtsi +++ b/arch/arm/boot/dts/tegra124-nyan.dtsi @@ -13,10 +13,25 @@ chosen { stdout-path = "serial0:115200n8"; }; - memory@80000000 { + /* + * Note that recent version of the device tree compiler (starting with + * version 1.4.2) warn about this node containing a reg property, but + * missing a unit-address. However, the bootloader on these Chromebook + * devices relies on the full name of this node to be exactly /memory. + * Adding the unit-address causes the bootloader to create a /memory + * node and write the memory bank configuration to that node, which in + * turn leads the kernel to believe that the device has 2 GiB of + * memory instead of the amount detected by the bootloader. + * + * The name of this node is effectively ABI and must not be changed. + */ + memory { + device_type = "memory"; reg = <0x0 0x80000000 0x0 0x80000000>; }; + /delete-node/ memory@80000000; + host1x@50000000 { hdmi@54280000 { status = "okay"; From 9c2054a5cf415a9dc32c91ffde78399955deb571 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 20 Feb 2019 10:32:52 +0000 Subject: [PATCH 106/298] net: dsa: fix unintended change of bridge interface STP state When a DSA port is added to a bridge and brought up, the resulting STP state programmed into the hardware depends on the order that these operations are performed. However, the Linux bridge code believes that the port is in disabled mode. If the DSA port is first added to a bridge and then brought up, it will be in blocking mode. If it is brought up and then added to the bridge, it will be in disabled mode. This difference is caused by DSA always setting the STP mode in dsa_port_enable() whether or not this port is part of a bridge. Since bridge always sets the STP state when the port is added, brought up or taken down, it is unnecessary for us to manipulate the STP state. Apparently, this code was copied from Rocker, and the very next day a similar fix for Rocker was merged but was not propagated to DSA. See e47172ab7e41 ("rocker: put port in FORWADING state after leaving bridge") Fixes: b73adef67765 ("net: dsa: integrate with SWITCHDEV for HW bridging") Signed-off-by: Russell King Reviewed-by: Vivien Didelot Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/port.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/dsa/port.c b/net/dsa/port.c index 2d7e01b23572..2a2a878b5ce3 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -69,7 +69,6 @@ static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) { - u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING; struct dsa_switch *ds = dp->ds; int port = dp->index; int err; @@ -80,7 +79,8 @@ int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy) return err; } - dsa_port_set_state_now(dp, stp_state); + if (!dp->bridge_dev) + dsa_port_set_state_now(dp, BR_STATE_FORWARDING); return 0; } @@ -90,7 +90,8 @@ void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy) struct dsa_switch *ds = dp->ds; int port = dp->index; - dsa_port_set_state_now(dp, BR_STATE_DISABLED); + if (!dp->bridge_dev) + dsa_port_set_state_now(dp, BR_STATE_DISABLED); if (ds->ops->port_disable) ds->ops->port_disable(ds, port, phy); From 1b328a2e095a009518ebac05e937cc0fc242fede Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 19 Feb 2019 17:51:14 +0100 Subject: [PATCH 107/298] clk: at91: fix at91sam9x5 peripheral clock number nck() looks at the last id in an array and unfortunately, at91sam9x35_periphck has a sentinel, hence the id is 0 and the calculated number of peripheral clocks is 1 instead of a maximum of 31. Fixes: 1eabdc2f9dd8 ("clk: at91: add at91sam9x5 PMCs driver") Signed-off-by: Alexandre Belloni Acked-by: Nicolas Ferre Cc: # v4.20+ Signed-off-by: Stephen Boyd --- drivers/clk/at91/at91sam9x5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/clk/at91/at91sam9x5.c b/drivers/clk/at91/at91sam9x5.c index 2fe225a697df..d37e7ed9eb90 100644 --- a/drivers/clk/at91/at91sam9x5.c +++ b/drivers/clk/at91/at91sam9x5.c @@ -144,8 +144,7 @@ static void __init at91sam9x5_pmc_setup(struct device_node *np, return; at91sam9x5_pmc = pmc_data_allocate(PMC_MAIN + 1, - nck(at91sam9x5_systemck), - nck(at91sam9x35_periphck), 0); + nck(at91sam9x5_systemck), 31, 0); if (!at91sam9x5_pmc) return; From 65a91e2e597dea62a798a8b771edc44859037e7f Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Fri, 8 Feb 2019 15:40:59 +0100 Subject: [PATCH 108/298] clk: at91: fix masterck name The master clock is actually named masterck earlier in the driver. Having "mck" in the parent list means that it can never be selected. Fixes: 1eabdc2f9dd8 ("clk: at91: add at91sam9x5 PMCs driver") Fixes: a2038077de9a ("clk: at91: add sama5d2 PMC driver") Fixes: 084b696bb509 ("clk: at91: add sama5d4 pmc driver") Signed-off-by: Alexandre Belloni Acked-by: Nicolas Ferre Cc: # v4.20+ Signed-off-by: Stephen Boyd --- drivers/clk/at91/at91sam9x5.c | 2 +- drivers/clk/at91/sama5d2.c | 4 ++-- drivers/clk/at91/sama5d4.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/clk/at91/at91sam9x5.c b/drivers/clk/at91/at91sam9x5.c index d37e7ed9eb90..3487e03d4bc6 100644 --- a/drivers/clk/at91/at91sam9x5.c +++ b/drivers/clk/at91/at91sam9x5.c @@ -209,7 +209,7 @@ static void __init at91sam9x5_pmc_setup(struct device_node *np, parent_names[1] = "mainck"; parent_names[2] = "plladivck"; parent_names[3] = "utmick"; - parent_names[4] = "mck"; + parent_names[4] = "masterck"; for (i = 0; i < 2; i++) { char name[6]; diff --git a/drivers/clk/at91/sama5d2.c b/drivers/clk/at91/sama5d2.c index d69ad96fe988..cd0ef7274fdb 100644 --- a/drivers/clk/at91/sama5d2.c +++ b/drivers/clk/at91/sama5d2.c @@ -240,7 +240,7 @@ static void __init sama5d2_pmc_setup(struct device_node *np) parent_names[1] = "mainck"; parent_names[2] = "plladivck"; parent_names[3] = "utmick"; - parent_names[4] = "mck"; + parent_names[4] = "masterck"; for (i = 0; i < 3; i++) { char name[6]; @@ -291,7 +291,7 @@ static void __init sama5d2_pmc_setup(struct device_node *np) parent_names[1] = "mainck"; parent_names[2] = "plladivck"; parent_names[3] = "utmick"; - parent_names[4] = "mck"; + parent_names[4] = "masterck"; parent_names[5] = "audiopll_pmcck"; for (i = 0; i < ARRAY_SIZE(sama5d2_gck); i++) { hw = at91_clk_register_generated(regmap, &pmc_pcr_lock, diff --git a/drivers/clk/at91/sama5d4.c b/drivers/clk/at91/sama5d4.c index e358be7f6c8d..b645a9d59cdb 100644 --- a/drivers/clk/at91/sama5d4.c +++ b/drivers/clk/at91/sama5d4.c @@ -207,7 +207,7 @@ static void __init sama5d4_pmc_setup(struct device_node *np) parent_names[1] = "mainck"; parent_names[2] = "plladivck"; parent_names[3] = "utmick"; - parent_names[4] = "mck"; + parent_names[4] = "masterck"; for (i = 0; i < 3; i++) { char name[6]; From 6e356d45950e2d26b63531a2fd112c987da7a933 Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Tue, 5 Feb 2019 14:13:34 -0500 Subject: [PATCH 109/298] orangefs: remove two un-needed BUG_ONs... Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index a5a2fe76568f..b094d3d79354 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -398,8 +398,6 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter loff_t pos = iocb->ki_pos; ssize_t rc = 0; - BUG_ON(iocb->private); - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); orangefs_stats.reads++; @@ -416,8 +414,6 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite loff_t pos; ssize_t rc; - BUG_ON(iocb->private); - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); inode_lock(file->f_mapping->host); From 0921c41e19028314830b33daa681e46b46477c5e Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Fri, 1 Feb 2019 09:36:59 -0500 Subject: [PATCH 110/298] drm/amd/display: Fix negative cursor pos programming [Why] If the cursor pos passed from DM is less than the plane_state->dst_rect top left corner then the unsigned cursor pos wraps around to a large positive number since cursor pos is a u32. There was an attempt to guard against this in hubp1_cursor_set_position by checking the src_x_offset and src_y_offset and offseting the cursor hotspot within hubp1_cursor_set_position. However, the cursor position itself is still being programmed incorrectly as a large value. This manifests itself visually as the cursor disappearing or containing strange artifacts near the middle of the screen on raven. [How] Don't subtract the destination rect top left corner from the pos but add it to the hotspot instead. This happens before the pos gets passed into hubp1_cursor_set_position. This achieves the same result but avoids the subtraction wrap around. With this fix the original cursor programming logic can be used again. Signed-off-by: Nicholas Kazlauskas Reviewed-by: Charlene Liu Acked-by: Leo Li Acked-by: Murton Liu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c index 58a12ddf12f3..41883c981789 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c @@ -2658,8 +2658,8 @@ static void dcn10_set_cursor_position(struct pipe_ctx *pipe_ctx) .mirror = pipe_ctx->plane_state->horizontal_mirror }; - pos_cpy.x -= pipe_ctx->plane_state->dst_rect.x; - pos_cpy.y -= pipe_ctx->plane_state->dst_rect.y; + pos_cpy.x_hotspot += pipe_ctx->plane_state->dst_rect.x; + pos_cpy.y_hotspot += pipe_ctx->plane_state->dst_rect.y; if (pipe_ctx->plane_state->address.type == PLN_ADDR_TYPE_VIDEO_PROGRESSIVE) From 9f7ddbea2bb826a2147309f735726a8b09950944 Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Tue, 5 Feb 2019 13:55:20 -0500 Subject: [PATCH 111/298] drm/amd/display: fix optimize_bandwidth func pointer for dce80 [Why] optimize_bandwidth was using dce100_prepare_bandwidth this is incorrect [How] change it to dce100_optimize_bandwidth Signed-off-by: Bhawanpreet Lakha Reviewed-by: Charlene Liu Acked-by: Leo Li Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dce100/dce100_hw_sequencer.h | 4 ++++ drivers/gpu/drm/amd/display/dc/dce80/dce80_hw_sequencer.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce100/dce100_hw_sequencer.h b/drivers/gpu/drm/amd/display/dc/dce100/dce100_hw_sequencer.h index acd418515346..a6b80fdaa666 100644 --- a/drivers/gpu/drm/amd/display/dc/dce100/dce100_hw_sequencer.h +++ b/drivers/gpu/drm/amd/display/dc/dce100/dce100_hw_sequencer.h @@ -37,6 +37,10 @@ void dce100_prepare_bandwidth( struct dc *dc, struct dc_state *context); +void dce100_optimize_bandwidth( + struct dc *dc, + struct dc_state *context); + bool dce100_enable_display_power_gating(struct dc *dc, uint8_t controller_id, struct dc_bios *dcb, enum pipe_gating_control power_gating); diff --git a/drivers/gpu/drm/amd/display/dc/dce80/dce80_hw_sequencer.c b/drivers/gpu/drm/amd/display/dc/dce80/dce80_hw_sequencer.c index a60a90e68d91..c4543178ba20 100644 --- a/drivers/gpu/drm/amd/display/dc/dce80/dce80_hw_sequencer.c +++ b/drivers/gpu/drm/amd/display/dc/dce80/dce80_hw_sequencer.c @@ -77,6 +77,6 @@ void dce80_hw_sequencer_construct(struct dc *dc) dc->hwss.enable_display_power_gating = dce100_enable_display_power_gating; dc->hwss.pipe_control_lock = dce_pipe_control_lock; dc->hwss.prepare_bandwidth = dce100_prepare_bandwidth; - dc->hwss.optimize_bandwidth = dce100_prepare_bandwidth; + dc->hwss.optimize_bandwidth = dce100_optimize_bandwidth; } From 4ece61a22be5ab5d49cc5fc20a19a0afa24a019d Mon Sep 17 00:00:00 2001 From: Bhawanpreet Lakha Date: Tue, 5 Feb 2019 14:03:52 -0500 Subject: [PATCH 112/298] drm/amd/display: set clocks to 0 on suspend on dce80 [Why] When a dce80 asic was suspended, the clocks were not set to 0. Upon resume, the new clock was compared to the existing clock, they were found to be the same, and so the clock was not set. This resulted in a blackscreen. [How] In atomic commit, check to see if there are any active pipes. If no, set clocks to 0 Signed-off-by: Bhawanpreet Lakha Reviewed-by: Nicholas Kazlauskas Acked-by: Leo Li Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../drm/amd/display/dc/dce80/dce80_resource.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c b/drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c index cdd1d6b7b9f2..4e9ea50141bd 100644 --- a/drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dce80/dce80_resource.c @@ -790,9 +790,22 @@ bool dce80_validate_bandwidth( struct dc *dc, struct dc_state *context) { - /* TODO implement when needed but for now hardcode max value*/ - context->bw.dce.dispclk_khz = 681000; - context->bw.dce.yclk_khz = 250000 * MEMORY_TYPE_MULTIPLIER_CZ; + int i; + bool at_least_one_pipe = false; + + for (i = 0; i < dc->res_pool->pipe_count; i++) { + if (context->res_ctx.pipe_ctx[i].stream) + at_least_one_pipe = true; + } + + if (at_least_one_pipe) { + /* TODO implement when needed but for now hardcode max value*/ + context->bw.dce.dispclk_khz = 681000; + context->bw.dce.yclk_khz = 250000 * MEMORY_TYPE_MULTIPLIER_CZ; + } else { + context->bw.dce.dispclk_khz = 0; + context->bw.dce.yclk_khz = 0; + } return true; } From a213c2c7e235cfc0e0a161a558f7fdf2fb3a624a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Wed, 20 Feb 2019 15:16:06 +0100 Subject: [PATCH 113/298] drm/amdgpu: disable bulk moves for now MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The changes to fix those are two invasive for backporting. Just disable the feature in 4.20 and 5.0. Acked-by: Alex Deucher Signed-off-by: Christian König Cc: [4.20+] Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 7c108e687683..698bcb8ce61d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -638,12 +638,14 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, struct ttm_bo_global *glob = adev->mman.bdev.glob; struct amdgpu_vm_bo_base *bo_base; +#if 0 if (vm->bulk_moveable) { spin_lock(&glob->lru_lock); ttm_bo_bulk_move_lru_tail(&vm->lru_bulk_move); spin_unlock(&glob->lru_lock); return; } +#endif memset(&vm->lru_bulk_move, 0, sizeof(vm->lru_bulk_move)); From a8fef9ba58c9966ddb1fec916d8d8137c9d8bc89 Mon Sep 17 00:00:00 2001 From: Russell King Date: Fri, 15 Feb 2019 13:55:47 +0000 Subject: [PATCH 114/298] net: marvell: mvneta: fix DMA debug warning Booting 4.20 on SolidRun Clearfog issues this warning with DMA API debug enabled: WARNING: CPU: 0 PID: 555 at kernel/dma/debug.c:1230 check_sync+0x514/0x5bc mvneta f1070000.ethernet: DMA-API: device driver tries to sync DMA memory it has not allocated [device address=0x000000002dd7dc00] [size=240 bytes] Modules linked in: ahci mv88e6xxx dsa_core xhci_plat_hcd xhci_hcd devlink armada_thermal marvell_cesa des_generic ehci_orion phy_armada38x_comphy mcp3021 spi_orion evbug sfp mdio_i2c ip_tables x_tables CPU: 0 PID: 555 Comm: bridge-network- Not tainted 4.20.0+ #291 Hardware name: Marvell Armada 380/385 (Device Tree) [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (dump_stack+0x9c/0xd4) [] (dump_stack) from [] (__warn+0xf8/0x124) [] (__warn) from [] (warn_slowpath_fmt+0x38/0x48) [] (warn_slowpath_fmt) from [] (check_sync+0x514/0x5bc) [] (check_sync) from [] (debug_dma_sync_single_range_for_cpu+0x6c/0x74) [] (debug_dma_sync_single_range_for_cpu) from [] (mvneta_poll+0x298/0xf58) [] (mvneta_poll) from [] (net_rx_action+0x128/0x424) [] (net_rx_action) from [] (__do_softirq+0xf0/0x540) [] (__do_softirq) from [] (irq_exit+0x124/0x144) [] (irq_exit) from [] (__handle_domain_irq+0x58/0xb0) [] (__handle_domain_irq) from [] (gic_handle_irq+0x48/0x98) [] (gic_handle_irq) from [] (__irq_svc+0x70/0x98) ... This appears to be caused by mvneta_rx_hwbm() calling dma_sync_single_range_for_cpu() with the wrong struct device pointer, as the buffer manager device pointer is used to map and unmap the buffer. Fix this. Signed-off-by: Russell King Signed-off-by: David S. Miller --- drivers/net/ethernet/marvell/mvneta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 9d4568eb2297..8433fb9c3eee 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -2146,7 +2146,7 @@ static int mvneta_rx_hwbm(struct napi_struct *napi, if (unlikely(!skb)) goto err_drop_frame_ret_pool; - dma_sync_single_range_for_cpu(dev->dev.parent, + dma_sync_single_range_for_cpu(&pp->bm_priv->pdev->dev, rx_desc->buf_phys_addr, MVNETA_MH_SIZE + NET_SKB_PAD, rx_bytes, From ae3b564179bfd06f32d051b9e5d72ce4b2a07c37 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Feb 2019 20:09:35 +0000 Subject: [PATCH 115/298] missing barriers in some of unix_sock ->addr and ->path accesses Several u->addr and u->path users are not holding any locks in common with unix_bind(). unix_state_lock() is useless for those purposes. u->addr is assign-once and *(u->addr) is fully set up by the time we set u->addr (all under unix_table_lock). u->path is also set in the same critical area, also before setting u->addr, and any unix_sock with ->path filled will have non-NULL ->addr. So setting ->addr with smp_store_release() is all we need for those "lockless" users - just have them fetch ->addr with smp_load_acquire() and don't even bother looking at ->path if they see NULL ->addr. Users of ->addr and ->path fall into several classes now: 1) ones that do smp_load_acquire(u->addr) and access *(u->addr) and u->path only if smp_load_acquire() has returned non-NULL. 2) places holding unix_table_lock. These are guaranteed that *(u->addr) is seen fully initialized. If unix_sock is in one of the "bound" chains, so's ->path. 3) unix_sock_destructor() using ->addr is safe. All places that set u->addr are guaranteed to have seen all stores *(u->addr) while holding a reference to u and unix_sock_destructor() is called when (atomic) refcount hits zero. 4) unix_release_sock() using ->path is safe. unix_bind() is serialized wrt unix_release() (normally - by struct file refcount), and for the instances that had ->path set by unix_bind() unix_release_sock() comes from unix_release(), so they are fine. Instances that had it set in unix_stream_connect() either end up attached to a socket (in unix_accept()), in which case the call chain to unix_release_sock() and serialization are the same as in the previous case, or they never get accept'ed and unix_release_sock() is called when the listener is shut down and its queue gets purged. In that case the listener's queue lock provides the barriers needed - unix_stream_connect() shoves our unix_sock into listener's queue under that lock right after having set ->path and eventual unix_release_sock() caller picks them from that queue under the same lock right before calling unix_release_sock(). 5) unix_find_other() use of ->path is pointless, but safe - it happens with successful lookup by (abstract) name, so ->path.dentry is guaranteed to be NULL there. earlier-variant-reviewed-by: "Paul E. McKenney" Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 59 +++++++++++++++++++++++++------------------- net/unix/diag.c | 3 ++- security/lsm_audit.c | 10 +++++--- 3 files changed, 42 insertions(+), 30 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 74d1eed7cbd4..a95d479caeea 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -890,7 +890,7 @@ static int unix_autobind(struct socket *sock) addr->hash ^= sk->sk_type; __unix_remove_socket(sk); - u->addr = addr; + smp_store_release(&u->addr, addr); __unix_insert_socket(&unix_socket_table[addr->hash], sk); spin_unlock(&unix_table_lock); err = 0; @@ -1060,7 +1060,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = 0; __unix_remove_socket(sk); - u->addr = addr; + smp_store_release(&u->addr, addr); __unix_insert_socket(list, sk); out_unlock: @@ -1331,15 +1331,29 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); - /* copy address information from listening to new sock*/ - if (otheru->addr) { - refcount_inc(&otheru->addr->refcnt); - newu->addr = otheru->addr; - } + /* copy address information from listening to new sock + * + * The contents of *(otheru->addr) and otheru->path + * are seen fully set up here, since we have found + * otheru in hash under unix_table_lock. Insertion + * into the hash chain we'd found it in had been done + * in an earlier critical area protected by unix_table_lock, + * the same one where we'd set *(otheru->addr) contents, + * as well as otheru->path and otheru->addr itself. + * + * Using smp_store_release() here to set newu->addr + * is enough to make those stores, as well as stores + * to newu->path visible to anyone who gets newu->addr + * by smp_load_acquire(). IOW, the same warranties + * as for unix_sock instances bound in unix_bind() or + * in unix_autobind(). + */ if (otheru->path.dentry) { path_get(&otheru->path); newu->path = otheru->path; } + refcount_inc(&otheru->addr->refcnt); + smp_store_release(&newu->addr, otheru->addr); /* Set credentials */ copy_peercred(sk, other); @@ -1453,7 +1467,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; - struct unix_sock *u; + struct unix_address *addr; DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr); int err = 0; @@ -1468,19 +1482,15 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) sock_hold(sk); } - u = unix_sk(sk); - unix_state_lock(sk); - if (!u->addr) { + addr = smp_load_acquire(&unix_sk(sk)->addr); + if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; err = sizeof(short); } else { - struct unix_address *addr = u->addr; - err = addr->len; memcpy(sunaddr, addr->name, addr->len); } - unix_state_unlock(sk); sock_put(sk); out: return err; @@ -2073,11 +2083,11 @@ static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg, static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { - struct unix_sock *u = unix_sk(sk); + struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); - if (u->addr) { - msg->msg_namelen = u->addr->len; - memcpy(msg->msg_name, u->addr->name, u->addr->len); + if (addr) { + msg->msg_namelen = addr->len; + memcpy(msg->msg_name, addr->name, addr->len); } } @@ -2581,15 +2591,14 @@ static int unix_open_file(struct sock *sk) if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; - unix_state_lock(sk); - path = unix_sk(sk)->path; - if (!path.dentry) { - unix_state_unlock(sk); + if (!smp_load_acquire(&unix_sk(sk)->addr)) + return -ENOENT; + + path = unix_sk(sk)->path; + if (!path.dentry) return -ENOENT; - } path_get(&path); - unix_state_unlock(sk); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) @@ -2830,7 +2839,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); - if (u->addr) { + if (u->addr) { // under unix_table_lock here int i, len; seq_putc(seq, ' '); diff --git a/net/unix/diag.c b/net/unix/diag.c index 384c84e83462..3183d9b8ab33 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -10,7 +10,8 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { - struct unix_address *addr = unix_sk(sk)->addr; + /* might or might not have unix_table_lock */ + struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) return 0; diff --git a/security/lsm_audit.c b/security/lsm_audit.c index f84001019356..33028c098ef3 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -321,6 +321,7 @@ static void dump_common_audit_data(struct audit_buffer *ab, if (a->u.net->sk) { struct sock *sk = a->u.net->sk; struct unix_sock *u; + struct unix_address *addr; int len = 0; char *p = NULL; @@ -351,14 +352,15 @@ static void dump_common_audit_data(struct audit_buffer *ab, #endif case AF_UNIX: u = unix_sk(sk); + addr = smp_load_acquire(&u->addr); + if (!addr) + break; if (u->path.dentry) { audit_log_d_path(ab, " path=", &u->path); break; } - if (!u->addr) - break; - len = u->addr->len-sizeof(short); - p = &u->addr->name->sun_path[0]; + len = addr->len-sizeof(short); + p = &addr->name->sun_path[0]; audit_log_format(ab, " path="); if (*p) audit_log_untrustedstring(ab, p); From 74fb44863084275b952f21ec6a024af0e2e75cb8 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 21 Feb 2019 08:59:02 +0100 Subject: [PATCH 116/298] PM-runtime: Fix deadlock when canceling hrtimer When rpm_resume() desactivates the autosuspend timer, it should only try to cancel hrtimer but not wait for the handler to finish, because both rpm_resume() and pm_suspend_timer_fn() take the power.lock. A deadlock is possible as follows: CPU0 CPU1 rpm_resume() spin_lock_irqsave pm_suspend_timer_fn() spin_lock_irqsave pm_runtime_deactivate_timer() hrtimer_cancel() It is sufficient to call hrtimer_try_to_cancel() from pm_runtime_deactivate_timer(), because dev->power.timer_expires reset to 0 by it, so use that function instead of hrtimer_cancel(). Fixes: 8234f6734c5d ("PM-runtime: Switch autosuspend over to using hrtimers") Reported-by: Sunzhaosheng Sun(Zhaosheng) Signed-off-by: Vincent Guittot [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/base/power/runtime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 0ea2139c50d8..ccd296dbb95c 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -95,7 +95,7 @@ static void __update_runtime_status(struct device *dev, enum rpm_status status) static void pm_runtime_deactivate_timer(struct device *dev) { if (dev->power.timer_expires > 0) { - hrtimer_cancel(&dev->power.suspend_timer); + hrtimer_try_to_cancel(&dev->power.suspend_timer); dev->power.timer_expires = 0; } } From 11fe9262ed226c127f67ca4bd85977b22589b68a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 21 Feb 2019 13:07:38 +0100 Subject: [PATCH 117/298] Revert "xsk: simplify AF_XDP socket teardown" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit e2ce3674883ecba2605370404208c9d4a07ae1c3. It turns out that the sock destructor xsk_destruct was needed after all. The cleanup simplification broke the skb transmit cleanup path, due to that the umem was prematurely destroyed. The umem cannot be destroyed until all outstanding skbs are freed, which means that we cannot remove the umem until the sk_destruct has been called. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- net/xdp/xsk.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 45f3b528dc09..85e4fe4f18cc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -366,7 +366,6 @@ static int xsk_release(struct socket *sock) xskq_destroy(xs->rx); xskq_destroy(xs->tx); - xdp_put_umem(xs->umem); sock_orphan(sk); sock->sk = NULL; @@ -718,6 +717,18 @@ static const struct proto_ops xsk_proto_ops = { .sendpage = sock_no_sendpage, }; +static void xsk_destruct(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + + if (!sock_flag(sk, SOCK_DEAD)) + return; + + xdp_put_umem(xs->umem); + + sk_refcnt_debug_dec(sk); +} + static int xsk_create(struct net *net, struct socket *sock, int protocol, int kern) { @@ -744,6 +755,9 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, sk->sk_family = PF_XDP; + sk->sk_destruct = xsk_destruct; + sk_refcnt_debug_inc(sk); + sock_set_flag(sk, SOCK_RCU_FREE); xs = xdp_sk(sk); From a841c673f1352f607fd3ba85de6c9c49ff2c1e12 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 20 Feb 2019 22:18:52 -0800 Subject: [PATCH 118/298] revert "initramfs: cleanup incomplete rootfs" Revert ff1522bb7d9845 ("initramfs: cleanup incomplete rootfs"). Andy reports : This breaks my setup where I have U-boot provided more size of initramfs : than needed. This allows a bit of flexibility to increase or decrease : initramfs compressed image without taking care of bootloader. The proper : solution is to do this if we sure that we didn't get enough memory, : otherwise I can't consider the error fatal to clean up rootfs. Fixes: ff1522bb7d9845 ("initramfs: cleanup incomplete rootfs") Reported-by: Andy Shevchenko Tested-by: Andy Shevchenko Cc: David Engraf Cc: Dominik Brodowski Cc: Greg Kroah-Hartman Cc: Philippe Ombredanne Cc: Arnd Bergmann Cc: Luc Van Oostenryck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 7cea802d00ef..fca899622937 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -550,6 +550,7 @@ static void __init free_initrd(void) initrd_end = 0; } +#ifdef CONFIG_BLK_DEV_RAM #define BUF_SIZE 1024 static void __init clean_rootfs(void) { @@ -596,6 +597,7 @@ static void __init clean_rootfs(void) ksys_close(fd); kfree(buf); } +#endif static int __init populate_rootfs(void) { @@ -638,10 +640,8 @@ static int __init populate_rootfs(void) printk(KERN_INFO "Unpacking initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); - if (err) { + if (err) printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); - clean_rootfs(); - } free_initrd(); #endif } From 050c17f239fd53adb55aa768d4f41bc76c0fe045 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 20 Feb 2019 22:18:58 -0800 Subject: [PATCH 119/298] numa: change get_mempolicy() to use nr_node_ids instead of MAX_NUMNODES The system call, get_mempolicy() [1], passes an unsigned long *nodemask pointer and an unsigned long maxnode argument which specifies the length of the user's nodemask array in bits (which is rounded up). The manual page says that if the maxnode value is too small, get_mempolicy will return EINVAL but there is no system call to return this minimum value. To determine this value, some programs search /proc//status for a line starting with "Mems_allowed:" and use the number of digits in the mask to determine the minimum value. A recent change to the way this line is formatted [2] causes these programs to compute a value less than MAX_NUMNODES so get_mempolicy() returns EINVAL. Change get_mempolicy(), the older compat version of get_mempolicy(), and the copy_nodes_to_user() function to use nr_node_ids instead of MAX_NUMNODES, thus preserving the defacto method of computing the minimum size for the nodemask array and the maxnode argument. [1] http://man7.org/linux/man-pages/man2/get_mempolicy.2.html [2] https://lore.kernel.org/lkml/1545405631-6808-1-git-send-email-longman@redhat.com Link: http://lkml.kernel.org/r/20190211180245.22295-1-rcampbell@nvidia.com Fixes: 4fb8e5b89bcbbbb ("include/linux/nodemask.h: use nr_node_ids (not MAX_NUMNODES) in __nodemask_pr_numnodes()") Signed-off-by: Ralph Campbell Suggested-by: Alexander Duyck Cc: Waiman Long Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d4496d9d34f5..ee2bce59d2bf 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1314,7 +1314,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; - const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); + unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) @@ -1491,7 +1491,7 @@ static int kernel_get_mempolicy(int __user *policy, int uninitialized_var(pval); nodemask_t nodes; - if (nmask != NULL && maxnode < MAX_NUMNODES) + if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; err = do_get_mempolicy(&pval, &nodes, addr, flags); @@ -1527,7 +1527,7 @@ COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); - nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); + nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) From e1db95befb3e9e3476629afec6e0f5d0707b9825 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:19:01 -0800 Subject: [PATCH 120/298] kasan: fix assigning tags twice When an object is kmalloc()'ed, two hooks are called: kasan_slab_alloc() and kasan_kmalloc(). Right now we assign a tag twice, once in each of the hooks. Fix it by assigning a tag only in the former hook. Link: http://lkml.kernel.org/r/ce8c6431da735aa7ec051fd6497153df690eb021.1549921721.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgeniy Stepanov Cc: Joonsoo Kim Cc: Kostya Serebryany Cc: Pekka Enberg Cc: Qian Cai Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 73c9cbfdedf4..09b534fbba17 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -361,10 +361,15 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) * get different tags. */ static u8 assign_tag(struct kmem_cache *cache, const void *object, - bool init, bool krealloc) + bool init, bool keep_tag) { - /* Reuse the same tag for krealloc'ed objects. */ - if (krealloc) + /* + * 1. When an object is kmalloc()'ed, two hooks are called: + * kasan_slab_alloc() and kasan_kmalloc(). We assign the + * tag only in the first one. + * 2. We reuse the same tag for krealloc'ed objects. + */ + if (keep_tag) return get_tag(object); /* @@ -405,12 +410,6 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache, return (void *)object; } -void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object, - gfp_t flags) -{ - return kasan_kmalloc(cache, object, cache->object_size, flags); -} - static inline bool shadow_invalid(u8 tag, s8 shadow_byte) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) @@ -467,7 +466,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) } static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags, bool krealloc) + size_t size, gfp_t flags, bool keep_tag) { unsigned long redzone_start; unsigned long redzone_end; @@ -485,7 +484,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, KASAN_SHADOW_SCALE_SIZE); if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - tag = assign_tag(cache, object, false, krealloc); + tag = assign_tag(cache, object, false, keep_tag); /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ kasan_unpoison_shadow(set_tag(object, tag), size); @@ -498,10 +497,16 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, return set_tag(object, tag); } +void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object, + gfp_t flags) +{ + return __kasan_kmalloc(cache, object, cache->object_size, flags, false); +} + void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, gfp_t flags) { - return __kasan_kmalloc(cache, object, size, flags, false); + return __kasan_kmalloc(cache, object, size, flags, true); } EXPORT_SYMBOL(kasan_kmalloc); From 53128245b43daad600d9fe72940206570e064112 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:19:11 -0800 Subject: [PATCH 121/298] kasan, kmemleak: pass tagged pointers to kmemleak Right now we call kmemleak hooks before assigning tags to pointers in KASAN hooks. As a result, when an objects gets allocated, kmemleak sees a differently tagged pointer, compared to the one it sees when the object gets freed. Fix it by calling KASAN hooks before kmemleak's ones. Link: http://lkml.kernel.org/r/cd825aa4897b0fc37d3316838993881daccbe9f5.1549921721.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: Qian Cai Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgeniy Stepanov Cc: Joonsoo Kim Cc: Kostya Serebryany Cc: Pekka Enberg Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 6 ++---- mm/slab_common.c | 2 +- mm/slub.c | 3 ++- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index 4190c24ef0e9..638ea1b25d39 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -437,11 +437,9 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, flags &= gfp_allowed_mask; for (i = 0; i < size; i++) { - void *object = p[i]; - - kmemleak_alloc_recursive(object, s->object_size, 1, + p[i] = kasan_slab_alloc(s, p[i], flags); + kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); - p[i] = kasan_slab_alloc(s, object, flags); } if (memcg_kmem_enabled()) diff --git a/mm/slab_common.c b/mm/slab_common.c index 81732d05e74a..fe524c8d0246 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1228,8 +1228,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) flags |= __GFP_COMP; page = alloc_pages(flags, order); ret = page ? page_address(page) : NULL; - kmemleak_alloc(ret, size, 1, flags); ret = kasan_kmalloc_large(ret, size, flags); + kmemleak_alloc(ret, size, 1, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); diff --git a/mm/slub.c b/mm/slub.c index 1e3d0ec4e200..4a3d7686902f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1374,8 +1374,9 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, */ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { + ptr = kasan_kmalloc_large(ptr, size, flags); kmemleak_alloc(ptr, size, 1, flags); - return kasan_kmalloc_large(ptr, size, flags); + return ptr; } static __always_inline void kfree_hook(void *x) From a2f775751d964e638818487544fa8320180d106e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:19:16 -0800 Subject: [PATCH 122/298] kmemleak: account for tagged pointers when calculating pointer range kmemleak keeps two global variables, min_addr and max_addr, which store the range of valid (encountered by kmemleak) pointer values, which it later uses to speed up pointer lookup when scanning blocks. With tagged pointers this range will get bigger than it needs to be. This patch makes kmemleak untag pointers before saving them to min_addr and max_addr and when performing a lookup. Link: http://lkml.kernel.org/r/16e887d442986ab87fe87a755815ad92fa431a5f.1550066133.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Tested-by: Qian Cai Acked-by: Catalin Marinas Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgeniy Stepanov Cc: Joonsoo Kim Cc: Kostya Serebryany Cc: Pekka Enberg Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 10 +++++++--- mm/slab.h | 1 + mm/slab_common.c | 1 + mm/slub.c | 1 + 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f9d9dc250428..707fa5579f66 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -574,6 +574,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, unsigned long flags; struct kmemleak_object *object, *parent; struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { @@ -619,8 +620,9 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, write_lock_irqsave(&kmemleak_lock, flags); - min_addr = min(min_addr, ptr); - max_addr = max(max_addr, ptr + size); + untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); + min_addr = min(min_addr, untagged_ptr); + max_addr = max(max_addr, untagged_ptr + size); link = &object_tree_root.rb_node; rb_parent = NULL; while (*link) { @@ -1333,6 +1335,7 @@ static void scan_block(void *_start, void *_end, unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); unsigned long flags; + unsigned long untagged_ptr; read_lock_irqsave(&kmemleak_lock, flags); for (ptr = start; ptr < end; ptr++) { @@ -1347,7 +1350,8 @@ static void scan_block(void *_start, void *_end, pointer = *ptr; kasan_enable_current(); - if (pointer < min_addr || pointer >= max_addr) + untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer); + if (untagged_ptr < min_addr || untagged_ptr >= max_addr) continue; /* diff --git a/mm/slab.h b/mm/slab.h index 638ea1b25d39..384105318779 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -438,6 +438,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, flags &= gfp_allowed_mask; for (i = 0; i < size; i++) { p[i] = kasan_slab_alloc(s, p[i], flags); + /* As p[i] might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); } diff --git a/mm/slab_common.c b/mm/slab_common.c index fe524c8d0246..f9d89c1b5977 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1229,6 +1229,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_pages(flags, order); ret = page ? page_address(page) : NULL; ret = kasan_kmalloc_large(ret, size, flags); + /* As ret might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ret, size, 1, flags); return ret; } diff --git a/mm/slub.c b/mm/slub.c index 4a3d7686902f..f5a451c49190 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1375,6 +1375,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { ptr = kasan_kmalloc_large(ptr, size, flags); + /* As ptr might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ptr, size, 1, flags); return ptr; } From a71012242837fe5e67d8c999cfc357174ed5dba0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:19:23 -0800 Subject: [PATCH 123/298] kasan, slub: move kasan_poison_slab hook before page_address With tag based KASAN page_address() looks at the page flags to see whether the resulting pointer needs to have a tag set. Since we don't want to set a tag when page_address() is called on SLAB pages, we call page_kasan_tag_reset() in kasan_poison_slab(). However in allocate_slab() page_address() is called before kasan_poison_slab(). Fix it by changing the order. [andreyknvl@google.com: fix compilation error when CONFIG_SLUB_DEBUG=n] Link: http://lkml.kernel.org/r/ac27cc0bbaeb414ed77bcd6671a877cf3546d56e.1550066133.git.andreyknvl@google.com Link: http://lkml.kernel.org/r/cd895d627465a3f1c712647072d17f10883be2a1.1549921721.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Evgeniy Stepanov Cc: Joonsoo Kim Cc: Kostya Serebryany Cc: Pekka Enberg Cc: Qian Cai Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f5a451c49190..a7e7c7f719f9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1075,6 +1075,16 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } +static void setup_page_debug(struct kmem_cache *s, void *addr, int order) +{ + if (!(s->flags & SLAB_POISON)) + return; + + metadata_access_enable(); + memset(addr, POISON_INUSE, PAGE_SIZE << order); + metadata_access_disable(); +} + static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) @@ -1330,6 +1340,8 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} +static inline void setup_page_debug(struct kmem_cache *s, + void *addr, int order) {} static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } @@ -1643,12 +1655,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); + kasan_poison_slab(page); + start = page_address(page); - if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << order); - - kasan_poison_slab(page); + setup_page_debug(s, start, order); shuffle = shuffle_freelist(s, page); From 18e506610238eda2b0c5a19a123d3d6ec0ab2de6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:19:28 -0800 Subject: [PATCH 124/298] kasan, slub: fix conflicts with CONFIG_SLAB_FREELIST_HARDENED CONFIG_SLAB_FREELIST_HARDENED hashes freelist pointer with the address of the object where the pointer gets stored. With tag based KASAN we don't account for that when building freelist, as we call set_freepointer() with the first argument untagged. This patch changes the code to properly propagate tags throughout the loop. Link: http://lkml.kernel.org/r/3df171559c52201376f246bf7ce3184fe21c1dc7.1549921721.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: Qian Cai Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vincenzo Frascino Cc: Kostya Serebryany Cc: Evgeniy Stepanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index a7e7c7f719f9..80da3a40b74d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -303,11 +303,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) __p < (__addr) + (__objects) * (__s)->size; \ __p += (__s)->size) -#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = fixup_red_left(__s, __addr), __idx = 1; \ - __idx <= __objects; \ - __p += (__s)->size, __idx++) - /* Determine object index from a given position */ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) { @@ -1664,17 +1659,16 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) shuffle = shuffle_freelist(s, page); if (!shuffle) { - for_each_object_idx(p, idx, s, start, page->objects) { - if (likely(idx < page->objects)) { - next = p + s->size; - next = setup_object(s, page, next); - set_freepointer(s, p, next); - } else - set_freepointer(s, p, NULL); - } start = fixup_red_left(s, start); start = setup_object(s, page, start); page->freelist = start; + for (idx = 0, p = start; idx < page->objects - 1; idx++) { + next = p + s->size; + next = setup_object(s, page, next); + set_freepointer(s, p, next); + p = next; + } + set_freepointer(s, p, NULL); } page->inuse = page->objects; From d36a63a943e37081e92e4abdf4a207fd2e83a006 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:19:32 -0800 Subject: [PATCH 125/298] kasan, slub: fix more conflicts with CONFIG_SLAB_FREELIST_HARDENED When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged. Normally, this doesn't cause any issues, as both set_freepointer() and get_freepointer() are called with a pointer with the same tag. However, there are some issues with CONFIG_SLUB_DEBUG code. For example, when __free_slub() iterates over objects in a cache, it passes untagged pointers to check_object(). check_object() in turns calls get_freepointer() with an untagged pointer, which causes the freepointer to be restored incorrectly. Add kasan_reset_tag to freelist_ptr(). Also add a detailed comment. Link: http://lkml.kernel.org/r/bf858f26ef32eb7bd24c665755b3aee4bc58d0e4.1550103861.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: Qian Cai Tested-by: Qian Cai Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 80da3a40b74d..c80e6699357c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -249,7 +249,18 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, unsigned long ptr_addr) { #ifdef CONFIG_SLAB_FREELIST_HARDENED - return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr); + /* + * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged. + * Normally, this doesn't cause any issues, as both set_freepointer() + * and get_freepointer() are called with a pointer with the same tag. + * However, there are some issues with CONFIG_SLUB_DEBUG code. For + * example, when __free_slub() iterates over objects in a cache, it + * passes untagged pointers to check_object(). check_object() in turns + * calls get_freepointer() with an untagged pointer, which causes the + * freepointer to be restored incorrectly. + */ + return (void *)((unsigned long)ptr ^ s->random ^ + (unsigned long)kasan_reset_tag((void *)ptr_addr)); #else return ptr; #endif From 338cfaad4993d3bc35a740e28981747770a65f90 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 20 Feb 2019 22:19:36 -0800 Subject: [PATCH 126/298] slub: fix SLAB_CONSISTENCY_CHECKS + KASAN_SW_TAGS Enabling SLUB_DEBUG's SLAB_CONSISTENCY_CHECKS with KASAN_SW_TAGS triggers endless false positives during boot below due to check_valid_pointer() checks tagged pointers which have no addresses that is valid within slab pages: BUG radix_tree_node (Tainted: G B ): Freelist Pointer check fails ----------------------------------------------------------------------------- INFO: Slab objects=69 used=69 fp=0x (null) flags=0x7ffffffc000200 INFO: Object @offset=15060037153926966016 fp=0x Redzone: bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 18 6b 06 00 08 80 ff d0 .........k...... Object : 18 6b 06 00 08 80 ff d0 00 00 00 00 00 00 00 00 .k.............. Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Object : 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ Redzone: bb bb bb bb bb bb bb bb ........ Padding: 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZZZZZZZZZ CPU: 0 PID: 0 Comm: swapper/0 Tainted: G B 5.0.0-rc5+ #18 Call trace: dump_backtrace+0x0/0x450 show_stack+0x20/0x2c __dump_stack+0x20/0x28 dump_stack+0xa0/0xfc print_trailer+0x1bc/0x1d0 object_err+0x40/0x50 alloc_debug_processing+0xf0/0x19c ___slab_alloc+0x554/0x704 kmem_cache_alloc+0x2f8/0x440 radix_tree_node_alloc+0x90/0x2fc idr_get_free+0x1e8/0x6d0 idr_alloc_u32+0x11c/0x2a4 idr_alloc+0x74/0xe0 worker_pool_assign_id+0x5c/0xbc workqueue_init_early+0x49c/0xd50 start_kernel+0x52c/0xac4 FIX radix_tree_node: Marking all objects used Link: http://lkml.kernel.org/r/20190209044128.3290-1-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: Andrey Konovalov Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/slub.c b/mm/slub.c index c80e6699357c..2d2830134e60 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -513,6 +513,7 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; base = page_address(page); + object = kasan_reset_tag(object); object = restore_red_left(s, object); if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { From b2b469939e93458753cfbf8282ad52636495965e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 20 Feb 2019 22:19:42 -0800 Subject: [PATCH 127/298] proc, oom: do not report alien mms when setting oom_score_adj Tetsuo has reported that creating a thousands of processes sharing MM without SIGHAND (aka alien threads) and setting /proc//oom_score_adj will swamp the kernel log and takes ages [1] to finish. This is especially worrisome that all that printing is done under RCU lock and this can potentially trigger RCU stall or softlockup detector. The primary reason for the printk was to catch potential users who might depend on the behavior prior to 44a70adec910 ("mm, oom_adj: make sure processes sharing mm have same view of oom_score_adj") but after more than 2 years without a single report I guess it is safe to simply remove the printk altogether. The next step should be moving oom_score_adj over to the mm struct and remove all the tasks crawling as suggested by [2] [1] http://lkml.kernel.org/r/97fce864-6f75-bca5-14bc-12c9f890e740@i-love.sakura.ne.jp [2] http://lkml.kernel.org/r/20190117155159.GA4087@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20190212102129.26288-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Acked-by: Johannes Weiner Cc: David Rientjes Cc: Yong-Taek Lee Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 633a63462573..f5ed9512d193 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1086,10 +1086,6 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) task_lock(p); if (!p->vfork_done && process_shares_mm(p, mm)) { - pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n", - task_pid_nr(p), p->comm, - p->signal->oom_score_adj, oom_adj, - task_pid_nr(task), task->comm); p->signal->oom_score_adj = oom_adj; if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE)) p->signal->oom_score_adj_min = (short)oom_adj; From 311ade0eab192f0abee2f70bce761bf0d66990c4 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Wed, 20 Feb 2019 22:19:45 -0800 Subject: [PATCH 128/298] mm/debug.c: fix __dump_page() for poisoned pages Evaluating page_mapping() on a poisoned page ends up dereferencing junk and making PF_POISONED_CHECK() considerably crashier than intended: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000006 Mem abort info: ESR = 0x96000005 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000005 CM = 0, WnR = 0 user pgtable: 4k pages, 39-bit VAs, pgdp = 00000000c2f6ac38 [0000000000000006] pgd=0000000000000000, pud=0000000000000000 Internal error: Oops: 96000005 [#1] PREEMPT SMP Modules linked in: CPU: 2 PID: 491 Comm: bash Not tainted 5.0.0-rc1+ #1 Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno Development Platform, BIOS EDK II Dec 17 2018 pstate: 00000005 (nzcv daif -PAN -UAO) pc : page_mapping+0x18/0x118 lr : __dump_page+0x1c/0x398 Process bash (pid: 491, stack limit = 0x000000004ebd4ecd) Call trace: page_mapping+0x18/0x118 __dump_page+0x1c/0x398 dump_page+0xc/0x18 remove_store+0xbc/0x120 dev_attr_store+0x18/0x28 sysfs_kf_write+0x40/0x50 kernfs_fop_write+0x130/0x1d8 __vfs_write+0x30/0x180 vfs_write+0xb4/0x1a0 ksys_write+0x60/0xd0 __arm64_sys_write+0x18/0x20 el0_svc_common+0x94/0xf8 el0_svc_handler+0x68/0x70 el0_svc+0x8/0xc Code: f9400401 d1000422 f240003f 9a801040 (f9400402) ---[ end trace cdb5eb5bf435cecb ]--- Fix that by not inspecting the mapping until we've determined that it's likely to be valid. Now the above condition still ends up stopping the kernel, but in the correct manner: page:ffffffbf20000000 is uninitialized and poisoned raw: ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff raw: ffffffffffffffff ffffffffffffffff ffffffffffffffff ffffffffffffffff page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p)) ------------[ cut here ]------------ kernel BUG at ./include/linux/mm.h:1006! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 1 PID: 483 Comm: bash Not tainted 5.0.0-rc1+ #3 Hardware name: ARM LTD ARM Juno Development Platform/ARM Juno Development Platform, BIOS EDK II Dec 17 2018 pstate: 40000005 (nZcv daif -PAN -UAO) pc : remove_store+0xbc/0x120 lr : remove_store+0xbc/0x120 ... Link: http://lkml.kernel.org/r/03b53ee9d7e76cda4b9b5e1e31eea080db033396.1550071778.git.robin.murphy@arm.com Fixes: 1c6fb1d89e73 ("mm: print more information about mapping in __dump_page") Signed-off-by: Robin Murphy Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/debug.c b/mm/debug.c index 0abb987dad9b..1611cf00a137 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -44,7 +44,7 @@ const struct trace_print_flags vmaflag_names[] = { void __dump_page(struct page *page, const char *reason) { - struct address_space *mapping = page_mapping(page); + struct address_space *mapping; bool page_poisoned = PagePoisoned(page); int mapcount; @@ -58,6 +58,8 @@ void __dump_page(struct page *page, const char *reason) goto hex_only; } + mapping = page_mapping(page); + /* * Avoid VM_BUG_ON() in page_mapcount(). * page->_mapcount space in struct page is used by sl[aou]b pages to From 94b3334cbebea34d56a7e6321c6fe9d89b309a49 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 20 Feb 2019 22:19:49 -0800 Subject: [PATCH 129/298] mm, page_alloc: fix a division by zero error when boosting watermarks v2 Yury Norov reported that an arm64 KVM instance could not boot since after v5.0-rc1 and could addressed by reverting the patches 1c30844d2dfe272d58c ("mm: reclaim small amounts of memory when an external 73444bc4d8f92e46a20 ("mm, page_alloc: do not wake kswapd with zone lock held") The problem is that a division by zero error is possible if boosting occurs very early in boot if the system has very little memory. This patch avoids the division by zero error. Link: http://lkml.kernel.org/r/20190213143012.GT9565@techsingularity.net Fixes: 1c30844d2dfe ("mm: reclaim small amounts of memory when an external fragmentation event occurs") Signed-off-by: Mel Gorman Reported-by: Yury Norov Tested-by: Yury Norov Tested-by: Will Deacon Acked-by: Vlastimil Babka Cc: Andrea Arcangeli Cc: David Rientjes Cc: Michal Hocko Cc: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f79b78bc829..0b9f577b1a2a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2170,6 +2170,18 @@ static inline void boost_watermark(struct zone *zone) max_boost = mult_frac(zone->_watermark[WMARK_HIGH], watermark_boost_factor, 10000); + + /* + * high watermark may be uninitialised if fragmentation occurs + * very early in boot so do not boost. We do not fall + * through and boost by pageblock_nr_pages as failing + * allocations that early means that reclaim is not going + * to help and it may even be impossible to reclaim the + * boosted watermark resulting in a hang. + */ + if (!max_boost) + return; + max_boost = max(pageblock_nr_pages, max_boost); zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, From 6ea183d60c469560e7b08a83c9804299e84ec9eb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 20 Feb 2019 22:19:54 -0800 Subject: [PATCH 130/298] mm: handle lru_add_drain_all for UP properly Since for_each_cpu(cpu, mask) added by commit 2d3854a37e8b767a ("cpumask: introduce new API, without changing anything") did not evaluate the mask argument if NR_CPUS == 1 due to CONFIG_SMP=n, lru_add_drain_all() is hitting WARN_ON() at __flush_work() added by commit 4d43d395fed12463 ("workqueue: Try to catch flush_work() without INIT_WORK().") by unconditionally calling flush_work() [1]. Workaround this issue by using CONFIG_SMP=n specific lru_add_drain_all implementation. There is no real need to defer the implementation to the workqueue as the draining is going to happen on the local cpu. So alias lru_add_drain_all to lru_add_drain which does all the necessary work. [akpm@linux-foundation.org: fix various build warnings] [1] https://lkml.kernel.org/r/18a30387-6aa5-6123-e67c-57579ecc3f38@roeck-us.net Link: http://lkml.kernel.org/r/20190213124334.GH4525@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Guenter Roeck Debugged-by: Tetsuo Handa Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 4929bc1be60e..4d7d37eb3c40 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -320,11 +320,6 @@ static inline void activate_page_drain(int cpu) { } -static bool need_activate_page_drain(int cpu) -{ - return false; -} - void activate_page(struct page *page) { struct zone *zone = page_zone(page); @@ -653,13 +648,15 @@ void lru_add_drain(void) put_cpu(); } +#ifdef CONFIG_SMP + +static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); + static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); } -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); - /* * Doesn't need any cpu hotplug locking because we do rely on per-cpu * kworkers being shut down before our page_alloc_cpu_dead callback is @@ -702,6 +699,12 @@ void lru_add_drain_all(void) mutex_unlock(&lock); } +#else +void lru_add_drain_all(void) +{ + lru_add_drain(); +} +#endif /** * release_pages - batched put_page() From 4e37504d1c49eec6434d0cc97278d2b51c9e8763 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 20 Feb 2019 22:19:59 -0800 Subject: [PATCH 131/298] psi: avoid divide-by-zero crash inside virtual machines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We've been seeing hard-to-trigger psi crashes when running inside VM instances: divide error: 0000 [#1] SMP PTI Modules linked in: [...] CPU: 0 PID: 212 Comm: kworker/0:2 Not tainted 4.16.18-119_fbk9_3817_gfe944c98d695 #119 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 Workqueue: events psi_clock RIP: 0010:psi_update_stats+0x270/0x490 RSP: 0018:ffffc90001117e10 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff8800a35a13f8 RDX: 0000000000000000 RSI: ffff8800a35a1340 RDI: 0000000000000000 RBP: 0000000000000658 R08: ffff8800a35a1470 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 00000000000f8502 FS: 0000000000000000(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbe370fa000 CR3: 00000000b1e3a000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: psi_clock+0x12/0x50 process_one_work+0x1e0/0x390 worker_thread+0x2b/0x3c0 ? rescuer_thread+0x330/0x330 kthread+0x113/0x130 ? kthread_create_worker_on_cpu+0x40/0x40 ? SyS_exit_group+0x10/0x10 ret_from_fork+0x35/0x40 Code: 48 0f 47 c7 48 01 c2 45 85 e4 48 89 16 0f 85 e6 00 00 00 4c 8b 49 10 4c 8b 51 08 49 69 d9 f2 07 00 00 48 6b c0 64 4c 8b 29 31 d2 <48> f7 f7 49 69 d5 8d 06 00 00 48 89 c5 4c 69 f0 00 98 0b 00 48 The Code-line points to `period` being 0 inside update_stats(), and we divide by that when calculating that period's pressure percentage. The elapsed period should never be 0. The reason this can happen is due to an off-by-one in the idle time / missing period calculation combined with a coarse sched_clock() in the virtual machine. The target time for aggregation is advanced into the future on a fixed grid to prevent clock drift. So when an aggregation runs after some idle period, we can not just set it to "now + psi_period", but have to calculate the downtime and advance the target time relative to itself. However, if the aggregator was disabled exactly one psi_period (ns), we drop one idle period in the calculation due to a > when we should do >=. In that case, next_update will be advanced from 'now - psi_period' to 'now' when it should be moved to 'now + psi_period'. The run finishes with last_update == next_update == sched_clock(). With hardware clocks, this exact nanosecond match isn't likely in the first place; but if it does happen, the clock will still have moved on and the period non-zero by the time the worker runs. A pointlessly short period, but besides the extra work, no harm no foul. However, a slow sched_clock() like we have on VMs might not have advanced either by the time the worker runs again. And when we calculate the elapsed period, the result, our pressure divisor, will be 0. Ouch. Fix this by correctly handling the situation when the elapsed time between aggregation runs is precisely two periods, and advance the expiration timestamp correctly to period into the future. Link: http://lkml.kernel.org/r/20190214193157.15788-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Łukasz Siudut Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c3484785b179..0e97ca9306ef 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -322,7 +322,7 @@ static bool update_stats(struct psi_group *group) expires = group->next_update; if (now < expires) goto out; - if (now - expires > psi_period) + if (now - expires >= psi_period) missed_periods = div_u64(now - expires, psi_period); /* From 1062af920c07f5b54cf5060fde3339da6df0cf6b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Feb 2019 08:48:09 -0800 Subject: [PATCH 132/298] tmpfs: fix link accounting when a tmpfile is linked in tmpfs has a peculiarity of accounting hard links as if they were separate inodes: so that when the number of inodes is limited, as it is by default, a user cannot soak up an unlimited amount of unreclaimable dcache memory just by repeatedly linking a file. But when v3.11 added O_TMPFILE, and the ability to use linkat() on the fd, we missed accommodating this new case in tmpfs: "df -i" shows that an extra "inode" remains accounted after the file is unlinked and the fd closed and the actual inode evicted. If a user repeatedly links tmpfiles into a tmpfs, the limit will be hit (ENOSPC) even after they are deleted. Just skip the extra reservation from shmem_link() in this case: there's a sense in which this first link of a tmpfile is then cheaper than a hard link of another file, but the accounting works out, and there's still good limiting, so no need to do anything more complicated. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1902182134370.7035@eggly.anvils Fixes: f4e0c30c191 ("allow the temp files created by open() to be linked to") Signed-off-by: Darrick J. Wong Signed-off-by: Hugh Dickins Reported-by: Matej Kupljen Acked-by: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 6ece1e2fe76e..0905215fb016 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2854,10 +2854,14 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr * No ordinary (disk based) filesystem counts links as inodes; * but each new link needs a new dentry, pinning lowmem, and * tmpfs dentries cannot be pruned until they are unlinked. + * But if an O_TMPFILE file is linked into the tmpfs, the + * first link must skip that, to get the accounting right. */ - ret = shmem_reserve_inode(inode->i_sb); - if (ret) - goto out; + if (inode->i_nlink) { + ret = shmem_reserve_inode(inode->i_sb); + if (ret) + goto out; + } dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); From 3f41b609382388f95c0a05b69b8db0d706adafb4 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:20:15 -0800 Subject: [PATCH 133/298] kasan: fix random seed generation for tag-based mode There are two issues with assigning random percpu seeds right now: 1. We use for_each_possible_cpu() to iterate over cpus, but cpumask is not set up yet at the moment of kasan_init(), and thus we only set the seed for cpu #0. 2. A call to get_random_u32() always returns the same number and produces a message in dmesg, since the random subsystem is not yet initialized. Fix 1 by calling kasan_init_tags() after cpumask is set up. Fix 2 by using get_cycles() instead of get_random_u32(). This gives us lower quality random numbers, but it's good enough, as KASAN is meant to be used as a debugging tool and not a mitigation. Link: http://lkml.kernel.org/r/1f815cc914b61f3516ed4cc9bfd9eeca9bd5d9de.1550677973.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Catalin Marinas Cc: Will Deacon Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/setup.c | 3 +++ arch/arm64/mm/kasan_init.c | 2 -- mm/kasan/tags.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index d09ec76f08cf..009849328289 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -339,6 +339,9 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); + /* Init percpu seeds for random tags after cpus are set up. */ + kasan_init_tags(); + #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Make sure init_thread_info.ttbr0 always generates translation diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 4b55b15707a3..f37a86d2a69d 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -252,8 +252,6 @@ void __init kasan_init(void) memset(kasan_early_shadow_page, KASAN_SHADOW_INIT, PAGE_SIZE); cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); - kasan_init_tags(); - /* At this point kasan is fully initialized. Enable error messages */ init_task.kasan_depth = 0; pr_info("KernelAddressSanitizer initialized\n"); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 0777649e07c4..63fca3172659 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -46,7 +46,7 @@ void kasan_init_tags(void) int cpu; for_each_possible_cpu(cpu) - per_cpu(prng_state, cpu) = get_random_u32(); + per_cpu(prng_state, cpu) = (u32)get_cycles(); } /* From dc15a8a2543cb9ebe67998eefe2880ce1a20d42e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:20:20 -0800 Subject: [PATCH 134/298] kasan: prevent tracing of tags.c Similarly to commit 0d0c8de8788b ("kasan: mark file common so ftrace doesn't trace it") add the -pg flag to mm/kasan/tags.c to prevent conflicts with tracing. Link: http://lkml.kernel.org/r/9c4c3ce5ccfb894c7fe66d91de7c1da2787b4da4.1550602886.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: Qian Cai Tested-by: Qian Cai Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Vincenzo Frascino Cc: Kostya Serebryany Cc: Evgeniy Stepanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index e2bb06c1b45e..5d1065efbd47 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -7,6 +7,8 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = -pg CFLAGS_REMOVE_generic.o = -pg +CFLAGS_REMOVE_tags.o = -pg + # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 From 219667c23c68eb3dbc0d5662b9246f28477fe529 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:20:25 -0800 Subject: [PATCH 135/298] kasan, slab: fix conflicts with CONFIG_HARDENED_USERCOPY Similarly to commit 96fedce27e13 ("kasan: make tag based mode work with CONFIG_HARDENED_USERCOPY"), we need to reset pointer tags in __check_heap_object() in mm/slab.c before doing any pointer math. Link: http://lkml.kernel.org/r/9a5c0f958db10e69df5ff9f2b997866b56b7effc.1550602886.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Tested-by: Qian Cai Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgeniy Stepanov Cc: Kostya Serebryany Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/slab.c b/mm/slab.c index 78eb8c5bf4e4..c84458281a88 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4408,6 +4408,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, unsigned int objnr; unsigned long offset; + ptr = kasan_reset_tag(ptr); + /* Find and validate object. */ cachep = page->slab_cache; objnr = obj_to_index(cachep, page, (void *)ptr); From 51dedad06b5f6c3eea7ec1069631b1ef7796912a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:20:28 -0800 Subject: [PATCH 136/298] kasan, slab: make freelist stored without tags Similarly to "kasan, slub: move kasan_poison_slab hook before page_address", move kasan_poison_slab() before alloc_slabmgmt(), which calls page_address(), to make page_address() return value to be non-tagged. This, combined with calling kasan_reset_tag() for off-slab slab management object, leads to freelist being stored non-tagged. Link: http://lkml.kernel.org/r/dfb53b44a4d00de3879a05a9f04c1f55e584f7a1.1550602886.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Tested-by: Qian Cai Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgeniy Stepanov Cc: Kostya Serebryany Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index c84458281a88..4ad95fcb1686 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2359,7 +2359,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *freelist; void *addr = page_address(page); - page->s_mem = kasan_reset_tag(addr) + colour_off; + page->s_mem = addr + colour_off; page->active = 0; if (OBJFREELIST_SLAB(cachep)) @@ -2368,6 +2368,7 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); + freelist = kasan_reset_tag(freelist); if (!freelist) return NULL; } else { @@ -2681,6 +2682,13 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, offset *= cachep->colour_off; + /* + * Call kasan_poison_slab() before calling alloc_slabmgmt(), so + * page_address() in the latter returns a non-tagged pointer, + * as it should be for slab pages. + */ + kasan_poison_slab(page); + /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, page_node); @@ -2689,7 +2697,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, slab_map_pages(cachep, page, freelist); - kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) From 557ea25383a231fe3ffc72881ada35c24b960dbc Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 20 Feb 2019 22:20:33 -0800 Subject: [PATCH 137/298] kasan, slab: remove redundant kasan_slab_alloc hooks kasan_slab_alloc() calls in kmem_cache_alloc() and kmem_cache_alloc_node() are redundant as they are already called via slab_alloc/slab_alloc_node()-> slab_post_alloc_hook()->kasan_slab_alloc(). Remove them. Link: http://lkml.kernel.org/r/4ca1655cdcfc4379c49c50f7bf80f81c4ad01485.1550602886.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Tested-by: Qian Cai Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgeniy Stepanov Cc: Kostya Serebryany Cc: Vincenzo Frascino Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 4ad95fcb1686..91c1863df93d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3547,7 +3547,6 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); - ret = kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3637,7 +3636,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); - ret = kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); From 6373dca16c911b2828ef8d836d7f6f1800e1bbbc Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 20 Feb 2019 22:20:37 -0800 Subject: [PATCH 138/298] slub: fix a crash with SLUB_DEBUG + KASAN_SW_TAGS In process_slab(), "p = get_freepointer()" could return a tagged pointer, but "addr = page_address()" always return a native pointer. As the result, slab_index() is messed up here, return (p - addr) / s->size; All other callers of slab_index() have the same situation where "addr" is from page_address(), so just need to untag "p". # cat /sys/kernel/slab/hugetlbfs_inode_cache/alloc_calls Unable to handle kernel paging request at virtual address 2bff808aa4856d48 Mem abort info: ESR = 0x96000007 Exception class = DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 Data abort info: ISV = 0, ISS = 0x00000007 CM = 0, WnR = 0 swapper pgtable: 64k pages, 48-bit VAs, pgdp = 0000000002498338 [2bff808aa4856d48] pgd=00000097fcfd0003, pud=00000097fcfd0003, pmd=00000097fca30003, pte=00e8008b24850712 Internal error: Oops: 96000007 [#1] SMP CPU: 3 PID: 79210 Comm: read_all Tainted: G L 5.0.0-rc7+ #84 Hardware name: HPE Apollo 70 /C01_APACHE_MB , BIOS L50_5.13_1.0.6 07/10/2018 pstate: 00400089 (nzcv daIf +PAN -UAO) pc : get_map+0x78/0xec lr : get_map+0xa0/0xec sp : aeff808989e3f8e0 x29: aeff808989e3f940 x28: ffff800826200000 x27: ffff100012d47000 x26: 9700000000002500 x25: 0000000000000001 x24: 52ff8008200131f8 x23: 52ff8008200130a0 x22: 52ff800820013098 x21: ffff800826200000 x20: ffff100013172ba0 x19: 2bff808a8971bc00 x18: ffff1000148f5538 x17: 000000000000001b x16: 00000000000000ff x15: ffff1000148f5000 x14: 00000000000000d2 x13: 0000000000000001 x12: 0000000000000000 x11: 0000000020000002 x10: 2bff808aa4856d48 x9 : 0000020000000000 x8 : 68ff80082620ebb0 x7 : 0000000000000000 x6 : ffff1000105da1dc x5 : 0000000000000000 x4 : 0000000000000000 x3 : 0000000000000010 x2 : 2bff808a8971bc00 x1 : ffff7fe002098800 x0 : ffff80082620ceb0 Process read_all (pid: 79210, stack limit = 0x00000000f65b9361) Call trace: get_map+0x78/0xec process_slab+0x7c/0x47c list_locations+0xb0/0x3c8 alloc_calls_show+0x34/0x40 slab_attr_show+0x34/0x48 sysfs_kf_seq_show+0x2e4/0x570 kernfs_seq_show+0x12c/0x1a0 seq_read+0x48c/0xf84 kernfs_fop_read+0xd4/0x448 __vfs_read+0x94/0x5d4 vfs_read+0xcc/0x194 ksys_read+0x6c/0xe8 __arm64_sys_read+0x68/0xb0 el0_svc_handler+0x230/0x3bc el0_svc+0x8/0xc Code: d3467d2a 9ac92329 8b0a0e6a f9800151 (c85f7d4b) ---[ end trace a383a9a44ff13176 ]--- Kernel panic - not syncing: Fatal exception SMP: stopping secondary CPUs SMP: failed to stop secondary CPUs 1-7,32,40,127 Kernel Offset: disabled CPU features: 0x002,20000c18 Memory Limit: none ---[ end Kernel panic - not syncing: Fatal exception ]--- Link: http://lkml.kernel.org/r/20190220020251.82039-1-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index 2d2830134e60..dc777761b6b7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -317,7 +317,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Determine object index from a given position */ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) { - return (p - addr) / s->size; + return (kasan_reset_tag(p) - addr) / s->size; } static inline unsigned int order_objects(unsigned int order, unsigned int size) From 6c8fcc096be9d02f478c508052a41a4430506ab3 Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 20 Feb 2019 22:20:42 -0800 Subject: [PATCH 139/298] mm: don't let userspace spam allocations warnings memdump_user usually gets fed unchecked userspace input. Blasting a full backtrace into dmesg every time is a bit excessive - I'm not sure on the kernel rule in general, but at least in drm we're trying not to let unpriviledge userspace spam the logs freely. Definitely not entire warning backtraces. It also means more filtering for our CI, because our testsuite exercises these corner cases and so hits these a lot. Link: http://lkml.kernel.org/r/20190220204058.11676-1-daniel.vetter@ffwll.ch Signed-off-by: Daniel Vetter Reviewed-by: Andrew Morton Acked-by: Michal Hocko Reviewed-by: Kees Cook Cc: Mike Rapoport Cc: Roman Gushchin Cc: Vlastimil Babka Cc: Jan Stancek Cc: Andrey Ryabinin Cc: "Michael S. Tsirkin" Cc: Huang Ying Cc: Bartosz Golaszewski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index 1ea055138043..379319b1bcfd 100644 --- a/mm/util.c +++ b/mm/util.c @@ -150,7 +150,7 @@ void *memdup_user(const void __user *src, size_t len) { void *p; - p = kmalloc_track_caller(len, GFP_USER); + p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); From 891cb2a72d821f930a39d5900cb7a3aa752c1d5b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 20 Feb 2019 22:20:46 -0800 Subject: [PATCH 140/298] mm, memory_hotplug: fix off-by-one in is_pageblock_removable Rong Chen has reported the following boot crash: PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 239 Comm: udevd Not tainted 5.0.0-rc4-00149-gefad4e4 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 RIP: 0010:page_mapping+0x12/0x80 Code: 5d c3 48 89 df e8 0e ad 02 00 85 c0 75 da 89 e8 5b 5d c3 0f 1f 44 00 00 53 48 89 fb 48 8b 43 08 48 8d 50 ff a8 01 48 0f 45 da <48> 8b 53 08 48 8d 42 ff 83 e2 01 48 0f 44 c3 48 83 38 ff 74 2f 48 RSP: 0018:ffff88801fa87cd8 EFLAGS: 00010202 RAX: ffffffffffffffff RBX: fffffffffffffffe RCX: 000000000000000a RDX: fffffffffffffffe RSI: ffffffff820b9a20 RDI: ffff88801e5c0000 RBP: 6db6db6db6db6db7 R08: ffff88801e8bb000 R09: 0000000001b64d13 R10: ffff88801fa87cf8 R11: 0000000000000001 R12: ffff88801e640000 R13: ffffffff820b9a20 R14: ffff88801f145258 R15: 0000000000000001 FS: 00007fb2079817c0(0000) GS:ffff88801dd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000006 CR3: 000000001fa82000 CR4: 00000000000006a0 Call Trace: __dump_page+0x14/0x2c0 is_mem_section_removable+0x24c/0x2c0 removable_show+0x87/0xa0 dev_attr_show+0x25/0x60 sysfs_kf_seq_show+0xba/0x110 seq_read+0x196/0x3f0 __vfs_read+0x34/0x180 vfs_read+0xa0/0x150 ksys_read+0x44/0xb0 do_syscall_64+0x5e/0x4a0 entry_SYSCALL_64_after_hwframe+0x49/0xbe and bisected it down to commit efad4e475c31 ("mm, memory_hotplug: is_mem_section_removable do not pass the end of a zone"). The reason for the crash is that the mapping is garbage for poisoned (uninitialized) page. This shouldn't happen as all pages in the zone's boundary should be initialized. Later debugging revealed that the actual problem is an off-by-one when evaluating the end_page. 'start_pfn + nr_pages' resp 'zone_end_pfn' refers to a pfn after the range and as such it might belong to a differen memory section. This along with CONFIG_SPARSEMEM then makes the loop condition completely bogus because a pointer arithmetic doesn't work for pages from two different sections in that memory model. Fix the issue by reworking is_pageblock_removable to be pfn based and only use struct page where necessary. This makes the code slightly easier to follow and we will remove the problematic pointer arithmetic completely. Link: http://lkml.kernel.org/r/20190218181544.14616-1-mhocko@kernel.org Fixes: efad4e475c31 ("mm, memory_hotplug: is_mem_section_removable do not pass the end of a zone") Signed-off-by: Michal Hocko Reported-by: Tested-by: Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 124e794867c5..1ad28323fb9f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1188,11 +1188,13 @@ static inline int pageblock_free(struct page *page) return PageBuddy(page) && page_order(page) >= pageblock_order; } -/* Return the start of the next active pageblock after a given page */ -static struct page *next_active_pageblock(struct page *page) +/* Return the pfn of the start of the next active pageblock after a given pfn */ +static unsigned long next_active_pageblock(unsigned long pfn) { + struct page *page = pfn_to_page(pfn); + /* Ensure the starting page is pageblock-aligned */ - BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); + BUG_ON(pfn & (pageblock_nr_pages - 1)); /* If the entire pageblock is free, move to the end of free page */ if (pageblock_free(page)) { @@ -1200,16 +1202,16 @@ static struct page *next_active_pageblock(struct page *page) /* be careful. we don't have locks, page_order can be changed.*/ order = page_order(page); if ((order < MAX_ORDER) && (order >= pageblock_order)) - return page + (1 << order); + return pfn + (1 << order); } - return page + pageblock_nr_pages; + return pfn + pageblock_nr_pages; } -static bool is_pageblock_removable_nolock(struct page *page) +static bool is_pageblock_removable_nolock(unsigned long pfn) { + struct page *page = pfn_to_page(pfn); struct zone *zone; - unsigned long pfn; /* * We have to be careful here because we are iterating over memory @@ -1232,13 +1234,14 @@ static bool is_pageblock_removable_nolock(struct page *page) /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { - struct page *page = pfn_to_page(start_pfn); - unsigned long end_pfn = min(start_pfn + nr_pages, zone_end_pfn(page_zone(page))); - struct page *end_page = pfn_to_page(end_pfn); + unsigned long end_pfn, pfn; + + end_pfn = min(start_pfn + nr_pages, + zone_end_pfn(page_zone(pfn_to_page(start_pfn)))); /* Check the starting page of each pageblock within the range */ - for (; page < end_page; page = next_active_pageblock(page)) { - if (!is_pageblock_removable_nolock(page)) + for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) { + if (!is_pageblock_removable_nolock(pfn)) return false; cond_resched(); } From 193f3685d0546b0cea20c99894aadb70098e47bf Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 11:19:41 +0100 Subject: [PATCH 141/298] ipv6: route: enforce RCU protection in rt6_update_exception_stamp_rt() We must access rt6_info->from under RCU read lock: move the dereference under such lock, with proper annotation. v1 -> v2: - avoid using multiple, racy, fetch operations for rt->from Fixes: a68886a69180 ("net/ipv6: Make from in rt6_info rcu protected") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 964491cf3672..2b1ed8c6fcab 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1599,15 +1599,15 @@ static int rt6_remove_exception_rt(struct rt6_info *rt) static void rt6_update_exception_stamp_rt(struct rt6_info *rt) { struct rt6_exception_bucket *bucket; - struct fib6_info *from = rt->from; struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; - - if (!from || - !(rt->rt6i_flags & RTF_CACHE)) - return; + struct fib6_info *from; rcu_read_lock(); + from = rcu_dereference(rt->from); + if (!from || !(rt->rt6i_flags & RTF_CACHE)) + goto unlock; + bucket = rcu_dereference(from->rt6i_exception_bucket); #ifdef CONFIG_IPV6_SUBTREES @@ -1626,6 +1626,7 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt) if (rt6_ex) rt6_ex->stamp = jiffies; +unlock: rcu_read_unlock(); } From bf1dc8bad1d42287164d216d8efb51c5cd381b18 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 11:19:42 +0100 Subject: [PATCH 142/298] ipv6: route: enforce RCU protection in ip6_route_check_nh_onlink() We need a RCU critical section around rt6_info->from deference, and proper annotation. Fixes: 4ed591c8ab44 ("net/ipv6: Allow onlink routes to have a device mismatch if it is the default route") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 2b1ed8c6fcab..74b9b6fd4168 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2743,20 +2743,24 @@ static int ip6_route_check_nh_onlink(struct net *net, u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN; const struct in6_addr *gw_addr = &cfg->fc_gateway; u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT; + struct fib6_info *from; struct rt6_info *grt; int err; err = 0; grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); if (grt) { + rcu_read_lock(); + from = rcu_dereference(grt->from); if (!grt->dst.error && /* ignore match if it is the default route */ - grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) && + from && !ipv6_addr_any(&from->fib6_dst.addr) && (grt->rt6i_flags & flags || dev != grt->dst.dev)) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway or device mismatch"); err = -EINVAL; } + rcu_read_unlock(); ip6_rt_put(grt); } From d7cf4a3bf3a83c977a29055e1c4ffada7697b31f Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 21 Feb 2019 12:56:54 +0100 Subject: [PATCH 143/298] net/smc: fix smc_poll in SMC_INIT state smc_poll() returns with mask bit EPOLLPRI if the connection urg_state is SMC_URG_VALID. Since SMC_URG_VALID is zero, smc_poll signals EPOLLPRI errorneously if called in state SMC_INIT before the connection is created, for instance in a non-blocking connect scenario. This patch switches to non-zero values for the urg states. Reviewed-by: Karsten Graul Fixes: de8474eb9d50 ("net/smc: urgent data support") Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/smc.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index 5721416d0605..adbdf195eb08 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -113,9 +113,9 @@ struct smc_host_cdc_msg { /* Connection Data Control message */ } __aligned(8); enum smc_urg_state { - SMC_URG_VALID, /* data present */ - SMC_URG_NOTYET, /* data pending */ - SMC_URG_READ /* data was already read */ + SMC_URG_VALID = 1, /* data present */ + SMC_URG_NOTYET = 2, /* data pending */ + SMC_URG_READ = 3, /* data was already read */ }; struct smc_connection { From 156a67a9065e3339be85f811d1b13b920e50d73b Mon Sep 17 00:00:00 2001 From: Jeff Kirsher Date: Mon, 28 Jan 2019 09:45:01 -0800 Subject: [PATCH 144/298] ixgbe: fix older devices that do not support IXGBE_MRQC_L3L4TXSWEN The enabling L3/L4 filtering for transmit switched packets for all devices caused unforeseen issue on older devices when trying to send UDP traffic in an ordered sequence. This bit was originally intended for X550 devices, which supported this feature, so limit the scope of this bit to only X550 devices. Signed-off-by: Jeff Kirsher Tested-by: Andrew Bowers --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index daff8183534b..3cbb7e0324fd 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -3953,8 +3953,11 @@ static void ixgbe_setup_mrqc(struct ixgbe_adapter *adapter) else mrqc = IXGBE_MRQC_VMDQRSS64EN; - /* Enable L3/L4 for Tx Switched packets */ - mrqc |= IXGBE_MRQC_L3L4TXSWEN; + /* Enable L3/L4 for Tx Switched packets only for X550, + * older devices do not support this feature + */ + if (hw->mac.type >= ixgbe_mac_X550) + mrqc |= IXGBE_MRQC_L3L4TXSWEN; } else { if (tcs > 4) mrqc = IXGBE_MRQC_RTRSS8TCEN; From 14ffeb52f3693ae0b674e59453452a2365ae9fd9 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 29 Jan 2019 15:03:17 +0100 Subject: [PATCH 145/298] i40e: fix potential RX buffer starvation for AF_XDP When the RX rings are created they are also populated with buffers so that packets can be received. Usually these are kernel buffers, but for AF_XDP in zero-copy mode, these are user-space buffers and in this case the application might not have sent down any buffers to the driver at this point. And if no buffers are allocated at ring creation time, no packets can be received and no interrupts will be generated so the NAPI poll function that allocates buffers to the rings will never get executed. To rectify this, we kick the NAPI context of any queue with an attached AF_XDP zero-copy socket in two places in the code. Once after an XDP program has loaded and once after the umem is registered. This take care of both cases: XDP program gets loaded first then AF_XDP socket is created, and the reverse, AF_XDP socket is created first, then XDP program is loaded. Fixes: 0a714186d3c0 ("i40e: add AF_XDP zero-copy Rx support") Signed-off-by: Magnus Karlsson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 13 ++++++++++++- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 5 +++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index f52e2c46e6a7..3a0990de81c1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -3289,8 +3289,11 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) i40e_alloc_rx_buffers_zc(ring, I40E_DESC_UNUSED(ring)) : !i40e_alloc_rx_buffers(ring, I40E_DESC_UNUSED(ring)); if (!ok) { + /* Log this in case the user has forgotten to give the kernel + * any buffers, even later in the application. + */ dev_info(&vsi->back->pdev->dev, - "Failed allocate some buffers on %sRx ring %d (pf_q %d)\n", + "Failed to allocate some buffers on %sRx ring %d (pf_q %d)\n", ring->xsk_umem ? "UMEM enabled " : "", ring->queue_index, pf_q); } @@ -11895,6 +11898,14 @@ static int i40e_xdp_setup(struct i40e_vsi *vsi, if (old_prog) bpf_prog_put(old_prog); + /* Kick start the NAPI context if there is an AF_XDP socket open + * on that queue id. This so that receiving will start. + */ + if (need_reset && prog) + for (i = 0; i < vsi->num_queue_pairs; i++) + if (vsi->xdp_rings[i]->xsk_umem) + (void)i40e_xsk_async_xmit(vsi->netdev, i); + return 0; } diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 870cf654e436..3827f16e6923 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -183,6 +183,11 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, err = i40e_queue_pair_enable(vsi, qid); if (err) return err; + + /* Kick start the NAPI context so that receiving will start */ + err = i40e_xsk_async_xmit(vsi->netdev, qid); + if (err) + return err; } return 0; From 4a9b32f30f805ca596d76605903a48eab58e0b88 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 29 Jan 2019 15:03:50 +0100 Subject: [PATCH 146/298] ixgbe: fix potential RX buffer starvation for AF_XDP When the RX rings are created they are also populated with buffers so that packets can be received. Usually these are kernel buffers, but for AF_XDP in zero-copy mode, these are user-space buffers and in this case the application might not have sent down any buffers to the driver at this point. And if no buffers are allocated at ring creation time, no packets can be received and no interrupts will be generated so the NAPI poll function that allocates buffers to the rings will never get executed. To rectify this, we kick the NAPI context of any queue with an attached AF_XDP zero-copy socket in two places in the code. Once after an XDP program has loaded and once after the umem is registered. This take care of both cases: XDP program gets loaded first then AF_XDP socket is created, and the reverse, AF_XDP socket is created first, then XDP program is loaded. Fixes: d0bcacd0a130 ("ixgbe: add AF_XDP zero-copy Rx support") Signed-off-by: Magnus Karlsson Tested-by: Andrew Bowers Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 12 +++++++++++- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 12 ++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 3cbb7e0324fd..cb35d8202572 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -10228,6 +10228,7 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) int i, frame_size = dev->mtu + ETH_HLEN + ETH_FCS_LEN + VLAN_HLEN; struct ixgbe_adapter *adapter = netdev_priv(dev); struct bpf_prog *old_prog; + bool need_reset; if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) return -EINVAL; @@ -10250,9 +10251,10 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) return -ENOMEM; old_prog = xchg(&adapter->xdp_prog, prog); + need_reset = (!!prog != !!old_prog); /* If transitioning XDP modes reconfigure rings */ - if (!!prog != !!old_prog) { + if (need_reset) { int err = ixgbe_setup_tc(dev, adapter->hw_tcs); if (err) { @@ -10268,6 +10270,14 @@ static int ixgbe_xdp_setup(struct net_device *dev, struct bpf_prog *prog) if (old_prog) bpf_prog_put(old_prog); + /* Kick start the NAPI context if there is an AF_XDP socket open + * on that queue id. This so that receiving will start. + */ + if (need_reset && prog) + for (i = 0; i < adapter->num_rx_queues; i++) + if (adapter->xdp_ring[i]->xsk_umem) + (void)ixgbe_xsk_async_xmit(adapter->netdev, i); + return 0; } diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 65c3e2c979d4..654ae92342ea 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -144,11 +144,19 @@ static int ixgbe_xsk_umem_enable(struct ixgbe_adapter *adapter, ixgbe_txrx_ring_disable(adapter, qid); err = ixgbe_add_xsk_umem(adapter, umem, qid); + if (err) + return err; - if (if_running) + if (if_running) { ixgbe_txrx_ring_enable(adapter, qid); - return err; + /* Kick start the NAPI context so that receiving will start */ + err = ixgbe_xsk_async_xmit(adapter->netdev, qid); + if (err) + return err; + } + + return 0; } static int ixgbe_xsk_umem_disable(struct ixgbe_adapter *adapter, u16 qid) From 252f6e8eae909bc075a1b1e3b9efb095ae4c0b56 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Wed, 16 Jan 2019 14:29:50 +0300 Subject: [PATCH 147/298] ARCv2: Enable unaligned access in early ASM code It is currently done in arc_init_IRQ() which might be too late considering gcc 7.3.1 onwards (GNU 2018.03) generates unaligned memory accesses by default Cc: stable@vger.kernel.org #4.4+ Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta [vgupta: rewrote changelog] --- arch/arc/kernel/head.S | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S index 8b90d25a15cc..26e33a8b2d18 100644 --- a/arch/arc/kernel/head.S +++ b/arch/arc/kernel/head.S @@ -17,6 +17,7 @@ #include #include #include +#include .macro CPU_EARLY_SETUP @@ -47,6 +48,15 @@ sr r5, [ARC_REG_DC_CTRL] 1: + +#ifdef CONFIG_ISA_ARCV2 + ; Unaligned access is disabled at reset, so re-enable early as + ; gcc 7.3.1 (ARC GNU 2018.03) onwards generates unaligned access + ; by default + lr r5, [status32] + bset r5, r5, STATUS_AD_BIT + kflag r5 +#endif .endm .section .init.text, "ax",@progbits From f8a15f97664178f27dfbf86a38f780a532cb6df0 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Wed, 30 Jan 2019 19:32:40 +0300 Subject: [PATCH 148/298] ARCv2: lib: memcpy: fix doing prefetchw outside of buffer ARCv2 optimized memcpy uses PREFETCHW instruction for prefetching the next cache line but doesn't ensure that the line is not past the end of the buffer. PRETECHW changes the line ownership and marks it dirty, which can cause data corruption if this area is used for DMA IO. Fix the issue by avoiding the PREFETCHW. This leads to performance degradation but it is OK as we'll introduce new memcpy implementation optimized for unaligned memory access using. We also cut off all PREFETCH instructions at they are quite useless here: * we call PREFETCH right before LOAD instruction call. * we copy 16 or 32 bytes of data (depending on CONFIG_ARC_HAS_LL64) in a main logical loop. so we call PREFETCH 4 times (or 2 times) for each L1 cache line (in case of 64B L1 cache Line which is default case). Obviously this is not optimal. Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/lib/memcpy-archs.S | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/arch/arc/lib/memcpy-archs.S b/arch/arc/lib/memcpy-archs.S index d61044dd8b58..ea14b0bf3116 100644 --- a/arch/arc/lib/memcpy-archs.S +++ b/arch/arc/lib/memcpy-archs.S @@ -25,15 +25,11 @@ #endif #ifdef CONFIG_ARC_HAS_LL64 -# define PREFETCH_READ(RX) prefetch [RX, 56] -# define PREFETCH_WRITE(RX) prefetchw [RX, 64] # define LOADX(DST,RX) ldd.ab DST, [RX, 8] # define STOREX(SRC,RX) std.ab SRC, [RX, 8] # define ZOLSHFT 5 # define ZOLAND 0x1F #else -# define PREFETCH_READ(RX) prefetch [RX, 28] -# define PREFETCH_WRITE(RX) prefetchw [RX, 32] # define LOADX(DST,RX) ld.ab DST, [RX, 4] # define STOREX(SRC,RX) st.ab SRC, [RX, 4] # define ZOLSHFT 4 @@ -41,8 +37,6 @@ #endif ENTRY_CFI(memcpy) - prefetch [r1] ; Prefetch the read location - prefetchw [r0] ; Prefetch the write location mov.f 0, r2 ;;; if size is zero jz.d [blink] @@ -72,8 +66,6 @@ ENTRY_CFI(memcpy) lpnz @.Lcopy32_64bytes ;; LOOP START LOADX (r6, r1) - PREFETCH_READ (r1) - PREFETCH_WRITE (r3) LOADX (r8, r1) LOADX (r10, r1) LOADX (r4, r1) @@ -117,9 +109,7 @@ ENTRY_CFI(memcpy) lpnz @.Lcopy8bytes_1 ;; LOOP START ld.ab r6, [r1, 4] - prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] - prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 24) or r7, r7, r5 @@ -162,9 +152,7 @@ ENTRY_CFI(memcpy) lpnz @.Lcopy8bytes_2 ;; LOOP START ld.ab r6, [r1, 4] - prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] - prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 16) or r7, r7, r5 @@ -204,9 +192,7 @@ ENTRY_CFI(memcpy) lpnz @.Lcopy8bytes_3 ;; LOOP START ld.ab r6, [r1, 4] - prefetch [r1, 28] ;Prefetch the next read location ld.ab r8, [r1,4] - prefetchw [r3, 32] ;Prefetch the next write location SHIFT_1 (r7, r6, 8) or r7, r7, r5 From cdf92962adb0cb23efc3c8bcf6465d16ab7c3a81 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Mon, 4 Feb 2019 21:41:51 +0300 Subject: [PATCH 149/298] ARC: fix actionpoints configuration detection Fix reversed logic while actionpoints configuration (full/min) detection. Fixies: 7dd380c338f1e ("ARC: boot log: print Action point details") Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index feb90093e6b1..def19b0ef8c6 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -212,7 +212,7 @@ static void read_arc_build_cfg_regs(void) READ_BCR(ARC_REG_AP_BCR, ap); if (ap.ver) { cpu->extn.ap_num = 2 << ap.num; - cpu->extn.ap_full = !!ap.min; + cpu->extn.ap_full = !ap.min; } READ_BCR(ARC_REG_SMART_BCR, bcr); From d5e3c55e01d8b1774b37b4647c30fb22f1d39077 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 5 Feb 2019 10:07:07 -0800 Subject: [PATCH 150/298] ARC: uacces: remove lp_start, lp_end from clobber list Newer ARC gcc handles lp_start, lp_end in a different way and doesn't like them in the clobber list. Signed-off-by: Vineet Gupta --- arch/arc/include/asm/uaccess.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h index c9173c02081c..eabc3efa6c6d 100644 --- a/arch/arc/include/asm/uaccess.h +++ b/arch/arc/include/asm/uaccess.h @@ -207,7 +207,7 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n) */ "=&r" (tmp), "+r" (to), "+r" (from) : - : "lp_count", "lp_start", "lp_end", "memory"); + : "lp_count", "memory"); return n; } @@ -433,7 +433,7 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) */ "=&r" (tmp), "+r" (to), "+r" (from) : - : "lp_count", "lp_start", "lp_end", "memory"); + : "lp_count", "memory"); return n; } @@ -653,7 +653,7 @@ static inline unsigned long __arc_clear_user(void __user *to, unsigned long n) " .previous \n" : "+r"(d_char), "+r"(res) : "i"(0) - : "lp_count", "lp_start", "lp_end", "memory"); + : "lp_count", "memory"); return res; } @@ -686,7 +686,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count) " .previous \n" : "+r"(res), "+r"(dst), "+r"(src), "=r"(val) : "g"(-EFAULT), "r"(count) - : "lp_count", "lp_start", "lp_end", "memory"); + : "lp_count", "memory"); return res; } From e494239a007e601448110ac304fe055951f9de3b Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 6 Jun 2018 10:20:37 -0700 Subject: [PATCH 151/298] ARCv2: support manual regfile save on interrupts There's a hardware bug which affects the HSDK platform, triggered by micro-ops for auto-saving regfile on taken interrupt. The workaround is to inhibit autosave. Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 8 +++++ arch/arc/include/asm/entry-arcv2.h | 54 ++++++++++++++++++++++++++++++ arch/arc/kernel/entry-arcv2.S | 4 ++- arch/arc/kernel/intc-arcv2.c | 2 ++ arch/arc/plat-hsdk/Kconfig | 1 + 5 files changed, 68 insertions(+), 1 deletion(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 376366a7db81..7215f52b3413 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -407,6 +407,14 @@ config ARC_HAS_ACCL_REGS (also referred to as r58:r59). These can also be used by gcc as GPR so kernel needs to save/restore per process +config ARC_IRQ_NO_AUTOSAVE + bool "Disable hardware autosave regfile on interrupts" + default n + help + On HS cores, taken interrupt auto saves the regfile on stack. + This is programmable and can be optionally disabled in which case + software INTERRUPT_PROLOGUE/EPILGUE do the needed work + endif # ISA_ARCV2 endmenu # "ARC CPU Configuration" diff --git a/arch/arc/include/asm/entry-arcv2.h b/arch/arc/include/asm/entry-arcv2.h index 309f4e6721b3..225e7df2d8ed 100644 --- a/arch/arc/include/asm/entry-arcv2.h +++ b/arch/arc/include/asm/entry-arcv2.h @@ -17,6 +17,33 @@ ; ; Now manually save: r12, sp, fp, gp, r25 +#ifdef CONFIG_ARC_IRQ_NO_AUTOSAVE +.ifnc \called_from, exception + st.as r9, [sp, -10] ; save r9 in it's final stack slot + sub sp, sp, 12 ; skip JLI, LDI, EI + + PUSH lp_count + PUSHAX lp_start + PUSHAX lp_end + PUSH blink + + PUSH r11 + PUSH r10 + + sub sp, sp, 4 ; skip r9 + + PUSH r8 + PUSH r7 + PUSH r6 + PUSH r5 + PUSH r4 + PUSH r3 + PUSH r2 + PUSH r1 + PUSH r0 +.endif +#endif + #ifdef CONFIG_ARC_HAS_ACCL_REGS PUSH r59 PUSH r58 @@ -86,6 +113,33 @@ POP r59 #endif +#ifdef CONFIG_ARC_IRQ_NO_AUTOSAVE +.ifnc \called_from, exception + POP r0 + POP r1 + POP r2 + POP r3 + POP r4 + POP r5 + POP r6 + POP r7 + POP r8 + POP r9 + POP r10 + POP r11 + + POP blink + POPAX lp_end + POPAX lp_start + + POP r9 + mov lp_count, r9 + + add sp, sp, 12 ; skip JLI, LDI, EI + ld.as r9, [sp, -10] ; reload r9 which got clobbered +.endif +#endif + .endm /*------------------------------------------------------------------------*/ diff --git a/arch/arc/kernel/entry-arcv2.S b/arch/arc/kernel/entry-arcv2.S index cc558a25b8fa..562089d62d9d 100644 --- a/arch/arc/kernel/entry-arcv2.S +++ b/arch/arc/kernel/entry-arcv2.S @@ -209,7 +209,9 @@ restore_regs: ;####### Return from Intr ####### debug_marker_l1: - bbit1.nt r0, STATUS_DE_BIT, .Lintr_ret_to_delay_slot + ; bbit1.nt r0, STATUS_DE_BIT, .Lintr_ret_to_delay_slot + btst r0, STATUS_DE_BIT ; Z flag set if bit clear + bnz .Lintr_ret_to_delay_slot ; branch if STATUS_DE_BIT set .Lisr_ret_fast_path: ; Handle special case #1: (Entry via Exception, Return via IRQ) diff --git a/arch/arc/kernel/intc-arcv2.c b/arch/arc/kernel/intc-arcv2.c index 067ea362fb3e..cf18b3e5a934 100644 --- a/arch/arc/kernel/intc-arcv2.c +++ b/arch/arc/kernel/intc-arcv2.c @@ -49,11 +49,13 @@ void arc_init_IRQ(void) *(unsigned int *)&ictrl = 0; +#ifndef CONFIG_ARC_IRQ_NO_AUTOSAVE ictrl.save_nr_gpr_pairs = 6; /* r0 to r11 (r12 saved manually) */ ictrl.save_blink = 1; ictrl.save_lp_regs = 1; /* LP_COUNT, LP_START, LP_END */ ictrl.save_u_to_u = 0; /* user ctxt saved on kernel stack */ ictrl.save_idx_regs = 1; /* JLI, LDI, EI */ +#endif WRITE_AUX(AUX_IRQ_CTRL, ictrl); diff --git a/arch/arc/plat-hsdk/Kconfig b/arch/arc/plat-hsdk/Kconfig index f25c085b9874..23e00216e5a5 100644 --- a/arch/arc/plat-hsdk/Kconfig +++ b/arch/arc/plat-hsdk/Kconfig @@ -9,6 +9,7 @@ menuconfig ARC_SOC_HSDK bool "ARC HS Development Kit SOC" depends on ISA_ARCV2 select ARC_HAS_ACCL_REGS + select ARC_IRQ_NO_AUTOSAVE select CLK_HSDK select RESET_HSDK select HAVE_PCI From a66f2e57bd566240d8b3884eedf503928fbbe557 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Thu, 14 Feb 2019 18:07:44 +0300 Subject: [PATCH 152/298] ARC: U-boot: check arguments paranoidly Handle U-boot arguments paranoidly: * don't allow to pass unknown tag. * try to use external device tree blob only if corresponding tag (TAG_DTB) is set. * don't check uboot_tag if kernel build with no ARC_UBOOT_SUPPORT. NOTE: If U-boot args are invalid we skip them and try to use embedded device tree blob. We can't panic on invalid U-boot args as we really pass invalid args due to bug in U-boot code. This happens if we don't provide external DTB to U-boot and don't set 'bootargs' U-boot environment variable (which is default case at least for HSDK board) In that case we will pass {r0 = 1 (bootargs in r2); r1 = 0; r2 = 0;} to linux which is invalid. While I'm at it refactor U-boot arguments handling code. Cc: stable@vger.kernel.org Tested-by: Corentin LABBE Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/kernel/head.S | 4 +- arch/arc/kernel/setup.c | 101 +++++++++++++++++++++++++++------------- 2 files changed, 71 insertions(+), 34 deletions(-) diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S index 26e33a8b2d18..1f945d0f40da 100644 --- a/arch/arc/kernel/head.S +++ b/arch/arc/kernel/head.S @@ -103,9 +103,9 @@ ENTRY(stext) #ifdef CONFIG_ARC_UBOOT_SUPPORT ; Uboot - kernel ABI ; r0 = [0] No uboot interaction, [1] cmdline in r2, [2] DTB in r2 - ; r1 = magic number (board identity, unused as of now + ; r1 = magic number (always zero as of now) ; r2 = pointer to uboot provided cmdline or external DTB in mem - ; These are handled later in setup_arch() + ; These are handled later in handle_uboot_args() st r0, [@uboot_tag] st r2, [@uboot_arg] #endif diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index def19b0ef8c6..8bb156164556 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -462,43 +462,80 @@ void setup_processor(void) arc_chk_core_config(); } -static inline int is_kernel(unsigned long addr) +static inline bool uboot_arg_invalid(unsigned long addr) { - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return 0; + /* + * Check that it is a untranslated address (although MMU is not enabled + * yet, it being a high address ensures this is not by fluke) + */ + if (addr < PAGE_OFFSET) + return true; + + /* Check that address doesn't clobber resident kernel image */ + return addr >= (unsigned long)_stext && addr <= (unsigned long)_end; +} + +#define IGNORE_ARGS "Ignore U-boot args: " + +/* uboot_tag values for U-boot - kernel ABI revision 0; see head.S */ +#define UBOOT_TAG_NONE 0 +#define UBOOT_TAG_CMDLINE 1 +#define UBOOT_TAG_DTB 2 + +void __init handle_uboot_args(void) +{ + bool use_embedded_dtb = true; + bool append_cmdline = false; + +#ifdef CONFIG_ARC_UBOOT_SUPPORT + /* check that we know this tag */ + if (uboot_tag != UBOOT_TAG_NONE && + uboot_tag != UBOOT_TAG_CMDLINE && + uboot_tag != UBOOT_TAG_DTB) { + pr_warn(IGNORE_ARGS "invalid uboot tag: '%08x'\n", uboot_tag); + goto ignore_uboot_args; + } + + if (uboot_tag != UBOOT_TAG_NONE && + uboot_arg_invalid((unsigned long)uboot_arg)) { + pr_warn(IGNORE_ARGS "invalid uboot arg: '%px'\n", uboot_arg); + goto ignore_uboot_args; + } + + /* see if U-boot passed an external Device Tree blob */ + if (uboot_tag == UBOOT_TAG_DTB) { + machine_desc = setup_machine_fdt((void *)uboot_arg); + + /* external Device Tree blob is invalid - use embedded one */ + use_embedded_dtb = !machine_desc; + } + + if (uboot_tag == UBOOT_TAG_CMDLINE) + append_cmdline = true; + +ignore_uboot_args: +#endif + + if (use_embedded_dtb) { + machine_desc = setup_machine_fdt(__dtb_start); + if (!machine_desc) + panic("Embedded DT invalid\n"); + } + + /* + * NOTE: @boot_command_line is populated by setup_machine_fdt() so this + * append processing can only happen after. + */ + if (append_cmdline) { + /* Ensure a whitespace between the 2 cmdlines */ + strlcat(boot_command_line, " ", COMMAND_LINE_SIZE); + strlcat(boot_command_line, uboot_arg, COMMAND_LINE_SIZE); + } } void __init setup_arch(char **cmdline_p) { -#ifdef CONFIG_ARC_UBOOT_SUPPORT - /* make sure that uboot passed pointer to cmdline/dtb is valid */ - if (uboot_tag && is_kernel((unsigned long)uboot_arg)) - panic("Invalid uboot arg\n"); - - /* See if u-boot passed an external Device Tree blob */ - machine_desc = setup_machine_fdt(uboot_arg); /* uboot_tag == 2 */ - if (!machine_desc) -#endif - { - /* No, so try the embedded one */ - machine_desc = setup_machine_fdt(__dtb_start); - if (!machine_desc) - panic("Embedded DT invalid\n"); - - /* - * If we are here, it is established that @uboot_arg didn't - * point to DT blob. Instead if u-boot says it is cmdline, - * append to embedded DT cmdline. - * setup_machine_fdt() would have populated @boot_command_line - */ - if (uboot_tag == 1) { - /* Ensure a whitespace between the 2 cmdlines */ - strlcat(boot_command_line, " ", COMMAND_LINE_SIZE); - strlcat(boot_command_line, uboot_arg, - COMMAND_LINE_SIZE); - } - } + handle_uboot_args(); /* Save unparsed command line copy for /proc/cmdline */ *cmdline_p = boot_command_line; From 493a2f812446e92bcb1e69a77381b4d39808d730 Mon Sep 17 00:00:00 2001 From: Eugeniy Paltsev Date: Thu, 14 Feb 2019 18:07:45 +0300 Subject: [PATCH 153/298] ARC: enable uboot support unconditionally After reworking U-boot args handling code and adding paranoid arguments check we can eliminate CONFIG_ARC_UBOOT_SUPPORT and enable uboot support unconditionally. For JTAG case we can assume that core registers will come up reset value of 0 or in worst case we rely on user passing '-on=clear_regs' to Metaware debugger. Cc: stable@vger.kernel.org Tested-by: Corentin LABBE Signed-off-by: Eugeniy Paltsev Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 12 ------------ arch/arc/configs/nps_defconfig | 1 - arch/arc/configs/vdk_hs38_defconfig | 1 - arch/arc/configs/vdk_hs38_smp_defconfig | 2 -- arch/arc/kernel/head.S | 2 -- arch/arc/kernel/setup.c | 2 -- 6 files changed, 20 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 7215f52b3413..d750b302d5ab 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -191,7 +191,6 @@ config NR_CPUS config ARC_SMP_HALT_ON_RESET bool "Enable Halt-on-reset boot mode" - default y if ARC_UBOOT_SUPPORT help In SMP configuration cores can be configured as Halt-on-reset or they could all start at same time. For Halt-on-reset, non @@ -523,17 +522,6 @@ config ARC_DBG_TLB_PARANOIA endif -config ARC_UBOOT_SUPPORT - bool "Support uboot arg Handling" - help - ARC Linux by default checks for uboot provided args as pointers to - external cmdline or DTB. This however breaks in absence of uboot, - when booting from Metaware debugger directly, as the registers are - not zeroed out on reset by mdb and/or ARCv2 based cores. The bogus - registers look like uboot args to kernel which then chokes. - So only enable the uboot arg checking/processing if users are sure - of uboot being in play. - config ARC_BUILTIN_DTB_NAME string "Built in DTB" help diff --git a/arch/arc/configs/nps_defconfig b/arch/arc/configs/nps_defconfig index 6e84060e7c90..621f59407d76 100644 --- a/arch/arc/configs/nps_defconfig +++ b/arch/arc/configs/nps_defconfig @@ -31,7 +31,6 @@ CONFIG_ARC_CACHE_LINE_SHIFT=5 # CONFIG_ARC_HAS_LLSC is not set CONFIG_ARC_KVADDR_SIZE=402 CONFIG_ARC_EMUL_UNALIGNED=y -CONFIG_ARC_UBOOT_SUPPORT=y CONFIG_PREEMPT=y CONFIG_NET=y CONFIG_UNIX=y diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig index 1e59a2e9c602..e447ace6fa1c 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -13,7 +13,6 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_ARC_PLAT_AXS10X=y CONFIG_AXS103=y CONFIG_ISA_ARCV2=y -CONFIG_ARC_UBOOT_SUPPORT=y CONFIG_ARC_BUILTIN_DTB_NAME="vdk_hs38" CONFIG_PREEMPT=y CONFIG_NET=y diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig index b5c3f6c54b03..c82cdb10aaf4 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -15,8 +15,6 @@ CONFIG_AXS103=y CONFIG_ISA_ARCV2=y CONFIG_SMP=y # CONFIG_ARC_TIMERS_64BIT is not set -# CONFIG_ARC_SMP_HALT_ON_RESET is not set -CONFIG_ARC_UBOOT_SUPPORT=y CONFIG_ARC_BUILTIN_DTB_NAME="vdk_hs38_smp" CONFIG_PREEMPT=y CONFIG_NET=y diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S index 1f945d0f40da..30e090625916 100644 --- a/arch/arc/kernel/head.S +++ b/arch/arc/kernel/head.S @@ -100,7 +100,6 @@ ENTRY(stext) st.ab 0, [r5, 4] 1: -#ifdef CONFIG_ARC_UBOOT_SUPPORT ; Uboot - kernel ABI ; r0 = [0] No uboot interaction, [1] cmdline in r2, [2] DTB in r2 ; r1 = magic number (always zero as of now) @@ -108,7 +107,6 @@ ENTRY(stext) ; These are handled later in handle_uboot_args() st r0, [@uboot_tag] st r2, [@uboot_arg] -#endif ; setup "current" tsk and optionally cache it in dedicated r25 mov r9, @init_task diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 8bb156164556..93d4d6639873 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -487,7 +487,6 @@ void __init handle_uboot_args(void) bool use_embedded_dtb = true; bool append_cmdline = false; -#ifdef CONFIG_ARC_UBOOT_SUPPORT /* check that we know this tag */ if (uboot_tag != UBOOT_TAG_NONE && uboot_tag != UBOOT_TAG_CMDLINE && @@ -514,7 +513,6 @@ void __init handle_uboot_args(void) append_cmdline = true; ignore_uboot_args: -#endif if (use_embedded_dtb) { machine_desc = setup_machine_fdt(__dtb_start); From b6835ea77729e7faf4656ca637ba53f42b8ee3fd Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Fri, 8 Feb 2019 13:55:19 +0300 Subject: [PATCH 154/298] ARC: define ARCH_SLAB_MINALIGN = 8 The default value of ARCH_SLAB_MINALIGN in "include/linux/slab.h" is "__alignof__(unsigned long long)" which for ARC unexpectedly turns out to be 4. This is not a compiler bug, but as defined by ARC ABI [1] Thus slab allocator would allocate a struct which is 32-bit aligned, which is generally OK even if struct has long long members. There was however potetial problem when it had any atomic64_t which use LLOCKD/SCONDD instructions which are required by ISA to take 64-bit addresses. This is the problem we ran into [ 4.015732] EXT4-fs (mmcblk0p2): re-mounted. Opts: (null) [ 4.167881] Misaligned Access [ 4.172356] Path: /bin/busybox.nosuid [ 4.176004] CPU: 2 PID: 171 Comm: rm Not tainted 4.19.14-yocto-standard #1 [ 4.182851] [ 4.182851] [ECR ]: 0x000d0000 => Check Programmer's Manual [ 4.190061] [EFA ]: 0xbeaec3fc [ 4.190061] [BLINK ]: ext4_delete_entry+0x210/0x234 [ 4.190061] [ERET ]: ext4_delete_entry+0x13e/0x234 [ 4.202985] [STAT32]: 0x80080002 : IE K [ 4.207236] BTA: 0x9009329c SP: 0xbe5b1ec4 FP: 0x00000000 [ 4.212790] LPS: 0x9074b118 LPE: 0x9074b120 LPC: 0x00000000 [ 4.218348] r00: 0x00000040 r01: 0x00000021 r02: 0x00000001 ... ... [ 4.270510] Stack Trace: [ 4.274510] ext4_delete_entry+0x13e/0x234 [ 4.278695] ext4_rmdir+0xe0/0x238 [ 4.282187] vfs_rmdir+0x50/0xf0 [ 4.285492] do_rmdir+0x9e/0x154 [ 4.288802] EV_Trap+0x110/0x114 The fix is to make sure slab allocations are 64-bit aligned. Do note that atomic64_t is __attribute__((aligned(8)) which means gcc does generate 64-bit aligned references, relative to beginning of container struct. However the issue is if the container itself is not 64-bit aligned, atomic64_t ends up unaligned which is what this patch ensures. [1] https://github.com/foss-for-synopsys-dwc-arc-processors/toolchain/wiki/files/ARCv2_ABI.pdf Signed-off-by: Alexey Brodkin Cc: # 4.8+ Signed-off-by: Vineet Gupta [vgupta: reworked changelog, added dependency on LL64+LLSC] --- arch/arc/include/asm/cache.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index f393b663413e..2ad77fb43639 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -52,6 +52,17 @@ #define cache_line_size() SMP_CACHE_BYTES #define ARCH_DMA_MINALIGN SMP_CACHE_BYTES +/* + * Make sure slab-allocated buffers are 64-bit aligned when atomic64_t uses + * ARCv2 64-bit atomics (LLOCKD/SCONDD). This guarantess runtime 64-bit + * alignment for any atomic64_t embedded in buffer. + * Default ARCH_SLAB_MINALIGN is __alignof__(long long) which has a relaxed + * value of 4 (and not 8) in ARC ABI. + */ +#if defined(CONFIG_ARC_HAS_LL64) && defined(CONFIG_ARC_HAS_LLSC) +#define ARCH_SLAB_MINALIGN 8 +#endif + extern void arc_cache_init(void); extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len); extern void read_decode_cache_bcr(void); From 59eb2a884f5380011179acc4662fc2cc2d850454 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Thu, 14 Feb 2019 14:03:02 +0100 Subject: [PATCH 155/298] i40e: fix XDP_REDIRECT/XDP xmit ring cleanup race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the driver clears the XDP xmit ring due to re-configuration or teardown, in-progress ndo_xdp_xmit must be taken into consideration. The ndo_xdp_xmit function is typically called from a NAPI context that the driver does not control. Therefore, we must be careful not to clear the XDP ring, while the call is on-going. This patch adds a synchronize_rcu() to wait for napi(s) (preempt-disable regions and softirqs), prior clearing the queue. Further, the __I40E_CONFIG_BUSY flag is checked in the ndo_xdp_xmit implementation to avoid touching the XDP xmit queue during re-configuration. Fixes: d9314c474d4f ("i40e: add support for XDP_REDIRECT") Fixes: 123cecd427b6 ("i40e: added queue pair disable/enable functions") Reported-by: Maciej Fijalkowski Signed-off-by: Björn Töpel Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/i40e/i40e_main.c | 14 ++++++++++++-- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 4 +++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3a0990de81c1..e4ff531db14a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -6728,8 +6728,13 @@ void i40e_down(struct i40e_vsi *vsi) for (i = 0; i < vsi->num_queue_pairs; i++) { i40e_clean_tx_ring(vsi->tx_rings[i]); - if (i40e_enabled_xdp_vsi(vsi)) + if (i40e_enabled_xdp_vsi(vsi)) { + /* Make sure that in-progress ndo_xdp_xmit + * calls are completed. + */ + synchronize_rcu(); i40e_clean_tx_ring(vsi->xdp_rings[i]); + } i40e_clean_rx_ring(vsi->rx_rings[i]); } @@ -11966,8 +11971,13 @@ static void i40e_queue_pair_reset_stats(struct i40e_vsi *vsi, int queue_pair) static void i40e_queue_pair_clean_rings(struct i40e_vsi *vsi, int queue_pair) { i40e_clean_tx_ring(vsi->tx_rings[queue_pair]); - if (i40e_enabled_xdp_vsi(vsi)) + if (i40e_enabled_xdp_vsi(vsi)) { + /* Make sure that in-progress ndo_xdp_xmit calls are + * completed. + */ + synchronize_rcu(); i40e_clean_tx_ring(vsi->xdp_rings[queue_pair]); + } i40e_clean_rx_ring(vsi->rx_rings[queue_pair]); } diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index a7e14e98889f..6c97667d20ef 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -3709,6 +3709,7 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, struct i40e_netdev_priv *np = netdev_priv(dev); unsigned int queue_index = smp_processor_id(); struct i40e_vsi *vsi = np->vsi; + struct i40e_pf *pf = vsi->back; struct i40e_ring *xdp_ring; int drops = 0; int i; @@ -3716,7 +3717,8 @@ int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames, if (test_bit(__I40E_VSI_DOWN, vsi->state)) return -ENETDOWN; - if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs) + if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs || + test_bit(__I40E_CONFIG_BUSY, pf->state)) return -ENXIO; if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) From b7dc5a071ddf69c0350396b203cba32fe5bab510 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sat, 16 Feb 2019 16:10:39 +0300 Subject: [PATCH 156/298] parisc: Fix ptrace syscall number modification Commit 910cd32e552e ("parisc: Fix and enable seccomp filter support") introduced a regression in ptrace-based syscall tampering: when tracer changes syscall number to -1, the kernel fails to initialize %r28 with -ENOSYS and subsequently fails to return the error code of the failed syscall to userspace. This erroneous behaviour could be observed with a simple strace syscall fault injection command which is expected to print something like this: $ strace -a0 -ewrite -einject=write:error=enospc echo hello write(1, "hello\n", 6) = -1 ENOSPC (No space left on device) (INJECTED) write(2, "echo: ", 6) = -1 ENOSPC (No space left on device) (INJECTED) write(2, "write error", 11) = -1 ENOSPC (No space left on device) (INJECTED) write(2, "\n", 1) = -1 ENOSPC (No space left on device) (INJECTED) +++ exited with 1 +++ After commit 910cd32e552ea09caa89cdbe328e468979b030dd it loops printing something like this instead: write(1, "hello\n", 6../strace: Failed to tamper with process 12345: unexpectedly got no error (return value 0, error 0) ) = 0 (INJECTED) This bug was found by strace test suite. Fixes: 910cd32e552e ("parisc: Fix and enable seccomp filter support") Cc: stable@vger.kernel.org # v4.5+ Signed-off-by: Dmitry V. Levin Tested-by: Helge Deller Signed-off-by: Helge Deller --- arch/parisc/kernel/ptrace.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index 2582df1c529b..0964c236e3e5 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -308,15 +308,29 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, long do_syscall_trace_enter(struct pt_regs *regs) { - if (test_thread_flag(TIF_SYSCALL_TRACE) && - tracehook_report_syscall_entry(regs)) { + if (test_thread_flag(TIF_SYSCALL_TRACE)) { + int rc = tracehook_report_syscall_entry(regs); + /* - * Tracing decided this syscall should not happen or the - * debugger stored an invalid system call number. Skip - * the system call and the system call restart handling. + * As tracesys_next does not set %r28 to -ENOSYS + * when %r20 is set to -1, initialize it here. */ - regs->gr[20] = -1UL; - goto out; + regs->gr[28] = -ENOSYS; + + if (rc) { + /* + * A nonzero return code from + * tracehook_report_syscall_entry() tells us + * to prevent the syscall execution. Skip + * the syscall call and the syscall restart handling. + * + * Note that the tracer may also just change + * regs->gr[20] to an invalid syscall number, + * that is handled by tracesys_next. + */ + regs->gr[20] = -1UL; + return -1; + } } /* Do the secure computing check after ptrace. */ @@ -340,7 +354,6 @@ long do_syscall_trace_enter(struct pt_regs *regs) regs->gr[24] & 0xffffffff, regs->gr[23] & 0xffffffff); -out: /* * Sign extend the syscall number to 64bit since it may have been * modified by a compat ptrace call From c685c69fba71462c3f9f6a1fb6151cded6c74d42 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Wed, 20 Feb 2019 15:20:14 +0000 Subject: [PATCH 157/298] ixgbe: don't do any AF_XDP zero-copy transmit if netif is not OK An issue has been found while testing zero-copy XDP that causes a reset to be triggered. As it takes some time to turn the carrier on after setting zc, and we already start trying to transmit some packets, watchdog considers this as an erroneous state and triggers a reset. Don't do any work if netif carrier is not OK. Fixes: 8221c5eba8c13 (ixgbe: add AF_XDP zero-copy Tx support) Signed-off-by: Jan Sokolowski Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c index 654ae92342ea..36a8879536a4 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c @@ -642,7 +642,8 @@ static bool ixgbe_xmit_zc(struct ixgbe_ring *xdp_ring, unsigned int budget) dma_addr_t dma; while (budget-- > 0) { - if (unlikely(!ixgbe_desc_unused(xdp_ring))) { + if (unlikely(!ixgbe_desc_unused(xdp_ring)) || + !netif_carrier_ok(xdp_ring->netdev)) { work_done = false; break; } From 71d73a0b43c2b101a960c624290c8a053d174cac Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Thu, 21 Feb 2019 20:16:10 +0100 Subject: [PATCH 158/298] CREDITS/MAINTAINERS: Retire parisc-linux.org email domain Retire the parisc-linux.org email domain and provide alternative email addresses for the remaining users, as agreed upon with them. Signed-off-by: Helge Deller --- CREDITS | 20 +++++++++----------- MAINTAINERS | 5 ++--- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/CREDITS b/CREDITS index e818eb6a3e71..0175098d4776 100644 --- a/CREDITS +++ b/CREDITS @@ -842,10 +842,9 @@ D: ax25-utils maintainer. N: Helge Deller E: deller@gmx.de -E: hdeller@redhat.de -D: PA-RISC Linux hacker, LASI-, ASP-, WAX-, LCD/LED-driver -S: Schimmelsrain 1 -S: D-69231 Rauenberg +W: http://www.parisc-linux.org/ +D: PA-RISC Linux architecture maintainer +D: LASI-, ASP-, WAX-, LCD/LED-driver S: Germany N: Jean Delvare @@ -1361,7 +1360,7 @@ S: Stellenbosch, Western Cape S: South Africa N: Grant Grundler -E: grundler@parisc-linux.org +E: grantgrundler@gmail.com W: http://obmouse.sourceforge.net/ W: http://www.parisc-linux.org/ D: obmouse - rewrote Olivier Florent's Omnibook 600 "pop-up" mouse driver @@ -2492,7 +2491,7 @@ S: Syracuse, New York 13206 S: USA N: Kyle McMartin -E: kyle@parisc-linux.org +E: kyle@mcmartin.ca D: Linux/PARISC hacker D: AD1889 sound driver S: Ottawa, Canada @@ -3780,14 +3779,13 @@ S: 21513 Conradia Ct S: Cupertino, CA 95014 S: USA -N: Thibaut Varene -E: T-Bone@parisc-linux.org -W: http://www.parisc-linux.org/~varenet/ -P: 1024D/B7D2F063 E67C 0D43 A75E 12A5 BB1C FA2F 1E32 C3DA B7D2 F063 +N: Thibaut Varène +E: hacks+kernel@slashdirt.org +W: http://hacks.slashdirt.org/ D: PA-RISC port minion, PDC and GSCPS2 drivers, debuglocks and other bits D: Some ARM at91rm9200 bits, S1D13XXX FB driver, random patches here and there D: AD1889 sound driver -S: Paris, France +S: France N: Heikki Vatiainen E: hessu@cs.tut.fi diff --git a/MAINTAINERS b/MAINTAINERS index 41ce5f4ad838..e6e17d8c5aae 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -409,8 +409,7 @@ F: drivers/platform/x86/wmi.c F: include/uapi/linux/wmi.h AD1889 ALSA SOUND DRIVER -M: Thibaut Varene -W: http://wiki.parisc-linux.org/AD1889 +W: https://parisc.wiki.kernel.org/index.php/AD1889 L: linux-parisc@vger.kernel.org S: Maintained F: sound/pci/ad1889.* @@ -11488,7 +11487,7 @@ F: Documentation/blockdev/paride.txt F: drivers/block/paride/ PARISC ARCHITECTURE -M: "James E.J. Bottomley" +M: "James E.J. Bottomley" M: Helge Deller L: linux-parisc@vger.kernel.org W: http://www.parisc-linux.org/ From 18de100ed6f0d3bf74036de9fd4528f208d585e6 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 21 Feb 2019 20:58:16 +0100 Subject: [PATCH 159/298] MAINTAINERS: mark CAIF as orphan The listed address for the CAIF maintainer bounces with "553 5.3.0 ... No such user here", and the only existing email address of the maintainer in git history hasn't responded in a week. Therefore, remove the listed maintainer and mark CAIF as orphan. Signed-off-by: Jann Horn Signed-off-by: David S. Miller --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 41ce5f4ad838..98457a87b238 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3390,9 +3390,8 @@ F: Documentation/media/v4l-drivers/cafe_ccic* F: drivers/media/platform/marvell-ccic/ CAIF NETWORK LAYER -M: Dmitry Tarnyagin L: netdev@vger.kernel.org -S: Supported +S: Orphan F: Documentation/networking/caif/ F: drivers/net/caif/ F: include/uapi/linux/caif/ From ad49bc6361ca29e3318b6f71a6fc361d2a8c9f26 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 18 Feb 2019 17:14:25 +0800 Subject: [PATCH 160/298] net: vrf: remove MTU limits for vrf device Similiar to commit e94cd8113ce63 ("net: remove MTU limits for dummy and ifb device"), MTU is irrelevant for VRF device. We init it as 64K while limit it to [68, 1500] may make users feel confused. Reported-by: Jianlin Shi Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/vrf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 95909e262ba4..7c1430ed0244 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1273,6 +1273,9 @@ static void vrf_setup(struct net_device *dev) /* default to no qdisc; user can add if desired */ dev->priv_flags |= IFF_NO_QUEUE; + + dev->min_mtu = 0; + dev->max_mtu = 0; } static int vrf_validate(struct nlattr *tb[], struct nlattr *data[], From 3c963a3306eada999be5ebf4f293dfa3d3945487 Mon Sep 17 00:00:00 2001 From: Michal Soltys Date: Mon, 18 Feb 2019 17:55:28 +0100 Subject: [PATCH 161/298] bonding: fix PACKET_ORIGDEV regression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes a subtle PACKET_ORIGDEV regression which was a side effect of fixes introduced by: 6a9e461f6fe4 bonding: pass link-local packets to bonding master also. ... to: b89f04c61efe bonding: deliver link-local packets with skb->dev set to link that packets arrived on While 6a9e461f6fe4 restored pre-b89f04c61efe presence of link-local packets on bonding masters (which is required e.g. by linux bridges participating in spanning tree or needed for lab-like setups created with group_fwd_mask) it also caused the originating device information to be lost due to cloning. Maciej Żenczykowski proposed another solution that doesn't require packet cloning and retains original device information - instead of returning RX_HANDLER_PASS for all link-local packets it's now limited only to packets from inactive slaves. At the same time, packets passed to bonding masters retain correct information about the originating device and PACKET_ORIGDEV can be used to determine it. This elegantly solves all issues so far: - link-local packets that were removed from bonding masters - LLDP daemons being forced to explicitly bind to slave interfaces - PACKET_ORIGDEV having no effect on bond interfaces Fixes: 6a9e461f6fe4 (bonding: pass link-local packets to bonding master also.) Reported-by: Vincent Bernat Signed-off-by: Michal Soltys Signed-off-by: Maciej Żenczykowski Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 35 +++++++++++++-------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 485462d3087f..537c90c8eb0a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1183,29 +1183,22 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) } } - /* Link-local multicast packets should be passed to the - * stack on the link they arrive as well as pass them to the - * bond-master device. These packets are mostly usable when - * stack receives it with the link on which they arrive - * (e.g. LLDP) they also must be available on master. Some of - * the use cases include (but are not limited to): LLDP agents - * that must be able to operate both on enslaved interfaces as - * well as on bonds themselves; linux bridges that must be able - * to process/pass BPDUs from attached bonds when any kind of - * STP version is enabled on the network. + /* + * For packets determined by bond_should_deliver_exact_match() call to + * be suppressed we want to make an exception for link-local packets. + * This is necessary for e.g. LLDP daemons to be able to monitor + * inactive slave links without being forced to bind to them + * explicitly. + * + * At the same time, packets that are passed to the bonding master + * (including link-local ones) can have their originating interface + * determined via PACKET_ORIGDEV socket option. */ - if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) { - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); - - if (nskb) { - nskb->dev = bond->dev; - nskb->queue_mapping = 0; - netif_rx(nskb); - } - return RX_HANDLER_PASS; - } - if (bond_should_deliver_exact_match(skb, slave, bond)) + if (bond_should_deliver_exact_match(skb, slave, bond)) { + if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) + return RX_HANDLER_PASS; return RX_HANDLER_EXACT; + } skb->dev = bond->dev; From 223b7329ec6a0dae1b7f7db7b770e93f4a069ef9 Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 19 Feb 2019 11:20:47 +0700 Subject: [PATCH 162/298] tipc: improve function tipc_wait_for_cond() Commit 844cf763fba6 ("tipc: make macro tipc_wait_for_cond() smp safe") replaced finish_wait() with remove_wait_queue() but still used prepare_to_wait(). This causes unnecessary conditional checking before adding to wait queue in prepare_to_wait(). This commit replaces prepare_to_wait() with add_wait_queue() as the pair function with remove_wait_queue(). Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 1217c90a363b..81b87916a0eb 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -388,7 +388,7 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout) rc_ = tipc_sk_sock_err((sock_), timeo_); \ if (rc_) \ break; \ - prepare_to_wait(sk_sleep(sk_), &wait_, TASK_INTERRUPTIBLE); \ + add_wait_queue(sk_sleep(sk_), &wait_); \ release_sock(sk_); \ *(timeo_) = wait_woken(&wait_, TASK_INTERRUPTIBLE, *(timeo_)); \ sched_annotate_sleep(); \ From 48766a583c7961af080de2df692f476624a9a21a Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Tue, 19 Feb 2019 11:20:48 +0700 Subject: [PATCH 163/298] tipc: improve function tipc_wait_for_rcvmsg() This commit replaces schedule_timeout() with wait_woken() in function tipc_wait_for_rcvmsg(). wait_woken() uses memory barriers in its implementation to avoid potential race condition when putting a process into sleeping state and then waking it up. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/socket.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 81b87916a0eb..684f2125fc6b 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1677,7 +1677,7 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk) static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) { struct sock *sk = sock->sk; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); long timeo = *timeop; int err = sock_error(sk); @@ -1685,15 +1685,17 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) return err; for (;;) { - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (timeo && skb_queue_empty(&sk->sk_receive_queue)) { if (sk->sk_shutdown & RCV_SHUTDOWN) { err = -ENOTCONN; break; } + add_wait_queue(sk_sleep(sk), &wait); release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + sched_annotate_sleep(); lock_sock(sk); + remove_wait_queue(sk_sleep(sk), &wait); } err = 0; if (!skb_queue_empty(&sk->sk_receive_queue)) @@ -1709,7 +1711,6 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) if (err) break; } - finish_wait(sk_sleep(sk), &wait); *timeop = timeo; return err; } From 9e8db5913264d3967b93c765a6a9e464d9c473db Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 18 Feb 2019 23:37:12 -0500 Subject: [PATCH 164/298] net: avoid false positives in untrusted gso validation GSO packets with vnet_hdr must conform to a small set of gso_types. The below commit uses flow dissection to drop packets that do not. But it has false positives when the skb is not fully initialized. Dissection needs skb->protocol and skb->network_header. Infer skb->protocol from gso_type as the two must agree. SKB_GSO_UDP can use both ipv4 and ipv6, so try both. Exclude callers for which network header offset is not known. Fixes: d5be7f632bad ("net: validate untrusted gso packets without csum offload") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 71f2394abbf7..e0348cb0a1dd 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -61,10 +61,20 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, /* gso packets without NEEDS_CSUM do not set transport_offset. * probe and drop if does not match one of the above types. */ - if (gso_type) { + if (gso_type && skb->network_header) { + if (!skb->protocol) + virtio_net_hdr_set_proto(skb, hdr); +retry: skb_probe_transport_header(skb, -1); - if (!skb_transport_header_was_set(skb)) + if (!skb_transport_header_was_set(skb)) { + /* UFO does not specify ipv4 or 6: try both */ + if (gso_type & SKB_GSO_UDP && + skb->protocol == htons(ETH_P_IP)) { + skb->protocol = htons(ETH_P_IPV6); + goto retry; + } return -EINVAL; + } } } From 7b2e932f633bcb7b190fc7031ce6dac75f8c3472 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 21 Feb 2019 13:44:49 -0800 Subject: [PATCH 165/298] ARCv2: don't assume core 0x54 has dual issue The first release of core4 (0x54) was dual issue only (HS4x). Newer releases allow hardware to be configured as single issue (HS3x) or dual issue. Prevent accessing a HS4x only aux register in HS3x, which otherwise leads to illegal instruction exceptions Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 8 ++++++++ arch/arc/kernel/setup.c | 26 +++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index f1b86cef0905..a27eafdc8260 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -151,6 +151,14 @@ struct bcr_isa_arcv2 { #endif }; +struct bcr_uarch_build_arcv2 { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad:8, prod:8, maj:8, min:8; +#else + unsigned int min:8, maj:8, prod:8, pad:8; +#endif +}; + struct bcr_mpy { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int pad:8, x1616:8, dsp:4, cycles:2, type:2, ver:8; diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 93d4d6639873..7b2340996cf8 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -199,13 +199,29 @@ static void read_arc_build_cfg_regs(void) cpu->bpu.ret_stk = 4 << bpu.rse; if (cpu->core.family >= 0x54) { - unsigned int exec_ctrl; - READ_BCR(AUX_EXEC_CTRL, exec_ctrl); - cpu->extn.dual_enb = !(exec_ctrl & 1); + struct bcr_uarch_build_arcv2 uarch; - /* dual issue always present for this core */ - cpu->extn.dual = 1; + /* + * The first 0x54 core (uarch maj:min 0:1 or 0:2) was + * dual issue only (HS4x). But next uarch rev (1:0) + * allows it be configured for single issue (HS3x) + * Ensure we fiddle with dual issue only on HS4x + */ + READ_BCR(ARC_REG_MICRO_ARCH_BCR, uarch); + + if (uarch.prod == 4) { + unsigned int exec_ctrl; + + /* dual issue hardware always present */ + cpu->extn.dual = 1; + + READ_BCR(AUX_EXEC_CTRL, exec_ctrl); + + /* dual issue hardware enabled ? */ + cpu->extn.dual_enb = !(exec_ctrl & 1); + + } } } From 2bdf700e538828d6456150b9319e5f689b062d54 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 19 Feb 2019 17:42:05 +0100 Subject: [PATCH 166/298] net: ip_gre: do not report erspan_ver for gre or gretap Report erspan version field to userspace in ipgre_fill_info just for erspan tunnels. The issue can be triggered with the following reproducer: $ip link add name gre1 type gre local 192.168.0.1 remote 192.168.1.1 $ip link set dev gre1 up $ip -d link sh gre1 13: gre1@NONE: mtu 1476 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000 link/gre 192.168.0.1 peer 192.168.1.1 promiscuity 0 minmtu 0 maxmtu 0 gre remote 192.168.1.1 local 192.168.0.1 ttl inherit erspan_ver 0 addrgenmode eui64 numtxqueues 1 numrxqueues 1 Fixes: f551c91de262 ("net: erspan: introduce erspan v2 for ip_gre") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 3978f807fa8b..6ae89f2b541b 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1457,9 +1457,23 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; __be16 o_flags = p->o_flags; - if ((t->erspan_ver == 1 || t->erspan_ver == 2) && - !t->collect_md) - o_flags |= TUNNEL_KEY; + if (t->erspan_ver == 1 || t->erspan_ver == 2) { + if (!t->collect_md) + o_flags |= TUNNEL_KEY; + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) + goto nla_put_failure; + + if (t->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) + goto nla_put_failure; + } else { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) + goto nla_put_failure; + } + } if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, @@ -1495,19 +1509,6 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) - goto nla_put_failure; - - if (t->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) - goto nla_put_failure; - } else if (t->erspan_ver == 2) { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) - goto nla_put_failure; - } - return 0; nla_put_failure: From 103d0244d29fcaf38f1339d4538919bbbc051490 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 19 Feb 2019 17:42:06 +0100 Subject: [PATCH 167/298] net: ip6_gre: do not report erspan_ver for ip6gre or ip6gretap Report erspan version field to userspace in ip6gre_fill_info just for erspan_v6 tunnels. Moreover report IFLA_GRE_ERSPAN_INDEX only for erspan version 1. The issue can be triggered with the following reproducer: $ip link add name gre6 type ip6gre local 2001::1 remote 2002::2 $ip link set gre6 up $ip -d link sh gre6 14: grep6@NONE: mtu 1448 qdisc noqueue state UNKNOWN mode DEFAULT group default qlen 1000 link/gre6 2001::1 peer 2002::2 promiscuity 0 minmtu 0 maxmtu 0 ip6gre remote 2002::2 local 2001::1 hoplimit 64 encaplimit 4 tclass 0x00 flowlabel 0x00000 erspan_index 0 erspan_ver 0 addrgenmode eui64 Fixes: 94d7d8f29287 ("ip6_gre: add erspan v2 support") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 43890898b0b5..0fdd0109d131 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -2104,9 +2104,23 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct __ip6_tnl_parm *p = &t->parms; __be16 o_flags = p->o_flags; - if ((p->erspan_ver == 1 || p->erspan_ver == 2) && - !p->collect_md) - o_flags |= TUNNEL_KEY; + if (p->erspan_ver == 1 || p->erspan_ver == 2) { + if (!p->collect_md) + o_flags |= TUNNEL_KEY; + + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) + goto nla_put_failure; + + if (p->erspan_ver == 1) { + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) + goto nla_put_failure; + } else { + if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) + goto nla_put_failure; + if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) + goto nla_put_failure; + } + } if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || nla_put_be16(skb, IFLA_GRE_IFLAGS, @@ -2121,8 +2135,7 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || - nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) || - nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -2140,19 +2153,6 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) - goto nla_put_failure; - - if (p->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) - goto nla_put_failure; - } else if (p->erspan_ver == 2) { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) - goto nla_put_failure; - } - return 0; nla_put_failure: From 6321aa197547da397753757bd84c6ce64b3e3d89 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Feb 2019 22:53:50 +0100 Subject: [PATCH 168/298] phonet: fix building with clang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit clang warns about overflowing the data[] member in the struct pnpipehdr: net/phonet/pep.c:295:8: warning: array index 4 is past the end of the array (which contains 1 element) [-Warray-bounds] if (hdr->data[4] == PEP_IND_READY) ^ ~ include/net/phonet/pep.h:66:3: note: array 'data' declared here u8 data[1]; Using a flexible array member at the end of the struct avoids the warning, but since we cannot have a flexible array member inside of the union, each index now has to be moved back by one, which makes it a little uglier. Signed-off-by: Arnd Bergmann Acked-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- include/net/phonet/pep.h | 5 +++-- net/phonet/pep.c | 32 ++++++++++++++++---------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/net/phonet/pep.h b/include/net/phonet/pep.h index b669fe6dbc3b..98f31c7ea23d 100644 --- a/include/net/phonet/pep.h +++ b/include/net/phonet/pep.h @@ -63,10 +63,11 @@ struct pnpipehdr { u8 state_after_reset; /* reset request */ u8 error_code; /* any response */ u8 pep_type; /* status indication */ - u8 data[1]; + u8 data0; /* anything else */ }; + u8 data[]; }; -#define other_pep_type data[1] +#define other_pep_type data[0] static inline struct pnpipehdr *pnp_hdr(struct sk_buff *skb) { diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 9fc76b19cd3c..db3473540303 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -132,7 +132,7 @@ static int pep_indicate(struct sock *sk, u8 id, u8 code, ph->utid = 0; ph->message_id = id; ph->pipe_handle = pn->pipe_handle; - ph->data[0] = code; + ph->error_code = code; return pn_skb_send(sk, skb, NULL); } @@ -153,7 +153,7 @@ static int pipe_handler_request(struct sock *sk, u8 id, u8 code, ph->utid = id; /* whatever */ ph->message_id = id; ph->pipe_handle = pn->pipe_handle; - ph->data[0] = code; + ph->error_code = code; return pn_skb_send(sk, skb, NULL); } @@ -208,7 +208,7 @@ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code, struct pnpipehdr *ph; struct sockaddr_pn dst; u8 data[4] = { - oph->data[0], /* PEP type */ + oph->pep_type, /* PEP type */ code, /* error code, at an unusual offset */ PAD, PAD, }; @@ -221,7 +221,7 @@ static int pep_ctrlreq_error(struct sock *sk, struct sk_buff *oskb, u8 code, ph->utid = oph->utid; ph->message_id = PNS_PEP_CTRL_RESP; ph->pipe_handle = oph->pipe_handle; - ph->data[0] = oph->data[1]; /* CTRL id */ + ph->data0 = oph->data[0]; /* CTRL id */ pn_skb_get_src_sockaddr(oskb, &dst); return pn_skb_send(sk, skb, &dst); @@ -272,17 +272,17 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) return -EINVAL; hdr = pnp_hdr(skb); - if (hdr->data[0] != PN_PEP_TYPE_COMMON) { + if (hdr->pep_type != PN_PEP_TYPE_COMMON) { net_dbg_ratelimited("Phonet unknown PEP type: %u\n", - (unsigned int)hdr->data[0]); + (unsigned int)hdr->pep_type); return -EOPNOTSUPP; } - switch (hdr->data[1]) { + switch (hdr->data[0]) { case PN_PEP_IND_FLOW_CONTROL: switch (pn->tx_fc) { case PN_LEGACY_FLOW_CONTROL: - switch (hdr->data[4]) { + switch (hdr->data[3]) { case PEP_IND_BUSY: atomic_set(&pn->tx_credits, 0); break; @@ -292,7 +292,7 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) } break; case PN_ONE_CREDIT_FLOW_CONTROL: - if (hdr->data[4] == PEP_IND_READY) + if (hdr->data[3] == PEP_IND_READY) atomic_set(&pn->tx_credits, wake = 1); break; } @@ -301,12 +301,12 @@ static int pipe_rcv_status(struct sock *sk, struct sk_buff *skb) case PN_PEP_IND_ID_MCFC_GRANT_CREDITS: if (pn->tx_fc != PN_MULTI_CREDIT_FLOW_CONTROL) break; - atomic_add(wake = hdr->data[4], &pn->tx_credits); + atomic_add(wake = hdr->data[3], &pn->tx_credits); break; default: net_dbg_ratelimited("Phonet unknown PEP indication: %u\n", - (unsigned int)hdr->data[1]); + (unsigned int)hdr->data[0]); return -EOPNOTSUPP; } if (wake) @@ -318,7 +318,7 @@ static int pipe_rcv_created(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *hdr = pnp_hdr(skb); - u8 n_sb = hdr->data[0]; + u8 n_sb = hdr->data0; pn->rx_fc = pn->tx_fc = PN_LEGACY_FLOW_CONTROL; __skb_pull(skb, sizeof(*hdr)); @@ -506,7 +506,7 @@ static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb) return -ECONNREFUSED; /* Parse sub-blocks */ - n_sb = hdr->data[4]; + n_sb = hdr->data[3]; while (n_sb > 0) { u8 type, buf[6], len = sizeof(buf); const u8 *data = pep_get_sb(skb, &type, &len, buf); @@ -739,7 +739,7 @@ static int pipe_do_remove(struct sock *sk) ph->utid = 0; ph->message_id = PNS_PIPE_REMOVE_REQ; ph->pipe_handle = pn->pipe_handle; - ph->data[0] = PAD; + ph->data0 = PAD; return pn_skb_send(sk, skb, NULL); } @@ -817,7 +817,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp, peer_type = hdr->other_pep_type << 8; /* Parse sub-blocks (options) */ - n_sb = hdr->data[4]; + n_sb = hdr->data[3]; while (n_sb > 0) { u8 type, buf[1], len = sizeof(buf); const u8 *data = pep_get_sb(skb, &type, &len, buf); @@ -1109,7 +1109,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) ph->utid = 0; if (pn->aligned) { ph->message_id = PNS_PIPE_ALIGNED_DATA; - ph->data[0] = 0; /* padding */ + ph->data0 = 0; /* padding */ } else ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; From f1071c3e2473ae19a7f5d892a187c4cab1a61f2e Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Mon, 11 Feb 2019 16:27:58 +0200 Subject: [PATCH 169/298] crypto: ccree - add missing inline qualifier Commit 1358c13a48c4 ("crypto: ccree - fix resume race condition on init") was missing a "inline" qualifier for stub function used when CONFIG_PM is not set causing a build warning. Fixes: 1358c13a48c4 ("crypto: ccree - fix resume race condition on init") Cc: stable@kernel.org # v4.20 Signed-off-by: Gilad Ben-Yossef Acked-by: Geert Uytterhoeven Signed-off-by: Herbert Xu --- drivers/crypto/ccree/cc_pm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/ccree/cc_pm.h b/drivers/crypto/ccree/cc_pm.h index f62624357020..907a6db4d6c0 100644 --- a/drivers/crypto/ccree/cc_pm.h +++ b/drivers/crypto/ccree/cc_pm.h @@ -30,7 +30,7 @@ static inline int cc_pm_init(struct cc_drvdata *drvdata) return 0; } -static void cc_pm_go(struct cc_drvdata *drvdata) {} +static inline void cc_pm_go(struct cc_drvdata *drvdata) {} static inline void cc_pm_fini(struct cc_drvdata *drvdata) {} From 69216a545cf81b2b32d01948f7039315abaf75a0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 16 Feb 2019 14:51:25 +0100 Subject: [PATCH 170/298] crypto: sha256/arm - fix crash bug in Thumb2 build The SHA256 code we adopted from the OpenSSL project uses a rather peculiar way to take the address of the round constant table: it takes the address of the sha256_block_data_order() routine, and substracts a constant known quantity to arrive at the base of the table, which is emitted by the same assembler code right before the routine's entry point. However, recent versions of binutils have helpfully changed the behavior of references emitted via an ADR instruction when running in Thumb2 mode: it now takes the Thumb execution mode bit into account, which is bit 0 af the address. This means the produced table address also has bit 0 set, and so we end up with an address value pointing 1 byte past the start of the table, which results in crashes such as Unable to handle kernel paging request at virtual address bf825000 pgd = 42f44b11 [bf825000] *pgd=80000040206003, *pmd=5f1bd003, *pte=00000000 Internal error: Oops: 207 [#1] PREEMPT SMP THUMB2 Modules linked in: sha256_arm(+) sha1_arm_ce sha1_arm ... CPU: 7 PID: 396 Comm: cryptomgr_test Not tainted 5.0.0-rc6+ #144 Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 PC is at sha256_block_data_order+0xaaa/0xb30 [sha256_arm] LR is at __this_module+0x17fd/0xffffe800 [sha256_arm] pc : [] lr : [] psr: 800b0033 sp : ebc8bbe8 ip : faaabe1c fp : 2fdd3433 r10: 4c5f1692 r9 : e43037df r8 : b04b0a5a r7 : c369d722 r6 : 39c3693e r5 : 7a013189 r4 : 1580d26b r3 : 8762a9b0 r2 : eea9c2cd r1 : 3e9ab536 r0 : 1dea4ae7 Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA Thumb Segment user Control: 70c5383d Table: 6b8467c0 DAC: dbadc0de Process cryptomgr_test (pid: 396, stack limit = 0x69e1fe23) Stack: (0xebc8bbe8 to 0xebc8c000) ... unwind: Unknown symbol address bf820bca unwind: Index not found bf820bca Code: 441a ea80 40f9 440a (f85e) 3b04 ---[ end trace e560cce92700ef8a ]--- Given that this affects older kernels as well, in case they are built with a recent toolchain, apply a minimal backportable fix, which is to emit another non-code label at the start of the routine, and reference that instead. (This is similar to the current upstream state of this file in OpenSSL) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/sha256-armv4.pl | 3 ++- arch/arm/crypto/sha256-core.S_shipped | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl index b9ec44060ed3..a03cf4dfb781 100644 --- a/arch/arm/crypto/sha256-armv4.pl +++ b/arch/arm/crypto/sha256-armv4.pl @@ -212,10 +212,11 @@ K256: .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: +.Lsha256_block_data_order: #if __ARM_ARCH__<7 sub r3,pc,#8 @ sha256_block_data_order #else - adr r3,sha256_block_data_order + adr r3,.Lsha256_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap diff --git a/arch/arm/crypto/sha256-core.S_shipped b/arch/arm/crypto/sha256-core.S_shipped index 3b58300d611c..054aae0edfce 100644 --- a/arch/arm/crypto/sha256-core.S_shipped +++ b/arch/arm/crypto/sha256-core.S_shipped @@ -93,10 +93,11 @@ K256: .global sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: +.Lsha256_block_data_order: #if __ARM_ARCH__<7 sub r3,pc,#8 @ sha256_block_data_order #else - adr r3,sha256_block_data_order + adr r3,.Lsha256_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap From c64316502008064c158fa40cc250665e461b0f2a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Sat, 16 Feb 2019 14:51:26 +0100 Subject: [PATCH 171/298] crypto: sha512/arm - fix crash bug in Thumb2 build The SHA512 code we adopted from the OpenSSL project uses a rather peculiar way to take the address of the round constant table: it takes the address of the sha256_block_data_order() routine, and substracts a constant known quantity to arrive at the base of the table, which is emitted by the same assembler code right before the routine's entry point. However, recent versions of binutils have helpfully changed the behavior of references emitted via an ADR instruction when running in Thumb2 mode: it now takes the Thumb execution mode bit into account, which is bit 0 af the address. This means the produced table address also has bit 0 set, and so we end up with an address value pointing 1 byte past the start of the table, which results in crashes such as Unable to handle kernel paging request at virtual address bf825000 pgd = 42f44b11 [bf825000] *pgd=80000040206003, *pmd=5f1bd003, *pte=00000000 Internal error: Oops: 207 [#1] PREEMPT SMP THUMB2 Modules linked in: sha256_arm(+) sha1_arm_ce sha1_arm ... CPU: 7 PID: 396 Comm: cryptomgr_test Not tainted 5.0.0-rc6+ #144 Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 PC is at sha256_block_data_order+0xaaa/0xb30 [sha256_arm] LR is at __this_module+0x17fd/0xffffe800 [sha256_arm] pc : [] lr : [] psr: 800b0033 sp : ebc8bbe8 ip : faaabe1c fp : 2fdd3433 r10: 4c5f1692 r9 : e43037df r8 : b04b0a5a r7 : c369d722 r6 : 39c3693e r5 : 7a013189 r4 : 1580d26b r3 : 8762a9b0 r2 : eea9c2cd r1 : 3e9ab536 r0 : 1dea4ae7 Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA Thumb Segment user Control: 70c5383d Table: 6b8467c0 DAC: dbadc0de Process cryptomgr_test (pid: 396, stack limit = 0x69e1fe23) Stack: (0xebc8bbe8 to 0xebc8c000) ... unwind: Unknown symbol address bf820bca unwind: Index not found bf820bca Code: 441a ea80 40f9 440a (f85e) 3b04 ---[ end trace e560cce92700ef8a ]--- Given that this affects older kernels as well, in case they are built with a recent toolchain, apply a minimal backportable fix, which is to emit another non-code label at the start of the routine, and reference that instead. (This is similar to the current upstream state of this file in OpenSSL) Signed-off-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm/crypto/sha512-armv4.pl | 3 ++- arch/arm/crypto/sha512-core.S_shipped | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm/crypto/sha512-armv4.pl b/arch/arm/crypto/sha512-armv4.pl index fb5d15048c0b..788c17b56ecc 100644 --- a/arch/arm/crypto/sha512-armv4.pl +++ b/arch/arm/crypto/sha512-armv4.pl @@ -274,10 +274,11 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: +.Lsha512_block_data_order: #if __ARM_ARCH__<7 sub r3,pc,#8 @ sha512_block_data_order #else - adr r3,sha512_block_data_order + adr r3,.Lsha512_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap diff --git a/arch/arm/crypto/sha512-core.S_shipped b/arch/arm/crypto/sha512-core.S_shipped index b1c334a49cda..710ea309769e 100644 --- a/arch/arm/crypto/sha512-core.S_shipped +++ b/arch/arm/crypto/sha512-core.S_shipped @@ -141,10 +141,11 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) .global sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: +.Lsha512_block_data_order: #if __ARM_ARCH__<7 sub r3,pc,#8 @ sha512_block_data_order #else - adr r3,sha512_block_data_order + adr r3,.Lsha512_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap From 17407715240456448e4989bee46ffc93991add83 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 19 Feb 2019 13:12:40 +0800 Subject: [PATCH 172/298] mac80211_hwsim: propagate genlmsg_reply return code genlmsg_reply can fail, so propagate its return code Signed-off-by: Li RongQing Signed-off-by: Johannes Berg --- drivers/net/wireless/mac80211_hwsim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/mac80211_hwsim.c b/drivers/net/wireless/mac80211_hwsim.c index 320edcac4699..6359053bd0c7 100644 --- a/drivers/net/wireless/mac80211_hwsim.c +++ b/drivers/net/wireless/mac80211_hwsim.c @@ -3554,7 +3554,7 @@ static int hwsim_get_radio_nl(struct sk_buff *msg, struct genl_info *info) goto out_err; } - genlmsg_reply(skb, info); + res = genlmsg_reply(skb, info); break; } From 5c14a4d05f68415af9e41a4e667d1748d41d1baf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Thu, 21 Feb 2019 18:29:36 +0100 Subject: [PATCH 173/298] mac80211: Change default tx_sk_pacing_shift to 7 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we did the original tests for the optimal value of sk_pacing_shift, we came up with 6 ms of buffering as the default. Sadly, 6 is not a power of two, so when picking the shift value I erred on the size of less buffering and picked 4 ms instead of 8. This was probably wrong; those 2 ms of extra buffering makes a larger difference than I thought. So, change the default pacing shift to 7, which corresponds to 8 ms of buffering. The point of diminishing returns really kicks in after 8 ms, and so having this as a default should cut down on the need for extensive per-device testing and overrides needed in the drivers. Cc: stable@vger.kernel.org Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Johannes Berg --- net/mac80211/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 87a729926734..977dea436ee8 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -615,13 +615,13 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, * We need a bit of data queued to build aggregates properly, so * instruct the TCP stack to allow more than a single ms of data * to be queued in the stack. The value is a bit-shift of 1 - * second, so 8 is ~4ms of queued data. Only affects local TCP + * second, so 7 is ~8ms of queued data. Only affects local TCP * sockets. * This is the default, anyhow - drivers may need to override it * for local reasons (longer buffers, longer completion time, or * similar). */ - local->hw.tx_sk_pacing_shift = 8; + local->hw.tx_sk_pacing_shift = 7; /* set up some defaults */ local->hw.queues = 1; From 51d0af222f6fa43134c6187ab4f374630f6e0d96 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 22 Feb 2019 13:21:15 +0100 Subject: [PATCH 174/298] mac80211: allocate tailroom for forwarded mesh packets Forwarded packets enter the tx path through ieee80211_add_pending_skb, which skips the ieee80211_skb_resize call. Fixes WARN_ON in ccmp_encrypt_skb and resulting packet loss. Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index bb4d71efb6fb..c2a6da5d80da 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2644,6 +2644,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u16 ac, q, hdrlen; + int tailroom = 0; hdr = (struct ieee80211_hdr *) skb->data; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -2732,8 +2733,12 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) if (!ifmsh->mshcfg.dot11MeshForwarding) goto out; + if (sdata->crypto_tx_tailroom_needed_cnt) + tailroom = IEEE80211_ENCRYPT_TAILROOM; + fwd_skb = skb_copy_expand(skb, local->tx_headroom + - sdata->encrypt_headroom, 0, GFP_ATOMIC); + sdata->encrypt_headroom, + tailroom, GFP_ATOMIC); if (!fwd_skb) goto out; From 7c0cdf0b3940f63d9777c3fcf250a2f83859ca54 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Fri, 22 Feb 2019 14:19:08 +0100 Subject: [PATCH 175/298] bpf, lpm: fix lookup bug in map_delete_elem trie_delete_elem() was deleting an entry even though it was not matching if the prefixlen was correct. This patch adds a check on matchlen. Reproducer: $ sudo bpftool map create /sys/fs/bpf/mylpm type lpm_trie key 8 value 1 entries 128 name mylpm flags 1 $ sudo bpftool map update pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 aa bb cc dd value hex 01 $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm key: 10 00 00 00 aa bb cc dd value: 01 Found 1 element $ sudo bpftool map delete pinned /sys/fs/bpf/mylpm key hex 10 00 00 00 ff ff ff ff $ echo $? 0 $ sudo bpftool map dump pinned /sys/fs/bpf/mylpm Found 0 elements A similar reproducer is added in the selftests. Without the patch: $ sudo ./tools/testing/selftests/bpf/test_lpm_map test_lpm_map: test_lpm_map.c:485: test_lpm_delete: Assertion `bpf_map_delete_elem(map_fd, key) == -1 && errno == ENOENT' failed. Aborted With the patch: test_lpm_map runs without errors. Fixes: e454cf595853 ("bpf: Implement map_delete_elem for BPF_MAP_TYPE_LPM_TRIE") Cc: Craig Gallek Signed-off-by: Alban Crequy Acked-by: Craig Gallek Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 1 + tools/testing/selftests/bpf/test_lpm_map.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index abf1002080df..93a5cbbde421 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -471,6 +471,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key) } if (!node || node->prefixlen != key->prefixlen || + node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) { ret = -ENOENT; goto out; diff --git a/tools/testing/selftests/bpf/test_lpm_map.c b/tools/testing/selftests/bpf/test_lpm_map.c index 147e34cfceb7..02d7c871862a 100644 --- a/tools/testing/selftests/bpf/test_lpm_map.c +++ b/tools/testing/selftests/bpf/test_lpm_map.c @@ -474,6 +474,16 @@ static void test_lpm_delete(void) assert(bpf_map_lookup_elem(map_fd, key, &value) == -1 && errno == ENOENT); + key->prefixlen = 30; // unused prefix so far + inet_pton(AF_INET, "192.255.0.0", key->data); + assert(bpf_map_delete_elem(map_fd, key) == -1 && + errno == ENOENT); + + key->prefixlen = 16; // same prefix as the root node + inet_pton(AF_INET, "192.255.0.0", key->data); + assert(bpf_map_delete_elem(map_fd, key) == -1 && + errno == ENOENT); + /* assert initial lookup */ key->prefixlen = 32; inet_pton(AF_INET, "192.168.0.1", key->data); From cc1780fc42c76c705dd07ea123f1143dc5057630 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 20 Feb 2019 13:32:11 +0000 Subject: [PATCH 176/298] KEYS: user: Align the payload buffer Align the payload of "user" and "logon" keys so that users of the keyrings service can access it as a struct that requires more than 2-byte alignment. fscrypt currently does this which results in the read of fscrypt_key::size being misaligned as it needs 4-byte alignment. Align to __alignof__(u64) rather than __alignof__(long) since in the future it's conceivable that people would use structs beginning with u64, which on some platforms would require more than 'long' alignment. Reported-by: Aaro Koskinen Fixes: 2aa349f6e37c ("[PATCH] Keys: Export user-defined keyring operations") Fixes: 88bd6ccdcdd6 ("ext4 crypto: add encryption key management facilities") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Tested-by: Aaro Koskinen Signed-off-by: David Howells Signed-off-by: James Morris --- include/keys/user-type.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/keys/user-type.h b/include/keys/user-type.h index e098cbe27db5..12babe991594 100644 --- a/include/keys/user-type.h +++ b/include/keys/user-type.h @@ -31,7 +31,7 @@ struct user_key_payload { struct rcu_head rcu; /* RCU destructor */ unsigned short datalen; /* length of this data */ - char data[0]; /* actual data */ + char data[0] __aligned(__alignof__(u64)); /* actual data */ }; extern struct key_type key_type_user; From ede0fa98a900e657d1fcd80b50920efc896c1a4c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 22 Feb 2019 15:36:18 +0000 Subject: [PATCH 177/298] KEYS: always initialize keyring_index_key::desc_len syzbot hit the 'BUG_ON(index_key->desc_len == 0);' in __key_link_begin() called from construct_alloc_key() during sys_request_key(), because the length of the key description was never calculated. The problem is that we rely on ->desc_len being initialized by search_process_keyrings(), specifically by search_nested_keyrings(). But, if the process isn't subscribed to any keyrings that never happens. Fix it by always initializing keyring_index_key::desc_len as soon as the description is set, like we already do in some places. The following program reproduces the BUG_ON() when it's run as root and no session keyring has been installed. If it doesn't work, try removing pam_keyinit.so from /etc/pam.d/login and rebooting. #include #include #include int main(void) { int id = add_key("keyring", "syz", NULL, 0, KEY_SPEC_USER_KEYRING); keyctl_setperm(id, KEY_OTH_WRITE); setreuid(5000, 5000); request_key("user", "desc", "", id); } Reported-by: syzbot+ec24e95ea483de0a24da@syzkaller.appspotmail.com Fixes: b2a4df200d57 ("KEYS: Expand the capacity of a keyring") Signed-off-by: Eric Biggers Signed-off-by: David Howells Cc: stable@vger.kernel.org Signed-off-by: James Morris --- security/keys/keyring.c | 4 +--- security/keys/proc.c | 3 +-- security/keys/request_key.c | 1 + security/keys/request_key_auth.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/security/keys/keyring.c b/security/keys/keyring.c index eadebb92986a..f81372f53dd7 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -661,9 +661,6 @@ static bool search_nested_keyrings(struct key *keyring, BUG_ON((ctx->flags & STATE_CHECKS) == 0 || (ctx->flags & STATE_CHECKS) == STATE_CHECKS); - if (ctx->index_key.description) - ctx->index_key.desc_len = strlen(ctx->index_key.description); - /* Check to see if this top-level keyring is what we are looking for * and whether it is valid or not. */ @@ -914,6 +911,7 @@ key_ref_t keyring_search(key_ref_t keyring, struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, + .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, diff --git a/security/keys/proc.c b/security/keys/proc.c index d2b802072693..78ac305d715e 100644 --- a/security/keys/proc.c +++ b/security/keys/proc.c @@ -165,8 +165,7 @@ static int proc_keys_show(struct seq_file *m, void *v) int rc; struct keyring_search_context ctx = { - .index_key.type = key->type, - .index_key.description = key->description, + .index_key = key->index_key, .cred = m->file->f_cred, .match_data.cmp = lookup_user_key_possessed, .match_data.raw_data = key, diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 3f56a312dd35..7a0c6b666ff0 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -531,6 +531,7 @@ struct key *request_key_and_link(struct key_type *type, struct keyring_search_context ctx = { .index_key.type = type, .index_key.description = description, + .index_key.desc_len = strlen(description), .cred = current_cred(), .match_data.cmp = key_default_cmp, .match_data.raw_data = description, diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index afc304e8b61e..bda6201c6c45 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -247,7 +247,7 @@ struct key *key_get_instantiation_authkey(key_serial_t target_id) struct key *authkey; key_ref_t authkey_ref; - sprintf(description, "%x", target_id); + ctx.index_key.desc_len = sprintf(description, "%x", target_id); authkey_ref = search_process_keyrings(&ctx); From ad7dc69aeb23138cc23c406cac25003b97e8ee17 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Fri, 22 Feb 2019 17:45:01 +0100 Subject: [PATCH 178/298] x86/kvm/mmu: fix switch between root and guest MMUs Commit 14c07ad89f4d ("x86/kvm/mmu: introduce guest_mmu") brought one subtle change: previously, when switching back from L2 to L1, we were resetting MMU hooks (like mmu->get_cr3()) in kvm_init_mmu() called from nested_vmx_load_cr3() and now we do that in nested_ept_uninit_mmu_context() when we re-target vcpu->arch.mmu pointer. The change itself looks logical: if nested_ept_init_mmu_context() changes something than nested_ept_uninit_mmu_context() restores it back. There is, however, one thing: the following call chain: nested_vmx_load_cr3() kvm_mmu_new_cr3() __kvm_mmu_new_cr3() fast_cr3_switch() cached_root_available() now happens with MMU hooks pointing to the new MMU (root MMU in our case) while previously it was happening with the old one. cached_root_available() tries to stash current root but it is incorrect to read current CR3 with mmu->get_cr3(), we need to use old_mmu->get_cr3() which in case we're switching from L2 to L1 is guest_mmu. (BTW, in shadow page tables case this is a non-issue because we don't switch MMU). While we could've tried to guess that we're switching between MMUs and call the right ->get_cr3() from cached_root_available() this seems to be overly complicated. Instead, just stash the corresponding CR3 when setting root_hpa and make cached_root_available() use the stashed value. Fixes: 14c07ad89f4d ("x86/kvm/mmu: introduce guest_mmu") Signed-off-by: Vitaly Kuznetsov Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4660ce90de7f..593e17b7797e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -397,6 +397,7 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; + gpa_t root_cr3; union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index da9c42349b1f..6e62ed3852ac 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3555,6 +3555,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, &invalid_list); mmu->root_hpa = INVALID_PAGE; } + mmu->root_cr3 = 0; } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -3610,6 +3611,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); } else BUG(); + vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu); return 0; } @@ -3618,10 +3620,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; u64 pdptr, pm_mask; - gfn_t root_gfn; + gfn_t root_gfn, root_cr3; int i; - root_gfn = vcpu->arch.mmu->get_cr3(vcpu) >> PAGE_SHIFT; + root_cr3 = vcpu->arch.mmu->get_cr3(vcpu); + root_gfn = root_cr3 >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; @@ -3646,7 +3649,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu->root_hpa = root; - return 0; + goto set_root_cr3; } /* @@ -3712,6 +3715,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); } +set_root_cr3: + vcpu->arch.mmu->root_cr3 = root_cr3; + return 0; } @@ -4163,7 +4169,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, struct kvm_mmu_root_info root; struct kvm_mmu *mmu = vcpu->arch.mmu; - root.cr3 = mmu->get_cr3(vcpu); + root.cr3 = mmu->root_cr3; root.hpa = mmu->root_hpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { @@ -4176,6 +4182,7 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, } mmu->root_hpa = root.hpa; + mmu->root_cr3 = root.cr3; return i < KVM_MMU_NUM_PREV_ROOTS; } @@ -5516,11 +5523,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; + vcpu->arch.root_mmu.root_cr3 = 0; vcpu->arch.root_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; + vcpu->arch.guest_mmu.root_cr3 = 0; vcpu->arch.guest_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; From 511da98d207d5c0675a10351b01e37cbe50a79e5 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Fri, 1 Feb 2019 00:09:43 +0800 Subject: [PATCH 179/298] kvm: x86: Return LA57 feature based on hardware capability Previously, 'commit 372fddf70904 ("x86/mm: Introduce the 'no5lvl' kernel parameter")' cleared X86_FEATURE_LA57 in boot_cpu_data, if Linux chooses to not run in 5-level paging mode. Yet boot_cpu_data is queried by do_cpuid_ent() as the host capability later when creating vcpus, and Qemu will not be able to detect this feature and create VMs with LA57 feature. As discussed earlier, VMs can still benefit from extended linear address width, e.g. to enhance features like ASLR. So we would like to fix this, by return the true hardware capability when Qemu queries. Signed-off-by: Yu Zhang Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index bbffa6c54697..c07958b59f50 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -335,6 +335,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; + unsigned f_la57 = 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -489,7 +490,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, // TSC_ADJUST is emulated entry->ebx |= F(TSC_ADJUST); entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + f_la57 = entry->ecx & F(LA57); cpuid_mask(&entry->ecx, CPUID_7_ECX); + /* Set LA57 based on hardware capability. */ + entry->ecx |= f_la57; entry->ecx |= f_umip; /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) From de3ccd26fafc707b09792d9b633c8b5b48865315 Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Fri, 1 Feb 2019 00:09:23 +0800 Subject: [PATCH 180/298] KVM: MMU: record maximum physical address width in kvm_mmu_extended_role Previously, commit 7dcd57552008 ("x86/kvm/mmu: check if tdp/shadow MMU reconfiguration is needed") offered some optimization to avoid the unnecessary reconfiguration. Yet one scenario is broken - when cpuid changes VM's maximum physical address width, reconfiguration is needed to reset the reserved bits. Also, the TDP may need to reset its shadow_root_level when this value is changed. To fix this, a new field, maxphyaddr, is introduced in the extended role structure to keep track of the configured guest physical address width. Signed-off-by: Yu Zhang Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 593e17b7797e..180373360e34 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -299,6 +299,7 @@ union kvm_mmu_extended_role { unsigned int cr4_smap:1; unsigned int cr4_smep:1; unsigned int cr4_la57:1; + unsigned int maxphyaddr:6; }; }; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6e62ed3852ac..f2d1d230d5b8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4777,6 +4777,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) ext.cr4_pse = !!is_pse(vcpu); ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57); + ext.maxphyaddr = cpuid_maxphyaddr(vcpu); ext.valid = 1; From d1f20c03f48102e52eb98b8651d129b83134cae4 Mon Sep 17 00:00:00 2001 From: Maciej Kwiecien Date: Fri, 22 Feb 2019 09:45:26 +0100 Subject: [PATCH 181/298] sctp: don't compare hb_timer expire date before starting it hb_timer might not start at all for a particular transport because its start is conditional. In a result a node is not sending heartbeats. Function sctp_transport_reset_hb_timer has two roles: - initial start of hb_timer for a given transport, - update expire date of hb_timer for a given transport. The function is optimized to update timer's expire only if it is before a new calculated one but this comparison is invalid for a timer which has not yet started. Such a timer has expire == 0 and if a new expire value is bigger than (MAX_JIFFIES / 2 + 2) then "time_before" macro will fail and timer will not start resulting in no heartbeat packets send by the node. This was found when association was initialized within first 5 mins after system boot due to jiffies init value which is near to MAX_JIFFIES. Test kernel version: 4.9.154 (ARCH=arm) hb_timer.expire = 0; //initialized, not started timer new_expire = MAX_JIFFIES / 2 + 2; //or more time_before(hb_timer.expire, new_expire) == false Fixes: ba6f5e33bdbb ("sctp: avoid refreshing heartbeat timer too often") Reported-by: Marcin Stojek Tested-by: Marcin Stojek Signed-off-by: Maciej Kwiecien Reviewed-by: Alexander Sverdlin Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 033696e6f74f..ad158d311ffa 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -207,7 +207,8 @@ void sctp_transport_reset_hb_timer(struct sctp_transport *transport) /* When a data chunk is sent, reset the heartbeat interval. */ expires = jiffies + sctp_transport_timeout(transport); - if (time_before(transport->hb_timer.expires, expires) && + if ((time_before(transport->hb_timer.expires, expires) || + !timer_pending(&transport->hb_timer)) && !mod_timer(&transport->hb_timer, expires + prandom_u32_max(transport->rto))) sctp_transport_hold(transport); From 7cc9f7003a969d359f608ebb701d42cafe75b84a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Feb 2019 00:15:30 +0100 Subject: [PATCH 182/298] ipvlan: disallow userns cap_net_admin to change global mode/flags When running Docker with userns isolation e.g. --userns-remap="default" and spawning up some containers with CAP_NET_ADMIN under this realm, I noticed that link changes on ipvlan slave device inside that container can affect all devices from this ipvlan group which are in other net namespaces where the container should have no permission to make changes to, such as the init netns, for example. This effectively allows to undo ipvlan private mode and switch globally to bridge mode where slaves can communicate directly without going through hostns, or it allows to switch between global operation mode (l2/l3/l3s) for everyone bound to the given ipvlan master device. libnetwork plugin here is creating an ipvlan master and ipvlan slave in hostns and a slave each that is moved into the container's netns upon creation event. * In hostns: # ip -d a [...] 8: cilium_host@bond0: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535 ipvlan mode l3 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 inet 10.41.0.1/32 scope link cilium_host valid_lft forever preferred_lft forever [...] * Spawn container & change ipvlan mode setting inside of it: # docker run -dt --cap-add=NET_ADMIN --network cilium-net --name client -l app=test cilium/netperf 9fff485d69dcb5ce37c9e33ca20a11ccafc236d690105aadbfb77e4f4170879c # docker exec -ti client ip -d a [...] 10: cilium0@if4: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535 ipvlan mode l3 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 inet 10.41.197.43/32 brd 10.41.197.43 scope global cilium0 valid_lft forever preferred_lft forever # docker exec -ti client ip link change link cilium0 name cilium0 type ipvlan mode l2 # docker exec -ti client ip -d a [...] 10: cilium0@if4: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535 ipvlan mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 inet 10.41.197.43/32 brd 10.41.197.43 scope global cilium0 valid_lft forever preferred_lft forever * In hostns (mode switched to l2): # ip -d a [...] 8: cilium_host@bond0: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535 ipvlan mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 inet 10.41.0.1/32 scope link cilium_host valid_lft forever preferred_lft forever [...] Same l3 -> l2 switch would also happen by creating another slave inside the container's network namespace when specifying the existing cilium0 link to derive the actual (bond0) master: # docker exec -ti client ip link add link cilium0 name cilium1 type ipvlan mode l2 # docker exec -ti client ip -d a [...] 2: cilium1@if4: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535 ipvlan mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 10: cilium0@if4: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535 ipvlan mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 inet 10.41.197.43/32 brd 10.41.197.43 scope global cilium0 valid_lft forever preferred_lft forever * In hostns: # ip -d a [...] 8: cilium_host@bond0: mtu 1500 qdisc noqueue state UNKNOWN group default qlen 1000 link/ether 0c:c4:7a:e1:3d:cc brd ff:ff:ff:ff:ff:ff promiscuity 0 minmtu 68 maxmtu 65535 ipvlan mode l2 bridge numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 inet 10.41.0.1/32 scope link cilium_host valid_lft forever preferred_lft forever [...] One way to mitigate it is to check CAP_NET_ADMIN permissions of the ipvlan master device's ns, and only then allow to change mode or flags for all devices bound to it. Above two cases are then disallowed after the patch. Signed-off-by: Daniel Borkmann Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 7cdac77d0c68..07e41c42bcf5 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -499,6 +499,8 @@ static int ipvlan_nl_changelink(struct net_device *dev, if (!data) return 0; + if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN)) + return -EPERM; if (data[IFLA_IPVLAN_MODE]) { u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); @@ -601,6 +603,8 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct ipvl_dev *tmp = netdev_priv(phy_dev); phy_dev = tmp->phy_dev; + if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN)) + return -EPERM; } else if (!netif_is_ipvlan_port(phy_dev)) { /* Exit early if the underlying link is invalid or busy */ if (phy_dev->type != ARPHRD_ETHER || From c286909fe5458f69e533c845b757fd2c35064d26 Mon Sep 17 00:00:00 2001 From: David Chen Date: Wed, 20 Feb 2019 13:47:19 +0800 Subject: [PATCH 183/298] r8152: Fix an error on RTL8153-BD MAC Address Passthrough support RTL8153-BD is used in Dell DA300 type-C dongle. Added RTL8153-BD support to activate MAC address pass through on DA300. Apply correction on previously submitted patch in net.git tree. Signed-off-by: David Chen Signed-off-by: David S. Miller --- drivers/net/usb/r8152.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index ada6baf8847a..86c8c64fbb0f 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -1179,7 +1179,7 @@ static int vendor_mac_passthru_addr_read(struct r8152 *tp, struct sockaddr *sa) } else { /* test for RTL8153-BND and RTL8153-BD */ ocp_data = ocp_read_byte(tp, MCU_TYPE_USB, USB_MISC_1); - if ((ocp_data & BND_MASK) == 0 && (ocp_data & BD_MASK)) { + if ((ocp_data & BND_MASK) == 0 && (ocp_data & BD_MASK) == 0) { netif_dbg(tp, probe, tp->netdev, "Invalid variant for MAC pass through\n"); return -ENODEV; From 8c7a77267eec81dd81af8412f29e50c0b1082548 Mon Sep 17 00:00:00 2001 From: George Wilkie Date: Wed, 20 Feb 2019 08:19:11 +0000 Subject: [PATCH 184/298] team: use operstate consistently for linkup When a port is added to a team, its initial state is derived from netif_carrier_ok rather than netif_oper_up. If it is carrier up but operationally down at the time of being added, the port state.linkup will be set prematurely. port state.linkup should be set consistently using netif_oper_up rather than netif_carrier_ok. Fixes: f1d22a1e0595 ("team: account for oper state") Signed-off-by: George Wilkie Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/team/team.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 958f1cf67282..6ce3f666d142 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1256,7 +1256,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev, list_add_tail_rcu(&port->list, &team->port_list); team_port_enable(team, port); __team_compute_features(team); - __team_port_change_port_added(port, !!netif_carrier_ok(port_dev)); + __team_port_change_port_added(port, !!netif_oper_up(port_dev)); __team_options_change_check(team); netdev_info(dev, "Port device %s added\n", portname); @@ -2915,7 +2915,7 @@ static int team_device_event(struct notifier_block *unused, switch (event) { case NETDEV_UP: - if (netif_carrier_ok(dev)) + if (netif_oper_up(dev)) team_port_change_check(port, true); break; case NETDEV_DOWN: From efcc9bcaf77c07df01371a7c34e50424c291f3ac Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Wed, 20 Feb 2019 09:23:03 +0100 Subject: [PATCH 185/298] net: ip6_gre: fix possible NULL pointer dereference in ip6erspan_set_version Fix a possible NULL pointer dereference in ip6erspan_set_version checking nlattr data pointer kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 7549 Comm: syz-executor432 Not tainted 5.0.0-rc6-next-20190218 #37 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:ip6erspan_set_version+0x5c/0x350 net/ipv6/ip6_gre.c:1726 Code: 07 38 d0 7f 08 84 c0 0f 85 9f 02 00 00 49 8d bc 24 b0 00 00 00 c6 43 54 01 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 9a 02 00 00 4d 8b ac 24 b0 00 00 00 4d 85 ed 0f RSP: 0018:ffff888089ed7168 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff8880869d6e58 RCX: 0000000000000000 RDX: 0000000000000016 RSI: ffffffff862736b4 RDI: 00000000000000b0 RBP: ffff888089ed7180 R08: 1ffff11010d3adcb R09: ffff8880869d6e58 R10: ffffed1010d3add5 R11: ffff8880869d6eaf R12: 0000000000000000 R13: ffffffff8931f8c0 R14: ffffffff862825d0 R15: ffff8880869d6e58 FS: 0000000000b3d880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000184 CR3: 0000000092cc5000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ip6erspan_newlink+0x66/0x7b0 net/ipv6/ip6_gre.c:2210 __rtnl_newlink+0x107b/0x16c0 net/core/rtnetlink.c:3176 rtnl_newlink+0x69/0xa0 net/core/rtnetlink.c:3234 rtnetlink_rcv_msg+0x465/0xb00 net/core/rtnetlink.c:5192 netlink_rcv_skb+0x17a/0x460 net/netlink/af_netlink.c:2485 rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5210 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline] netlink_unicast+0x536/0x720 net/netlink/af_netlink.c:1336 netlink_sendmsg+0x8ae/0xd70 net/netlink/af_netlink.c:1925 sock_sendmsg_nosec net/socket.c:621 [inline] sock_sendmsg+0xdd/0x130 net/socket.c:631 ___sys_sendmsg+0x806/0x930 net/socket.c:2136 __sys_sendmsg+0x105/0x1d0 net/socket.c:2174 __do_sys_sendmsg net/socket.c:2183 [inline] __se_sys_sendmsg net/socket.c:2181 [inline] __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2181 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x440159 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fffa69156e8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440159 RDX: 0000000000000000 RSI: 0000000020001340 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 0000000000000001 R09: 00000000004002c8 R10: 0000000000000011 R11: 0000000000000246 R12: 00000000004019e0 R13: 0000000000401a70 R14: 0000000000000000 R15: 0000000000000000 Modules linked in: ---[ end trace 09f8a7d13b4faaa1 ]--- RIP: 0010:ip6erspan_set_version+0x5c/0x350 net/ipv6/ip6_gre.c:1726 Code: 07 38 d0 7f 08 84 c0 0f 85 9f 02 00 00 49 8d bc 24 b0 00 00 00 c6 43 54 01 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 9a 02 00 00 4d 8b ac 24 b0 00 00 00 4d 85 ed 0f RSP: 0018:ffff888089ed7168 EFLAGS: 00010202 RAX: dffffc0000000000 RBX: ffff8880869d6e58 RCX: 0000000000000000 RDX: 0000000000000016 RSI: ffffffff862736b4 RDI: 00000000000000b0 RBP: ffff888089ed7180 R08: 1ffff11010d3adcb R09: ffff8880869d6e58 R10: ffffed1010d3add5 R11: ffff8880869d6eaf R12: 0000000000000000 R13: ffffffff8931f8c0 R14: ffffffff862825d0 R15: ffff8880869d6e58 FS: 0000000000b3d880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020000184 CR3: 0000000092cc5000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 4974d5f678ab ("net: ip6_gre: initialize erspan_ver just for erspan tunnels") Reported-and-tested-by: syzbot+30191cf1057abd3064af@syzkaller.appspotmail.com Signed-off-by: Lorenzo Bianconi Reviewed-by: Greg Rose Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 0fdd0109d131..26f25b6e2833 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1722,6 +1722,9 @@ static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[], static void ip6erspan_set_version(struct nlattr *data[], struct __ip6_tnl_parm *parms) { + if (!data) + return; + parms->erspan_ver = 1; if (data[IFLA_GRE_ERSPAN_VER]) parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); From f6d25aca1ba3f46b76dabf6023a0dc2062dc792e Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Wed, 20 Feb 2019 11:02:43 +0000 Subject: [PATCH 186/298] net: thunderx: correct typo in macro name Correct STREERING to STEERING at macro name for BGX steering register. Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 2 +- drivers/net/ethernet/cavium/thunder/thunder_bgx.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c index e337da6ba2a4..673c57b8023f 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.c +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.c @@ -1217,7 +1217,7 @@ static void bgx_init_hw(struct bgx *bgx) /* Disable MAC steering (NCSI traffic) */ for (i = 0; i < RX_TRAFFIC_STEER_RULE_COUNT; i++) - bgx_reg_write(bgx, 0, BGX_CMR_RX_STREERING + (i * 8), 0x00); + bgx_reg_write(bgx, 0, BGX_CMR_RX_STEERING + (i * 8), 0x00); } static u8 bgx_get_lane2sds_cfg(struct bgx *bgx, struct lmac *lmac) diff --git a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h index cbdd20b9ee6f..5cbc54e9eb19 100644 --- a/drivers/net/ethernet/cavium/thunder/thunder_bgx.h +++ b/drivers/net/ethernet/cavium/thunder/thunder_bgx.h @@ -60,7 +60,7 @@ #define RX_DMACX_CAM_EN BIT_ULL(48) #define RX_DMACX_CAM_LMACID(x) (((u64)x) << 49) #define RX_DMAC_COUNT 32 -#define BGX_CMR_RX_STREERING 0x300 +#define BGX_CMR_RX_STEERING 0x300 #define RX_TRAFFIC_STEER_RULE_COUNT 8 #define BGX_CMR_CHAN_MSK_AND 0x450 #define BGX_CMR_BIST_STATUS 0x460 From 2ecbe4f4a027890a5d74a5100075aa6a373bea2c Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Wed, 20 Feb 2019 11:02:43 +0000 Subject: [PATCH 187/298] net: thunderx: replace global nicvf_rx_mode_wq work queue for all VFs to private for each of them. Having one work queue for receive mode configuration ndo_set_rx_mode() call for all VFs results in making each of them wait till the set_rx_mode() call completes for another VF if any of close, set receive mode and change flags calls being already invoked. Potentially this could cause device state change before appropriate call of receive mode configuration completes, so the call itself became meaningless, corrupt data or break configuration sequence. We don't need any delays in NIC VF configuration sequence so having delayed work call with 0 delay has no sense. This commit is to implement one work queue for each NIC VF for set_rx_mode task and to let them work independently and replacing delayed_work with work_struct. Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nic.h | 4 ++- .../net/ethernet/cavium/thunder/nicvf_main.c | 30 ++++++++++--------- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h index f4d81765221e..376a96bce33f 100644 --- a/drivers/net/ethernet/cavium/thunder/nic.h +++ b/drivers/net/ethernet/cavium/thunder/nic.h @@ -271,7 +271,7 @@ struct xcast_addr_list { }; struct nicvf_work { - struct delayed_work work; + struct work_struct work; u8 mode; struct xcast_addr_list *mc; }; @@ -327,6 +327,8 @@ struct nicvf { struct nicvf_work rx_mode_work; /* spinlock to protect workqueue arguments from concurrent access */ spinlock_t rx_mode_wq_lock; + /* workqueue for handling kernel ndo_set_rx_mode() calls */ + struct workqueue_struct *nicvf_rx_mode_wq; /* PTP timestamp */ struct cavium_ptp *ptp_clock; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 88f8a8fa93cd..abf24e7dff2d 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -68,9 +68,6 @@ module_param(cpi_alg, int, 0444); MODULE_PARM_DESC(cpi_alg, "PFC algorithm (0=none, 1=VLAN, 2=VLAN16, 3=IP Diffserv)"); -/* workqueue for handling kernel ndo_set_rx_mode() calls */ -static struct workqueue_struct *nicvf_rx_mode_wq; - static inline u8 nicvf_netdev_qidx(struct nicvf *nic, u8 qidx) { if (nic->sqs_mode) @@ -1311,6 +1308,9 @@ int nicvf_stop(struct net_device *netdev) struct nicvf_cq_poll *cq_poll = NULL; union nic_mbx mbx = {}; + /* wait till all queued set_rx_mode tasks completes */ + drain_workqueue(nic->nicvf_rx_mode_wq); + mbx.msg.msg = NIC_MBOX_MSG_SHUTDOWN; nicvf_send_msg_to_pf(nic, &mbx); @@ -1418,6 +1418,9 @@ int nicvf_open(struct net_device *netdev) struct nicvf_cq_poll *cq_poll = NULL; union nic_mbx mbx = {}; + /* wait till all queued set_rx_mode tasks completes if any */ + drain_workqueue(nic->nicvf_rx_mode_wq); + netif_carrier_off(netdev); err = nicvf_register_misc_interrupt(nic); @@ -1973,7 +1976,7 @@ static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, static void nicvf_set_rx_mode_task(struct work_struct *work_arg) { struct nicvf_work *vf_work = container_of(work_arg, struct nicvf_work, - work.work); + work); struct nicvf *nic = container_of(vf_work, struct nicvf, rx_mode_work); u8 mode; struct xcast_addr_list *mc; @@ -2030,7 +2033,7 @@ static void nicvf_set_rx_mode(struct net_device *netdev) kfree(nic->rx_mode_work.mc); nic->rx_mode_work.mc = mc_list; nic->rx_mode_work.mode = mode; - queue_delayed_work(nicvf_rx_mode_wq, &nic->rx_mode_work.work, 0); + queue_work(nic->nicvf_rx_mode_wq, &nic->rx_mode_work.work); spin_unlock(&nic->rx_mode_wq_lock); } @@ -2187,7 +2190,10 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&nic->reset_task, nicvf_reset_task); - INIT_DELAYED_WORK(&nic->rx_mode_work.work, nicvf_set_rx_mode_task); + nic->nicvf_rx_mode_wq = alloc_ordered_workqueue("nicvf_rx_mode_wq_VF%d", + WQ_MEM_RECLAIM, + nic->vf_id); + INIT_WORK(&nic->rx_mode_work.work, nicvf_set_rx_mode_task); spin_lock_init(&nic->rx_mode_wq_lock); err = register_netdev(netdev); @@ -2228,13 +2234,15 @@ static void nicvf_remove(struct pci_dev *pdev) nic = netdev_priv(netdev); pnetdev = nic->pnicvf->netdev; - cancel_delayed_work_sync(&nic->rx_mode_work.work); - /* Check if this Qset is assigned to different VF. * If yes, clean primary and all secondary Qsets. */ if (pnetdev && (pnetdev->reg_state == NETREG_REGISTERED)) unregister_netdev(pnetdev); + if (nic->nicvf_rx_mode_wq) { + destroy_workqueue(nic->nicvf_rx_mode_wq); + nic->nicvf_rx_mode_wq = NULL; + } nicvf_unregister_interrupts(nic); pci_set_drvdata(pdev, NULL); if (nic->drv_stats) @@ -2261,17 +2269,11 @@ static struct pci_driver nicvf_driver = { static int __init nicvf_init_module(void) { pr_info("%s, ver %s\n", DRV_NAME, DRV_VERSION); - nicvf_rx_mode_wq = alloc_ordered_workqueue("nicvf_generic", - WQ_MEM_RECLAIM); return pci_register_driver(&nicvf_driver); } static void __exit nicvf_cleanup_module(void) { - if (nicvf_rx_mode_wq) { - destroy_workqueue(nicvf_rx_mode_wq); - nicvf_rx_mode_wq = NULL; - } pci_unregister_driver(&nicvf_driver); } From 0dd563b9a62c4cbabf5d4fd6596440c2491e72b1 Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Wed, 20 Feb 2019 11:02:43 +0000 Subject: [PATCH 188/298] net: thunderx: make CFG_DONE message to run through generic send-ack sequence At the end of NIC VF initialization VF sends CFG_DONE message to PF without using nicvf_msg_send_to_pf routine. This potentially could re-write data in mailbox. This commit is to implement common way of sending CFG_DONE message by the same way with other configuration messages by using nicvf_send_msg_to_pf() routine. Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nic_main.c | 2 +- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c index 6c8dcb65ff03..90497a27df18 100644 --- a/drivers/net/ethernet/cavium/thunder/nic_main.c +++ b/drivers/net/ethernet/cavium/thunder/nic_main.c @@ -1039,7 +1039,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) case NIC_MBOX_MSG_CFG_DONE: /* Last message of VF config msg sequence */ nic_enable_vf(nic, vf, true); - goto unlock; + break; case NIC_MBOX_MSG_SHUTDOWN: /* First msg in VF teardown sequence */ if (vf >= nic->num_vf_en) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index abf24e7dff2d..19b58fc3ca41 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -169,6 +169,17 @@ static int nicvf_check_pf_ready(struct nicvf *nic) return 1; } +static void nicvf_send_cfg_done(struct nicvf *nic) +{ + union nic_mbx mbx = {}; + + mbx.msg.msg = NIC_MBOX_MSG_CFG_DONE; + if (nicvf_send_msg_to_pf(nic, &mbx)) { + netdev_err(nic->netdev, + "PF didn't respond to CFG DONE msg\n"); + } +} + static void nicvf_read_bgx_stats(struct nicvf *nic, struct bgx_stats_msg *bgx) { if (bgx->rx) @@ -1416,7 +1427,6 @@ int nicvf_open(struct net_device *netdev) struct nicvf *nic = netdev_priv(netdev); struct queue_set *qs = nic->qs; struct nicvf_cq_poll *cq_poll = NULL; - union nic_mbx mbx = {}; /* wait till all queued set_rx_mode tasks completes if any */ drain_workqueue(nic->nicvf_rx_mode_wq); @@ -1515,8 +1525,7 @@ int nicvf_open(struct net_device *netdev) nicvf_enable_intr(nic, NICVF_INTR_RBDR, qidx); /* Send VF config done msg to PF */ - mbx.msg.msg = NIC_MBOX_MSG_CFG_DONE; - nicvf_write_to_mbx(nic, &mbx); + nicvf_send_cfg_done(nic); return 0; cleanup: From 7db730d9d2f7b6af6aeac621b1890ea477a0cb8d Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Wed, 20 Feb 2019 11:02:44 +0000 Subject: [PATCH 189/298] net: thunderx: add nicvf_send_msg_to_pf result check for set_rx_mode_task The rx_set_mode invokes number of messages to be send to PF for receive mode configuration. In case if there any issues we need to stop sending messages and release allocated memory. This commit is to implement check of nicvf_msg_send_to_pf() result. Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 19b58fc3ca41..45f06504a61b 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1953,7 +1953,8 @@ static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, /* flush DMAC filters and reset RX mode */ mbx.xcast.msg = NIC_MBOX_MSG_RESET_XCAST; - nicvf_send_msg_to_pf(nic, &mbx); + if (nicvf_send_msg_to_pf(nic, &mbx) < 0) + goto free_mc; if (mode & BGX_XCAST_MCAST_FILTER) { /* once enabling filtering, we need to signal to PF to add @@ -1961,7 +1962,8 @@ static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, */ mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST; mbx.xcast.data.mac = 0; - nicvf_send_msg_to_pf(nic, &mbx); + if (nicvf_send_msg_to_pf(nic, &mbx) < 0) + goto free_mc; } /* check if we have any specific MACs to be added to PF DMAC filter */ @@ -1970,9 +1972,9 @@ static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, for (idx = 0; idx < mc_addrs->count; idx++) { mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST; mbx.xcast.data.mac = mc_addrs->mc[idx]; - nicvf_send_msg_to_pf(nic, &mbx); + if (nicvf_send_msg_to_pf(nic, &mbx) < 0) + goto free_mc; } - kfree(mc_addrs); } /* and finally set rx mode for PF accordingly */ @@ -1980,6 +1982,8 @@ static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, mbx.xcast.data.mode = mode; nicvf_send_msg_to_pf(nic, &mbx); +free_mc: + kfree(mc_addrs); } static void nicvf_set_rx_mode_task(struct work_struct *work_arg) From 5354439612894033e3f3b934f0bc03afb5f4ddc5 Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Wed, 20 Feb 2019 11:02:44 +0000 Subject: [PATCH 190/298] net: thunderx: rework xcast message structure to make it fit into 64 bit To communicate to PF each of ThunderX NIC VF uses mailbox which is pair of 64 bit registers available to both VFn and PF. This commit is to change the xcast message structure in order to fit it into 64 bit. Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nic.h | 6 ++---- drivers/net/ethernet/cavium/thunder/nic_main.c | 4 ++-- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 6 +++--- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h index 376a96bce33f..227343625e83 100644 --- a/drivers/net/ethernet/cavium/thunder/nic.h +++ b/drivers/net/ethernet/cavium/thunder/nic.h @@ -577,10 +577,8 @@ struct set_ptp { struct xcast { u8 msg; - union { - u8 mode; - u64 mac; - } data; + u8 mode; + u64 mac:48; }; /* 128 bit shared memory between PF and each VF */ diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c index 90497a27df18..620dbe082ca0 100644 --- a/drivers/net/ethernet/cavium/thunder/nic_main.c +++ b/drivers/net/ethernet/cavium/thunder/nic_main.c @@ -1094,7 +1094,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); bgx_set_dmac_cam_filter(nic->node, bgx, lmac, - mbx.xcast.data.mac, + mbx.xcast.mac, vf < NIC_VF_PER_MBX_REG ? vf : vf - NIC_VF_PER_MBX_REG); break; @@ -1106,7 +1106,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) } bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); - bgx_set_xcast_mode(nic->node, bgx, lmac, mbx.xcast.data.mode); + bgx_set_xcast_mode(nic->node, bgx, lmac, mbx.xcast.mode); break; default: dev_err(&nic->pdev->dev, diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 45f06504a61b..da5986ca7bee 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1961,7 +1961,7 @@ static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, * its' own LMAC to the filter to accept packets for it. */ mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST; - mbx.xcast.data.mac = 0; + mbx.xcast.mac = 0; if (nicvf_send_msg_to_pf(nic, &mbx) < 0) goto free_mc; } @@ -1971,7 +1971,7 @@ static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, /* now go through kernel list of MACs and add them one by one */ for (idx = 0; idx < mc_addrs->count; idx++) { mbx.xcast.msg = NIC_MBOX_MSG_ADD_MCAST; - mbx.xcast.data.mac = mc_addrs->mc[idx]; + mbx.xcast.mac = mc_addrs->mc[idx]; if (nicvf_send_msg_to_pf(nic, &mbx) < 0) goto free_mc; } @@ -1979,7 +1979,7 @@ static void __nicvf_set_rx_mode_task(u8 mode, struct xcast_addr_list *mc_addrs, /* and finally set rx mode for PF accordingly */ mbx.xcast.msg = NIC_MBOX_MSG_SET_XCAST; - mbx.xcast.data.mode = mode; + mbx.xcast.mode = mode; nicvf_send_msg_to_pf(nic, &mbx); free_mc: From 609ea65c65a0f801c285abf524d36d1f4635d942 Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Wed, 20 Feb 2019 11:02:44 +0000 Subject: [PATCH 191/298] net: thunderx: add mutex to protect mailbox from concurrent calls for same VF In some cases it could happen that nicvf_send_msg_to_pf() could be called concurrently for the same NIC VF, and thus re-writing mailbox contents and breaking messaging sequence with PF by re-writing NICVF data. This commit is to implement mutex for NICVF to protect mailbox registers and NICVF messaging control data from concurrent access. Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nic.h | 2 ++ drivers/net/ethernet/cavium/thunder/nicvf_main.c | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h index 227343625e83..86cda3f4b37b 100644 --- a/drivers/net/ethernet/cavium/thunder/nic.h +++ b/drivers/net/ethernet/cavium/thunder/nic.h @@ -329,6 +329,8 @@ struct nicvf { spinlock_t rx_mode_wq_lock; /* workqueue for handling kernel ndo_set_rx_mode() calls */ struct workqueue_struct *nicvf_rx_mode_wq; + /* mutex to protect VF's mailbox contents from concurrent access */ + struct mutex rx_mode_mtx; /* PTP timestamp */ struct cavium_ptp *ptp_clock; diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index da5986ca7bee..2332e3e95e0e 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -124,6 +124,9 @@ int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx) { int timeout = NIC_MBOX_MSG_TIMEOUT; int sleep = 10; + int ret = 0; + + mutex_lock(&nic->rx_mode_mtx); nic->pf_acked = false; nic->pf_nacked = false; @@ -136,7 +139,8 @@ int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx) netdev_err(nic->netdev, "PF NACK to mbox msg 0x%02x from VF%d\n", (mbx->msg.msg & 0xFF), nic->vf_id); - return -EINVAL; + ret = -EINVAL; + break; } msleep(sleep); if (nic->pf_acked) @@ -146,10 +150,12 @@ int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx *mbx) netdev_err(nic->netdev, "PF didn't ACK to mbox msg 0x%02x from VF%d\n", (mbx->msg.msg & 0xFF), nic->vf_id); - return -EBUSY; + ret = -EBUSY; + break; } } - return 0; + mutex_unlock(&nic->rx_mode_mtx); + return ret; } /* Checks if VF is able to comminicate with PF @@ -2208,6 +2214,7 @@ static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) nic->vf_id); INIT_WORK(&nic->rx_mode_work.work, nicvf_set_rx_mode_task); spin_lock_init(&nic->rx_mode_wq_lock); + mutex_init(&nic->rx_mode_mtx); err = register_netdev(netdev); if (err) { From 2c632ad8bc744d2ad59dd381ce56fae143cf1e0a Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Wed, 20 Feb 2019 11:02:45 +0000 Subject: [PATCH 192/298] net: thunderx: move link state polling function to VF Move the link change polling task to VF side in order to prevent races between VF and PF while sending link change message(s). This commit is to implement link change request to be initiated by VF. Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- drivers/net/ethernet/cavium/thunder/nic.h | 2 +- .../net/ethernet/cavium/thunder/nic_main.c | 39 ++++++++++++-- .../net/ethernet/cavium/thunder/nicvf_main.c | 52 +++++++++++++------ 3 files changed, 74 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic.h b/drivers/net/ethernet/cavium/thunder/nic.h index 86cda3f4b37b..62636c1ed141 100644 --- a/drivers/net/ethernet/cavium/thunder/nic.h +++ b/drivers/net/ethernet/cavium/thunder/nic.h @@ -331,7 +331,7 @@ struct nicvf { struct workqueue_struct *nicvf_rx_mode_wq; /* mutex to protect VF's mailbox contents from concurrent access */ struct mutex rx_mode_mtx; - + struct delayed_work link_change_work; /* PTP timestamp */ struct cavium_ptp *ptp_clock; /* Inbound timestamping is on */ diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c index 620dbe082ca0..8ab71dae3988 100644 --- a/drivers/net/ethernet/cavium/thunder/nic_main.c +++ b/drivers/net/ethernet/cavium/thunder/nic_main.c @@ -929,6 +929,35 @@ static void nic_config_timestamp(struct nicpf *nic, int vf, struct set_ptp *ptp) nic_reg_write(nic, NIC_PF_PKIND_0_15_CFG | (pkind_idx << 3), pkind_val); } +static void nic_link_status_get(struct nicpf *nic, u8 vf) +{ + union nic_mbx mbx = {}; + struct bgx_link_status link; + u8 bgx, lmac; + + mbx.link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE; + + /* Get BGX, LMAC indices for the VF */ + bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); + lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); + + /* Get interface link status */ + bgx_get_lmac_link_state(nic->node, bgx, lmac, &link); + + nic->link[vf] = link.link_up; + nic->duplex[vf] = link.duplex; + nic->speed[vf] = link.speed; + + /* Send a mbox message to VF with current link status */ + mbx.link_status.link_up = link.link_up; + mbx.link_status.duplex = link.duplex; + mbx.link_status.speed = link.speed; + mbx.link_status.mac_type = link.mac_type; + + /* reply with link status */ + nic_send_msg_to_vf(nic, vf, &mbx); +} + /* Interrupt handler to handle mailbox messages from VFs */ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) { @@ -1108,6 +1137,13 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); bgx_set_xcast_mode(nic->node, bgx, lmac, mbx.xcast.mode); break; + case NIC_MBOX_MSG_BGX_LINK_CHANGE: + if (vf >= nic->num_vf_en) { + ret = -1; /* NACK */ + break; + } + nic_link_status_get(nic, vf); + goto unlock; default: dev_err(&nic->pdev->dev, "Invalid msg from VF%d, msg 0x%x\n", vf, mbx.msg.msg); @@ -1419,9 +1455,6 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) goto err_disable_sriov; } - INIT_DELAYED_WORK(&nic->dwork, nic_poll_for_link); - queue_delayed_work(nic->check_link, &nic->dwork, 0); - return 0; err_disable_sriov: diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 2332e3e95e0e..503cfadff4ac 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -242,21 +242,24 @@ static void nicvf_handle_mbx_intr(struct nicvf *nic) break; case NIC_MBOX_MSG_BGX_LINK_CHANGE: nic->pf_acked = true; - nic->link_up = mbx.link_status.link_up; - nic->duplex = mbx.link_status.duplex; - nic->speed = mbx.link_status.speed; - nic->mac_type = mbx.link_status.mac_type; - if (nic->link_up) { - netdev_info(nic->netdev, "Link is Up %d Mbps %s duplex\n", - nic->speed, - nic->duplex == DUPLEX_FULL ? - "Full" : "Half"); - netif_carrier_on(nic->netdev); - netif_tx_start_all_queues(nic->netdev); - } else { - netdev_info(nic->netdev, "Link is Down\n"); - netif_carrier_off(nic->netdev); - netif_tx_stop_all_queues(nic->netdev); + if (nic->link_up != mbx.link_status.link_up) { + nic->link_up = mbx.link_status.link_up; + nic->duplex = mbx.link_status.duplex; + nic->speed = mbx.link_status.speed; + nic->mac_type = mbx.link_status.mac_type; + if (nic->link_up) { + netdev_info(nic->netdev, + "Link is Up %d Mbps %s duplex\n", + nic->speed, + nic->duplex == DUPLEX_FULL ? + "Full" : "Half"); + netif_carrier_on(nic->netdev); + netif_tx_start_all_queues(nic->netdev); + } else { + netdev_info(nic->netdev, "Link is Down\n"); + netif_carrier_off(nic->netdev); + netif_tx_stop_all_queues(nic->netdev); + } } break; case NIC_MBOX_MSG_ALLOC_SQS: @@ -1325,6 +1328,8 @@ int nicvf_stop(struct net_device *netdev) struct nicvf_cq_poll *cq_poll = NULL; union nic_mbx mbx = {}; + cancel_delayed_work_sync(&nic->link_change_work); + /* wait till all queued set_rx_mode tasks completes */ drain_workqueue(nic->nicvf_rx_mode_wq); @@ -1427,6 +1432,18 @@ static int nicvf_update_hw_max_frs(struct nicvf *nic, int mtu) return nicvf_send_msg_to_pf(nic, &mbx); } +static void nicvf_link_status_check_task(struct work_struct *work_arg) +{ + struct nicvf *nic = container_of(work_arg, + struct nicvf, + link_change_work.work); + union nic_mbx mbx = {}; + mbx.msg.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE; + nicvf_send_msg_to_pf(nic, &mbx); + queue_delayed_work(nic->nicvf_rx_mode_wq, + &nic->link_change_work, 2 * HZ); +} + int nicvf_open(struct net_device *netdev) { int cpu, err, qidx; @@ -1533,6 +1550,11 @@ int nicvf_open(struct net_device *netdev) /* Send VF config done msg to PF */ nicvf_send_cfg_done(nic); + INIT_DELAYED_WORK(&nic->link_change_work, + nicvf_link_status_check_task); + queue_delayed_work(nic->nicvf_rx_mode_wq, + &nic->link_change_work, 0); + return 0; cleanup: nicvf_disable_intr(nic, NICVF_INTR_MBOX, 0); From 2e1c3fff5e496621ccbb1a207b775c1dd1d524ac Mon Sep 17 00:00:00 2001 From: Vadim Lomovtsev Date: Wed, 20 Feb 2019 11:02:45 +0000 Subject: [PATCH 193/298] net: thunderx: remove link change polling code and info from nicpf Since link change polling routine was moved to nicvf side, we don't need anymore polling function at nicpf side along with link status info for all enabled Vfs as at VF side this info is already tracked. This commit is to remove unnecessary code & fields from nicpf structure. Signed-off-by: Vadim Lomovtsev Signed-off-by: David S. Miller --- .../net/ethernet/cavium/thunder/nic_main.c | 114 ++---------------- 1 file changed, 12 insertions(+), 102 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c b/drivers/net/ethernet/cavium/thunder/nic_main.c index 8ab71dae3988..c90252829ed3 100644 --- a/drivers/net/ethernet/cavium/thunder/nic_main.c +++ b/drivers/net/ethernet/cavium/thunder/nic_main.c @@ -57,14 +57,8 @@ struct nicpf { #define NIC_GET_BGX_FROM_VF_LMAC_MAP(map) ((map >> 4) & 0xF) #define NIC_GET_LMAC_FROM_VF_LMAC_MAP(map) (map & 0xF) u8 *vf_lmac_map; - struct delayed_work dwork; - struct workqueue_struct *check_link; - u8 *link; - u8 *duplex; - u32 *speed; u16 cpi_base[MAX_NUM_VFS_SUPPORTED]; u16 rssi_base[MAX_NUM_VFS_SUPPORTED]; - bool mbx_lock[MAX_NUM_VFS_SUPPORTED]; /* MSI-X */ u8 num_vec; @@ -929,6 +923,10 @@ static void nic_config_timestamp(struct nicpf *nic, int vf, struct set_ptp *ptp) nic_reg_write(nic, NIC_PF_PKIND_0_15_CFG | (pkind_idx << 3), pkind_val); } +/* Get BGX LMAC link status and update corresponding VF + * if there is a change, valid only if internal L2 switch + * is not present otherwise VF link is always treated as up + */ static void nic_link_status_get(struct nicpf *nic, u8 vf) { union nic_mbx mbx = {}; @@ -944,10 +942,6 @@ static void nic_link_status_get(struct nicpf *nic, u8 vf) /* Get interface link status */ bgx_get_lmac_link_state(nic->node, bgx, lmac, &link); - nic->link[vf] = link.link_up; - nic->duplex[vf] = link.duplex; - nic->speed[vf] = link.speed; - /* Send a mbox message to VF with current link status */ mbx.link_status.link_up = link.link_up; mbx.link_status.duplex = link.duplex; @@ -970,8 +964,6 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) int i; int ret = 0; - nic->mbx_lock[vf] = true; - mbx_addr = nic_get_mbx_addr(vf); mbx_data = (u64 *)&mbx; @@ -986,12 +978,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) switch (mbx.msg.msg) { case NIC_MBOX_MSG_READY: nic_mbx_send_ready(nic, vf); - if (vf < nic->num_vf_en) { - nic->link[vf] = 0; - nic->duplex[vf] = 0; - nic->speed[vf] = 0; - } - goto unlock; + return; case NIC_MBOX_MSG_QS_CFG: reg_addr = NIC_PF_QSET_0_127_CFG | (mbx.qs.num << NIC_QS_ID_SHIFT); @@ -1060,7 +1047,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) break; case NIC_MBOX_MSG_RSS_SIZE: nic_send_rss_size(nic, vf); - goto unlock; + return; case NIC_MBOX_MSG_RSS_CFG: case NIC_MBOX_MSG_RSS_CFG_CONT: nic_config_rss(nic, &mbx.rss_cfg); @@ -1078,19 +1065,19 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) break; case NIC_MBOX_MSG_ALLOC_SQS: nic_alloc_sqs(nic, &mbx.sqs_alloc); - goto unlock; + return; case NIC_MBOX_MSG_NICVF_PTR: nic->nicvf[vf] = mbx.nicvf.nicvf; break; case NIC_MBOX_MSG_PNICVF_PTR: nic_send_pnicvf(nic, vf); - goto unlock; + return; case NIC_MBOX_MSG_SNICVF_PTR: nic_send_snicvf(nic, &mbx.nicvf); - goto unlock; + return; case NIC_MBOX_MSG_BGX_STATS: nic_get_bgx_stats(nic, &mbx.bgx_stats); - goto unlock; + return; case NIC_MBOX_MSG_LOOPBACK: ret = nic_config_loopback(nic, &mbx.lbk); break; @@ -1099,7 +1086,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) break; case NIC_MBOX_MSG_PFC: nic_pause_frame(nic, vf, &mbx.pfc); - goto unlock; + return; case NIC_MBOX_MSG_PTP_CFG: nic_config_timestamp(nic, vf, &mbx.ptp); break; @@ -1143,7 +1130,7 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) break; } nic_link_status_get(nic, vf); - goto unlock; + return; default: dev_err(&nic->pdev->dev, "Invalid msg from VF%d, msg 0x%x\n", vf, mbx.msg.msg); @@ -1157,8 +1144,6 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf) mbx.msg.msg, vf); nic_mbx_send_nack(nic, vf); } -unlock: - nic->mbx_lock[vf] = false; } static irqreturn_t nic_mbx_intr_handler(int irq, void *nic_irq) @@ -1306,52 +1291,6 @@ static int nic_sriov_init(struct pci_dev *pdev, struct nicpf *nic) return 0; } -/* Poll for BGX LMAC link status and update corresponding VF - * if there is a change, valid only if internal L2 switch - * is not present otherwise VF link is always treated as up - */ -static void nic_poll_for_link(struct work_struct *work) -{ - union nic_mbx mbx = {}; - struct nicpf *nic; - struct bgx_link_status link; - u8 vf, bgx, lmac; - - nic = container_of(work, struct nicpf, dwork.work); - - mbx.link_status.msg = NIC_MBOX_MSG_BGX_LINK_CHANGE; - - for (vf = 0; vf < nic->num_vf_en; vf++) { - /* Poll only if VF is UP */ - if (!nic->vf_enabled[vf]) - continue; - - /* Get BGX, LMAC indices for the VF */ - bgx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); - lmac = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[vf]); - /* Get interface link status */ - bgx_get_lmac_link_state(nic->node, bgx, lmac, &link); - - /* Inform VF only if link status changed */ - if (nic->link[vf] == link.link_up) - continue; - - if (!nic->mbx_lock[vf]) { - nic->link[vf] = link.link_up; - nic->duplex[vf] = link.duplex; - nic->speed[vf] = link.speed; - - /* Send a mbox message to VF with current link status */ - mbx.link_status.link_up = link.link_up; - mbx.link_status.duplex = link.duplex; - mbx.link_status.speed = link.speed; - mbx.link_status.mac_type = link.mac_type; - nic_send_msg_to_vf(nic, vf, &mbx); - } - } - queue_delayed_work(nic->check_link, &nic->dwork, HZ * 2); -} - static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct device *dev = &pdev->dev; @@ -1420,18 +1359,6 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (!nic->vf_lmac_map) goto err_release_regions; - nic->link = devm_kmalloc_array(dev, max_lmac, sizeof(u8), GFP_KERNEL); - if (!nic->link) - goto err_release_regions; - - nic->duplex = devm_kmalloc_array(dev, max_lmac, sizeof(u8), GFP_KERNEL); - if (!nic->duplex) - goto err_release_regions; - - nic->speed = devm_kmalloc_array(dev, max_lmac, sizeof(u32), GFP_KERNEL); - if (!nic->speed) - goto err_release_regions; - /* Initialize hardware */ nic_init_hw(nic); @@ -1447,19 +1374,8 @@ static int nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) if (err) goto err_unregister_interrupts; - /* Register a physical link status poll fn() */ - nic->check_link = alloc_workqueue("check_link_status", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1); - if (!nic->check_link) { - err = -ENOMEM; - goto err_disable_sriov; - } - return 0; -err_disable_sriov: - if (nic->flags & NIC_SRIOV_ENABLED) - pci_disable_sriov(pdev); err_unregister_interrupts: nic_unregister_interrupts(nic); err_release_regions: @@ -1480,12 +1396,6 @@ static void nic_remove(struct pci_dev *pdev) if (nic->flags & NIC_SRIOV_ENABLED) pci_disable_sriov(pdev); - if (nic->check_link) { - /* Destroy work Queue */ - cancel_delayed_work_sync(&nic->dwork); - destroy_workqueue(nic->check_link); - } - nic_unregister_interrupts(nic); pci_release_regions(pdev); From f5b51fe804ec2a6edce0f8f6b11ea57283f5857b Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 20 Feb 2019 18:18:12 +0100 Subject: [PATCH 194/298] ipv6: route: purge exception on removal When a netdevice is unregistered, we flush the relevant exception via rt6_sync_down_dev() -> fib6_ifdown() -> fib6_del() -> fib6_del_route(). Finally, we end-up calling rt6_remove_exception(), where we release the relevant dst, while we keep the references to the related fib6_info and dev. Such references should be released later when the dst will be destroyed. There are a number of caches that can keep the exception around for an unlimited amount of time - namely dst_cache, possibly even socket cache. As a result device registration may hang, as demonstrated by this script: ip netns add cl ip netns add rt ip netns add srv ip netns exec rt sysctl -w net.ipv6.conf.all.forwarding=1 ip link add name cl_veth type veth peer name cl_rt_veth ip link set dev cl_veth netns cl ip -n cl link set dev cl_veth up ip -n cl addr add dev cl_veth 2001::2/64 ip -n cl route add default via 2001::1 ip -n cl link add tunv6 type ip6tnl mode ip6ip6 local 2001::2 remote 2002::1 hoplimit 64 dev cl_veth ip -n cl link set tunv6 up ip -n cl addr add 2013::2/64 dev tunv6 ip link set dev cl_rt_veth netns rt ip -n rt link set dev cl_rt_veth up ip -n rt addr add dev cl_rt_veth 2001::1/64 ip link add name rt_srv_veth type veth peer name srv_veth ip link set dev srv_veth netns srv ip -n srv link set dev srv_veth up ip -n srv addr add dev srv_veth 2002::1/64 ip -n srv route add default via 2002::2 ip -n srv link add tunv6 type ip6tnl mode ip6ip6 local 2002::1 remote 2001::2 hoplimit 64 dev srv_veth ip -n srv link set tunv6 up ip -n srv addr add 2013::1/64 dev tunv6 ip link set dev rt_srv_veth netns rt ip -n rt link set dev rt_srv_veth up ip -n rt addr add dev rt_srv_veth 2002::2/64 ip netns exec srv netserver & sleep 0.1 ip netns exec cl ping6 -c 4 2013::1 ip netns exec cl netperf -H 2013::1 -t TCP_STREAM -l 3 & sleep 1 ip -n rt link set dev rt_srv_veth mtu 1400 wait %2 ip -n cl link del cl_veth This commit addresses the issue purging all the references held by the exception at time, as we currently do for e.g. ipv6 pcpu dst entries. v1 -> v2: - re-order the code to avoid accessing dst and net after dst_dev_put() Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based routes") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 74b9b6fd4168..047c47224dba 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1274,18 +1274,29 @@ static DEFINE_SPINLOCK(rt6_exception_lock); static void rt6_remove_exception(struct rt6_exception_bucket *bucket, struct rt6_exception *rt6_ex) { + struct fib6_info *from; struct net *net; if (!bucket || !rt6_ex) return; net = dev_net(rt6_ex->rt6i->dst.dev); + net->ipv6.rt6_stats->fib_rt_cache--; + + /* purge completely the exception to allow releasing the held resources: + * some [sk] cache may keep the dst around for unlimited time + */ + from = rcu_dereference_protected(rt6_ex->rt6i->from, + lockdep_is_held(&rt6_exception_lock)); + rcu_assign_pointer(rt6_ex->rt6i->from, NULL); + fib6_info_release(from); + dst_dev_put(&rt6_ex->rt6i->dst); + hlist_del_rcu(&rt6_ex->hlist); dst_release(&rt6_ex->rt6i->dst); kfree_rcu(rt6_ex, rcu); WARN_ON_ONCE(!bucket->depth); bucket->depth--; - net->ipv6.rt6_stats->fib_rt_cache--; } /* Remove oldest rt6_ex in bucket and free the memory From 52baf9878b65872a7fc735d7fae3350ea9f30646 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Feb 2019 22:34:54 +0100 Subject: [PATCH 195/298] net: socket: add check for negative optlen in compat setsockopt __sys_setsockopt() already checks for `optlen < 0`. Add an equivalent check to the compat path for robustness. This has to be `> INT_MAX` instead of `< 0` because the signedness of `optlen` is different here. Signed-off-by: Jann Horn Signed-off-by: David S. Miller --- net/compat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/compat.c b/net/compat.c index 959d1c51826d..3d348198004f 100644 --- a/net/compat.c +++ b/net/compat.c @@ -388,8 +388,12 @@ static int __compat_sys_setsockopt(int fd, int level, int optname, char __user *optval, unsigned int optlen) { int err; - struct socket *sock = sockfd_lookup(fd, &err); + struct socket *sock; + if (optlen > INT_MAX) + return -EINVAL; + + sock = sockfd_lookup(fd, &err); if (sock) { err = security_socket_setsockopt(sock, level, optname); if (err) { From 80d79ad224ba22381e8d26b54674a86433e75d18 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 20 Feb 2019 14:58:50 -0800 Subject: [PATCH 196/298] Documentation: networking: switchdev: Update port parent ID section Update the section about switchdev drivers having to implement a switchdev_port_attr_get() function to return SWITCHDEV_ATTR_ID_PORT_PARENT_ID since that is no longer valid after commit bccb30254a4a ("net: Get rid of SWITCHDEV_ATTR_ID_PORT_PARENT_ID"). Fixes: bccb30254a4a ("net: Get rid of SWITCHDEV_ATTR_ID_PORT_PARENT_ID") Reviewed-by: Ido Schimmel Acked-by: Jiri Pirko Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 82236a17b5e6..97b7ca8b9b86 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -92,11 +92,11 @@ device. Switch ID ^^^^^^^^^ -The switchdev driver must implement the switchdev op switchdev_port_attr_get -for SWITCHDEV_ATTR_ID_PORT_PARENT_ID for each port netdev, returning the same -physical ID for each port of a switch. The ID must be unique between switches -on the same system. The ID does not need to be unique between switches on -different systems. +The switchdev driver must implement the net_device operation +ndo_get_port_parent_id for each port netdev, returning the same physical ID for +each port of a switch. The ID must be unique between switches on the same +system. The ID does not need to be unique between switches on different +systems. The switch ID is used to locate ports on a switch and to know if aggregated ports belong to the same switch. From 71c190249f0ced5b26377ea6bf829ab3af77a40c Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 22 Feb 2019 22:36:03 +0000 Subject: [PATCH 197/298] nfp: bpf: fix code-gen bug on BPF_ALU | BPF_XOR | BPF_K The intended optimization should be A ^ 0 = A, not A ^ -1 = A. Fixes: cd7df56ed3e6 ("nfp: add BPF to NFP code translator") Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index e23ca90289f7..a09696540171 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -2309,7 +2309,7 @@ static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !~meta->insn.imm); + return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm); } static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) From f036ebd9bfbe1e91a3d855e85e05fc5ff156b641 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Fri, 22 Feb 2019 22:36:04 +0000 Subject: [PATCH 198/298] nfp: bpf: fix ALU32 high bits clearance bug NFP BPF JIT compiler is doing a couple of small optimizations when jitting ALU imm instructions, some of these optimizations could save code-gen, for example: A & -1 = A A | 0 = A A ^ 0 = A However, for ALU32, high 32-bit of the 64-bit register should still be cleared according to ISA semantics. Fixes: cd7df56ed3e6 ("nfp: add BPF to NFP code translator") Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index a09696540171..0a868c829b90 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -1291,15 +1291,10 @@ wrp_alu64_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, static int wrp_alu32_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, - enum alu_op alu_op, bool skip) + enum alu_op alu_op) { const struct bpf_insn *insn = &meta->insn; - if (skip) { - meta->skip = true; - return 0; - } - wrp_alu_imm(nfp_prog, insn->dst_reg * 2, alu_op, insn->imm); wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0); @@ -2309,7 +2304,7 @@ static int xor_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int xor_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR, !meta->insn.imm); + return wrp_alu32_imm(nfp_prog, meta, ALU_OP_XOR); } static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) @@ -2319,7 +2314,7 @@ static int and_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int and_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND, !~meta->insn.imm); + return wrp_alu32_imm(nfp_prog, meta, ALU_OP_AND); } static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) @@ -2329,7 +2324,7 @@ static int or_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int or_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR, !meta->insn.imm); + return wrp_alu32_imm(nfp_prog, meta, ALU_OP_OR); } static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) @@ -2339,7 +2334,7 @@ static int add_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int add_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD, !meta->insn.imm); + return wrp_alu32_imm(nfp_prog, meta, ALU_OP_ADD); } static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) @@ -2349,7 +2344,7 @@ static int sub_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int sub_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB, !meta->insn.imm); + return wrp_alu32_imm(nfp_prog, meta, ALU_OP_SUB); } static int mul_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) From 67681d02aaa1db9044a16df4ca9c77cde1221a3e Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 20 Feb 2019 19:07:31 -0500 Subject: [PATCH 199/298] bnxt_en: Fix typo in firmware message timeout logic. The logic that polls for the firmware message response uses a shorter sleep interval for the first few passes. But there was a typo so it was using the wrong counter (larger counter) for these short sleep passes. The result is a slightly shorter timeout period for these firmware messages than intended. Fix it by using the proper counter. Fixes: 9751e8e71487 ("bnxt_en: reduce timeout on initial HWRM calls") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 8bc7e495b027..1ddd6721d7cd 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3903,7 +3903,7 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, if (len) break; /* on first few passes, just barely sleep */ - if (i < DFLT_HWRM_CMD_TIMEOUT) + if (i < HWRM_SHORT_TIMEOUT_COUNTER) usleep_range(HWRM_SHORT_MIN_TIMEOUT, HWRM_SHORT_MAX_TIMEOUT); else From 0000b81a063b5f3ab82fa18041c28327ce72c312 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 20 Feb 2019 19:07:32 -0500 Subject: [PATCH 200/298] bnxt_en: Wait longer for the firmware message response to complete. The code waits up to 20 usec for the firmware response to complete once we've seen the valid response header in the buffer. It turns out that in some scenarios, this wait time is not long enough. Extend it to 150 usec and use usleep_range() instead of udelay(). Fixes: 9751e8e71487 ("bnxt_en: reduce timeout on initial HWRM calls") Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- drivers/net/ethernet/broadcom/bnxt/bnxt.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 1ddd6721d7cd..d95730c6e0f2 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -3926,7 +3926,7 @@ static int bnxt_hwrm_do_send_msg(struct bnxt *bp, void *msg, u32 msg_len, dma_rmb(); if (*valid) break; - udelay(1); + usleep_range(1, 5); } if (j >= HWRM_VALID_BIT_DELAY_USEC) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index a451796deefe..2fb653e0048d 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -582,7 +582,7 @@ struct nqe_cn { (HWRM_SHORT_TIMEOUT_COUNTER * HWRM_SHORT_MIN_TIMEOUT + \ ((n) - HWRM_SHORT_TIMEOUT_COUNTER) * HWRM_MIN_TIMEOUT)) -#define HWRM_VALID_BIT_DELAY_USEC 20 +#define HWRM_VALID_BIT_DELAY_USEC 150 #define BNXT_HWRM_CHNL_CHIMP 0 #define BNXT_HWRM_CHNL_KONG 1 From 97f0082a0592212fc15d4680f5a4d80f79a1687c Mon Sep 17 00:00:00 2001 From: Kalash Nainwal Date: Wed, 20 Feb 2019 16:23:04 -0800 Subject: [PATCH 201/298] net: Set rtm_table to RT_TABLE_COMPAT for ipv6 for tables > 255 Set rtm_table to RT_TABLE_COMPAT for ipv6 for tables > 255 to keep legacy software happy. This is similar to what was done for ipv4 in commit 709772e6e065 ("net: Fix routing tables with id > 255 for legacy software"). Signed-off-by: Kalash Nainwal Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 047c47224dba..ce15dc4ccbfa 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4665,7 +4665,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, table = rt->fib6_table->tb6_id; else table = RT6_TABLE_UNSPEC; - rtm->rtm_table = table; + rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table)) goto nla_put_failure; From 6ff7b060535e87c2ae14dd8548512abfdda528fb Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 21 Feb 2019 22:42:01 +0800 Subject: [PATCH 202/298] mdio_bus: Fix use-after-free on device_register fails KASAN has found use-after-free in fixed_mdio_bus_init, commit 0c692d07842a ("drivers/net/phy/mdio_bus.c: call put_device on device_register() failure") call put_device() while device_register() fails,give up the last reference to the device and allow mdiobus_release to be executed ,kfreeing the bus. However in most drives, mdiobus_free be called to free the bus while mdiobus_register fails. use-after-free occurs when access bus again, this patch revert it to let mdiobus_free free the bus. KASAN report details as below: BUG: KASAN: use-after-free in mdiobus_free+0x85/0x90 drivers/net/phy/mdio_bus.c:482 Read of size 4 at addr ffff8881dc824d78 by task syz-executor.0/3524 CPU: 1 PID: 3524 Comm: syz-executor.0 Not tainted 5.0.0-rc7+ #45 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xfa/0x1ce lib/dump_stack.c:113 print_address_description+0x65/0x270 mm/kasan/report.c:187 kasan_report+0x149/0x18d mm/kasan/report.c:317 mdiobus_free+0x85/0x90 drivers/net/phy/mdio_bus.c:482 fixed_mdio_bus_init+0x283/0x1000 [fixed_phy] ? 0xffffffffc0e40000 ? 0xffffffffc0e40000 ? 0xffffffffc0e40000 do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x462e99 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f6215c19c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99 RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000003 RBP: 00007f6215c19c70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f6215c1a6bc R13: 00000000004bcefb R14: 00000000006f7030 R15: 0000000000000004 Allocated by task 3524: set_track mm/kasan/common.c:85 [inline] __kasan_kmalloc.constprop.3+0xa0/0xd0 mm/kasan/common.c:496 kmalloc include/linux/slab.h:545 [inline] kzalloc include/linux/slab.h:740 [inline] mdiobus_alloc_size+0x54/0x1b0 drivers/net/phy/mdio_bus.c:143 fixed_mdio_bus_init+0x163/0x1000 [fixed_phy] do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Freed by task 3524: set_track mm/kasan/common.c:85 [inline] __kasan_slab_free+0x130/0x180 mm/kasan/common.c:458 slab_free_hook mm/slub.c:1409 [inline] slab_free_freelist_hook mm/slub.c:1436 [inline] slab_free mm/slub.c:2986 [inline] kfree+0xe1/0x270 mm/slub.c:3938 device_release+0x78/0x200 drivers/base/core.c:919 kobject_cleanup lib/kobject.c:662 [inline] kobject_release lib/kobject.c:691 [inline] kref_put include/linux/kref.h:67 [inline] kobject_put+0x146/0x240 lib/kobject.c:708 put_device+0x1c/0x30 drivers/base/core.c:2060 __mdiobus_register+0x483/0x560 drivers/net/phy/mdio_bus.c:382 fixed_mdio_bus_init+0x26b/0x1000 [fixed_phy] do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe The buggy address belongs to the object at ffff8881dc824c80 which belongs to the cache kmalloc-2k of size 2048 The buggy address is located 248 bytes inside of 2048-byte region [ffff8881dc824c80, ffff8881dc825480) The buggy address belongs to the page: page:ffffea0007720800 count:1 mapcount:0 mapping:ffff8881f6c02800 index:0x0 compound_mapcount: 0 flags: 0x2fffc0000010200(slab|head) raw: 02fffc0000010200 0000000000000000 0000000500000001 ffff8881f6c02800 raw: 0000000000000000 00000000800f000f 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8881dc824c00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8881dc824c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8881dc824d00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8881dc824d80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8881dc824e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 0c692d07842a ("drivers/net/phy/mdio_bus.c: call put_device on device_register() failure") Signed-off-by: YueHaibing Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/mdio_bus.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index 66b9cfe692fc..7368616286ae 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -379,7 +379,6 @@ int __mdiobus_register(struct mii_bus *bus, struct module *owner) err = device_register(&bus->dev); if (err) { pr_err("mii_bus %s failed to register\n", bus->id); - put_device(&bus->dev); return -EINVAL; } From 543fc3fb41834a7f2e4cfa1dcf8aa9c472a52e9a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 17:43:57 +0100 Subject: [PATCH 203/298] udpv6: add the required annotation to mib type In commit 029a37434880 ("udp6: cleanup stats accounting in recvmsg()") I forgot to add the percpu annotation for the mib pointer. Add it, and make sparse happy. Fixes: 029a37434880 ("udp6: cleanup stats accounting in recvmsg()") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2596ffdeebea..e6c52c27f354 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -288,8 +288,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int peeked, peeking, off; int err; int is_udplite = IS_UDPLITE(sk); + struct udp_mib __percpu *mib; bool checksum_valid = false; - struct udp_mib *mib; int is_udp4; if (flags & MSG_ERRQUEUE) From 5de362df44d71fc8f6b153ae4eaa2a1284c84490 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 17:43:58 +0100 Subject: [PATCH 204/298] fou6: fix proto error handler argument type Last argument of gue6_err_proto_handler() has a wrong type annotation, fix it and make sparse happy again. Fixes: b8a51b38e4d4 ("fou, fou6: ICMP error handlers for FoU and GUE") Signed-off-by: Paolo Abeni Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/fou6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c index b858bd5280bf..867474abe269 100644 --- a/net/ipv6/fou6.c +++ b/net/ipv6/fou6.c @@ -72,7 +72,7 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, static int gue6_err_proto_handler(int proto, struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, u32 info) + u8 type, u8 code, int offset, __be32 info) { const struct inet6_protocol *ipprot; From 424a7cd078401591fc45587ffb2c012d7f402fb7 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 17:43:59 +0100 Subject: [PATCH 205/298] udpv6: fix possible user after free in error handler Before derefencing the encap pointer, commit e7cc082455cb ("udp: Support for error handlers of tunnels with arbitrary destination port") checks for a NULL value, but the two fetch operation can race with removal. Fix the above using a single access. Also fix a couple of type annotations, to make sparse happy. Fixes: e7cc082455cb ("udp: Support for error handlers of tunnels with arbitrary destination port") Signed-off-by: Paolo Abeni Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv6/udp.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e6c52c27f354..b444483cdb2b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -420,17 +420,19 @@ EXPORT_SYMBOL(udpv6_encap_enable); */ static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, u32 info) + u8 type, u8 code, int offset, __be32 info) { int i; for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, u32 info); + u8 type, u8 code, int offset, __be32 info); + const struct ip6_tnl_encap_ops *encap; - if (!ip6tun_encaps[i]) + encap = rcu_dereference(ip6tun_encaps[i]); + if (!encap) continue; - handler = rcu_dereference(ip6tun_encaps[i]->err_handler); + handler = encap->err_handler; if (handler && !handler(skb, opt, type, code, offset, info)) return 0; } From 92b95364235b6441a36861ff0ca4541a13351d60 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 21 Feb 2019 17:44:00 +0100 Subject: [PATCH 206/298] udp: fix possible user after free in error handler Similar to the previous commit, this addresses the same issue for ipv4: use a single fetch operation and use the correct rcu annotation. Fixes: e7cc082455cb ("udp: Support for error handlers of tunnels with arbitrary destination port") Signed-off-by: Paolo Abeni Acked-by: Stefano Brivio Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 5c3cd5d84a6f..372fdc5381a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -562,10 +562,12 @@ static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, u32 info); + const struct ip_tunnel_encap_ops *encap; - if (!iptun_encaps[i]) + encap = rcu_dereference(iptun_encaps[i]); + if (!encap) continue; - handler = rcu_dereference(iptun_encaps[i]->err_handler); + handler = encap->err_handler; if (handler && !handler(skb, info)) return 0; } From b4b8bb69c104a9345c528692cde5aa520d885360 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 23 Feb 2019 00:03:44 +0100 Subject: [PATCH 207/298] bpf, doc: add bpf list as secondary entry to maintainers file We recently created a bpf@vger.kernel.org list (https://lore.kernel.org/bpf/) for BPF related discussions, originally in context of BPF track at LSF/MM for topic discussions. It's *optional* but *desirable* to keep it in Cc for BPF related kernel/loader/llvm/tooling threads, meaning also infrastructure like llvm that sits on top of kernel but is crucial to BPF. In any case, netdev with it's bpf delegate is *as-is* today primary list for patches, so nothing changes in the workflow. Main purpose is to have some more awareness for the bpf@vger.kernel.org list that folks can Cc for BPF specific topics. Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- MAINTAINERS | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 41ce5f4ad838..d78f3714de08 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2852,7 +2852,7 @@ R: Martin KaFai Lau R: Song Liu R: Yonghong Song L: netdev@vger.kernel.org -L: linux-kernel@vger.kernel.org +L: bpf@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git Q: https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147 @@ -2882,6 +2882,7 @@ N: bpf BPF JIT for ARM M: Shubham Bansal L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Maintained F: arch/arm/net/ @@ -2890,18 +2891,21 @@ M: Daniel Borkmann M: Alexei Starovoitov M: Zi Shen Lim L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Supported F: arch/arm64/net/ BPF JIT for MIPS (32-BIT AND 64-BIT) M: Paul Burton L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Maintained F: arch/mips/net/ BPF JIT for NFP NICs M: Jakub Kicinski L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Supported F: drivers/net/ethernet/netronome/nfp/bpf/ @@ -2909,6 +2913,7 @@ BPF JIT for POWERPC (32-BIT AND 64-BIT) M: Naveen N. Rao M: Sandipan Das L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Maintained F: arch/powerpc/net/ @@ -2916,6 +2921,7 @@ BPF JIT for S390 M: Martin Schwidefsky M: Heiko Carstens L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Maintained F: arch/s390/net/ X: arch/s390/net/pnet.c @@ -2923,12 +2929,14 @@ X: arch/s390/net/pnet.c BPF JIT for SPARC (32-BIT AND 64-BIT) M: David S. Miller L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Maintained F: arch/sparc/net/ BPF JIT for X86 32-BIT M: Wang YanQing L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Maintained F: arch/x86/net/bpf_jit_comp32.c @@ -2936,6 +2944,7 @@ BPF JIT for X86 64-BIT M: Alexei Starovoitov M: Daniel Borkmann L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Supported F: arch/x86/net/ X: arch/x86/net/bpf_jit_comp32.c @@ -8487,6 +8496,7 @@ L7 BPF FRAMEWORK M: John Fastabend M: Daniel Borkmann L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Maintained F: include/linux/skmsg.h F: net/core/skmsg.c @@ -16714,6 +16724,7 @@ M: Jesper Dangaard Brouer M: John Fastabend L: netdev@vger.kernel.org L: xdp-newbies@vger.kernel.org +L: bpf@vger.kernel.org S: Supported F: net/core/xdp.c F: include/net/xdp.h @@ -16727,6 +16738,7 @@ XDP SOCKETS (AF_XDP) M: Björn Töpel M: Magnus Karlsson L: netdev@vger.kernel.org +L: bpf@vger.kernel.org S: Maintained F: kernel/bpf/xskmap.c F: net/xdp/ From 61a65d32fe91c2b6ea3aed47c5f1efc7acd89ba2 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 21 Feb 2019 17:54:11 +0100 Subject: [PATCH 208/298] net: phy: marvell10g: Fix Multi-G advertisement to only advertise 10G Some Marvell Alaska PHYs support 2.5G, 5G and 10G BaseT links. Their default behaviour is to advertise all of these modes, but at the moment, only 10GBaseT is supported. To prevent link partners from establishing link at that speed, clear these modes upon configuring aneg parameters. Fixes: 20b2af32ff3f ("net: phy: add Marvell Alaska X 88X3310 10Gigabit PHY support") Signed-off-by: Maxime Chevallier Reported-by: Russell King Signed-off-by: David S. Miller --- drivers/net/phy/marvell10g.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 82ab6ed3b74e..6bac602094bd 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -26,6 +26,8 @@ #include #include +#define MDIO_AN_10GBT_CTRL_ADV_NBT_MASK 0x01e0 + enum { MV_PCS_BASE_T = 0x0000, MV_PCS_BASE_R = 0x1000, @@ -386,8 +388,10 @@ static int mv3310_config_aneg(struct phy_device *phydev) else reg = 0; + /* Make sure we clear unsupported 2.5G/5G advertising */ ret = mv3310_modify(phydev, MDIO_MMD_AN, MDIO_AN_10GBT_CTRL, - MDIO_AN_10GBT_CTRL_ADV10G, reg); + MDIO_AN_10GBT_CTRL_ADV10G | + MDIO_AN_10GBT_CTRL_ADV_NBT_MASK, reg); if (ret < 0) return ret; if (ret > 0) From 4593403fa516a5a4cffe6883c5062d60932cbfbe Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Fri, 22 Feb 2019 14:57:23 +0800 Subject: [PATCH 209/298] net: set static variable an initial value in atl2_probe() cards_found is a static variable, but when it enters atl2_probe(), cards_found is set to zero, the value is not consistent with last probe, so next behavior is not our expect. Signed-off-by: Mao Wenan Signed-off-by: David S. Miller --- drivers/net/ethernet/atheros/atlx/atl2.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c index bb41becb6609..31ff1e0d1baa 100644 --- a/drivers/net/ethernet/atheros/atlx/atl2.c +++ b/drivers/net/ethernet/atheros/atlx/atl2.c @@ -1335,13 +1335,11 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { struct net_device *netdev; struct atl2_adapter *adapter; - static int cards_found; + static int cards_found = 0; unsigned long mmio_start; int mmio_len; int err; - cards_found = 0; - err = pci_enable_device(pdev); if (err) return err; From af548a27b158d548d41e56255e6eaca1658cc3be Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Fri, 22 Feb 2019 07:27:41 -0300 Subject: [PATCH 210/298] selftests: fib_tests: sleep after changing carrier. again. Just like commit e2ba732a1681 ("selftests: fib_tests: sleep after changing carrier"), wait one second to allow linkwatch to propagate the carrier change to the stack. There are two sets of carrier tests. The first slept after the carrier was set to off, and when the second set ran, it was likely that the linkwatch would be able to run again without much delay, reducing the likelihood of a race. However, if you run 'fib_tests.sh -t carrier' on a loop, you will quickly notice the failures. Sleeping on the second set of tests make the failures go away. Cc: David Ahern Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/fib_tests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 802b4af18729..1080ff55a788 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -388,6 +388,7 @@ fib_carrier_unicast_test() set -e $IP link set dev dummy0 carrier off + sleep 1 set +e echo " Carrier down" From 278e2148c07559dd4ad8602f22366d61eb2ee7b7 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 22 Feb 2019 21:22:32 +0800 Subject: [PATCH 211/298] Revert "bridge: do not add port to router list when receives query with source 0.0.0.0" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 5a2de63fd1a5 ("bridge: do not add port to router list when receives query with source 0.0.0.0") and commit 0fe5119e267f ("net: bridge: remove ipv6 zero address check in mcast queries") The reason is RFC 4541 is not a standard but suggestive. Currently we will elect 0.0.0.0 as Querier if there is no ip address configured on bridge. If we do not add the port which recives query with source 0.0.0.0 to router list, the IGMP reports will not be about to forward to Querier, IGMP data will also not be able to forward to dest. As Nikolay suggested, revert this change first and add a boolopt api to disable none-zero election in future if needed. Reported-by: Linus Lüssing Reported-by: Sebastian Gottschall Fixes: 5a2de63fd1a5 ("bridge: do not add port to router list when receives query with source 0.0.0.0") Fixes: 0fe5119e267f ("net: bridge: remove ipv6 zero address check in mcast queries") Signed-off-by: Hangbin Liu Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 3aeff0895669..ac92b2eb32b1 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1204,14 +1204,7 @@ static void br_multicast_query_received(struct net_bridge *br, return; br_multicast_update_query_timer(br, query, max_delay); - - /* Based on RFC4541, section 2.1.1 IGMP Forwarding Rules, - * the arrival port for IGMP Queries where the source address - * is 0.0.0.0 should not be added to router port list. - */ - if ((saddr->proto == htons(ETH_P_IP) && saddr->u.ip4) || - saddr->proto == htons(ETH_P_IPV6)) - br_multicast_mark_router(br, port); + br_multicast_mark_router(br, port); } static void br_ip4_multicast_query(struct net_bridge *br, From 99407d8fa3abfe41b04d9321a9df0a0e30a57fae Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Fri, 22 Feb 2019 20:07:45 +0100 Subject: [PATCH 212/298] net: dsa: Remove documentation for port_fdb_prepare This callback was removed some time ago, also remove the documentation. Fixes: 1b6dd556c304 ("net: dsa: Remove prepare phase for FDB") Signed-off-by: Hauke Mehrtens Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.txt | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt index 25170ad7d25b..101f2b2c69ad 100644 --- a/Documentation/networking/dsa/dsa.txt +++ b/Documentation/networking/dsa/dsa.txt @@ -533,16 +533,12 @@ Bridge VLAN filtering function that the driver has to call for each VLAN the given port is a member of. A switchdev object is used to carry the VID and bridge flags. -- port_fdb_prepare: bridge layer function invoked when the bridge prepares the - installation of a Forwarding Database entry. If the operation is not - supported, this function should return -EOPNOTSUPP to inform the bridge code - to fallback to a software implementation. No hardware setup must be done in - this function. See port_fdb_add for this and details. - - port_fdb_add: bridge layer function invoked when the bridge wants to install a Forwarding Database entry, the switch hardware should be programmed with the specified address in the specified VLAN Id in the forwarding database - associated with this VLAN ID + associated with this VLAN ID. If the operation is not supported, this + function should return -EOPNOTSUPP to inform the bridge code to fallback to + a software implementation. Note: VLAN ID 0 corresponds to the port private database, which, in the context of DSA, would be the its port-based VLAN, used by the associated bridge device. From 797a22bd5298c2674d927893f46cadf619dad11d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Feb 2019 13:24:59 -0800 Subject: [PATCH 213/298] net/x25: fix a race in x25_bind() syzbot was able to trigger another soft lockup [1] I first thought it was the O(N^2) issue I mentioned in my prior fix (f657d22ee1f "net/x25: do not hold the cpu too long in x25_new_lci()"), but I eventually found that x25_bind() was not checking SOCK_ZAPPED state under socket lock protection. This means that multiple threads can end up calling x25_insert_socket() for the same socket, and corrupt x25_list [1] watchdog: BUG: soft lockup - CPU#0 stuck for 123s! [syz-executor.2:10492] Modules linked in: irq event stamp: 27515 hardirqs last enabled at (27514): [] trace_hardirqs_on_thunk+0x1a/0x1c hardirqs last disabled at (27515): [] trace_hardirqs_off_thunk+0x1a/0x1c softirqs last enabled at (32): [] x25_get_neigh+0xa3/0xd0 net/x25/x25_link.c:336 softirqs last disabled at (34): [] x25_find_socket+0x23/0x140 net/x25/af_x25.c:341 CPU: 0 PID: 10492 Comm: syz-executor.2 Not tainted 5.0.0-rc7+ #88 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__sanitizer_cov_trace_pc+0x4/0x50 kernel/kcov.c:97 Code: f4 ff ff ff e8 11 9f ea ff 48 c7 05 12 fb e5 08 00 00 00 00 e9 c8 e9 ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 <48> 8b 75 08 65 48 8b 04 25 40 ee 01 00 65 8b 15 38 0c 92 7e 81 e2 RSP: 0018:ffff88806e94fc48 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13 RAX: 1ffff1100d84dac5 RBX: 0000000000000001 RCX: ffffc90006197000 RDX: 0000000000040000 RSI: ffffffff86324bf3 RDI: ffff88806c26d628 RBP: ffff88806e94fc48 R08: ffff88806c1c6500 R09: fffffbfff1282561 R10: fffffbfff1282560 R11: ffffffff89412b03 R12: ffff88806c26d628 R13: ffff888090455200 R14: dffffc0000000000 R15: 0000000000000000 FS: 00007f3a107e4700(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f3a107e3db8 CR3: 00000000a5544000 CR4: 00000000001406f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __x25_find_socket net/x25/af_x25.c:327 [inline] x25_find_socket+0x7d/0x140 net/x25/af_x25.c:342 x25_new_lci net/x25/af_x25.c:355 [inline] x25_connect+0x380/0xde0 net/x25/af_x25.c:784 __sys_connect+0x266/0x330 net/socket.c:1662 __do_sys_connect net/socket.c:1673 [inline] __se_sys_connect net/socket.c:1670 [inline] __x64_sys_connect+0x73/0xb0 net/socket.c:1670 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457e29 Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f3a107e3c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457e29 RDX: 0000000000000012 RSI: 0000000020000200 RDI: 0000000000000005 RBP: 000000000073c040 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f3a107e46d4 R13: 00000000004be362 R14: 00000000004ceb98 R15: 00000000ffffffff Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 10493 Comm: syz-executor.3 Not tainted 5.0.0-rc7+ #88 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__read_once_size include/linux/compiler.h:193 [inline] RIP: 0010:queued_write_lock_slowpath+0x143/0x290 kernel/locking/qrwlock.c:86 Code: 4c 8d 2c 01 41 83 c7 03 41 0f b6 45 00 41 38 c7 7c 08 84 c0 0f 85 0c 01 00 00 8b 03 3d 00 01 00 00 74 1a f3 90 41 0f b6 55 00 <41> 38 d7 7c eb 84 d2 74 e7 48 89 df e8 cc aa 4e 00 eb dd be 04 00 RSP: 0018:ffff888085c47bd8 EFLAGS: 00000206 RAX: 0000000000000300 RBX: ffffffff89412b00 RCX: 1ffffffff1282560 RDX: 0000000000000000 RSI: 0000000000000004 RDI: ffffffff89412b00 RBP: ffff888085c47c70 R08: 1ffffffff1282560 R09: fffffbfff1282561 R10: fffffbfff1282560 R11: ffffffff89412b03 R12: 00000000000000ff R13: fffffbfff1282560 R14: 1ffff11010b88f7d R15: 0000000000000003 FS: 00007fdd04086700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fdd04064db8 CR3: 0000000090be0000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: queued_write_lock include/asm-generic/qrwlock.h:104 [inline] do_raw_write_lock+0x1d6/0x290 kernel/locking/spinlock_debug.c:203 __raw_write_lock_bh include/linux/rwlock_api_smp.h:204 [inline] _raw_write_lock_bh+0x3b/0x50 kernel/locking/spinlock.c:312 x25_insert_socket+0x21/0xe0 net/x25/af_x25.c:267 x25_bind+0x273/0x340 net/x25/af_x25.c:703 __sys_bind+0x23f/0x290 net/socket.c:1481 __do_sys_bind net/socket.c:1492 [inline] __se_sys_bind net/socket.c:1490 [inline] __x64_sys_bind+0x73/0xb0 net/socket.c:1490 do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x457e29 Fixes: 90c27297a9bf ("X.25 remove bkl in bind") Signed-off-by: Eric Dumazet Cc: andrew hendry Signed-off-by: David S. Miller --- net/x25/af_x25.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ec3a828672ef..eff31348e20b 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -679,8 +679,7 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; - if (!sock_flag(sk, SOCK_ZAPPED) || - addr_len != sizeof(struct sockaddr_x25) || + if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25) { rc = -EINVAL; goto out; @@ -699,9 +698,13 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } lock_sock(sk); - x25_sk(sk)->source_addr = addr->sx25_addr; - x25_insert_socket(sk); - sock_reset_flag(sk, SOCK_ZAPPED); + if (sock_flag(sk, SOCK_ZAPPED)) { + x25_sk(sk)->source_addr = addr->sx25_addr; + x25_insert_socket(sk); + sock_reset_flag(sk, SOCK_ZAPPED); + } else { + rc = -EINVAL; + } release_sock(sk); SOCK_DEBUG(sk, "x25_bind: socket is bound\n"); out: From bf50b606cfd85ac8d3d0adb711f3e22204059848 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 23 Feb 2019 15:51:51 -0800 Subject: [PATCH 214/298] tcp: repaired skbs must init their tso_segs syzbot reported a WARN_ON(!tcp_skb_pcount(skb)) in tcp_send_loss_probe() [1] This was caused by TCP_REPAIR sent skbs that inadvertenly were missing a call to tcp_init_tso_segs() [1] WARNING: CPU: 1 PID: 0 at net/ipv4/tcp_output.c:2534 tcp_send_loss_probe+0x771/0x8a0 net/ipv4/tcp_output.c:2534 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.0.0-rc7+ #77 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 panic+0x2cb/0x65c kernel/panic.c:214 __warn.cold+0x20/0x45 kernel/panic.c:571 report_bug+0x263/0x2b0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] fixup_bug arch/x86/kernel/traps.c:173 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:290 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973 RIP: 0010:tcp_send_loss_probe+0x771/0x8a0 net/ipv4/tcp_output.c:2534 Code: 88 fc ff ff 4c 89 ef e8 ed 75 c8 fb e9 c8 fc ff ff e8 43 76 c8 fb e9 63 fd ff ff e8 d9 75 c8 fb e9 94 f9 ff ff e8 bf 03 91 fb <0f> 0b e9 7d fa ff ff e8 b3 03 91 fb 0f b6 1d 37 43 7a 03 31 ff 89 RSP: 0018:ffff8880ae907c60 EFLAGS: 00010206 RAX: ffff8880a989c340 RBX: 0000000000000000 RCX: ffffffff85dedbdb RDX: 0000000000000100 RSI: ffffffff85dee0b1 RDI: 0000000000000005 RBP: ffff8880ae907c90 R08: ffff8880a989c340 R09: ffffed10147d1ae1 R10: ffffed10147d1ae0 R11: ffff8880a3e8d703 R12: ffff888091b90040 R13: ffff8880a3e8d540 R14: 0000000000008000 R15: ffff888091b90860 tcp_write_timer_handler+0x5c0/0x8a0 net/ipv4/tcp_timer.c:583 tcp_write_timer+0x10e/0x1d0 net/ipv4/tcp_timer.c:607 call_timer_fn+0x190/0x720 kernel/time/timer.c:1325 expire_timers kernel/time/timer.c:1362 [inline] __run_timers kernel/time/timer.c:1681 [inline] __run_timers kernel/time/timer.c:1649 [inline] run_timer_softirq+0x652/0x1700 kernel/time/timer.c:1694 __do_softirq+0x266/0x95a kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x180/0x1d0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0x14a/0x570 arch/x86/kernel/apic/apic.c:1062 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:807 RIP: 0010:native_safe_halt+0x2/0x10 arch/x86/include/asm/irqflags.h:58 Code: ff ff ff 48 89 c7 48 89 45 d8 e8 59 0c a1 fa 48 8b 45 d8 e9 ce fe ff ff 48 89 df e8 48 0c a1 fa eb 82 90 90 90 90 90 90 fb f4 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f4 c3 90 90 90 90 90 90 RSP: 0018:ffff8880a98afd78 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13 RAX: 1ffffffff1125061 RBX: ffff8880a989c340 RCX: 0000000000000000 RDX: dffffc0000000000 RSI: 0000000000000001 RDI: ffff8880a989cbbc RBP: ffff8880a98afda8 R08: ffff8880a989c340 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000001 R13: ffffffff889282f8 R14: 0000000000000001 R15: 0000000000000000 arch_cpu_idle+0x10/0x20 arch/x86/kernel/process.c:555 default_idle_call+0x36/0x90 kernel/sched/idle.c:93 cpuidle_idle_call kernel/sched/idle.c:153 [inline] do_idle+0x386/0x570 kernel/sched/idle.c:262 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:353 start_secondary+0x404/0x5c0 arch/x86/kernel/smpboot.c:271 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:243 Kernel Offset: disabled Rebooting in 86400 seconds.. Fixes: 79861919b889 ("tcp: fix TCP_REPAIR xmit queue setup") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Andrey Vagin Cc: Soheil Hassas Yeganeh Cc: Neal Cardwell Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 730bc44dbad9..ccc78f3a4b60 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2347,6 +2347,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); goto repair; /* Skip network transmission */ } From 4c8e0459b585e2a7b367545be3e102737f1e489f Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 24 Feb 2019 01:11:15 +0100 Subject: [PATCH 215/298] net: phy: realtek: Dummy IRQ calls for RTL8366RB This fixes a regression introduced by commit 0d2e778e38e0ddffab4bb2b0e9ed2ad5165c4bf7 "net: phy: replace PHY_HAS_INTERRUPT with a check for config_intr and ack_interrupt". This assumes that a PHY cannot trigger interrupt unless it has .config_intr() or .ack_interrupt() implemented. A later patch makes the code assume both need to be implemented for interrupts to be present. But this PHY (which is inside a DSA) will happily fire interrupts without either callback. Implement dummy callbacks for .config_intr() and .ack_interrupt() in the phy header to fix this. Tested on the RTL8366RB on D-Link DIR-685. Fixes: 0d2e778e38e0 ("net: phy: replace PHY_HAS_INTERRUPT with a check for config_intr and ack_interrupt") Cc: Heiner Kallweit Signed-off-by: Linus Walleij Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/realtek.c | 7 +++++++ include/linux/phy.h | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index c6010fb1aa0f..cb4a23041a94 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -282,6 +282,13 @@ static struct phy_driver realtek_drvs[] = { .name = "RTL8366RB Gigabit Ethernet", .features = PHY_GBIT_FEATURES, .config_init = &rtl8366rb_config_init, + /* These interrupts are handled by the irq controller + * embedded inside the RTL8366RB, they get unmasked when the + * irq is requested and ACKed by reading the status register, + * which is done by the irqchip code. + */ + .ack_interrupt = genphy_no_ack_interrupt, + .config_intr = genphy_no_config_intr, .suspend = genphy_suspend, .resume = genphy_resume, }, diff --git a/include/linux/phy.h b/include/linux/phy.h index 127fcc9c3778..333b56d8f746 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -992,6 +992,14 @@ static inline int genphy_no_soft_reset(struct phy_device *phydev) { return 0; } +static inline int genphy_no_ack_interrupt(struct phy_device *phydev) +{ + return 0; +} +static inline int genphy_no_config_intr(struct phy_device *phydev) +{ + return 0; +} int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum); int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, From 5908e6b738e3357af42c10e1183753c70a0117a9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Feb 2019 16:46:45 -0800 Subject: [PATCH 216/298] Linux 5.0-rc8 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 96c5335e7ee4..ac5ac28a24e9 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 0 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = -rc8 NAME = Shy Crocodile # *DOCUMENTATION* From 8f67c90ee9148eab3d2b4393c3cf76489b27f87c Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 22 Feb 2019 12:33:25 +0100 Subject: [PATCH 217/298] net/sched: act_ipt: fix refcount leak when replace fails After commit 4e8ddd7f1758 ("net: sched: don't release reference on action overwrite"), the error path of all actions was converted to drop refcount also when the action was being overwritten. But we forgot act_ipt_init(), in case allocation of 'tname' was not successful: # tc action add action xt -j LOG --log-prefix hello index 100 tablename: mangle hook: NF_IP_POST_ROUTING target: LOG level warning prefix "hello" index 100 # tc action show action xt total acts 1 action order 0: tablename: mangle hook: NF_IP_POST_ROUTING target LOG level warning prefix "hello" index 100 ref 1 bind 0 # tc action replace action xt -j LOG --log-prefix world index 100 tablename: mangle hook: NF_IP_POST_ROUTING target: LOG level warning prefix "world" index 100 RTNETLINK answers: Cannot allocate memory We have an error talking to the kernel # tc action show action xt total acts 1 action order 0: tablename: mangle hook: NF_IP_POST_ROUTING target LOG level warning prefix "hello" index 100 ref 2 bind 0 Ensure we call tcf_idr_release(), in case 'tname' allocation failed, also when the action is being replaced. Fixes: 4e8ddd7f1758 ("net: sched: don't release reference on action overwrite") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8af6c11d2482..faa1addf89b3 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -199,8 +199,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, err2: kfree(tname); err1: - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return err; } From 6191da98062d25276a3b88fb2a94dcbcfb3ea65d Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Fri, 22 Feb 2019 12:33:26 +0100 Subject: [PATCH 218/298] net/sched: act_skbedit: fix refcount leak when replace fails when act_skbedit was converted to use RCU in the data plane, we added an error path, but we forgot to drop the action refcount in case of failure during a 'replace' operation: # tc actions add action skbedit ptype otherhost pass index 100 # tc action show action skbedit total acts 1 action order 0: skbedit ptype otherhost pass index 100 ref 1 bind 0 # tc actions replace action skbedit ptype otherhost drop index 100 RTNETLINK answers: Cannot allocate memory We have an error talking to the kernel # tc action show action skbedit total acts 1 action order 0: skbedit ptype otherhost pass index 100 ref 2 bind 0 Ensure we call tcf_idr_release(), in case 'params_new' allocation failed, also when the action is being replaced. Fixes: c749cdda9089 ("net/sched: act_skbedit: don't use spinlock in the data path") Signed-off-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 64dba3708fce..cfceed28c333 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -189,8 +189,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, params_new = kzalloc(sizeof(*params_new), GFP_KERNEL); if (unlikely(!params_new)) { - if (ret == ACT_P_CREATED) - tcf_idr_release(*a, bind); + tcf_idr_release(*a, bind); return -ENOMEM; } From cffde20164d2a1e7d647b63b4d8fac11bf48b500 Mon Sep 17 00:00:00 2001 From: Hauke Mehrtens Date: Fri, 22 Feb 2019 20:11:13 +0100 Subject: [PATCH 219/298] net: dsa: lantiq: Add GPHY firmware files This adds the file names of the FW files which this driver handles into the module description. Signed-off-by: Hauke Mehrtens Signed-off-by: David S. Miller --- drivers/net/dsa/lantiq_gswip.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 693a67f45bef..ddc1f9ca8ebc 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1162,6 +1162,12 @@ static struct platform_driver gswip_driver = { module_platform_driver(gswip_driver); +MODULE_FIRMWARE("lantiq/xrx300_phy11g_a21.bin"); +MODULE_FIRMWARE("lantiq/xrx300_phy22f_a21.bin"); +MODULE_FIRMWARE("lantiq/xrx200_phy11g_a14.bin"); +MODULE_FIRMWARE("lantiq/xrx200_phy11g_a22.bin"); +MODULE_FIRMWARE("lantiq/xrx200_phy22f_a14.bin"); +MODULE_FIRMWARE("lantiq/xrx200_phy22f_a22.bin"); MODULE_AUTHOR("Hauke Mehrtens "); MODULE_DESCRIPTION("Lantiq / Intel GSWIP driver"); MODULE_LICENSE("GPL v2"); From 71828b2240692cec0e68b8d867bc00e1745e7fae Mon Sep 17 00:00:00 2001 From: Timur Celik Date: Sat, 23 Feb 2019 12:53:13 +0100 Subject: [PATCH 220/298] tun: fix blocking read This patch moves setting of the current state into the loop. Otherwise the task may end up in a busy wait loop if none of the break conditions are met. Signed-off-by: Timur Celik Signed-off-by: David S. Miller --- drivers/net/tun.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index fed298c0cb39..d291762b9e9d 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2167,9 +2167,9 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) } add_wait_queue(&tfile->wq.wait, &wait); - current->state = TASK_INTERRUPTIBLE; while (1) { + set_current_state(TASK_INTERRUPTIBLE); ptr = ptr_ring_consume(&tfile->tx_ring); if (ptr) break; @@ -2185,7 +2185,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) schedule(); } - current->state = TASK_RUNNING; + set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->wq.wait, &wait); out: From 014e90ca44eeb8434f135e26a3cc17e57d64d85d Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Sun, 24 Feb 2019 00:04:52 +0100 Subject: [PATCH 221/298] ARM: dts: gemini: Re-enable display controller commit 137cd7100ec6fa36d610e106df00acb4d8af99df "ARM: dts: Enable Gemini flash access" contained a bug by disabling the display controller, while the whole idea with the patch was to enable flash access AND the display controller, simultaneously. Fix it up. Fixes: 137cd7100ec6 ("ARM: dts: Enable Gemini flash access") Signed-off-by: Linus Walleij Signed-off-by: Arnd Bergmann --- arch/arm/boot/dts/gemini-dlink-dir-685.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/gemini-dlink-dir-685.dts b/arch/arm/boot/dts/gemini-dlink-dir-685.dts index cc0c3cf89eaa..592111c8d6fd 100644 --- a/arch/arm/boot/dts/gemini-dlink-dir-685.dts +++ b/arch/arm/boot/dts/gemini-dlink-dir-685.dts @@ -443,7 +443,7 @@ ata@63000000 { }; display-controller@6a000000 { - status = "disabled"; + status = "okay"; port@0 { reg = <0>; From c9bd505dbd9d3dc80c496f88eafe70affdcf1ba6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sun, 10 Feb 2019 18:31:07 +0100 Subject: [PATCH 222/298] mmc: spi: Fix card detection during probe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using the mmc_spi driver with a card-detect pin, I noticed that the card was not detected immediately after probe, but only after it was unplugged and plugged back in (and the CD IRQ fired). The call tree looks something like this: mmc_spi_probe mmc_add_host mmc_start_host _mmc_detect_change mmc_schedule_delayed_work(&host->detect, 0) mmc_rescan host->bus_ops->detect(host) mmc_detect _mmc_detect_card_removed host->ops->get_cd(host) mmc_gpio_get_cd -> -ENOSYS (ctx->cd_gpio not set) mmc_gpiod_request_cd ctx->cd_gpio = desc To fix this issue, call mmc_detect_change after the card-detect GPIO/IRQ is registered. Signed-off-by: Jonathan Neuschäfer Reviewed-by: Linus Walleij Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmc_spi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 10ba46b728e8..8ade14fb2148 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -1450,6 +1450,7 @@ static int mmc_spi_probe(struct spi_device *spi) mmc->caps &= ~MMC_CAP_NEEDS_POLL; mmc_gpiod_request_cd_irq(mmc); } + mmc_detect_change(mmc, 0); /* Index 1 is write protect/read only */ status = mmc_gpiod_request_ro(mmc, NULL, 1, false, 0, NULL); From 5c27ff5db1491a947264d6d4e4cbe43ae6535bae Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Mon, 18 Feb 2019 20:45:40 +0300 Subject: [PATCH 223/298] mmc: tmio_mmc_core: don't claim spurious interrupts I have encountered an interrupt storm during the eMMC chip probing (and the chip finally didn't get detected). It turned out that U-Boot left the DMAC interrupts enabled while the Linux driver didn't use those. The SDHI driver's interrupt handler somehow assumes that, even if an SDIO interrupt didn't happen, it should return IRQ_HANDLED. I think that if none of the enabled interrupts happened and got handled, we should return IRQ_NONE -- that way the kernel IRQ code recoginizes a spurious interrupt and masks it off pretty quickly... Fixes: 7729c7a232a9 ("mmc: tmio: Provide separate interrupt handlers") Signed-off-by: Sergei Shtylyov Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Reviewed-by: Simon Horman Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 085a0fab769c..41d79720e745 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -629,7 +629,7 @@ static bool __tmio_mmc_sdcard_irq(struct tmio_mmc_host *host, int ireg, return false; } -static void __tmio_mmc_sdio_irq(struct tmio_mmc_host *host) +static bool __tmio_mmc_sdio_irq(struct tmio_mmc_host *host) { struct mmc_host *mmc = host->mmc; struct tmio_mmc_data *pdata = host->pdata; @@ -637,7 +637,7 @@ static void __tmio_mmc_sdio_irq(struct tmio_mmc_host *host) unsigned int sdio_status; if (!(pdata->flags & TMIO_MMC_SDIO_IRQ)) - return; + return false; status = sd_ctrl_read16(host, CTL_SDIO_STATUS); ireg = status & TMIO_SDIO_MASK_ALL & ~host->sdio_irq_mask; @@ -650,6 +650,8 @@ static void __tmio_mmc_sdio_irq(struct tmio_mmc_host *host) if (mmc->caps & MMC_CAP_SDIO_IRQ && ireg & TMIO_SDIO_STAT_IOIRQ) mmc_signal_sdio_irq(mmc); + + return ireg; } irqreturn_t tmio_mmc_irq(int irq, void *devid) @@ -668,9 +670,10 @@ irqreturn_t tmio_mmc_irq(int irq, void *devid) if (__tmio_mmc_sdcard_irq(host, ireg, status)) return IRQ_HANDLED; - __tmio_mmc_sdio_irq(host); + if (__tmio_mmc_sdio_irq(host)) + return IRQ_HANDLED; - return IRQ_HANDLED; + return IRQ_NONE; } EXPORT_SYMBOL_GPL(tmio_mmc_irq); From 53a41cb7ed381edee91029cdcabe9b3250f43f4d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 25 Feb 2019 09:10:51 -0800 Subject: [PATCH 224/298] Revert "x86/fault: BUG() when uaccess helpers fault on kernel addresses" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 9da3f2b74054406f87dff7101a569217ffceb29b. It was well-intentioned, but wrong. Overriding the exception tables for instructions for random reasons is just wrong, and that is what the new code did. It caused problems for tracing, and it caused problems for strncpy_from_user(), because the new checks made perfectly valid use cases break, rather than catch things that did bad things. Unchecked user space accesses are a problem, but that's not a reason to add invalid checks that then people have to work around with silly flags (in this case, that 'kernel_uaccess_faults_ok' flag, which is just an odd way to say "this commit was wrong" and was sprinked into random places to hide the wrongness). The real fix to unchecked user space accesses is to get rid of the special "let's not check __get_user() and __put_user() at all" logic. Make __{get|put}_user() be just aliases to the regular {get|put}_user() functions, and make it impossible to access user space without having the proper checks in places. The raison d'être of the special double-underscore versions used to be that the range check was expensive, and if you did multiple user accesses, you'd do the range check up front (like the signal frame handling code, for example). But SMAP (on x86) and PAN (on ARM) have made that optimization pointless, because the _real_ expense is the "set CPU flag to allow user space access". Do let's not break the valid cases to catch invalid cases that shouldn't even exist. Cc: Thomas Gleixner Cc: Kees Cook Cc: Tobin C. Harding Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Jann Horn Signed-off-by: Linus Torvalds --- arch/x86/mm/extable.c | 58 ------------------------------------------- fs/namespace.c | 2 -- include/linux/sched.h | 6 ----- mm/maccess.c | 6 ----- 4 files changed, 72 deletions(-) diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 6521134057e8..856fa409c536 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -117,67 +117,11 @@ __visible bool ex_handler_fprestore(const struct exception_table_entry *fixup, } EXPORT_SYMBOL_GPL(ex_handler_fprestore); -/* Helper to check whether a uaccess fault indicates a kernel bug. */ -static bool bogus_uaccess(struct pt_regs *regs, int trapnr, - unsigned long fault_addr) -{ - /* This is the normal case: #PF with a fault address in userspace. */ - if (trapnr == X86_TRAP_PF && fault_addr < TASK_SIZE_MAX) - return false; - - /* - * This code can be reached for machine checks, but only if the #MC - * handler has already decided that it looks like a candidate for fixup. - * This e.g. happens when attempting to access userspace memory which - * the CPU can't access because of uncorrectable bad memory. - */ - if (trapnr == X86_TRAP_MC) - return false; - - /* - * There are two remaining exception types we might encounter here: - * - #PF for faulting accesses to kernel addresses - * - #GP for faulting accesses to noncanonical addresses - * Complain about anything else. - */ - if (trapnr != X86_TRAP_PF && trapnr != X86_TRAP_GP) { - WARN(1, "unexpected trap %d in uaccess\n", trapnr); - return false; - } - - /* - * This is a faulting memory access in kernel space, on a kernel - * address, in a usercopy function. This can e.g. be caused by improper - * use of helpers like __put_user and by improper attempts to access - * userspace addresses in KERNEL_DS regions. - * The one (semi-)legitimate exception are probe_kernel_{read,write}(), - * which can be invoked from places like kgdb, /dev/mem (for reading) - * and privileged BPF code (for reading). - * The probe_kernel_*() functions set the kernel_uaccess_faults_ok flag - * to tell us that faulting on kernel addresses, and even noncanonical - * addresses, in a userspace accessor does not necessarily imply a - * kernel bug, root might just be doing weird stuff. - */ - if (current->kernel_uaccess_faults_ok) - return false; - - /* This is bad. Refuse the fixup so that we go into die(). */ - if (trapnr == X86_TRAP_PF) { - pr_emerg("BUG: pagefault on kernel address 0x%lx in non-whitelisted uaccess\n", - fault_addr); - } else { - pr_emerg("BUG: GPF in non-whitelisted uaccess (non-canonical address?)\n"); - } - return true; -} - __visible bool ex_handler_uaccess(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { - if (bogus_uaccess(regs, trapnr, fault_addr)) - return false; regs->ip = ex_fixup_addr(fixup); return true; } @@ -188,8 +132,6 @@ __visible bool ex_handler_ext(const struct exception_table_entry *fixup, unsigned long error_code, unsigned long fault_addr) { - if (bogus_uaccess(regs, trapnr, fault_addr)) - return false; /* Special hack for uaccess_err */ current->thread.uaccess_err = 1; regs->ip = ex_fixup_addr(fixup); diff --git a/fs/namespace.c b/fs/namespace.c index a677b59efd74..678ef175d63a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2698,7 +2698,6 @@ static long exact_copy_from_user(void *to, const void __user * from, if (!access_ok(from, n)) return n; - current->kernel_uaccess_faults_ok++; while (n) { if (__get_user(c, f)) { memset(t, 0, n); @@ -2708,7 +2707,6 @@ static long exact_copy_from_user(void *to, const void __user * from, f++; n--; } - current->kernel_uaccess_faults_ok--; return n; } diff --git a/include/linux/sched.h b/include/linux/sched.h index bba3afb4e9bf..f9b43c989577 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -739,12 +739,6 @@ struct task_struct { unsigned use_memdelay:1; #endif - /* - * May usercopy functions fault on kernel addresses? - * This is not just a single bit because this can potentially nest. - */ - unsigned int kernel_uaccess_faults_ok; - unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; diff --git a/mm/maccess.c b/mm/maccess.c index f3416632e5a4..ec00be51a24f 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -30,10 +30,8 @@ long __probe_kernel_read(void *dst, const void *src, size_t size) set_fs(KERNEL_DS); pagefault_disable(); - current->kernel_uaccess_faults_ok++; ret = __copy_from_user_inatomic(dst, (__force const void __user *)src, size); - current->kernel_uaccess_faults_ok--; pagefault_enable(); set_fs(old_fs); @@ -60,9 +58,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) set_fs(KERNEL_DS); pagefault_disable(); - current->kernel_uaccess_faults_ok++; ret = __copy_to_user_inatomic((__force void __user *)dst, src, size); - current->kernel_uaccess_faults_ok--; pagefault_enable(); set_fs(old_fs); @@ -98,13 +94,11 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) set_fs(KERNEL_DS); pagefault_disable(); - current->kernel_uaccess_faults_ok++; do { ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); - current->kernel_uaccess_faults_ok--; dst[-1] = '\0'; pagefault_enable(); set_fs(old_fs); From 9919a363a5cb57c2b64c4803b4d2dd45e90bf230 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Mon, 25 Feb 2019 15:22:19 +0800 Subject: [PATCH 225/298] net: dsa: fix a leaked reference by adding missing of_node_put The call to of_parse_phandle returns a node pointer with refcount incremented thus it must be explicitly decremented after the last usage. Detected by coccinelle with the following warnings: ./net/dsa/port.c:294:1-7: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 284, but without a corresponding object release within this function. ./net/dsa/dsa2.c:627:3-9: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 618, but without a corresponding object release within this function. ./net/dsa/dsa2.c:630:3-9: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 618, but without a corresponding object release within this function. ./net/dsa/dsa2.c:636:3-9: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 618, but without a corresponding object release within this function. ./net/dsa/dsa2.c:639:1-7: ERROR: missing of_node_put; acquired a node pointer with refcount incremented on line 618, but without a corresponding object release within this function. Signed-off-by: Wen Yang Reviewed-by: Vivien Didelot Cc: Andrew Lunn Cc: Vivien Didelot Cc: Florian Fainelli Cc: "David S. Miller" Cc: Vivien Didelot Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- net/dsa/dsa2.c | 16 ++++++++++------ net/dsa/port.c | 1 + 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index a1917025e155..410f19148106 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -612,8 +612,8 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, { struct device_node *ports, *port; struct dsa_port *dp; + int err = 0; u32 reg; - int err; ports = of_get_child_by_name(dn, "ports"); if (!ports) { @@ -624,19 +624,23 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, for_each_available_child_of_node(ports, port) { err = of_property_read_u32(port, "reg", ®); if (err) - return err; + goto out_put_node; - if (reg >= ds->num_ports) - return -EINVAL; + if (reg >= ds->num_ports) { + err = -EINVAL; + goto out_put_node; + } dp = &ds->ports[reg]; err = dsa_port_parse_of(dp, port); if (err) - return err; + goto out_put_node; } - return 0; +out_put_node: + of_node_put(ports); + return err; } static int dsa_switch_parse_member_of(struct dsa_switch *ds, diff --git a/net/dsa/port.c b/net/dsa/port.c index 2a2a878b5ce3..c2261697ee83 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -292,6 +292,7 @@ static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp) return ERR_PTR(-EPROBE_DEFER); } + of_node_put(phy_dn); return phydev; } From a3df633a3c92bb96b06552c3f828d7c267774379 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 25 Feb 2019 17:28:27 +0200 Subject: [PATCH 226/298] net: sched: act_tunnel_key: fix NULL pointer dereference during init Metadata pointer is only initialized for action TCA_TUNNEL_KEY_ACT_SET, but it is unconditionally dereferenced in tunnel_key_init() error handler. Verify that metadata pointer is not NULL before dereferencing it in tunnel_key_init error handling code. Fixes: ee28bb56ac5b ("net/sched: fix memory leak in act_tunnel_key_init()") Signed-off-by: Vlad Buslov Reviewed-by: Davide Caratti Signed-off-by: David S. Miller --- net/sched/act_tunnel_key.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 8b43fe0130f7..3f943de9a2c9 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -377,7 +377,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, return ret; release_tun_meta: - dst_release(&metadata->dst); + if (metadata) + dst_release(&metadata->dst); err_out: if (exists) From ff7b11aa481f682e0e9711abfeb7d03f5cd612bf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 21 Feb 2019 14:13:56 -0800 Subject: [PATCH 227/298] net: socket: set sock->sk to NULL after calling proto_ops::release() Commit 9060cb719e61 ("net: crypto set sk to NULL when af_alg_release.") fixed a use-after-free in sockfs_setattr() when an AF_ALG socket is closed concurrently with fchownat(). However, it ignored that many other proto_ops::release() methods don't set sock->sk to NULL and therefore allow the same use-after-free: - base_sock_release - bnep_sock_release - cmtp_sock_release - data_sock_release - dn_release - hci_sock_release - hidp_sock_release - iucv_sock_release - l2cap_sock_release - llcp_sock_release - llc_ui_release - rawsock_release - rfcomm_sock_release - sco_sock_release - svc_release - vcc_release - x25_release Rather than fixing all these and relying on every socket type to get this right forever, just make __sock_release() set sock->sk to NULL itself after calling proto_ops::release(). Reproducer that produces the KASAN splat when any of these socket types are configured into the kernel: #include #include #include #include pthread_t t; volatile int fd; void *close_thread(void *arg) { for (;;) { usleep(rand() % 100); close(fd); } } int main() { pthread_create(&t, NULL, close_thread, NULL); for (;;) { fd = socket(rand() % 50, rand() % 11, 0); fchownat(fd, "", 1000, 1000, 0x1000); close(fd); } } Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Signed-off-by: Eric Biggers Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/socket.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/socket.c b/net/socket.c index d80d87a395ea..320f51b22b19 100644 --- a/net/socket.c +++ b/net/socket.c @@ -577,6 +577,7 @@ static void __sock_release(struct socket *sock, struct inode *inode) if (inode) inode_lock(inode); sock->ops->release(sock); + sock->sk = NULL; if (inode) inode_unlock(inode); sock->ops = NULL; From 2a418cf3f5f1caf911af288e978d61c9844b0695 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 22 Feb 2019 17:17:04 -0800 Subject: [PATCH 228/298] x86/uaccess: Don't leak the AC flag into __put_user() value evaluation When calling __put_user(foo(), ptr), the __put_user() macro would call foo() in between __uaccess_begin() and __uaccess_end(). If that code were buggy, then those bugs would be run without SMAP protection. Fortunately, there seem to be few instances of the problem in the kernel. Nevertheless, __put_user() should be fixed to avoid doing this. Therefore, evaluate __put_user()'s argument before setting AC. This issue was noticed when an objtool hack by Peter Zijlstra complained about genregs_get() and I compared the assembly output to the C source. [ bp: Massage commit message and fixed up whitespace. ] Fixes: 11f1a4b9755f ("x86: reorganize SMAP handling in user space accesses") Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Acked-by: Linus Torvalds Cc: Peter Zijlstra Cc: Brian Gerst Cc: Josh Poimboeuf Cc: Denys Vlasenko Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20190225125231.845656645@infradead.org --- arch/x86/include/asm/uaccess.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a77445d1b034..28376aa2d053 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -284,7 +284,7 @@ do { \ __put_user_goto(x, ptr, "l", "k", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64((__typeof__(*ptr))(x), ptr, label); \ + __put_user_goto_u64(x, ptr, label); \ break; \ default: \ __put_user_bad(); \ @@ -431,8 +431,10 @@ do { \ ({ \ __label__ __pu_label; \ int __pu_err = -EFAULT; \ + __typeof__(*(ptr)) __pu_val; \ + __pu_val = x; \ __uaccess_begin(); \ - __put_user_size((x), (ptr), (size), __pu_label); \ + __put_user_size(__pu_val, (ptr), (size), __pu_label); \ __pu_err = 0; \ __pu_label: \ __uaccess_end(); \ From 29b00e609960ae0fcff382f4c7079dd0874a5311 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 22 Feb 2019 22:35:32 -0800 Subject: [PATCH 229/298] tmpfs: fix uninitialized return value in shmem_link When we made the shmem_reserve_inode call in shmem_link conditional, we forgot to update the declaration for ret so that it always has a known value. Dan Carpenter pointed out this deficiency in the original patch. Fixes: 1062af920c07 ("tmpfs: fix link accounting when a tmpfile is linked in") Reported-by: Dan Carpenter Signed-off-by: Darrick J. Wong Signed-off-by: Hugh Dickins Cc: Matej Kupljen Cc: Al Viro Cc: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0905215fb016..2c012eee133d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2848,7 +2848,7 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); - int ret; + int ret = 0; /* * No ordinary (disk based) filesystem counts links as inodes; From 7d762d69145a54d169f58e56d6dac57a5508debc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 21 Feb 2019 22:04:32 +0000 Subject: [PATCH 230/298] afs: Fix manually set volume location server list When a cell with a volume location server list is added manually by echoing the details into /proc/net/afs/cells, a record is added but the flag saying it has been looked up isn't set. This causes the VL server rotation code to wait forever, with the top of /proc/pid/stack looking like: afs_select_vlserver+0x3a6/0x6f3 afs_vl_lookup_vldb+0x4b/0x92 afs_create_volume+0x25/0x1b9 ... with the thread stuck in afs_start_vl_iteration() waiting for AFS_CELL_FL_NO_LOOKUP_YET to be cleared. Fix this by clearing AFS_CELL_FL_NO_LOOKUP_YET when setting up a record if that record's details were supplied manually. Fixes: 0a5143f2f89c ("afs: Implement VL server rotation") Reported-by: Dave Botsch Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/cell.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index cf445dbd5f2e..9de46116c749 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -173,6 +173,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, rcu_assign_pointer(cell->vl_servers, vllist); cell->dns_expiry = TIME64_MAX; + __clear_bit(AFS_CELL_FL_NO_LOOKUP_YET, &cell->flags); } else { cell->dns_expiry = ktime_get_real_seconds(); } From 18836b48ebae20850631ee2916d0cdbb86df813d Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Thu, 21 Feb 2019 10:56:42 +0100 Subject: [PATCH 231/298] MIPS: BCM63XX: provide DMA masks for ethernet devices The switch to the generic dma ops made dma masks mandatory, breaking devices having them not set. In case of bcm63xx, it broke ethernet with the following warning when trying to up the device: [ 2.633123] ------------[ cut here ]------------ [ 2.637949] WARNING: CPU: 0 PID: 325 at ./include/linux/dma-mapping.h:516 bcm_enetsw_open+0x160/0xbbc [ 2.647423] Modules linked in: gpio_button_hotplug [ 2.652361] CPU: 0 PID: 325 Comm: ip Not tainted 4.19.16 #0 [ 2.658080] Stack : 80520000 804cd3ec 00000000 00000000 804ccc00 87085bdc 87d3f9d4 804f9a17 [ 2.666707] 8049cf18 00000145 80a942a0 00000204 80ac0000 10008400 87085b90 eb3d5ab7 [ 2.675325] 00000000 00000000 80ac0000 000022b0 00000000 00000000 00000007 00000000 [ 2.683954] 0000007a 80500000 0013b381 00000000 80000000 00000000 804a1664 80289878 [ 2.692572] 00000009 00000204 80ac0000 00000200 00000002 00000000 00000000 80a90000 [ 2.701191] ... [ 2.703701] Call Trace: [ 2.706244] [<8001f3c8>] show_stack+0x58/0x100 [ 2.710840] [<800336e4>] __warn+0xe4/0x118 [ 2.715049] [<800337d4>] warn_slowpath_null+0x48/0x64 [ 2.720237] [<80289878>] bcm_enetsw_open+0x160/0xbbc [ 2.725347] [<802d1d4c>] __dev_open+0xf8/0x16c [ 2.729913] [<802d20cc>] __dev_change_flags+0x100/0x1c4 [ 2.735290] [<802d21b8>] dev_change_flags+0x28/0x70 [ 2.740326] [<803539e0>] devinet_ioctl+0x310/0x7b0 [ 2.745250] [<80355fd8>] inet_ioctl+0x1f8/0x224 [ 2.749939] [<802af290>] sock_ioctl+0x30c/0x488 [ 2.754632] [<80112b34>] do_vfs_ioctl+0x740/0x7dc [ 2.759459] [<80112c20>] ksys_ioctl+0x50/0x94 [ 2.763955] [<800240b8>] syscall_common+0x34/0x58 [ 2.768782] ---[ end trace fb1a6b14d74e28b6 ]--- [ 2.773544] bcm63xx_enetsw bcm63xx_enetsw.0: cannot allocate rx ring 512 Fix this by adding appropriate DMA masks for the platform devices. Fixes: f8c55dc6e828 ("MIPS: use generic dma noncoherent ops for simple noncoherent platforms") Signed-off-by: Jonas Gorski Reviewed-by: Christoph Hellwig Reviewed-by: Florian Fainelli Signed-off-by: Paul Burton Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: Ralf Baechle Cc: James Hogan Cc: stable@vger.kernel.org # v4.19+ --- arch/mips/bcm63xx/dev-enet.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/mips/bcm63xx/dev-enet.c b/arch/mips/bcm63xx/dev-enet.c index 07b4c65a88a4..8e73d65f3480 100644 --- a/arch/mips/bcm63xx/dev-enet.c +++ b/arch/mips/bcm63xx/dev-enet.c @@ -70,6 +70,8 @@ static struct platform_device bcm63xx_enet_shared_device = { static int shared_device_registered; +static u64 enet_dmamask = DMA_BIT_MASK(32); + static struct resource enet0_res[] = { { .start = -1, /* filled at runtime */ @@ -99,6 +101,8 @@ static struct platform_device bcm63xx_enet0_device = { .resource = enet0_res, .dev = { .platform_data = &enet0_pd, + .dma_mask = &enet_dmamask, + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; @@ -131,6 +135,8 @@ static struct platform_device bcm63xx_enet1_device = { .resource = enet1_res, .dev = { .platform_data = &enet1_pd, + .dma_mask = &enet_dmamask, + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; @@ -157,6 +163,8 @@ static struct platform_device bcm63xx_enetsw_device = { .resource = enetsw_res, .dev = { .platform_data = &enetsw_pd, + .dma_mask = &enet_dmamask, + .coherent_dma_mask = DMA_BIT_MASK(32), }, }; From ecef67cb10db7b83b3b71c61dbb29aa070ab0112 Mon Sep 17 00:00:00 2001 From: Timur Celik Date: Mon, 25 Feb 2019 21:13:13 +0100 Subject: [PATCH 232/298] tun: remove unnecessary memory barrier Replace set_current_state with __set_current_state since no memory barrier is needed at this point. Signed-off-by: Timur Celik Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/tun.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index d291762b9e9d..53f4f37b0ffd 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -2185,7 +2185,7 @@ static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err) schedule(); } - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); remove_wait_queue(&tfile->wq.wait, &wait); out: From 9ef6b42ad6fd7929dd1b6092cb02014e382c6a91 Mon Sep 17 00:00:00 2001 From: Nazarov Sergey Date: Mon, 25 Feb 2019 19:24:15 +0300 Subject: [PATCH 233/298] net: Add __icmp_send helper. Add __icmp_send function having ip_options struct parameter Signed-off-by: Sergey Nazarov Reviewed-by: Paul Moore Signed-off-by: David S. Miller --- include/net/icmp.h | 9 ++++++++- net/ipv4/icmp.c | 7 ++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/net/icmp.h b/include/net/icmp.h index 6ac3a5bd0117..e0f709d26dde 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -22,6 +22,7 @@ #include #include +#include struct icmp_err { int errno; @@ -39,7 +40,13 @@ struct net_proto_family; struct sk_buff; struct net; -void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info); +void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, + const struct ip_options *opt); +static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) +{ + __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); +} + int icmp_rcv(struct sk_buff *skb); int icmp_err(struct sk_buff *skb, u32 info); int icmp_init(void); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 065997f414e6..3f24414150e2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -570,7 +570,8 @@ static struct rtable *icmp_route_lookup(struct net *net, * MUST reply to only the first fragment. */ -void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) +void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, + const struct ip_options *opt) { struct iphdr *iph; int room; @@ -691,7 +692,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) goto out_unlock; @@ -742,7 +743,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) local_bh_enable(); out:; } -EXPORT_SYMBOL(icmp_send); +EXPORT_SYMBOL(__icmp_send); static void icmp_socket_deliver(struct sk_buff *skb, u32 info) From 3da1ed7ac398f34fff1694017a07054d69c5f5c5 Mon Sep 17 00:00:00 2001 From: Nazarov Sergey Date: Mon, 25 Feb 2019 19:27:15 +0300 Subject: [PATCH 234/298] net: avoid use IPCB in cipso_v4_error Extract IP options in cipso_v4_error and use __icmp_send. Signed-off-by: Sergey Nazarov Acked-by: Paul Moore Signed-off-by: David S. Miller --- include/net/ip.h | 2 ++ net/ipv4/cipso_ipv4.c | 17 +++++++++++++++-- net/ipv4/ip_options.c | 22 +++++++++++++++++----- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 8866bfce6121..f0e8d064e249 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -667,6 +667,8 @@ static inline int ip_options_echo(struct net *net, struct ip_options *dopt, } void ip_options_fragment(struct sk_buff *skb); +int __ip_options_compile(struct net *net, struct ip_options *opt, + struct sk_buff *skb, __be32 *info); int ip_options_compile(struct net *net, struct ip_options *opt, struct sk_buff *skb); int ip_options_get(struct net *net, struct ip_options_rcu **optp, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 777fa3b7fb13..eff86a71c1b0 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1735,13 +1735,26 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options *opt = (struct ip_options *)optbuf; + if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; + /* + * We might be called above the IP layer, + * so we can not use icmp_send and IPCB here. + */ + + memset(opt, 0, sizeof(struct ip_options)); + opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); + if (__ip_options_compile(dev_net(skb->dev), opt, skb, NULL)) + return; + if (gateway) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); else - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); } /** diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ed194d46c00e..32a35043c9f5 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -251,8 +251,9 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb) * If opt == NULL, then skb->data should point to IP header. */ -int ip_options_compile(struct net *net, - struct ip_options *opt, struct sk_buff *skb) +int __ip_options_compile(struct net *net, + struct ip_options *opt, struct sk_buff *skb, + __be32 *info) { __be32 spec_dst = htonl(INADDR_ANY); unsigned char *pp_ptr = NULL; @@ -468,11 +469,22 @@ int ip_options_compile(struct net *net, return 0; error: - if (skb) { - icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); - } + if (info) + *info = htonl((pp_ptr-iph)<<24); return -EINVAL; } + +int ip_options_compile(struct net *net, + struct ip_options *opt, struct sk_buff *skb) +{ + int ret; + __be32 info; + + ret = __ip_options_compile(net, opt, skb, &info); + if (ret != 0 && skb) + icmp_send(skb, ICMP_PARAMETERPROB, 0, info); + return ret; +} EXPORT_SYMBOL(ip_options_compile); /* From 56de8357049c707e5c881cc9d0e5ffed76388423 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 08:34:19 +0100 Subject: [PATCH 235/298] scsi: lpfc: fix calls to dma_set_mask_and_coherent() The change to use dma_set_mask_and_coherent() incorrectly made a second call with the 32 bit DMA mask value when the call with the 64 bit DMA mask value succeeded. This resulted in NVMe/FC connections failing due to corrupted data buffers, and various other SCSI/FCP I/O errors. Fixes: f30e1bfd6154 ("scsi: lpfc: use dma_set_mask_and_coherent") Cc: Suggested-by: Don Dutile Signed-off-by: Ewan D. Milne Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_init.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index c1c36812c3d2..a588dfad4b11 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -7361,15 +7361,18 @@ lpfc_sli_pci_mem_setup(struct lpfc_hba *phba) unsigned long bar0map_len, bar2map_len; int i, hbq_count; void *ptr; - int error = -ENODEV; + int error; if (!pdev) - return error; + return -ENODEV; /* Set the device DMA mask size */ - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) + error = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (error) + error = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (error) return error; + error = -ENODEV; /* Get the bus address of Bar0 and Bar2 and the number of bytes * required by each mapping. @@ -9742,11 +9745,13 @@ lpfc_sli4_pci_mem_setup(struct lpfc_hba *phba) uint32_t if_type; if (!pdev) - return error; + return -ENODEV; /* Set the device DMA mask size */ - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) + error = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (error) + error = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (error) return error; /* From 33d6667416c73eb0b37f0f10f56d825b15389dab Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 08:34:20 +0100 Subject: [PATCH 236/298] scsi: 3w-9xxx: fix calls to dma_set_mask_and_coherent() The change to use dma_set_mask_and_coherent() incorrectly made a second call with the 32 bit DMA mask value when the call with the 64 bit DMA mask value succeeded. Fixes: b000bced5739 ("scsi: 3w-9xxx: fully convert to the generic DMA API") Cc: Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/3w-9xxx.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c index a3c20e3a8b7c..3337b1e80412 100644 --- a/drivers/scsi/3w-9xxx.c +++ b/drivers/scsi/3w-9xxx.c @@ -2009,7 +2009,7 @@ static int twa_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) struct Scsi_Host *host = NULL; TW_Device_Extension *tw_dev; unsigned long mem_addr, mem_len; - int retval = -ENODEV; + int retval; retval = pci_enable_device(pdev); if (retval) { @@ -2020,8 +2020,10 @@ static int twa_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) pci_set_master(pdev); pci_try_set_mwi(pdev); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) { + retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (retval) + retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (retval) { TW_PRINTK(host, TW_DRIVER, 0x23, "Failed to set dma mask"); retval = -ENODEV; goto out_disable_device; @@ -2240,8 +2242,10 @@ static int twa_resume(struct pci_dev *pdev) pci_set_master(pdev); pci_try_set_mwi(pdev); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) { + retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (retval) + retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (retval) { TW_PRINTK(host, TW_DRIVER, 0x40, "Failed to set dma mask during resume"); retval = -ENODEV; goto out_disable_device; From 1feb3b02294994ee3a067c9b6cda6c244fd0ee18 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 08:34:21 +0100 Subject: [PATCH 237/298] scsi: 3w-sas: fix calls to dma_set_mask_and_coherent() The change to use dma_set_mask_and_coherent() incorrectly made a second call with the 32 bit DMA mask value when the call with the 64 bit DMA mask value succeeded. Fixes: b1fa122930c4 ("scsi: 3w-sas: fully convert to the generic DMA API") Cc: Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/3w-sas.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/3w-sas.c b/drivers/scsi/3w-sas.c index e8f5f7c63190..2d15b878bd94 100644 --- a/drivers/scsi/3w-sas.c +++ b/drivers/scsi/3w-sas.c @@ -1572,8 +1572,10 @@ static int twl_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) pci_set_master(pdev); pci_try_set_mwi(pdev); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) { + retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (retval) + retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (retval) { TW_PRINTK(host, TW_DRIVER, 0x18, "Failed to set dma mask"); retval = -ENODEV; goto out_disable_device; @@ -1804,8 +1806,10 @@ static int twl_resume(struct pci_dev *pdev) pci_set_master(pdev); pci_try_set_mwi(pdev); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) { + retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (retval) + retval = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (retval) { TW_PRINTK(host, TW_DRIVER, 0x25, "Failed to set dma mask during resume"); retval = -ENODEV; goto out_disable_device; From c326de562f1fc149da4855a1b9d0433300c2a85d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 08:34:22 +0100 Subject: [PATCH 238/298] scsi: aic94xx: fix calls to dma_set_mask_and_coherent() The change to use dma_set_mask_and_coherent() incorrectly made a second call with the 32 bit DMA mask value when the call with the 64 bit DMA mask value succeeded. [mkp: fixed subject] Fixes: 3a21986f1a59 ("scsi: aic94xx: fully convert to the generic DMA API") Cc: Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/aic94xx/aic94xx_init.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/aic94xx/aic94xx_init.c b/drivers/scsi/aic94xx/aic94xx_init.c index 07efcb9b5b94..bbdae67774f0 100644 --- a/drivers/scsi/aic94xx/aic94xx_init.c +++ b/drivers/scsi/aic94xx/aic94xx_init.c @@ -769,9 +769,11 @@ static int asd_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) if (err) goto Err_remove; - err = -ENODEV; - if (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32))) { + err = dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(64)); + if (err) + err = dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32)); + if (err) { + err = -ENODEV; asd_printk("no suitable DMA mask for %s\n", pci_name(dev)); goto Err_remove; } From 11ea3824140ca994f4560c4bec6a32d257ef3e83 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 08:34:23 +0100 Subject: [PATCH 239/298] scsi: bfa: fix calls to dma_set_mask_and_coherent() The change to use dma_set_mask_and_coherent() incorrectly made a second call with the 32 bit DMA mask value when the call with the 64 bit DMA mask value succeeded. [mkp: fixed commit message] Fixes: a69b080025ea ("scsi: bfa: use dma_set_mask_and_coherent") Cc: Suggested-by: Ewan D. Milne Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/bfa/bfad.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/bfa/bfad.c b/drivers/scsi/bfa/bfad.c index 42a0caf6740d..88880a66a189 100644 --- a/drivers/scsi/bfa/bfad.c +++ b/drivers/scsi/bfa/bfad.c @@ -727,7 +727,7 @@ bfad_init_timer(struct bfad_s *bfad) int bfad_pci_init(struct pci_dev *pdev, struct bfad_s *bfad) { - int rc = -ENODEV; + int rc = -ENODEV; if (pci_enable_device(pdev)) { printk(KERN_ERR "pci_enable_device fail %p\n", pdev); @@ -739,8 +739,12 @@ bfad_pci_init(struct pci_dev *pdev, struct bfad_s *bfad) pci_set_master(pdev); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) { + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (rc) + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + + if (rc) { + rc = -ENODEV; printk(KERN_ERR "dma_set_mask_and_coherent fail %p\n", pdev); goto out_release_region; } @@ -1534,6 +1538,7 @@ bfad_pci_slot_reset(struct pci_dev *pdev) { struct bfad_s *bfad = pci_get_drvdata(pdev); u8 byte; + int rc; dev_printk(KERN_ERR, &pdev->dev, "bfad_pci_slot_reset flags: 0x%x\n", bfad->bfad_flags); @@ -1561,8 +1566,11 @@ bfad_pci_slot_reset(struct pci_dev *pdev) pci_save_state(pdev); pci_set_master(pdev); - if (dma_set_mask_and_coherent(&bfad->pcidev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&bfad->pcidev->dev, DMA_BIT_MASK(32))) + rc = dma_set_mask_and_coherent(&bfad->pcidev->dev, DMA_BIT_MASK(64)); + if (rc) + rc = dma_set_mask_and_coherent(&bfad->pcidev->dev, + DMA_BIT_MASK(32)); + if (rc) goto out_disable_device; if (restart_bfa(bfad) == -1) From 732f3238dcf27acb92959a99b7923dc49395980e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 08:34:24 +0100 Subject: [PATCH 240/298] scsi: csiostor: fix calls to dma_set_mask_and_coherent() The change to use dma_set_mask_and_coherent() incorrectly made a second call with the 32 bit DMA mask value when the call with the 64 bit DMA mask value succeeded. Fixes: c22b332d811b ("scsi: csiostor: switch to generic DMA API") Cc: Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/csiostor/csio_init.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/csiostor/csio_init.c b/drivers/scsi/csiostor/csio_init.c index cf629380a981..616b25bf7941 100644 --- a/drivers/scsi/csiostor/csio_init.c +++ b/drivers/scsi/csiostor/csio_init.c @@ -210,8 +210,11 @@ csio_pci_init(struct pci_dev *pdev, int *bars) pci_set_master(pdev); pci_try_set_mwi(pdev); - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) { + rv = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (rv) + rv = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (rv) { + rv = -ENODEV; dev_err(&pdev->dev, "No suitable DMA available.\n"); goto err_release_regions; } From d9a00459effc30f6de2cdd887b64f15c6c54ae71 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 08:34:25 +0100 Subject: [PATCH 241/298] scsi: hisi_sas: fix calls to dma_set_mask_and_coherent() The change to use dma_set_mask_and_coherent() incorrectly made a second call with the 32 bit DMA mask value when the call with the 64 bit DMA mask value succeeded. [mkp: fixed commit message] Fixes: e4db40e7a1a2 ("scsi: hisi_sas: use dma_set_mask_and_coherent") Cc: Suggested-by: Ewan D. Milne Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/hisi_sas/hisi_sas_main.c | 8 ++++++-- drivers/scsi/hisi_sas/hisi_sas_v3_hw.c | 8 +++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index eed7fc5b3389..bc17fa0d8375 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -2323,6 +2323,7 @@ static struct Scsi_Host *hisi_sas_shost_alloc(struct platform_device *pdev, struct Scsi_Host *shost; struct hisi_hba *hisi_hba; struct device *dev = &pdev->dev; + int error; shost = scsi_host_alloc(hw->sht, sizeof(*hisi_hba)); if (!shost) { @@ -2343,8 +2344,11 @@ static struct Scsi_Host *hisi_sas_shost_alloc(struct platform_device *pdev, if (hisi_sas_get_fw_info(hisi_hba) < 0) goto err_out; - if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) { + error = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (error) + error = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32)); + + if (error) { dev_err(dev, "No usable DMA addressing method\n"); goto err_out; } diff --git a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c index c92b3822c408..e0570fd8466e 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v3_hw.c @@ -2447,10 +2447,12 @@ hisi_sas_v3_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) goto err_out_disable_device; - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) || - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) { + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (rc) + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (rc) { dev_err(dev, "No usable DMA addressing method\n"); - rc = -EIO; + rc = -ENODEV; goto err_out_regions; } From 3e344b6cec8e675e692ffddf9977c52638337006 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 Feb 2019 08:34:26 +0100 Subject: [PATCH 242/298] scsi: hptiop: fix calls to dma_set_mask() The change to use dma_set_mask() incorrectly made a second call with the 32 bit DMA mask value when the call with the 64 bit DMA mask value succeeded. Fixes: 453cd3700ca3 ("scsi: hptiop: use dma_set_mask") Cc: Suggested-by: Ewan D. Milne Signed-off-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/hptiop.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/hptiop.c b/drivers/scsi/hptiop.c index 3eedfd4f8f57..251c084a6ff0 100644 --- a/drivers/scsi/hptiop.c +++ b/drivers/scsi/hptiop.c @@ -1292,6 +1292,7 @@ static int hptiop_probe(struct pci_dev *pcidev, const struct pci_device_id *id) dma_addr_t start_phy; void *start_virt; u32 offset, i, req_size; + int rc; dprintk("hptiop_probe(%p)\n", pcidev); @@ -1308,9 +1309,12 @@ static int hptiop_probe(struct pci_dev *pcidev, const struct pci_device_id *id) /* Enable 64bit DMA if possible */ iop_ops = (struct hptiop_adapter_ops *)id->driver_data; - if (dma_set_mask(&pcidev->dev, - DMA_BIT_MASK(iop_ops->hw_dma_bit_mask)) || - dma_set_mask(&pcidev->dev, DMA_BIT_MASK(32))) { + rc = dma_set_mask(&pcidev->dev, + DMA_BIT_MASK(iop_ops->hw_dma_bit_mask)); + if (rc) + rc = dma_set_mask(&pcidev->dev, DMA_BIT_MASK(32)); + + if (rc) { printk(KERN_ERR "hptiop: fail to set dma_mask\n"); goto disable_pci_device; } From 5603731a15ef9ca317c122cc8c959f1dee1798b4 Mon Sep 17 00:00:00 2001 From: Takeshi Saito Date: Thu, 21 Feb 2019 20:38:05 +0100 Subject: [PATCH 243/298] mmc: tmio: fix access width of Block Count Register In R-Car Gen2 or later, the maximum number of transfer blocks are changed from 0xFFFF to 0xFFFFFFFF. Therefore, Block Count Register should use iowrite32(). If another system (U-boot, Hypervisor OS, etc) uses bit[31:16], this value will not be cleared. So, SD/MMC card initialization fails. So, check for the bigger register and use apropriate write. Also, mark the register as extended on Gen2. Signed-off-by: Takeshi Saito [wsa: use max_blk_count in if(), add Gen2, update commit message] Signed-off-by: Wolfram Sang Cc: stable@kernel.org Reviewed-by: Simon Horman [Ulf: Fixed build error] Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_sys_dmac.c | 1 + drivers/mmc/host/tmio_mmc.h | 5 +++++ drivers/mmc/host/tmio_mmc_core.c | 6 +++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/renesas_sdhi_sys_dmac.c b/drivers/mmc/host/renesas_sdhi_sys_dmac.c index 8471160316e0..02cd878e209f 100644 --- a/drivers/mmc/host/renesas_sdhi_sys_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_sys_dmac.c @@ -65,6 +65,7 @@ static const struct renesas_sdhi_of_data of_rcar_gen2_compatible = { .scc_offset = 0x0300, .taps = rcar_gen2_scc_taps, .taps_num = ARRAY_SIZE(rcar_gen2_scc_taps), + .max_blk_count = 0xffffffff, }; /* Definitions for sampling clocks */ diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index c03529e3f01a..2adb0d24360f 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -277,6 +277,11 @@ static inline void sd_ctrl_write32_as_16_and_16(struct tmio_mmc_host *host, iowrite16(val >> 16, host->ctl + ((addr + 2) << host->bus_shift)); } +static inline void sd_ctrl_write32(struct tmio_mmc_host *host, int addr, u32 val) +{ + iowrite32(val, host->ctl + (addr << host->bus_shift)); +} + static inline void sd_ctrl_write32_rep(struct tmio_mmc_host *host, int addr, const u32 *buf, int count) { diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 41d79720e745..f7a6f005899a 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -703,7 +704,10 @@ static int tmio_mmc_start_data(struct tmio_mmc_host *host, /* Set transfer length / blocksize */ sd_ctrl_write16(host, CTL_SD_XFER_LEN, data->blksz); - sd_ctrl_write16(host, CTL_XFER_BLK_COUNT, data->blocks); + if (host->mmc->max_blk_count >= SZ_64K) + sd_ctrl_write32(host, CTL_XFER_BLK_COUNT, data->blocks); + else + sd_ctrl_write16(host, CTL_XFER_BLK_COUNT, data->blocks); tmio_mmc_start_dma(host, data); From cffaaf0c816238c45cd2d06913476c83eb50f682 Mon Sep 17 00:00:00 2001 From: Julia Cartwright Date: Wed, 20 Feb 2019 16:46:31 +0000 Subject: [PATCH 244/298] iommu/dmar: Fix buffer overflow during PCI bus notification Commit 57384592c433 ("iommu/vt-d: Store bus information in RMRR PCI device path") changed the type of the path data, however, the change in path type was not reflected in size calculations. Update to use the correct type and prevent a buffer overflow. This bug manifests in systems with deep PCI hierarchies, and can lead to an overflow of the static allocated buffer (dmar_pci_notify_info_buf), or can lead to overflow of slab-allocated data. BUG: KASAN: global-out-of-bounds in dmar_alloc_pci_notify_info+0x1d5/0x2e0 Write of size 1 at addr ffffffff90445d80 by task swapper/0/1 CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.14.87-rt49-02406-gd0a0e96 #1 Call Trace: ? dump_stack+0x46/0x59 ? print_address_description+0x1df/0x290 ? dmar_alloc_pci_notify_info+0x1d5/0x2e0 ? kasan_report+0x256/0x340 ? dmar_alloc_pci_notify_info+0x1d5/0x2e0 ? e820__memblock_setup+0xb0/0xb0 ? dmar_dev_scope_init+0x424/0x48f ? __down_write_common+0x1ec/0x230 ? dmar_dev_scope_init+0x48f/0x48f ? dmar_free_unused_resources+0x109/0x109 ? cpumask_next+0x16/0x20 ? __kmem_cache_create+0x392/0x430 ? kmem_cache_create+0x135/0x2f0 ? e820__memblock_setup+0xb0/0xb0 ? intel_iommu_init+0x170/0x1848 ? _raw_spin_unlock_irqrestore+0x32/0x60 ? migrate_enable+0x27a/0x5b0 ? sched_setattr+0x20/0x20 ? migrate_disable+0x1fc/0x380 ? task_rq_lock+0x170/0x170 ? try_to_run_init_process+0x40/0x40 ? locks_remove_file+0x85/0x2f0 ? dev_prepare_static_identity_mapping+0x78/0x78 ? rt_spin_unlock+0x39/0x50 ? lockref_put_or_lock+0x2a/0x40 ? dput+0x128/0x2f0 ? __rcu_read_unlock+0x66/0x80 ? __fput+0x250/0x300 ? __rcu_read_lock+0x1b/0x30 ? mntput_no_expire+0x38/0x290 ? e820__memblock_setup+0xb0/0xb0 ? pci_iommu_init+0x25/0x63 ? pci_iommu_init+0x25/0x63 ? do_one_initcall+0x7e/0x1c0 ? initcall_blacklisted+0x120/0x120 ? kernel_init_freeable+0x27b/0x307 ? rest_init+0xd0/0xd0 ? kernel_init+0xf/0x120 ? rest_init+0xd0/0xd0 ? ret_from_fork+0x1f/0x40 The buggy address belongs to the variable: dmar_pci_notify_info_buf+0x40/0x60 Fixes: 57384592c433 ("iommu/vt-d: Store bus information in RMRR PCI device path") Signed-off-by: Julia Cartwright Signed-off-by: Joerg Roedel --- drivers/iommu/dmar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/dmar.c b/drivers/iommu/dmar.c index dc9f14811e0f..58dc70bffd5b 100644 --- a/drivers/iommu/dmar.c +++ b/drivers/iommu/dmar.c @@ -144,7 +144,7 @@ dmar_alloc_pci_notify_info(struct pci_dev *dev, unsigned long event) for (tmp = dev; tmp; tmp = tmp->bus->self) level++; - size = sizeof(*info) + level * sizeof(struct acpi_dmar_pci_path); + size = sizeof(*info) + level * sizeof(info->path[0]); if (size <= sizeof(dmar_pci_notify_info_buf)) { info = (struct dmar_pci_notify_info *)dmar_pci_notify_info_buf; } else { From 781e62823cb81b972dc8652c1827205cda2ac9ac Mon Sep 17 00:00:00 2001 From: Peng Sun Date: Tue, 26 Feb 2019 22:15:37 +0800 Subject: [PATCH 245/298] bpf: decrease usercnt if bpf_map_new_fd() fails in bpf_map_get_fd_by_id() In bpf/syscall.c, bpf_map_get_fd_by_id() use bpf_map_inc_not_zero() to increase the refcount, both map->refcnt and map->usercnt. Then, if bpf_map_new_fd() fails, should handle map->usercnt too. Fixes: bd5f5f4ecb78 ("bpf: Add BPF_MAP_GET_FD_BY_ID") Signed-off-by: Peng Sun Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8577bb7f8be6..f98328abe8fa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1986,7 +1986,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) fd = bpf_map_new_fd(map, f_flags); if (fd < 0) - bpf_map_put(map); + bpf_map_put_with_uref(map); return fd; } From b6e9e5df4ecf100f6a10ab2ade8e46d47a4b9779 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Feb 2019 09:00:02 -0800 Subject: [PATCH 246/298] ipv4: Return error for RTA_VIA attribute IPv4 currently does not support nexthops outside of the AF_INET family. Specifically, it does not handle RTA_VIA attribute. If it is passed in a route add request, the actual route added only uses the device which is clearly not what the user intended: $ ip ro add 172.16.1.0/24 via inet6 2001:db8:1::1 dev eth0 $ ip ro ls ... 172.16.1.0/24 dev eth0 Catch this and fail the route add: $ ip ro add 172.16.1.0/24 via inet6 2001:db8:1::1 dev eth0 Error: IPv4 does not support RTA_VIA attribute. Fixes: 03c0566542f4c ("mpls: Netlink commands to add, remove, and dump routes") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index fe4f6a624238..ed14ec245584 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -710,6 +710,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_GATEWAY: cfg->fc_gw = nla_get_be32(attr); break; + case RTA_VIA: + NL_SET_ERR_MSG(extack, "IPv4 does not support RTA_VIA attribute"); + err = -EINVAL; + goto errout; case RTA_PRIORITY: cfg->fc_priority = nla_get_u32(attr); break; From e3818541b49fb88650ba339d33cc53e4095da5b3 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Feb 2019 09:00:03 -0800 Subject: [PATCH 247/298] ipv6: Return error for RTA_VIA attribute IPv6 currently does not support nexthops outside of the AF_INET6 family. Specifically, it does not handle RTA_VIA attribute. If it is passed in a route add request, the actual route added only uses the device which is clearly not what the user intended: $ ip -6 ro add 2001:db8:2::/64 via inet 172.16.1.1 dev eth0 $ ip ro ls ... 2001:db8:2::/64 dev eth0 metric 1024 pref medium Catch this and fail the route add: $ ip -6 ro add 2001:db8:2::/64 via inet 172.16.1.1 dev eth0 Error: IPv6 does not support RTA_VIA attribute. Fixes: 03c0566542f4c ("mpls: Netlink commands to add, remove, and dump routes") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ce15dc4ccbfa..b7a620023a52 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4182,6 +4182,10 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]); cfg->fc_flags |= RTF_GATEWAY; } + if (tb[RTA_VIA]) { + NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute"); + goto errout; + } if (tb[RTA_DST]) { int plen = (rtm->rtm_dst_len + 7) >> 3; From be48220edd48ca0d569782992840488a52373a24 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Feb 2019 09:00:04 -0800 Subject: [PATCH 248/298] mpls: Return error for RTA_GATEWAY attribute MPLS does not support nexthops with an MPLS address family. Specifically, it does not handle RTA_GATEWAY attribute. Make it clear by returning an error. Fixes: 03c0566542f4c ("mpls: Netlink commands to add, remove, and dump routes") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 7d55d4c04088..fa763e2e50ec 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1838,6 +1838,9 @@ static int rtm_to_route_config(struct sk_buff *skb, goto errout; break; } + case RTA_GATEWAY: + NL_SET_ERR_MSG(extack, "MPLS does not support RTA_GATEWAY attribute"); + goto errout; case RTA_VIA: { if (nla_get_via(nla, &cfg->rc_via_alen, From bf48648d650db1146b75b9bd358502431e86cf4f Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 22 Feb 2019 18:25:03 +0000 Subject: [PATCH 249/298] hv_netvsc: Fix IP header checksum for coalesced packets Incoming packets may have IP header checksum verified by the host. They may not have IP header checksum computed after coalescing. This patch re-compute the checksum when necessary, otherwise the packets may be dropped, because Linux network stack always checks it. Signed-off-by: Haiyang Zhang Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 256adbd044f5..cf4897043e83 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -744,6 +744,14 @@ void netvsc_linkstatus_callback(struct net_device *net, schedule_delayed_work(&ndev_ctx->dwork, 0); } +static void netvsc_comp_ipcsum(struct sk_buff *skb) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + + iph->check = 0; + iph->check = ip_fast_csum(iph, iph->ihl); +} + static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, struct netvsc_channel *nvchan) { @@ -770,9 +778,17 @@ static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net, /* skb is already created with CHECKSUM_NONE */ skb_checksum_none_assert(skb); - /* - * In Linux, the IP checksum is always checked. - * Do L4 checksum offload if enabled and present. + /* Incoming packets may have IP header checksum verified by the host. + * They may not have IP header checksum computed after coalescing. + * We compute it here if the flags are set, because on Linux, the IP + * checksum is always checked. + */ + if (csum_info && csum_info->receive.ip_checksum_value_invalid && + csum_info->receive.ip_checksum_succeeded && + skb->protocol == htons(ETH_P_IP)) + netvsc_comp_ipcsum(skb); + + /* Do L4 checksum offload if enabled and present. */ if (csum_info && (net->features & NETIF_F_RXCSUM)) { if (csum_info->receive.tcp_checksum_succeeded || From bfd07f3dd4f111b884d7922b37eb239280f83d8c Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Mon, 25 Feb 2019 10:57:20 +0700 Subject: [PATCH 250/298] tipc: fix race condition causing hung sendto When sending multicast messages via blocking socket, if sending link is congested (tsk->cong_link_cnt is set to 1), the sending thread will be put into sleeping state. However, tipc_sk_filter_rcv() is called under socket spin lock but tipc_wait_for_cond() is not. So, there is no guarantee that the setting of tsk->cong_link_cnt to 0 in tipc_sk_proto_rcv() in CPU-1 will be perceived by CPU-0. If that is the case, the sending thread in CPU-0 after being waken up, will continue to see tsk->cong_link_cnt as 1 and put the sending thread into sleeping state again. The sending thread will sleep forever. CPU-0 | CPU-1 tipc_wait_for_cond() | { | // condition_ = !tsk->cong_link_cnt | while ((rc_ = !(condition_))) { | ... | release_sock(sk_); | wait_woken(); | | if (!sock_owned_by_user(sk)) | tipc_sk_filter_rcv() | { | ... | tipc_sk_proto_rcv() | { | ... | tsk->cong_link_cnt--; | ... | sk->sk_write_space(sk); | ... | } | ... | } sched_annotate_sleep(); | lock_sock(sk_); | remove_wait_queue(); | } | } | This commit fixes it by adding memory barrier to tipc_sk_proto_rcv() and tipc_wait_for_cond(). Acked-by: Jon Maloy Signed-off-by: Tung Nguyen Signed-off-by: David S. Miller --- net/tipc/socket.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 684f2125fc6b..70343ac448b1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -379,11 +379,13 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout) #define tipc_wait_for_cond(sock_, timeo_, condition_) \ ({ \ + DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ struct sock *sk_; \ int rc_; \ \ while ((rc_ = !(condition_))) { \ - DEFINE_WAIT_FUNC(wait_, woken_wake_function); \ + /* coupled with smp_wmb() in tipc_sk_proto_rcv() */ \ + smp_rmb(); \ sk_ = (sock_)->sk; \ rc_ = tipc_sk_sock_err((sock_), timeo_); \ if (rc_) \ @@ -1983,6 +1985,8 @@ static void tipc_sk_proto_rcv(struct sock *sk, return; case SOCK_WAKEUP: tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); + /* coupled with smp_rmb() in tipc_wait_for_cond() */ + smp_wmb(); tsk->cong_link_cnt--; wakeup = true; break; From 6e53330909672bd9a8c9f826e989a5945d9d9fdf Mon Sep 17 00:00:00 2001 From: Marc Gonzalez Date: Mon, 25 Feb 2019 11:41:33 +0100 Subject: [PATCH 251/298] arm64: dts: qcom: msm8998: Extend TZ reserved memory area My console locks up as soon as Linux writes to [88800000,88f00000[ AFAIU, that memory area is reserved for trustzone. Extend TZ reserved memory range, to prevent Linux from stepping on trustzone's toes. Cc: stable@vger.kernel.org # 4.20+ Reviewed-by: Sibi Sankar Fixes: c7833949564ec ("arm64: dts: qcom: msm8998: Add smem related nodes") Signed-off-by: Marc Gonzalez Signed-off-by: Andy Gross --- arch/arm64/boot/dts/qcom/msm8998.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/msm8998.dtsi b/arch/arm64/boot/dts/qcom/msm8998.dtsi index 8d41b69ec2da..99bccaac31ad 100644 --- a/arch/arm64/boot/dts/qcom/msm8998.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8998.dtsi @@ -37,7 +37,7 @@ smem_mem: smem-mem@86000000 { }; memory@86200000 { - reg = <0x0 0x86200000 0x0 0x2600000>; + reg = <0x0 0x86200000 0x0 0x2d00000>; no-map; }; From e5723f95d6b493dd437f1199cacb41459713b32f Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Fri, 22 Feb 2019 19:21:34 +0530 Subject: [PATCH 252/298] mmc: core: Fix NULL ptr crash from mmc_should_fail_request In case of CQHCI, mrq->cmd may be NULL for data requests (non DCMD). In such case mmc_should_fail_request is directly dereferencing mrq->cmd while cmd is NULL. Fix this by checking for mrq->cmd pointer. Fixes: 72a5af554df8 ("mmc: core: Add support for handling CQE requests") Signed-off-by: Ritesh Harjani Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 5bd58b95d318..b27a1e620233 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -95,7 +95,7 @@ static void mmc_should_fail_request(struct mmc_host *host, if (!data) return; - if (cmd->error || data->error || + if ((cmd && cmd->error) || data->error || !should_fail(&host->fail_mmc_request, data->blksz * data->blocks)) return; From 388b4e6a00bb3097278ed1648ac5a1cb48c894e6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 26 Feb 2019 16:35:26 -0800 Subject: [PATCH 253/298] scsi: core: Avoid that system resume triggers a kernel warning scsi_device_quiesce() and scsi_device_resume() are called during system-wide suspend and resume. scsi_device_quiesce() only succeeds for SCSI devices that are in one of the RUNNING, OFFLINE or TRANSPORT_OFFLINE states (see also scsi_set_device_state()). This patch avoids that the following warning is triggered when resuming a system for which quiescing a SCSI device failed: WARNING: CPU: 2 PID: 11303 at drivers/scsi/scsi_lib.c:2600 scsi_device_resume+0x4f/0x58 CPU: 2 PID: 11303 Comm: kworker/u8:70 Not tainted 5.0.0-rc1+ #50 Hardware name: LENOVO 80E3/Lancer 5B2, BIOS A2CN45WW(V2.13) 08/04/2016 Workqueue: events_unbound async_run_entry_fn Call Trace: scsi_dev_type_resume+0x2e/0x60 async_run_entry_fn+0x32/0xd8 process_one_work+0x1f4/0x420 worker_thread+0x28/0x3c0 kthread+0x118/0x130 ret_from_fork+0x22/0x40 Cc: Przemek Socha Reported-by: Przemek Socha Fixes: 3a0a529971ec ("block, scsi: Make SCSI quiesce and resume work reliably") # v4.15 Signed-off-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index f8d51c3d5582..a6828391d6b3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2598,7 +2598,6 @@ void scsi_device_resume(struct scsi_device *sdev) * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); - WARN_ON_ONCE(!sdev->quiesced_by); sdev->quiesced_by = NULL; blk_clear_pm_only(sdev->request_queue); if (sdev->sdev_state == SDEV_QUIESCE) From 27ec9dc17c48ea2e642ccb90b4ebf7fd47468911 Mon Sep 17 00:00:00 2001 From: Alamy Liu Date: Mon, 25 Feb 2019 11:22:13 -0800 Subject: [PATCH 254/298] mmc: cqhci: fix space allocated for transfer descriptor There is not enough space being allocated when DCMD is disabled. CQE_DCMD is not necessary to be enabled when CQE is enabled. (Software could halt CQE to send command) In the case that CQE_DCMD is not enabled, it still needs to allocate space for data transfer. For instance: CQE_DCMD is enabled: 31 slots space (one slot used by DCMD) CQE_DCMD is disabled: 32 slots space Fixes: a4080225f51d ("mmc: cqhci: support for command queue enabled host") Signed-off-by: Alamy Liu Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/cqhci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/cqhci.c b/drivers/mmc/host/cqhci.c index 159270e947cf..bd9a59bc0e31 100644 --- a/drivers/mmc/host/cqhci.c +++ b/drivers/mmc/host/cqhci.c @@ -201,7 +201,7 @@ static int cqhci_host_alloc_tdl(struct cqhci_host *cq_host) cq_host->desc_size = cq_host->slot_sz * cq_host->num_slots; cq_host->data_size = cq_host->trans_desc_len * cq_host->mmc->max_segs * - (cq_host->num_slots - 1); + cq_host->mmc->cqe_qdepth; pr_debug("%s: cqhci: desc_size: %zu data_sz: %zu slot-sz: %d\n", mmc_hostname(cq_host->mmc), cq_host->desc_size, cq_host->data_size, From d07e9fadf3a6b466ca3ae90fa4859089ff20530f Mon Sep 17 00:00:00 2001 From: Alamy Liu Date: Mon, 25 Feb 2019 11:22:14 -0800 Subject: [PATCH 255/298] mmc: cqhci: Fix a tiny potential memory leak on error condition Free up the allocated memory in the case of error return The value of mmc_host->cqe_enabled stays 'false'. Thus, cqhci_disable (mmc_cqe_ops->cqe_disable) won't be called to free the memory. Also, cqhci_disable() seems to be designed to disable and free all resources, not suitable to handle this corner case. Fixes: a4080225f51d ("mmc: cqhci: support for command queue enabled host") Signed-off-by: Alamy Liu Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/cqhci.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/cqhci.c b/drivers/mmc/host/cqhci.c index bd9a59bc0e31..a8af682a9182 100644 --- a/drivers/mmc/host/cqhci.c +++ b/drivers/mmc/host/cqhci.c @@ -217,12 +217,21 @@ static int cqhci_host_alloc_tdl(struct cqhci_host *cq_host) cq_host->desc_size, &cq_host->desc_dma_base, GFP_KERNEL); + if (!cq_host->desc_base) + return -ENOMEM; + cq_host->trans_desc_base = dmam_alloc_coherent(mmc_dev(cq_host->mmc), cq_host->data_size, &cq_host->trans_desc_dma_base, GFP_KERNEL); - if (!cq_host->desc_base || !cq_host->trans_desc_base) + if (!cq_host->trans_desc_base) { + dmam_free_coherent(mmc_dev(cq_host->mmc), cq_host->desc_size, + cq_host->desc_base, + cq_host->desc_dma_base); + cq_host->desc_base = NULL; + cq_host->desc_dma_base = 0; return -ENOMEM; + } pr_debug("%s: cqhci: desc-base: 0x%p trans-base: 0x%p\n desc_dma 0x%llx trans_dma: 0x%llx\n", mmc_hostname(cq_host->mmc), cq_host->desc_base, cq_host->trans_desc_base, From c53336c8f5f29043fded57912cc06c24e12613d7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 28 Feb 2019 00:02:11 +0800 Subject: [PATCH 256/298] mmc: core: align max segment size with logical block size Logical block size is the lowest possible block size that the storage device can address. Max segment size is often related with controller's DMA capability. And it is reasonable to align max segment size with logical block size. SDHCI sets un-aligned max segment size, and causes ADMA error, so fix it by aligning max segment size with logical block size. Reported-by: Naresh Kamboju Cc: Christoph Hellwig Cc: Naresh Kamboju Cc: Faiz Abbas Cc: linux-block@vger.kernel.org Signed-off-by: Ming Lei Signed-off-by: Ulf Hansson --- drivers/mmc/core/block.c | 6 ------ drivers/mmc/core/queue.c | 9 ++++++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 14f3fdb8c6bb..9ce8eb51a60f 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2380,12 +2380,6 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, snprintf(md->disk->disk_name, sizeof(md->disk->disk_name), "mmcblk%u%s", card->host->index, subname ? subname : ""); - if (mmc_card_mmc(card)) - blk_queue_logical_block_size(md->queue.queue, - card->ext_csd.data_sector_size); - else - blk_queue_logical_block_size(md->queue.queue, 512); - set_capacity(md->disk, size); if (mmc_host_cmd23(card->host)) { diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 35cc138b096d..15a45ec6518d 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -355,6 +355,7 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) { struct mmc_host *host = card->host; u64 limit = BLK_BOUNCE_HIGH; + unsigned block_size = 512; if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask) limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT; @@ -368,7 +369,13 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) blk_queue_max_hw_sectors(mq->queue, min(host->max_blk_count, host->max_req_size / 512)); blk_queue_max_segments(mq->queue, host->max_segs); - blk_queue_max_segment_size(mq->queue, host->max_seg_size); + + if (mmc_card_mmc(card)) + block_size = card->ext_csd.data_sector_size; + + blk_queue_logical_block_size(mq->queue, block_size); + blk_queue_max_segment_size(mq->queue, + round_down(host->max_seg_size, block_size)); INIT_WORK(&mq->recovery_work, mmc_mq_recovery_handler); INIT_WORK(&mq->complete_work, mmc_blk_mq_complete_work); From 2b3c6885386020b1b9d92d45e8349637e27d1f66 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 27 Feb 2019 03:58:53 -0500 Subject: [PATCH 257/298] bnxt_en: Drop oversize TX packets to prevent errors. There have been reports of oversize UDP packets being sent to the driver to be transmitted, causing error conditions. The issue is likely caused by the dst of the SKB switching between 'lo' with 64K MTU and the hardware device with a smaller MTU. Patches are being proposed by Mahesh Bandewar to fix the issue. In the meantime, add a quick length check in the driver to prevent the error. The driver uses the TX packet size as index to look up an array to setup the TX BD. The array is large enough to support all MTU sizes supported by the driver. The oversize TX packet causes the driver to index beyond the array and put garbage values into the TX BD. Add a simple check to prevent this. Signed-off-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index d95730c6e0f2..803f7990d32b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -500,6 +500,12 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) } length >>= 9; + if (unlikely(length >= ARRAY_SIZE(bnxt_lhint_arr))) { + dev_warn_ratelimited(&pdev->dev, "Dropped oversize %d bytes TX packet.\n", + skb->len); + i = 0; + goto tx_dma_error; + } flags |= bnxt_lhint_arr[length]; txbd->tx_bd_len_flags_type = cpu_to_le32(flags); From f4d7b3e23d259c44f1f1c39645450680fcd935d6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 27 Feb 2019 13:37:26 +0300 Subject: [PATCH 258/298] net: dev: Use unsigned integer as an argument to left-shift 1 << 31 is Undefined Behaviour according to the C standard. Use U type modifier to avoid theoretical overflow. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 86dbb3e29139..848b54b7ec91 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3861,7 +3861,7 @@ static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) if (debug_value == 0) /* no output */ return 0; /* set low N bits */ - return (1 << debug_value) - 1; + return (1U << debug_value) - 1; } static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu) From 287beb284f14796160be00db15f87ab3531715a2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 27 Feb 2019 13:45:35 +0300 Subject: [PATCH 259/298] enc28j60: Correct description of debug module parameter The netif_msg_init() API takes on input the amount of bits to be set. The description of debug parameter in the enc28j60 module is misleading in this sense and passing 0xffff does not give an expected behaviour. Fix the description of debug module parameter to show what exactly is expected. Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/enc28j60.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/enc28j60.c b/drivers/net/ethernet/microchip/enc28j60.c index f6ecfa778660..8f72587b5a2c 100644 --- a/drivers/net/ethernet/microchip/enc28j60.c +++ b/drivers/net/ethernet/microchip/enc28j60.c @@ -1681,5 +1681,5 @@ MODULE_DESCRIPTION(DRV_NAME " ethernet driver"); MODULE_AUTHOR("Claudio Lanconelli "); MODULE_LICENSE("GPL"); module_param_named(debug, debug.msg_enable, int, 0); -MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., ffff=all)"); +MODULE_PARM_DESC(debug, "Debug verbosity level in amount of bits set (0=none, ..., 31=all)"); MODULE_ALIAS("spi:" DRV_NAME); From 232ba3a51cc224b339c7114888ed7f0d4d95695e Mon Sep 17 00:00:00 2001 From: Rajasingh Thavamani Date: Wed, 27 Feb 2019 17:43:19 +0530 Subject: [PATCH 260/298] net: phy: Micrel KSZ8061: link failure after cable connect With Micrel KSZ8061 PHY, the link may occasionally not come up after Ethernet cable connect. The vendor's (Microchip, former Micrel) errata sheet 80000688A.pdf descripes the problem and possible workarounds in detail, see below. The batch implements workaround 1, which permanently fixes the issue. DESCRIPTION Link-up may not occur properly when the Ethernet cable is initially connected. This issue occurs more commonly when the cable is connected slowly, but it may occur any time a cable is connected. This issue occurs in the auto-negotiation circuit, and will not occur if auto-negotiation is disabled (which requires that the two link partners be set to the same speed and duplex). END USER IMPLICATIONS When this issue occurs, link is not established. Subsequent cable plug/unplaug cycle will not correct the issue. WORk AROUND There are four approaches to work around this issue: 1. This issue can be prevented by setting bit 15 in MMD device address 1, register 2, prior to connecting the cable or prior to setting the Restart Auto-negotiation bit in register 0h. The MMD registers are accessed via the indirect access registers Dh and Eh, or via the Micrel EthUtil utility as shown here: . if using the EthUtil utility (usually with a Micrel KSZ8061 Evaluation Board), type the following commands: > address 1 > mmd 1 > iw 2 b61a . Alternatively, write the following registers to write to the indirect MMD register: Write register Dh, data 0001h Write register Eh, data 0002h Write register Dh, data 4001h Write register Eh, data B61Ah 2. The issue can be avoided by disabling auto-negotiation in the KSZ8061, either by the strapping option, or by clearing bit 12 in register 0h. Care must be taken to ensure that the KSZ8061 and the link partner will link with the same speed and duplex. Note that the KSZ8061 defaults to full-duplex when auto-negotiation is off, but other devices may default to half-duplex in the event of failed auto-negotiation. 3. The issue can be avoided by connecting the cable prior to powering-up or resetting the KSZ8061, and leaving it plugged in thereafter. 4. If the above measures are not taken and the problem occurs, link can be recovered by setting the Restart Auto-Negotiation bit in register 0h, or by resetting or power cycling the device. Reset may be either hardware reset or software reset (register 0h, bit 15). PLAN This errata will not be corrected in the future revision. Fixes: 7ab59dc15e2f ("drivers/net/phy/micrel_phy: Add support for new PHYs") Signed-off-by: Alexander Onnasch Signed-off-by: Rajasingh Thavamani Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/micrel.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index b1f959935f50..b7df0295a3ca 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -344,6 +344,17 @@ static int ksz8041_config_aneg(struct phy_device *phydev) return genphy_config_aneg(phydev); } +static int ksz8061_config_init(struct phy_device *phydev) +{ + int ret; + + ret = phy_write_mmd(phydev, MDIO_MMD_PMAPMD, MDIO_DEVID1, 0xB61A); + if (ret) + return ret; + + return kszphy_config_init(phydev); +} + static int ksz9021_load_values_from_of(struct phy_device *phydev, const struct device_node *of_node, u16 reg, @@ -1040,7 +1051,7 @@ static struct phy_driver ksphy_driver[] = { .name = "Micrel KSZ8061", .phy_id_mask = MICREL_PHY_ID_MASK, .features = PHY_BASIC_FEATURES, - .config_init = kszphy_config_init, + .config_init = ksz8061_config_init, .ack_interrupt = kszphy_ack_interrupt, .config_intr = kszphy_config_intr, .suspend = genphy_suspend, From d63716658ac16c515d1223a9fbf5edbf76b1b333 Mon Sep 17 00:00:00 2001 From: Mario Kleiner Date: Wed, 30 Jan 2019 06:04:46 +0100 Subject: [PATCH 261/298] drm/amd/display: Use vrr friendly pageflip throttling in DC. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In VRR mode, keep track of the vblank count of the last completed pageflip in amdgpu_crtc->last_flip_vblank, as recorded in the pageflip completion handler after each completed flip. Use that count to prevent mmio programming a new pageflip within the same vblank in which the last pageflip completed, iow. to throttle pageflips to at most one flip per video frame, while at the same time allowing to request a flip not only before start of vblank, but also anywhere within vblank. The old logic did the same, and made sense for regular fixed refresh rate flipping, but in vrr mode it prevents requesting a flip anywhere inside the possibly huge vblank, thereby reducing framerate in vrr mode instead of improving it, by delaying a slightly delayed flip requests up to a maximum vblank duration + 1 scanout duration. This would limit VRR usefulness to only help applications with a very high GPU demand, which can submit the flip request before start of vblank, but then have to wait long for fences to complete. With this method a flip can be both requested and - after fences have completed - executed, ie. it doesn't matter if the request (amdgpu_dm_do_flip()) gets delayed until deep into the extended vblank due to cpu execution delays. This also allows clients which want to regulate framerate within the vrr range a much more fine-grained control of flip timing, a feature that might be useful for video playback, and is very useful for neuroscience/vision research applications. In regular non-VRR mode, retain the old flip submission behavior. This to keep flip scheduling for fullscreen X11/GLX OpenGL clients intact, if they use the GLX_OML_sync_control extensions glXSwapBufferMscOML(, ..., target_msc,...) function with a specific target_msc target vblank count. glXSwapBuffersMscOML() or DRI3/Present PresentPixmap() will not flip at the proper target_msc for a non-zero target_msc if VRR mode is active with this patch. They'd often flip one frame too early. However, this limitation should not matter much in VRR mode, as scheduling based on vblank counts is pretty futile/unusable under variable refresh duration anyway, so no real extra harm is done. Signed-off-by: Mario Kleiner Cc: Nicholas Kazlauskas Cc: Harry Wentland Cc: Alex Deucher Cc: Michel Dänzer Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h | 1 + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 27 ++++++++++++++++--- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h index aadd0fa42e43..3aa42c64484a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h @@ -405,6 +405,7 @@ struct amdgpu_crtc { struct amdgpu_flip_work *pflip_works; enum amdgpu_flip_status pflip_status; int deferred_flip_completion; + u64 last_flip_vblank; /* pll sharing */ struct amdgpu_atom_ss ss; bool ss_enabled; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 5296b8f3e0ab..636d14a60952 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -303,12 +303,11 @@ static void dm_pflip_high_irq(void *interrupt_params) return; } + /* Update to correct count(s) if racing with vblank irq */ + amdgpu_crtc->last_flip_vblank = drm_crtc_accurate_vblank_count(&amdgpu_crtc->base); /* wake up userspace */ if (amdgpu_crtc->event) { - /* Update to correct count(s) if racing with vblank irq */ - drm_crtc_accurate_vblank_count(&amdgpu_crtc->base); - drm_crtc_send_vblank_event(&amdgpu_crtc->base, amdgpu_crtc->event); /* page flip completed. clean up */ @@ -4828,6 +4827,8 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, to_dm_crtc_state(drm_atomic_get_old_crtc_state(state, pcrtc)); int planes_count = 0; unsigned long flags; + u64 last_flip_vblank; + bool vrr_active = acrtc_state->freesync_config.state == VRR_STATE_ACTIVE_VARIABLE; /* update planes when needed */ for_each_oldnew_plane_in_state(state, plane, old_plane_state, new_plane_state, i) { @@ -4859,6 +4860,16 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, /* In commit tail framework this cannot happen */ WARN_ON(1); } + + /* For variable refresh rate mode only: + * Get vblank of last completed flip to avoid > 1 vrr flips per + * video frame by use of throttling, but allow flip programming + * anywhere in the possibly large variable vrr vblank interval + * for fine-grained flip timing control and more opportunity to + * avoid stutter on late submission of amdgpu_dm_do_flip() calls. + */ + last_flip_vblank = acrtc_attach->last_flip_vblank; + spin_unlock_irqrestore(&crtc->dev->event_lock, flags); if (!pflip_needed || plane->type == DRM_PLANE_TYPE_OVERLAY) { @@ -4882,10 +4893,18 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, if (plane->type == DRM_PLANE_TYPE_PRIMARY) drm_crtc_vblank_get(crtc); + /* Use old throttling in non-vrr fixed refresh rate mode + * to keep flip scheduling based on target vblank counts + * working in a backwards compatible way, e.g., clients + * using GLX_OML_sync_control extension. + */ + if (!vrr_active) + last_flip_vblank = drm_crtc_vblank_count(crtc); + amdgpu_dm_do_flip( crtc, fb, - (uint32_t)drm_crtc_vblank_count(crtc) + *wait_for_vblank, + (uint32_t) last_flip_vblank + *wait_for_vblank, dc_state); } From 58bdd544e2933a21a51eecf17c3f5f94038261b5 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 22 Feb 2019 15:37:58 +0800 Subject: [PATCH 262/298] net: nfc: Fix NULL dereference on nfc_llcp_build_tlv fails KASAN report this: BUG: KASAN: null-ptr-deref in nfc_llcp_build_gb+0x37f/0x540 [nfc] Read of size 3 at addr 0000000000000000 by task syz-executor.0/5401 CPU: 0 PID: 5401 Comm: syz-executor.0 Not tainted 5.0.0-rc7+ #45 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xfa/0x1ce lib/dump_stack.c:113 kasan_report+0x171/0x18d mm/kasan/report.c:321 memcpy+0x1f/0x50 mm/kasan/common.c:130 nfc_llcp_build_gb+0x37f/0x540 [nfc] nfc_llcp_register_device+0x6eb/0xb50 [nfc] nfc_register_device+0x50/0x1d0 [nfc] nfcsim_device_new+0x394/0x67d [nfcsim] ? 0xffffffffc1080000 nfcsim_init+0x6b/0x1000 [nfcsim] do_one_initcall+0xfa/0x5ca init/main.c:887 do_init_module+0x204/0x5f6 kernel/module.c:3460 load_module+0x66b2/0x8570 kernel/module.c:3808 __do_sys_finit_module+0x238/0x2a0 kernel/module.c:3902 do_syscall_64+0x147/0x600 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x462e99 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f9cb79dcc58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99 RDX: 0000000000000000 RSI: 0000000020000280 RDI: 0000000000000003 RBP: 00007f9cb79dcc70 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00007f9cb79dd6bc R13: 00000000004bcefb R14: 00000000006f7030 R15: 0000000000000004 nfc_llcp_build_tlv will return NULL on fails, caller should check it, otherwise will trigger a NULL dereference. Reported-by: Hulk Robot Fixes: eda21f16a5ed ("NFC: Set MIU and RW values from CONNECT and CC LLCP frames") Fixes: d646960f7986 ("NFC: Initial LLCP support") Signed-off-by: YueHaibing Signed-off-by: David S. Miller --- net/nfc/llcp_commands.c | 20 ++++++++++++++++++++ net/nfc/llcp_core.c | 24 ++++++++++++++++++++---- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 6a196e438b6c..d1fc019e932e 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -419,6 +419,10 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) sock->service_name, sock->service_name_len, &service_name_tlv_length); + if (!service_name_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += service_name_tlv_length; } @@ -429,9 +433,17 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); + if (!miux_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += miux_tlv_length; rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); + if (!rw_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += rw_tlv_length; pr_debug("SKB size %d SN length %zu\n", size, sock->service_name_len); @@ -484,9 +496,17 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&miux, 0, &miux_tlv_length); + if (!miux_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += miux_tlv_length; rw_tlv = nfc_llcp_build_tlv(LLCP_TLV_RW, &rw, 0, &rw_tlv_length); + if (!rw_tlv) { + err = -ENOMEM; + goto error_tlv; + } size += rw_tlv_length; skb = llcp_allocate_pdu(sock, LLCP_PDU_CC, size); diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index ef4026a23e80..4fa015208aab 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -532,10 +532,10 @@ static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local) static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { - u8 *gb_cur, *version_tlv, version, version_length; - u8 *lto_tlv, lto_length; - u8 *wks_tlv, wks_length; - u8 *miux_tlv, miux_length; + u8 *gb_cur, version, version_length; + u8 lto_length, wks_length, miux_length; + u8 *version_tlv = NULL, *lto_tlv = NULL, + *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; int ret = 0; @@ -543,17 +543,33 @@ static int nfc_llcp_build_gb(struct nfc_llcp_local *local) version = LLCP_VERSION_11; version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version, 1, &version_length); + if (!version_tlv) { + ret = -ENOMEM; + goto out; + } gb_len += version_length; lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, <o_length); + if (!lto_tlv) { + ret = -ENOMEM; + goto out; + } gb_len += lto_length; pr_debug("Local wks 0x%lx\n", local->local_wks); wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length); + if (!wks_tlv) { + ret = -ENOMEM; + goto out; + } gb_len += wks_length; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, &miux_length); + if (!miux_tlv) { + ret = -ENOMEM; + goto out; + } gb_len += miux_length; gb_len += ARRAY_SIZE(llcp_magic); From 0a1d52994d440e21def1c2174932410b4f2a98a1 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 27 Feb 2019 21:29:52 +0100 Subject: [PATCH 263/298] mm: enforce min addr even if capable() in expand_downwards() security_mmap_addr() does a capability check with current_cred(), but we can reach this code from contexts like a VFS write handler where current_cred() must not be used. This can be abused on systems without SMAP to make NULL pointer dereferences exploitable again. Fixes: 8869477a49c3 ("security: protect from stack expansion into low vm addresses") Cc: stable@kernel.org Signed-off-by: Jann Horn Signed-off-by: Linus Torvalds --- mm/mmap.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f901065c4c64..fc1809b1bed6 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2426,12 +2426,11 @@ int expand_downwards(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; - int error; + int error = 0; address &= PAGE_MASK; - error = security_mmap_addr(address); - if (error) - return error; + if (address < mmap_min_addr) + return -EPERM; /* Enforce stack_guard_gap */ prev = vma->vm_prev; From e0bf304e4a00d66d90904a6c5b93141f177cf6d2 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Wed, 27 Feb 2019 10:42:56 +0100 Subject: [PATCH 264/298] MIPS: fix memory setup for platforms with PHYS_OFFSET != 0 For platforms, which use a PHYS_OFFSET != 0, symbol _end also contains that offset. So when calling memblock_reserve() for reserving kernel the size argument needs to be adjusted. Fixes: bcec54bf3118 ("mips: switch to NO_BOOTMEM") Acked-by: Mike Rapoport Signed-off-by: Thomas Bogendoerfer Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: James Hogan Cc: linux-mips@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: Mike Rapoport Cc: stable@vger.kernel.org # v4.20+ --- arch/mips/kernel/setup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 8c6c48ed786a..d2e5a5ad0e6f 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -384,7 +384,8 @@ static void __init bootmem_init(void) init_initrd(); reserved_end = (unsigned long) PFN_UP(__pa_symbol(&_end)); - memblock_reserve(PHYS_OFFSET, reserved_end << PAGE_SHIFT); + memblock_reserve(PHYS_OFFSET, + (reserved_end << PAGE_SHIFT) - PHYS_OFFSET); /* * max_low_pfn is not a number of pages. The number of pages From 2216322919c8608a448d7ebc560a845238a5d6b6 Mon Sep 17 00:00:00 2001 From: Nicholas Kazlauskas Date: Mon, 7 Jan 2019 12:41:46 -0500 Subject: [PATCH 265/298] drm: Block fb changes for async plane updates The prepare_fb call always happens on new_plane_state. The drm_atomic_helper_cleanup_planes checks to see if plane state pointer has changed when deciding to call cleanup_fb on either the new_plane_state or the old_plane_state. For a non-async atomic commit the state pointer is swapped, so this helper calls prepare_fb on the new_plane_state and cleanup_fb on the old_plane_state. This makes sense, since we want to prepare the framebuffer we are going to use and cleanup the the framebuffer we are no longer using. For the async atomic update helpers this differs. The async atomic update helpers perform in-place updates on the existing state. They call drm_atomic_helper_cleanup_planes but the state pointer is not swapped. This means that prepare_fb is called on the new_plane_state and cleanup_fb is called on the new_plane_state (not the old). In the case where old_plane_state->fb == new_plane_state->fb then there should be no behavioral difference between an async update and a non-async commit. But there are issues that arise when old_plane_state->fb != new_plane_state->fb. The first is that the new_plane_state->fb is immediately cleaned up after it has been prepared, so we're using a fb that we shouldn't be. The second occurs during a sequence of async atomic updates and non-async regular atomic commits. Suppose there are two framebuffers being interleaved in a double-buffering scenario, fb1 and fb2: - Async update, oldfb = NULL, newfb = fb1, prepare fb1, cleanup fb1 - Async update, oldfb = fb1, newfb = fb2, prepare fb2, cleanup fb2 - Non-async commit, oldfb = fb2, newfb = fb1, prepare fb1, cleanup fb2 We call cleanup_fb on fb2 twice in this example scenario, and any further use will result in use-after-free. The simple fix to this problem is to block framebuffer changes in the drm_atomic_helper_async_check function for now. v2: Move check by itself, add a FIXME (Daniel) Cc: Daniel Vetter Cc: Harry Wentland Cc: Andrey Grodzovsky Cc: # v4.14+ Fixes: fef9df8b5945 ("drm/atomic: initial support for asynchronous plane update") Signed-off-by: Nicholas Kazlauskas Acked-by: Andrey Grodzovsky Acked-by: Harry Wentland Reviewed-by: Daniel Vetter Signed-off-by: Harry Wentland Link: https://patchwork.freedesktop.org/patch/275364/ Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_atomic_helper.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 54e2ae614dcc..f4290f6b0c38 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -1602,6 +1602,15 @@ int drm_atomic_helper_async_check(struct drm_device *dev, old_plane_state->crtc != new_plane_state->crtc) return -EINVAL; + /* + * FIXME: Since prepare_fb and cleanup_fb are always called on + * the new_plane_state for async updates we need to block framebuffer + * changes. This prevents use of a fb that's been cleaned up and + * double cleanups from occuring. + */ + if (old_plane_state->fb != new_plane_state->fb) + return -EINVAL; + funcs = plane->helper_private; if (!funcs->atomic_async_update) return -EINVAL; From 17fb465f16027ea22225b282295f5b8af19992e0 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Thu, 21 Feb 2019 00:33:03 +0000 Subject: [PATCH 266/298] drm/bochs: Fix the ID mismatch error When running RISC-V QEMU with the Bochs device attached via PCIe the probe of the Bochs device fails with: [drm:bochs_hw_init] *ERROR* ID mismatch This was introduced by this commit: 7780eb9ce8 bochs: convert to drm_dev_register To fix the error we ensure that pci_enable_device() is called before bochs_load(). Fixes: 7780eb9ce80f ("bochs: convert to drm_dev_register") Signed-off-by: Alistair Francis Reported-by: David Abdurachmanov Link: http://patchwork.freedesktop.org/patch/msgid/20190221003231.31625-1-alistair.francis@wdc.com Signed-off-by: Gerd Hoffmann Signed-off-by: Dave Airlie --- drivers/gpu/drm/bochs/bochs_drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/bochs/bochs_drv.c b/drivers/gpu/drm/bochs/bochs_drv.c index f3dd66ae990a..aa35007262cd 100644 --- a/drivers/gpu/drm/bochs/bochs_drv.c +++ b/drivers/gpu/drm/bochs/bochs_drv.c @@ -154,6 +154,10 @@ static int bochs_pci_probe(struct pci_dev *pdev, if (IS_ERR(dev)) return PTR_ERR(dev); + ret = pci_enable_device(pdev); + if (ret) + goto err_free_dev; + dev->pdev = pdev; pci_set_drvdata(pdev, dev); From 72a7d452b0f09dcd5d3e18cff1e3839f8b1acc1f Mon Sep 17 00:00:00 2001 From: Max Uvarov Date: Mon, 25 Feb 2019 12:15:10 +0300 Subject: [PATCH 267/298] net: phy: dp83867: add soft reset delay Similar to dp83640 delay after soft reset is needed to set up registers correctly. Signed-off-by: Max Uvarov Signed-off-by: David S. Miller --- drivers/net/phy/dp83867.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/phy/dp83867.c b/drivers/net/phy/dp83867.c index da6a67d47ce9..56fa3606cb9c 100644 --- a/drivers/net/phy/dp83867.c +++ b/drivers/net/phy/dp83867.c @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -325,6 +326,8 @@ static int dp83867_phy_reset(struct phy_device *phydev) if (err < 0) return err; + usleep_range(10, 20); + return dp83867_config_init(phydev); } From 651eb32e569e50564803891cf2a8d22e49e979a5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 25 Feb 2019 16:08:36 +0100 Subject: [PATCH 268/298] selftests: pmtu: disable DAD in all namespaces Otherwise, the configured IPv6 address could be still "tentative" at test time, possibly causing tests failures. We can also drop some sleep along the code and decrease the timeout for most commands so that the test runtime decreases. v1 -> v2: - fix comment (Stefano) Fixes: d1f1b9cbf34c ("selftests: net: Introduce first PMTU test") Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index e2c94e47707c..89aec2fdf4fa 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -263,8 +263,6 @@ setup_fou_or_gue() { ${ns_a} ip link set ${encap}_a up ${ns_b} ip link set ${encap}_b up - - sleep 1 } setup_fou44() { @@ -302,6 +300,10 @@ setup_gue66() { setup_namespaces() { for n in ${NS_A} ${NS_B} ${NS_R1} ${NS_R2}; do ip netns add ${n} || return 1 + + # Disable DAD, so that we don't have to wait to use the + # configured IPv6 addresses + ip netns exec ${n} sysctl -q net/ipv6/conf/default/accept_dad=0 done } @@ -337,8 +339,6 @@ setup_vti() { ${ns_a} ip link set vti${proto}_a up ${ns_b} ip link set vti${proto}_b up - - sleep 1 } setup_vti4() { @@ -375,8 +375,6 @@ setup_vxlan_or_geneve() { ${ns_a} ip link set ${type}_a up ${ns_b} ip link set ${type}_b up - - sleep 1 } setup_geneve4() { @@ -588,8 +586,8 @@ test_pmtu_ipvX() { mtu "${ns_b}" veth_B-R2 1500 # Create route exceptions - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1800 ${dst1} > /dev/null - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1800 ${dst2} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst1} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1800 ${dst2} > /dev/null # Check that exceptions have been created with the correct PMTU pmtu_1="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst1})" @@ -621,7 +619,7 @@ test_pmtu_ipvX() { # Decrease remote MTU on path via R2, get new exception mtu "${ns_r2}" veth_R2-B 400 mtu "${ns_b}" veth_B-R2 400 - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1400 ${dst2} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1400 ${dst2} > /dev/null pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})" check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1 @@ -638,7 +636,7 @@ test_pmtu_ipvX() { check_pmtu_value "1500" "${pmtu_2}" "increasing local MTU" || return 1 # Get new exception - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s 1400 ${dst2} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s 1400 ${dst2} > /dev/null pmtu_2="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst2})" check_pmtu_value "lock 552" "${pmtu_2}" "exceeding MTU, with MTU < min_pmtu" || return 1 } @@ -687,7 +685,7 @@ test_pmtu_ipvX_over_vxlanY_or_geneveY_exception() { mtu "${ns_a}" ${type}_a $((${ll_mtu} + 1000)) mtu "${ns_b}" ${type}_b $((${ll_mtu} + 1000)) - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} > /dev/null # Check that exception was created pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" @@ -767,7 +765,7 @@ test_pmtu_ipvX_over_fouY_or_gueY() { mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000)) mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000)) - ${ns_a} ${ping} -q -M want -i 0.1 -w 2 -s $((${ll_mtu} + 500)) ${dst} > /dev/null + ${ns_a} ${ping} -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${dst} > /dev/null # Check that exception was created pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${dst})" @@ -825,13 +823,13 @@ test_pmtu_vti4_exception() { # Send DF packet without exceeding link layer MTU, check that no # exception is created - ${ns_a} ping -q -M want -i 0.1 -w 2 -s ${ping_payload} ${tunnel4_b_addr} > /dev/null + ${ns_a} ping -q -M want -i 0.1 -w 1 -s ${ping_payload} ${tunnel4_b_addr} > /dev/null pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" check_pmtu_value "" "${pmtu}" "sending packet smaller than PMTU (IP payload length ${esp_payload_rfc4106})" || return 1 # Now exceed link layer MTU by one byte, check that exception is created # with the right PMTU value - ${ns_a} ping -q -M want -i 0.1 -w 2 -s $((ping_payload + 1)) ${tunnel4_b_addr} > /dev/null + ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((ping_payload + 1)) ${tunnel4_b_addr} > /dev/null pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel4_b_addr})" check_pmtu_value "${esp_payload_rfc4106}" "${pmtu}" "exceeding PMTU (IP payload length $((esp_payload_rfc4106 + 1)))" } @@ -847,7 +845,7 @@ test_pmtu_vti6_exception() { mtu "${ns_b}" veth_b 4000 mtu "${ns_a}" vti6_a 5000 mtu "${ns_b}" vti6_b 5000 - ${ns_a} ${ping6} -q -i 0.1 -w 2 -s 60000 ${tunnel6_b_addr} > /dev/null + ${ns_a} ${ping6} -q -i 0.1 -w 1 -s 60000 ${tunnel6_b_addr} > /dev/null # Check that exception was created pmtu="$(route_get_dst_pmtu_from_exception "${ns_a}" ${tunnel6_b_addr})" From b3cc4f8a8a411bbf07a7d7213ce9c7571ded38b9 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 25 Feb 2019 16:08:37 +0100 Subject: [PATCH 269/298] selftests: pmtu: add explicit tests for PMTU exceptions cleanup Add a couple of new tests, explicitly checking that the kernel timely releases PMTU exceptions on related device removal. This is mostly a regression test vs the issue fixed by commit f5b51fe804ec ("ipv6: route: purge exception on removal") Only 2 new test cases have been added, instead of extending all the existing ones, because the reproducer requires executing several commands and would slow down too much the tests otherwise. v2 -> v3: - more cleanup, still from Stefano v1 -> v2: - several script cleanups, as suggested by Stefano Signed-off-by: Paolo Abeni Reviewed-by: David Ahern Signed-off-by: David S. Miller --- tools/testing/selftests/net/pmtu.sh | 68 ++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh index 89aec2fdf4fa..912b2dc50be3 100755 --- a/tools/testing/selftests/net/pmtu.sh +++ b/tools/testing/selftests/net/pmtu.sh @@ -103,6 +103,15 @@ # and check that configured MTU is used on link creation and changes, and # that MTU is properly calculated instead when MTU is not configured from # userspace +# +# - cleanup_ipv4_exception +# Similar to pmtu_ipv4_vxlan4_exception, but explicitly generate PMTU +# exceptions on multiple CPUs and check that the veth device tear-down +# happens in a timely manner +# +# - cleanup_ipv6_exception +# Same as above, but use IPv6 transport from A to B + # Kselftest framework requirement - SKIP code is 4. ksft_skip=4 @@ -135,7 +144,9 @@ tests=" pmtu_vti6_default_mtu vti6: default MTU assignment pmtu_vti4_link_add_mtu vti4: MTU setting on link creation pmtu_vti6_link_add_mtu vti6: MTU setting on link creation - pmtu_vti6_link_change_mtu vti6: MTU changes on link changes" + pmtu_vti6_link_change_mtu vti6: MTU changes on link changes + cleanup_ipv4_exception ipv4: cleanup of cached exceptions + cleanup_ipv6_exception ipv6: cleanup of cached exceptions" NS_A="ns-$(mktemp -u XXXXXX)" NS_B="ns-$(mktemp -u XXXXXX)" @@ -1006,6 +1017,61 @@ test_pmtu_vti6_link_change_mtu() { return ${fail} } +check_command() { + cmd=${1} + + if ! which ${cmd} > /dev/null 2>&1; then + err " missing required command: '${cmd}'" + return 1 + fi + return 0 +} + +test_cleanup_vxlanX_exception() { + outer="${1}" + encap="vxlan" + ll_mtu=4000 + + check_command taskset || return 2 + cpu_list=$(grep -m 2 processor /proc/cpuinfo | cut -d ' ' -f 2) + + setup namespaces routing ${encap}${outer} || return 2 + trace "${ns_a}" ${encap}_a "${ns_b}" ${encap}_b \ + "${ns_a}" veth_A-R1 "${ns_r1}" veth_R1-A \ + "${ns_b}" veth_B-R1 "${ns_r1}" veth_R1-B + + # Create route exception by exceeding link layer MTU + mtu "${ns_a}" veth_A-R1 $((${ll_mtu} + 1000)) + mtu "${ns_r1}" veth_R1-A $((${ll_mtu} + 1000)) + mtu "${ns_b}" veth_B-R1 ${ll_mtu} + mtu "${ns_r1}" veth_R1-B ${ll_mtu} + + mtu "${ns_a}" ${encap}_a $((${ll_mtu} + 1000)) + mtu "${ns_b}" ${encap}_b $((${ll_mtu} + 1000)) + + # Fill exception cache for multiple CPUs (2) + # we can always use inner IPv4 for that + for cpu in ${cpu_list}; do + taskset --cpu-list ${cpu} ${ns_a} ping -q -M want -i 0.1 -w 1 -s $((${ll_mtu} + 500)) ${tunnel4_b_addr} > /dev/null + done + + ${ns_a} ip link del dev veth_A-R1 & + iplink_pid=$! + sleep 1 + if [ "$(cat /proc/${iplink_pid}/cmdline 2>/dev/null | tr -d '\0')" = "iplinkdeldevveth_A-R1" ]; then + err " can't delete veth device in a timely manner, PMTU dst likely leaked" + return 1 + fi +} + +test_cleanup_ipv6_exception() { + test_cleanup_vxlanX_exception 6 +} + +test_cleanup_ipv4_exception() { + test_cleanup_vxlanX_exception 4 +} + usage() { echo echo "$0 [OPTIONS] [TEST]..." From a1fd1ad2552fad9e649eeb85fd79301e2880a886 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 25 Feb 2019 13:55:48 -0800 Subject: [PATCH 270/298] ipv4: Pass original device to ip_rcv_finish_core ip_route_input_rcu expects the original ingress device (e.g., for proper multicast handling). The skb->dev can be changed by l3mdev_ip_rcv, so dev needs to be saved prior to calling it. This was the behavior prior to the listify changes. Fixes: 5fa12739a53d0 ("net: ipv4: listify ip_rcv_finish") Cc: Edward Cree Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 51d8efba6de2..1f4737b77067 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -307,11 +307,10 @@ static inline bool ip_rcv_options(struct sk_buff *skb) } static int ip_rcv_finish_core(struct net *net, struct sock *sk, - struct sk_buff *skb) + struct sk_buff *skb, struct net_device *dev) { const struct iphdr *iph = ip_hdr(skb); int (*edemux)(struct sk_buff *skb); - struct net_device *dev = skb->dev; struct rtable *rt; int err; @@ -400,6 +399,7 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk, static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + struct net_device *dev = skb->dev; int ret; /* if ingress device is enslaved to an L3 master device pass the @@ -409,7 +409,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb); + ret = ip_rcv_finish_core(net, sk, skb, dev); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -545,6 +545,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, INIT_LIST_HEAD(&sublist); list_for_each_entry_safe(skb, next, head, list) { + struct net_device *dev = skb->dev; struct dst_entry *dst; skb_list_del_init(skb); @@ -554,7 +555,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP) + if (ip_rcv_finish_core(net, sk, skb, dev) == NET_RX_DROP) continue; dst = skb_dst(skb); From 5578de4834fe0f2a34fedc7374be691443396d1f Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Mon, 25 Feb 2019 19:06:06 -0500 Subject: [PATCH 271/298] netlabel: fix out-of-bounds memory accesses There are two array out-of-bounds memory accesses, one in cipso_v4_map_lvl_valid(), the other in netlbl_bitmap_walk(). Both errors are embarassingly simple, and the fixes are straightforward. As a FYI for anyone backporting this patch to kernels prior to v4.8, you'll want to apply the netlbl_bitmap_walk() patch to cipso_v4_bitmap_walk() as netlbl_bitmap_walk() doesn't exist before Linux v4.8. Reported-by: Jann Horn Fixes: 446fda4f2682 ("[NetLabel]: CIPSOv4 engine") Fixes: 3faa8f982f95 ("netlabel: Move bitmap manipulation functions to the NetLabel core.") Signed-off-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 3 ++- net/netlabel/netlabel_kapi.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index eff86a71c1b0..f0165c5f376b 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -667,7 +667,8 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: - if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL) + if ((level < doi_def->map.std->lvl.cipso_size) && + (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)) return 0; break; } diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index ea7c67050792..ee3e5b6471a6 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -903,7 +903,8 @@ int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, (state == 0 && (byte & bitmask) == 0)) return bit_spot; - bit_spot++; + if (++bit_spot >= bitmap_len) + return -1; bitmask >>= 1; if (bitmask == 0) { byte = bitmap[++byte_offset]; From 4b6d196c9cec548a6b1cf5bb07b4a8b8d375829d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 22 Feb 2019 22:54:07 -0800 Subject: [PATCH 272/298] crypto: arm64/chacha - fix chacha_4block_xor_neon() for big endian The change to encrypt a fifth ChaCha block using scalar instructions caused the chacha20-neon, xchacha20-neon, and xchacha12-neon self-tests to start failing on big endian arm64 kernels. The bug is that the keystream block produced in 32-bit scalar registers is directly XOR'd with the data words, which are loaded and stored in native endianness. Thus in big endian mode the data bytes end up XOR'd with the wrong bytes. Fix it by byte-swapping the keystream words in big endian mode. Fixes: 2fe55987b262 ("crypto: arm64/chacha - use combined SIMD/ALU routine for more speed") Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/chacha-neon-core.S | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index 021bb9e9784b..bfb80e10ff7b 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -532,6 +532,10 @@ ENTRY(chacha_4block_xor_neon) add v3.4s, v3.4s, v19.4s add a2, a2, w8 add a3, a3, w9 +CPU_BE( rev a0, a0 ) +CPU_BE( rev a1, a1 ) +CPU_BE( rev a2, a2 ) +CPU_BE( rev a3, a3 ) ld4r {v24.4s-v27.4s}, [x0], #16 ld4r {v28.4s-v31.4s}, [x0] @@ -552,6 +556,10 @@ ENTRY(chacha_4block_xor_neon) add v7.4s, v7.4s, v23.4s add a6, a6, w8 add a7, a7, w9 +CPU_BE( rev a4, a4 ) +CPU_BE( rev a5, a5 ) +CPU_BE( rev a6, a6 ) +CPU_BE( rev a7, a7 ) // x8[0-3] += s2[0] // x9[0-3] += s2[1] @@ -569,6 +577,10 @@ ENTRY(chacha_4block_xor_neon) add v11.4s, v11.4s, v27.4s add a10, a10, w8 add a11, a11, w9 +CPU_BE( rev a8, a8 ) +CPU_BE( rev a9, a9 ) +CPU_BE( rev a10, a10 ) +CPU_BE( rev a11, a11 ) // x12[0-3] += s3[0] // x13[0-3] += s3[1] @@ -586,6 +598,10 @@ ENTRY(chacha_4block_xor_neon) add v15.4s, v15.4s, v31.4s add a14, a14, w8 add a15, a15, w9 +CPU_BE( rev a12, a12 ) +CPU_BE( rev a13, a13 ) +CPU_BE( rev a14, a14 ) +CPU_BE( rev a15, a15 ) // interleave 32-bit words in state n, n+1 ldp w6, w7, [x2], #64 From f86d17e9efe010b894db231329ee36b24bcc1b24 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 22 Feb 2019 22:54:08 -0800 Subject: [PATCH 273/298] crypto: arm64/chacha - fix hchacha_block_neon() for big endian On big endian arm64 kernels, the xchacha20-neon and xchacha12-neon self-tests fail because hchacha_block_neon() outputs little endian words but the C code expects native endianness. Fix it to output the words in native endianness (which also makes it match the arm32 version). Fixes: cc7cf991e9eb ("crypto: arm64/chacha20 - add XChaCha20 support") Signed-off-by: Eric Biggers Reviewed-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- arch/arm64/crypto/chacha-neon-core.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/crypto/chacha-neon-core.S b/arch/arm64/crypto/chacha-neon-core.S index bfb80e10ff7b..706c4e10e9e2 100644 --- a/arch/arm64/crypto/chacha-neon-core.S +++ b/arch/arm64/crypto/chacha-neon-core.S @@ -158,8 +158,8 @@ ENTRY(hchacha_block_neon) mov w3, w2 bl chacha_permute - st1 {v0.16b}, [x1], #16 - st1 {v3.16b}, [x1] + st1 {v0.4s}, [x1], #16 + st1 {v3.4s}, [x1] ldp x29, x30, [sp], #16 ret From c7c0d8df0b94a67377555a550b8d66ee2ad2f4ed Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 23 Feb 2019 14:20:36 +0100 Subject: [PATCH 274/298] tee: optee: add missing of_node_put after of_device_is_available Add an of_node_put when a tested device node is not available. The semantic patch that fixes this problem is as follows (http://coccinelle.lip6.fr): // @@ identifier f; local idexpression e; expression x; @@ e = f(...); ... when != of_node_put(e) when != x = e when != e = x when any if (<+...of_device_is_available(e)...+>) { ... when != of_node_put(e) ( return e; | + of_node_put(e); return ...; ) } // Fixes: db878f76b9ff ("tee: optee: take DT status property into account") Signed-off-by: Julia Lawall Signed-off-by: Jens Wiklander --- drivers/tee/optee/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index e5efce3c08e2..947f9b28de9e 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -699,8 +699,10 @@ static int __init optee_driver_init(void) return -ENODEV; np = of_find_matching_node(fw_np, optee_match); - if (!np || !of_device_is_available(np)) + if (!np || !of_device_is_available(np)) { + of_node_put(np); return -ENODEV; + } optee = optee_probe(np); of_node_put(np); From 9cd05ad2910b55238e3c720c99ad896dc538301b Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Mon, 25 Feb 2019 22:31:14 +0800 Subject: [PATCH 275/298] x86/hyper-v: Fix definition of HV_MAX_FLUSH_REP_COUNT The max flush rep count of HvFlushGuestPhysicalAddressList hypercall is equal with how many entries of union hv_gpa_page_range can be populated into the input parameter page. The code lacks parenthesis around PAGE_SIZE - 2 * sizeof(u64) which results in bogus computations. Add them. Fixes: cc4edae4b924 ("x86/hyper-v: Add HvFlushGuestAddressList hypercall support") Signed-off-by: Lan Tianyu Signed-off-by: Thomas Gleixner Cc: kys@microsoft.com Cc: haiyangz@microsoft.com Cc: sthemmin@microsoft.com Cc: sashal@kernel.org Cc: bp@alien8.de Cc: hpa@zytor.com Cc: gregkh@linuxfoundation.org Cc: devel@linuxdriverproject.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190225143114.5149-1-Tianyu.Lan@microsoft.com --- arch/x86/include/asm/hyperv-tlfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 705dafc2d11a..2bdbbbcfa393 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -841,7 +841,7 @@ union hv_gpa_page_range { * count is equal with how many entries of union hv_gpa_page_range can * be populated into the input parameter page. */ -#define HV_MAX_FLUSH_REP_COUNT (PAGE_SIZE - 2 * sizeof(u64) / \ +#define HV_MAX_FLUSH_REP_COUNT ((PAGE_SIZE - 2 * sizeof(u64)) / \ sizeof(union hv_gpa_page_range)) struct hv_guest_mapping_flush_list { From e30be063d6dbcc0f18b1eb25fa709fdef89201fb Mon Sep 17 00:00:00 2001 From: BOUGH CHEN Date: Thu, 28 Feb 2019 10:15:42 +0000 Subject: [PATCH 276/298] mmc: sdhci-esdhc-imx: correct the fix of ERR004536 Commit 18094430d6b5 ("mmc: sdhci-esdhc-imx: add ADMA Length Mismatch errata fix") involve the fix of ERR004536, but the fix is incorrect. Double confirm with IC, need to clear the bit 7 of register 0x6c rather than set this bit 7. Here is the definition of bit 7 of 0x6c: 0: enable the new IC fix for ERR004536 1: do not use the IC fix, keep the same as before Find this issue on i.MX845s-evk board when enable CMDQ, and let system in heavy loading. root@imx8mmevk:~# dd if=/dev/mmcblk2 of=/dev/null bs=1M & root@imx8mmevk:~# memtester 1000M > /dev/zero & root@imx8mmevk:~# [ 139.897220] mmc2: cqhci: timeout for tag 16 [ 139.901417] mmc2: cqhci: ============ CQHCI REGISTER DUMP =========== [ 139.907862] mmc2: cqhci: Caps: 0x0000310a | Version: 0x00000510 [ 139.914311] mmc2: cqhci: Config: 0x00001001 | Control: 0x00000000 [ 139.920753] mmc2: cqhci: Int stat: 0x00000000 | Int enab: 0x00000006 [ 139.927193] mmc2: cqhci: Int sig: 0x00000006 | Int Coal: 0x00000000 [ 139.933634] mmc2: cqhci: TDL base: 0x7809c000 | TDL up32: 0x00000000 [ 139.940073] mmc2: cqhci: Doorbell: 0x00030000 | TCN: 0x00000000 [ 139.946518] mmc2: cqhci: Dev queue: 0x00010000 | Dev Pend: 0x00010000 [ 139.952967] mmc2: cqhci: Task clr: 0x00000000 | SSC1: 0x00011000 [ 139.959411] mmc2: cqhci: SSC2: 0x00000001 | DCMD rsp: 0x00000000 [ 139.965857] mmc2: cqhci: RED mask: 0xfdf9a080 | TERRI: 0x00000000 [ 139.972308] mmc2: cqhci: Resp idx: 0x0000002e | Resp arg: 0x00000900 [ 139.978761] mmc2: sdhci: ============ SDHCI REGISTER DUMP =========== [ 139.985214] mmc2: sdhci: Sys addr: 0xb2c19000 | Version: 0x00000002 [ 139.991669] mmc2: sdhci: Blk size: 0x00000200 | Blk cnt: 0x00000400 [ 139.998127] mmc2: sdhci: Argument: 0x40110400 | Trn mode: 0x00000033 [ 140.004618] mmc2: sdhci: Present: 0x01088a8f | Host ctl: 0x00000030 [ 140.011113] mmc2: sdhci: Power: 0x00000002 | Blk gap: 0x00000080 [ 140.017583] mmc2: sdhci: Wake-up: 0x00000008 | Clock: 0x0000000f [ 140.024039] mmc2: sdhci: Timeout: 0x0000008f | Int stat: 0x00000000 [ 140.030497] mmc2: sdhci: Int enab: 0x107f4000 | Sig enab: 0x107f4000 [ 140.036972] mmc2: sdhci: AC12 err: 0x00000000 | Slot int: 0x00000502 [ 140.043426] mmc2: sdhci: Caps: 0x07eb0000 | Caps_1: 0x8000b407 [ 140.049867] mmc2: sdhci: Cmd: 0x00002c1a | Max curr: 0x00ffffff [ 140.056314] mmc2: sdhci: Resp[0]: 0x00000900 | Resp[1]: 0xffffffff [ 140.062755] mmc2: sdhci: Resp[2]: 0x328f5903 | Resp[3]: 0x00d00f00 [ 140.069195] mmc2: sdhci: Host ctl2: 0x00000008 [ 140.073640] mmc2: sdhci: ADMA Err: 0x00000007 | ADMA Ptr: 0x7809c108 [ 140.080079] mmc2: sdhci: ============================================ [ 140.086662] mmc2: running CQE recovery Fixes: 18094430d6b5 ("mmc: sdhci-esdhc-imx: add ADMA Length Mismatch errata fix") Signed-off-by: Haibo Chen Cc: stable@vger.kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-esdhc-imx.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index d0d319398a54..00d41b312c79 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -1095,11 +1095,12 @@ static void sdhci_esdhc_imx_hwinit(struct sdhci_host *host) writel(readl(host->ioaddr + SDHCI_HOST_CONTROL) | ESDHC_BURST_LEN_EN_INCR, host->ioaddr + SDHCI_HOST_CONTROL); + /* - * erratum ESDHC_FLAG_ERR004536 fix for MX6Q TO1.2 and MX6DL - * TO1.1, it's harmless for MX6SL - */ - writel(readl(host->ioaddr + 0x6c) | BIT(7), + * erratum ESDHC_FLAG_ERR004536 fix for MX6Q TO1.2 and MX6DL + * TO1.1, it's harmless for MX6SL + */ + writel(readl(host->ioaddr + 0x6c) & ~BIT(7), host->ioaddr + 0x6c); /* disable DLL_CTRL delay line settings */ From 8ed0579c12b2fe56a1fac2f712f58fc26c1dc49b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 28 Feb 2019 16:34:37 +0100 Subject: [PATCH 277/298] kvm: properly check debugfs dentry before using it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit debugfs can now report an error code if something went wrong instead of just NULL. So if the return value is to be used as a "real" dentry, it needs to be checked if it is an error before dereferencing it. This is now happening because of ff9fb72bc077 ("debugfs: return error values, not NULL"). syzbot has found a way to trigger multiple debugfs files attempting to be created, which fails, and then the error code gets passed to dentry_path_raw() which obviously does not like it. Reported-by: Eric Biggers Reported-and-tested-by: syzbot+7857962b4d45e602b8ad@syzkaller.appspotmail.com Cc: "Radim Krčmář" Cc: kvm@vger.kernel.org Acked-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 585845203db8..076bc38963bf 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4044,7 +4044,7 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) } add_uevent_var(env, "PID=%d", kvm->userspace_pid); - if (kvm->debugfs_dentry) { + if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); if (p) { From 5845f706388a4cde0f6b80f9e5d33527e942b7d9 Mon Sep 17 00:00:00 2001 From: Sheng Lan Date: Thu, 28 Feb 2019 18:47:58 +0800 Subject: [PATCH 278/298] net: netem: fix skb length BUG_ON in __skb_to_sgvec It can be reproduced by following steps: 1. virtio_net NIC is configured with gso/tso on 2. configure nginx as http server with an index file bigger than 1M bytes 3. use tc netem to produce duplicate packets and delay: tc qdisc add dev eth0 root netem delay 100ms 10ms 30% duplicate 90% 4. continually curl the nginx http server to get index file on client 5. BUG_ON is seen quickly [10258690.371129] kernel BUG at net/core/skbuff.c:4028! [10258690.371748] invalid opcode: 0000 [#1] SMP PTI [10258690.372094] CPU: 5 PID: 0 Comm: swapper/5 Tainted: G W 5.0.0-rc6 #2 [10258690.372094] RSP: 0018:ffffa05797b43da0 EFLAGS: 00010202 [10258690.372094] RBP: 00000000000005ea R08: 0000000000000000 R09: 00000000000005ea [10258690.372094] R10: ffffa0579334d800 R11: 00000000000002c0 R12: 0000000000000002 [10258690.372094] R13: 0000000000000000 R14: ffffa05793122900 R15: ffffa0578f7cb028 [10258690.372094] FS: 0000000000000000(0000) GS:ffffa05797b40000(0000) knlGS:0000000000000000 [10258690.372094] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [10258690.372094] CR2: 00007f1a6dc00868 CR3: 000000001000e000 CR4: 00000000000006e0 [10258690.372094] Call Trace: [10258690.372094] [10258690.372094] skb_to_sgvec+0x11/0x40 [10258690.372094] start_xmit+0x38c/0x520 [virtio_net] [10258690.372094] dev_hard_start_xmit+0x9b/0x200 [10258690.372094] sch_direct_xmit+0xff/0x260 [10258690.372094] __qdisc_run+0x15e/0x4e0 [10258690.372094] net_tx_action+0x137/0x210 [10258690.372094] __do_softirq+0xd6/0x2a9 [10258690.372094] irq_exit+0xde/0xf0 [10258690.372094] smp_apic_timer_interrupt+0x74/0x140 [10258690.372094] apic_timer_interrupt+0xf/0x20 [10258690.372094] In __skb_to_sgvec(), the skb->len is not equal to the sum of the skb's linear data size and nonlinear data size, thus BUG_ON triggered. Because the skb is cloned and a part of nonlinear data is split off. Duplicate packet is cloned in netem_enqueue() and may be delayed some time in qdisc. When qdisc len reached the limit and returns NET_XMIT_DROP, the skb will be retransmit later in write queue. the skb will be fragmented by tso_fragment(), the limit size that depends on cwnd and mss decrease, the skb's nonlinear data will be split off. The length of the skb cloned by netem will not be updated. When we use virtio_net NIC and invoke skb_to_sgvec(), the BUG_ON trigger. To fix it, netem returns NET_XMIT_SUCCESS to upper stack when it clones a duplicate packet. Fixes: 35d889d1 ("sch_netem: fix skb leak in netem_enqueue()") Signed-off-by: Sheng Lan Reported-by: Qin Ji Suggested-by: Eric Dumazet Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_netem.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 75046ec72144..cc9d8133afcd 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -447,6 +447,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, int nb = 0; int count = 1; int rc = NET_XMIT_SUCCESS; + int rc_drop = NET_XMIT_DROP; /* Do not fool qdisc_drop_all() */ skb->prev = NULL; @@ -486,6 +487,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->duplicate = 0; rootq->enqueue(skb2, rootq, to_free); q->duplicate = dupsave; + rc_drop = NET_XMIT_SUCCESS; } /* @@ -498,7 +500,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (skb_is_gso(skb)) { segs = netem_segment(skb, sch, to_free); if (!segs) - return NET_XMIT_DROP; + return rc_drop; } else { segs = skb; } @@ -521,8 +523,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, 1<<(prandom_u32() % 8); } - if (unlikely(sch->q.qlen >= sch->limit)) - return qdisc_drop_all(skb, sch, to_free); + if (unlikely(sch->q.qlen >= sch->limit)) { + qdisc_drop_all(skb, sch, to_free); + return rc_drop; + } qdisc_qstats_backlog_inc(sch, skb); From ac5105052dc8be5cef34d79e1f4186d39b2f3ca3 Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Thu, 28 Feb 2019 11:36:52 +0000 Subject: [PATCH 279/298] sctp: chunk.c: correct format string for size_t in printk According to Documentation/core-api/printk-formats.rst, size_t should be printed with %zu, rather than %Zu. In addition, using %Zu triggers a warning on clang (-Wformat-extra-args): net/sctp/chunk.c:196:25: warning: data argument not used by format string [-Wformat-extra-args] __func__, asoc, max_data); ~~~~~~~~~~~~~~~~^~~~~~~~~ ./include/linux/printk.h:440:49: note: expanded from macro 'pr_warn_ratelimited' printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~ ./include/linux/printk.h:424:17: note: expanded from macro 'printk_ratelimited' printk(fmt, ##__VA_ARGS__); \ ~~~ ^ Fixes: 5b5e0928f742 ("lib/vsprintf.c: remove %Z support") Link: https://github.com/ClangBuiltLinux/linux/issues/378 Signed-off-by: Matthias Maennich Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/chunk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 64bef313d436..5cb7c1ff97e9 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -192,7 +192,7 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, if (unlikely(!max_data)) { max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), sctp_datachk_len(&asoc->stream)); - pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%Zu)", + pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%zu)", __func__, asoc, max_data); } From 99e87f56b48f490fb16b6e0f74691c1e664dea95 Mon Sep 17 00:00:00 2001 From: Igor Druzhinin Date: Thu, 28 Feb 2019 12:48:03 +0000 Subject: [PATCH 280/298] xen-netback: fix occasional leak of grant ref mappings under memory pressure Zero-copy callback flag is not yet set on frag list skb at the moment xenvif_handle_frag_list() returns -ENOMEM. This eventually results in leaking grant ref mappings since xenvif_zerocopy_callback() is never called for these fragments. Those eventually build up and cause Xen to kill Dom0 as the slots get reused for new mappings: "d0v0 Attempt to implicitly unmap a granted PTE c010000329fce005" That behavior is observed under certain workloads where sudden spikes of page cache writes coexist with active atomic skb allocations from network traffic. Additionally, rework the logic to deal with frag_list deallocation in a single place. Signed-off-by: Paul Durrant Signed-off-by: Igor Druzhinin Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 80aae3a32c2a..f09948b009dd 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1072,11 +1072,6 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s skb_frag_size_set(&frags[i], len); } - /* Copied all the bits from the frag list -- free it. */ - skb_frag_list_init(skb); - xenvif_skb_zerocopy_prepare(queue, nskb); - kfree_skb(nskb); - /* Release all the original (foreign) frags. */ for (f = 0; f < skb_shinfo(skb)->nr_frags; f++) skb_frag_unref(skb, f); @@ -1145,6 +1140,8 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) xenvif_fill_frags(queue, skb); if (unlikely(skb_has_frag_list(skb))) { + struct sk_buff *nskb = skb_shinfo(skb)->frag_list; + xenvif_skb_zerocopy_prepare(queue, nskb); if (xenvif_handle_frag_list(queue, skb)) { if (net_ratelimit()) netdev_err(queue->vif->dev, @@ -1153,6 +1150,9 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) kfree_skb(skb); continue; } + /* Copied all the bits from the frag list -- free it. */ + skb_frag_list_init(skb); + kfree_skb(nskb); } skb->dev = queue->vif->dev; From a2288d4e355992d369c50c45d017a85f6061ff71 Mon Sep 17 00:00:00 2001 From: Igor Druzhinin Date: Thu, 28 Feb 2019 14:11:26 +0000 Subject: [PATCH 281/298] xen-netback: don't populate the hash cache on XenBus disconnect Occasionally, during the disconnection procedure on XenBus which includes hash cache deinitialization there might be some packets still in-flight on other processors. Handling of these packets includes hashing and hash cache population that finally results in hash cache data structure corruption. In order to avoid this we prevent hashing of those packets if there are no queues initialized. In that case RCU protection of queues guards the hash cache as well. Signed-off-by: Igor Druzhinin Reviewed-by: Paul Durrant Signed-off-by: David S. Miller --- drivers/net/xen-netback/hash.c | 2 ++ drivers/net/xen-netback/interface.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/net/xen-netback/hash.c b/drivers/net/xen-netback/hash.c index 0ccb021f1e78..10d580c3dea3 100644 --- a/drivers/net/xen-netback/hash.c +++ b/drivers/net/xen-netback/hash.c @@ -454,6 +454,8 @@ void xenvif_init_hash(struct xenvif *vif) if (xenvif_hash_cache_size == 0) return; + BUG_ON(vif->hash.cache.count); + spin_lock_init(&vif->hash.cache.lock); INIT_LIST_HEAD(&vif->hash.cache.list); } diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 182d6770f102..6da12518e693 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -153,6 +153,13 @@ static u16 xenvif_select_queue(struct net_device *dev, struct sk_buff *skb, { struct xenvif *vif = netdev_priv(dev); unsigned int size = vif->hash.size; + unsigned int num_queues; + + /* If queues are not set up internally - always return 0 + * as the packet going to be dropped anyway */ + num_queues = READ_ONCE(vif->num_queues); + if (num_queues < 1) + return 0; if (vif->hash.alg == XEN_NETIF_CTRL_HASH_ALGORITHM_NONE) return fallback(dev, skb, NULL) % dev->real_num_tx_queues; From 6e46e2d821bb22b285ae8187959096b65d063b0d Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Thu, 28 Feb 2019 18:14:03 +0100 Subject: [PATCH 282/298] net: dsa: mv88e6xxx: Fix u64 statistics The switch maintains u64 counters for the number of octets sent and received. These are kept as two u32's which need to be combined. Fix the combing, which wrongly worked on u16's. Fixes: 80c4627b2719 ("dsa: mv88x6xxx: Refactor getting a single statistic") Reported-by: Chris Healy Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 12fd7ce3f1ff..9d36d2b5fbc6 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -896,7 +896,7 @@ static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip, default: return U64_MAX; } - value = (((u64)high) << 16) | low; + value = (((u64)high) << 32) | low; return value; } From d235c48b40d399328585a68f3f9bf7cc3062d586 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Thu, 28 Feb 2019 22:14:33 +0100 Subject: [PATCH 283/298] net: dsa: mv88e6xxx: power serdes on/off for 10G interfaces on 6390X Upon setting the cmode on 6390 and 6390X, the associated serdes interfaces must be powered off/on. Both 6390X and 6390 share code to do so, but it currently uses the 6390 specific helper mv88e6390_serdes_power() to disable and enable the serdes interface. This call will fail silently on 6390X when trying so set a 10G interface such as XAUI or RXAUI, since mv88e6390_serdes_power() internally grabs the lane number based on modes supported by the 6390, and returns 0 when getting -ENODEV as a lane number. Using mv88e6390x_serdes_power() should be safe here, since we explicitly rule-out all ports but the 9 and 10, and because modes supported by 6390 ports 9 and 10 are a subset of those supported on 6390X. This was tested on 6390X using RXAUI mode. Fixes: 364e9d7776a3 ("net: dsa: mv88e6xxx: Power on/off SERDES on cmode change") Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/port.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index ebd26b6a93e6..4b02c96f6786 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -408,7 +408,7 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, return err; } - err = mv88e6390_serdes_power(chip, port, false); + err = mv88e6390x_serdes_power(chip, port, false); if (err) return err; @@ -424,7 +424,7 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, if (err) return err; - err = mv88e6390_serdes_power(chip, port, true); + err = mv88e6390x_serdes_power(chip, port, true); if (err) return err; From 352d20d611414715353ee65fc206ee57ab1a6984 Mon Sep 17 00:00:00 2001 From: Peng Sun Date: Wed, 27 Feb 2019 22:36:25 +0800 Subject: [PATCH 284/298] bpf: drop refcount if bpf_map_new_fd() fails in map_create() In bpf/syscall.c, map_create() first set map->usercnt to 1, a file descriptor is supposed to return to userspace. When bpf_map_new_fd() fails, drop the refcount. Fixes: bd5f5f4ecb78 ("bpf: Add BPF_MAP_GET_FD_BY_ID") Signed-off-by: Peng Sun Acked-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f98328abe8fa..84470d1480aa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -559,12 +559,12 @@ static int map_create(union bpf_attr *attr) err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. - * bpf_map_put() is needed because the above + * bpf_map_put_with_uref() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ - bpf_map_put(map); + bpf_map_put_with_uref(map); return err; } From 6baec880d7a53cbc2841123e56ee31e830df9b49 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Feb 2019 16:21:58 -0800 Subject: [PATCH 285/298] kasan: turn off asan-stack for clang-8 and earlier Building an arm64 allmodconfig kernel with clang results in over 140 warnings about overly large stack frames, the worst ones being: drivers/gpu/drm/panel/panel-sitronix-st7789v.c:196:12: error: stack frame size of 20224 bytes in function 'st7789v_prepare' drivers/video/fbdev/omap2/omapfb/displays/panel-tpo-td028ttec1.c:196:12: error: stack frame size of 13120 bytes in function 'td028ttec1_panel_enable' drivers/usb/host/max3421-hcd.c:1395:1: error: stack frame size of 10048 bytes in function 'max3421_spi_thread' drivers/net/wan/slic_ds26522.c:209:12: error: stack frame size of 9664 bytes in function 'slic_ds26522_probe' drivers/crypto/ccp/ccp-ops.c:2434:5: error: stack frame size of 8832 bytes in function 'ccp_run_cmd' drivers/media/dvb-frontends/stv0367.c:1005:12: error: stack frame size of 7840 bytes in function 'stv0367ter_algo' None of these happen with gcc today, and almost all of these are the result of a single known issue in llvm. Hopefully it will eventually get fixed with the clang-9 release. In the meantime, the best idea I have is to turn off asan-stack for clang-8 and earlier, so we can produce a kernel that is safe to run. I have posted three patches that address the frame overflow warnings that are not addressed by turning off asan-stack, so in combination with this change, we get much closer to a clean allmodconfig build, which in turn is necessary to do meaningful build regression testing. It is still possible to turn on the CONFIG_ASAN_STACK option on all versions of clang, and it's always enabled for gcc, but when CONFIG_COMPILE_TEST is set, the option remains invisible, so allmodconfig and randconfig builds (which are normally done with a forced CONFIG_COMPILE_TEST) will still result in a mostly clean build. Link: http://lkml.kernel.org/r/20190222222950.3997333-1-arnd@arndb.de Link: https://bugs.llvm.org/show_bug.cgi?id=38809 Signed-off-by: Arnd Bergmann Reviewed-by: Qian Cai Reviewed-by: Mark Brown Acked-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Nick Desaulniers Cc: Kostya Serebryany Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.kasan | 22 ++++++++++++++++++++++ scripts/Makefile.kasan | 2 +- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index d8c474b6691e..9737059ec58b 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -113,6 +113,28 @@ config KASAN_INLINE endchoice +config KASAN_STACK_ENABLE + bool "Enable stack instrumentation (unsafe)" if CC_IS_CLANG && !COMPILE_TEST + default !(CLANG_VERSION < 90000) + depends on KASAN + help + The LLVM stack address sanitizer has a know problem that + causes excessive stack usage in a lot of functions, see + https://bugs.llvm.org/show_bug.cgi?id=38809 + Disabling asan-stack makes it safe to run kernels build + with clang-8 with KASAN enabled, though it loses some of + the functionality. + This feature is always disabled when compile-testing with clang-8 + or earlier to avoid cluttering the output in stack overflow + warnings, but clang-8 users can still enable it for builds without + CONFIG_COMPILE_TEST. On gcc and later clang versions it is + assumed to always be safe to use and enabled by default. + +config KASAN_STACK + int + default 1 if KASAN_STACK_ENABLE || CC_IS_GCC + default 0 + config KASAN_S390_4_LEVEL_PAGING bool "KASan: use 4-level paging" depends on KASAN && S390 diff --git a/scripts/Makefile.kasan b/scripts/Makefile.kasan index 25c259df8ffa..6deabedc67fc 100644 --- a/scripts/Makefile.kasan +++ b/scripts/Makefile.kasan @@ -26,7 +26,7 @@ else CFLAGS_KASAN := $(CFLAGS_KASAN_SHADOW) \ $(call cc-param,asan-globals=1) \ $(call cc-param,asan-instrumentation-with-call-threshold=$(call_threshold)) \ - $(call cc-param,asan-stack=1) \ + $(call cc-param,asan-stack=$(CONFIG_KASAN_STACK)) \ $(call cc-param,asan-use-after-scope=1) \ $(call cc-param,asan-instrument-allocas=1) endif From cb6acd01e2e43fd8bad11155752b7699c3d0fb76 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 28 Feb 2019 16:22:02 -0800 Subject: [PATCH 286/298] hugetlbfs: fix races and page leaks during migration hugetlb pages should only be migrated if they are 'active'. The routines set/clear_page_huge_active() modify the active state of hugetlb pages. When a new hugetlb page is allocated at fault time, set_page_huge_active is called before the page is locked. Therefore, another thread could race and migrate the page while it is being added to page table by the fault code. This race is somewhat hard to trigger, but can be seen by strategically adding udelay to simulate worst case scheduling behavior. Depending on 'how' the code races, various BUG()s could be triggered. To address this issue, simply delay the set_page_huge_active call until after the page is successfully added to the page table. Hugetlb pages can also be leaked at migration time if the pages are associated with a file in an explicitly mounted hugetlbfs filesystem. For example, consider a two node system with 4GB worth of huge pages available. A program mmaps a 2G file in a hugetlbfs filesystem. It then migrates the pages associated with the file from one node to another. When the program exits, huge page counts are as follows: node0 1024 free_hugepages 1024 nr_hugepages node1 0 free_hugepages 1024 nr_hugepages Filesystem Size Used Avail Use% Mounted on nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool That is as expected. 2G of huge pages are taken from the free_hugepages counts, and 2G is the size of the file in the explicitly mounted filesystem. If the file is then removed, the counts become: node0 1024 free_hugepages 1024 nr_hugepages node1 1024 free_hugepages 1024 nr_hugepages Filesystem Size Used Avail Use% Mounted on nodev 4.0G 2.0G 2.0G 50% /var/opt/hugepool Note that the filesystem still shows 2G of pages used, while there actually are no huge pages in use. The only way to 'fix' the filesystem accounting is to unmount the filesystem If a hugetlb page is associated with an explicitly mounted filesystem, this information in contained in the page_private field. At migration time, this information is not preserved. To fix, simply transfer page_private from old to new page at migration time if necessary. There is a related race with removing a huge page from a file and migration. When a huge page is removed from the pagecache, the page_mapping() field is cleared, yet page_private remains set until the page is actually freed by free_huge_page(). A page could be migrated while in this state. However, since page_mapping() is not set the hugetlbfs specific routine to transfer page_private is not called and we leak the page count in the filesystem. To fix that, check for this condition before migrating a huge page. If the condition is detected, return EBUSY for the page. Link: http://lkml.kernel.org/r/74510272-7319-7372-9ea6-ec914734c179@oracle.com Link: http://lkml.kernel.org/r/20190212221400.3512-1-mike.kravetz@oracle.com Fixes: bcc54222309c ("mm: hugetlb: introduce page_huge_active") Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Mel Gorman Cc: Davidlohr Bueso Cc: [mike.kravetz@oracle.com: v2] Link: http://lkml.kernel.org/r/7534d322-d782-8ac6-1c8d-a8dc380eb3ab@oracle.com [mike.kravetz@oracle.com: update comment and changelog] Link: http://lkml.kernel.org/r/420bcfd6-158b-38e4-98da-26d0cd85bd01@oracle.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 12 ++++++++++++ mm/hugetlb.c | 16 +++++++++++++--- mm/migrate.c | 11 +++++++++++ 3 files changed, 36 insertions(+), 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 32920a10100e..a7fa037b876b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -859,6 +859,18 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, rc = migrate_huge_page_move_mapping(mapping, newpage, page); if (rc != MIGRATEPAGE_SUCCESS) return rc; + + /* + * page_private is subpool pointer in hugetlb pages. Transfer to + * new page. PagePrivate is not associated with page_private for + * hugetlb pages and can not be set here as only page_huge_active + * pages can be migrated. + */ + if (page_private(page)) { + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + } + if (mode != MIGRATE_SYNC_NO_COPY) migrate_page_copy(newpage, page); else diff --git a/mm/hugetlb.c b/mm/hugetlb.c index afef61656c1e..8dfdffc34a99 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3624,7 +3624,6 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, copy_user_huge_page(new_page, old_page, address, vma, pages_per_huge_page(h)); __SetPageUptodate(new_page); - set_page_huge_active(new_page); mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); @@ -3645,6 +3644,7 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_page_huge_active(new_page); /* Make the old page be freed below */ new_page = old_page; } @@ -3729,6 +3729,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, pte_t new_pte; spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); + bool new_page = false; /* * Currently, we are forced to kill the process in the event the @@ -3790,7 +3791,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); - set_page_huge_active(page); + new_page = true; if (vma->vm_flags & VM_MAYSHARE) { int err = huge_add_to_page_cache(page, mapping, idx); @@ -3861,6 +3862,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } spin_unlock(ptl); + + /* + * Only make newly allocated pages active. Existing pages found + * in the pagecache could be !page_huge_active() if they have been + * isolated for migration. + */ + if (new_page) + set_page_huge_active(page); + unlock_page(page); out: return ret; @@ -4095,7 +4105,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, * the set_pte_at() write. */ __SetPageUptodate(page); - set_page_huge_active(page); mapping = dst_vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, dst_vma, dst_addr); @@ -4163,6 +4172,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); + set_page_huge_active(page); if (vm_shared) unlock_page(page); ret = 0; diff --git a/mm/migrate.c b/mm/migrate.c index d4fd680be3b0..181f5d2718a9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1315,6 +1315,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, lock_page(hpage); } + /* + * Check for pages which are in the process of being freed. Without + * page_mapping() set, hugetlbfs specific move page routine will not + * be called and we could leak usage counts for subpools. + */ + if (page_private(hpage) && !page_mapping(hpage)) { + rc = -EBUSY; + goto out_unlock; + } + if (PageAnon(hpage)) anon_vma = page_get_anon_vma(hpage); @@ -1345,6 +1355,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, put_new_page = NULL; } +out_unlock: unlock_page(hpage); out: if (rc != -EAGAIN) From ada641ff6ed34a125fbf62ec79733352ffd4305d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 26 Feb 2019 15:27:43 +0100 Subject: [PATCH 287/298] selftests: fixes for UDP GRO The current implementation for UDP GRO tests is racy: the receiver may flush the RX queue while the sending is still transmitting and incorrectly report RX errors, with a wrong number of packet received. Add explicit timeouts to the receiver for both connection activation (first packet received for UDP) and reception completion, so that in the above critical scenario the receiver will wait for the transfer completion. Fixes: 3327a9c46352 ("selftests: add functionals test for UDP GRO") Signed-off-by: Paolo Abeni Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/udpgro.sh | 8 ++-- tools/testing/selftests/net/udpgso_bench_rx.c | 42 +++++++++++++------ 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh index aeac53a99aeb..ac2a30be9b32 100755 --- a/tools/testing/selftests/net/udpgro.sh +++ b/tools/testing/selftests/net/udpgro.sh @@ -37,7 +37,7 @@ run_one() { cfg_veth - ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} && \ + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \ echo "ok" || \ echo "failed" & @@ -81,7 +81,7 @@ run_one_nat() { # will land on the 'plain' one ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 & pid=$! - ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${family} -b ${addr2%/*} ${rx_args} && \ + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \ echo "ok" || \ echo "failed"& @@ -99,8 +99,8 @@ run_one_2sock() { cfg_veth - ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -p 12345 & - ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} && \ + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 & + ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \ echo "ok" || \ echo "failed" & diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c index 0c960f673324..db3d4a8b5a4c 100644 --- a/tools/testing/selftests/net/udpgso_bench_rx.c +++ b/tools/testing/selftests/net/udpgso_bench_rx.c @@ -45,6 +45,8 @@ static int cfg_alen = sizeof(struct sockaddr_in6); static int cfg_expected_pkt_nr; static int cfg_expected_pkt_len; static int cfg_expected_gso_size; +static int cfg_connect_timeout_ms; +static int cfg_rcv_timeout_ms; static struct sockaddr_storage cfg_bind_addr; static bool interrupted; @@ -87,7 +89,7 @@ static unsigned long gettimeofday_ms(void) return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); } -static void do_poll(int fd) +static void do_poll(int fd, int timeout_ms) { struct pollfd pfd; int ret; @@ -102,8 +104,16 @@ static void do_poll(int fd) break; if (ret == -1) error(1, errno, "poll"); - if (ret == 0) - continue; + if (ret == 0) { + if (!timeout_ms) + continue; + + timeout_ms -= 10; + if (timeout_ms <= 0) { + interrupted = true; + break; + } + } if (pfd.revents != POLLIN) error(1, errno, "poll: 0x%x expected 0x%x\n", pfd.revents, POLLIN); @@ -134,7 +144,7 @@ static int do_socket(bool do_tcp) if (listen(accept_fd, 1)) error(1, errno, "listen"); - do_poll(accept_fd); + do_poll(accept_fd, cfg_connect_timeout_ms); if (interrupted) exit(0); @@ -273,7 +283,9 @@ static void do_flush_udp(int fd) static void usage(const char *filepath) { - error(1, 0, "Usage: %s [-Grtv] [-b addr] [-p port] [-l pktlen] [-n packetnr] [-S gsosize]", filepath); + error(1, 0, "Usage: %s [-C connect_timeout] [-Grtv] [-b addr] [-p port]" + " [-l pktlen] [-n packetnr] [-R rcv_timeout] [-S gsosize]", + filepath); } static void parse_opts(int argc, char **argv) @@ -282,7 +294,7 @@ static void parse_opts(int argc, char **argv) /* bind to any by default */ setup_sockaddr(PF_INET6, "::", &cfg_bind_addr); - while ((c = getopt(argc, argv, "4b:Gl:n:p:rS:tv")) != -1) { + while ((c = getopt(argc, argv, "4b:C:Gl:n:p:rR:S:tv")) != -1) { switch (c) { case '4': cfg_family = PF_INET; @@ -292,6 +304,9 @@ static void parse_opts(int argc, char **argv) case 'b': setup_sockaddr(cfg_family, optarg, &cfg_bind_addr); break; + case 'C': + cfg_connect_timeout_ms = strtoul(optarg, NULL, 0); + break; case 'G': cfg_gro_segment = true; break; @@ -307,6 +322,9 @@ static void parse_opts(int argc, char **argv) case 'r': cfg_read_all = true; break; + case 'R': + cfg_rcv_timeout_ms = strtoul(optarg, NULL, 0); + break; case 'S': cfg_expected_gso_size = strtol(optarg, NULL, 0); break; @@ -329,8 +347,9 @@ static void parse_opts(int argc, char **argv) static void do_recv(void) { + int timeout_ms = cfg_tcp ? cfg_rcv_timeout_ms : cfg_connect_timeout_ms; unsigned long tnow, treport; - int fd, loop = 0; + int fd; fd = do_socket(cfg_tcp); @@ -342,12 +361,7 @@ static void do_recv(void) treport = gettimeofday_ms() + 1000; do { - /* force termination after the second poll(); this cope both - * with sender slower than receiver and missing packet errors - */ - if (cfg_expected_pkt_nr && loop++) - interrupted = true; - do_poll(fd); + do_poll(fd, timeout_ms); if (cfg_tcp) do_flush_tcp(fd); @@ -365,6 +379,8 @@ static void do_recv(void) treport = tnow + 1000; } + timeout_ms = cfg_rcv_timeout_ms; + } while (!interrupted); if (cfg_expected_pkt_nr && (packets != cfg_expected_pkt_nr)) From 15f3ddf53d4d26c4e338c355abffb3eaf4b3112f Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Tue, 26 Feb 2019 15:39:13 +0000 Subject: [PATCH 288/298] net: aquantia: regression on cpus with high cores: set mode with 8 queues Recently the maximum number of queues was increased up to 8, but NIC was not fully configured for 8 queues. In setups with more than 4 CPU cores parts of TX traffic gets lost if the kernel routes it to queues 4th-8th. This patch sets a tx hw traffic mode with 8 queues. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202651 Fixes: 71a963cfc50b ("net: aquantia: increase max number of hw queues") Reported-by: Nicholas Johnson Signed-off-by: Dmitry Bogdanov Signed-off-by: Igor Russkikh Signed-off-by: David S. Miller --- .../ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c | 3 +++ .../ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c | 9 +++++++++ .../ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h | 4 ++++ .../aquantia/atlantic/hw_atl/hw_atl_llh_internal.h | 13 +++++++++++++ 4 files changed, 29 insertions(+) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c index b58ca7cb8e9d..fbba300c1d01 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c @@ -275,6 +275,9 @@ static int hw_atl_b0_hw_offload_set(struct aq_hw_s *self, static int hw_atl_b0_hw_init_tx_path(struct aq_hw_s *self) { + /* Tx TC/Queue number config */ + hw_atl_rpb_tps_tx_tc_mode_set(self, 1U); + hw_atl_thm_lso_tcp_flag_of_first_pkt_set(self, 0x0FF6U); hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(self, 0x0FF6U); hw_atl_thm_lso_tcp_flag_of_last_pkt_set(self, 0x0F7FU); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c index 939f77e2e117..8ac7a67b15c1 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c @@ -1274,6 +1274,15 @@ void hw_atl_tpb_tx_buff_en_set(struct aq_hw_s *aq_hw, u32 tx_buff_en) HW_ATL_TPB_TX_BUF_EN_SHIFT, tx_buff_en); } +void hw_atl_rpb_tps_tx_tc_mode_set(struct aq_hw_s *aq_hw, + u32 tx_traf_class_mode) +{ + aq_hw_write_reg_bit(aq_hw, HW_ATL_TPB_TX_TC_MODE_ADDR, + HW_ATL_TPB_TX_TC_MODE_MSK, + HW_ATL_TPB_TX_TC_MODE_SHIFT, + tx_traf_class_mode); +} + void hw_atl_tpb_tx_buff_hi_threshold_per_tc_set(struct aq_hw_s *aq_hw, u32 tx_buff_hi_threshold_per_tc, u32 buffer) diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h index 03c570d115fe..f529540bfd7e 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h @@ -605,6 +605,10 @@ void hw_atl_thm_lso_tcp_flag_of_middle_pkt_set(struct aq_hw_s *aq_hw, /* tpb */ +/* set TX Traffic Class Mode */ +void hw_atl_rpb_tps_tx_tc_mode_set(struct aq_hw_s *aq_hw, + u32 tx_traf_class_mode); + /* set tx buffer enable */ void hw_atl_tpb_tx_buff_en_set(struct aq_hw_s *aq_hw, u32 tx_buff_en); diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h index 8470d92db812..e91ffce005f1 100644 --- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h +++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_llh_internal.h @@ -1948,6 +1948,19 @@ /* default value of bitfield tx_buf_en */ #define HW_ATL_TPB_TX_BUF_EN_DEFAULT 0x0 +/* register address for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_ADDR 0x00007900 +/* bitmask for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_MSK 0x00000100 +/* inverted bitmask for bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_MSKN 0xFFFFFEFF +/* lower bit position of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_SHIFT 8 +/* width of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_WIDTH 1 +/* default value of bitfield tx_tc_mode */ +#define HW_ATL_TPB_TX_TC_MODE_DEFAULT 0x0 + /* tx tx{b}_hi_thresh[c:0] bitfield definitions * preprocessor definitions for the bitfield "tx{b}_hi_thresh[c:0]". * parameter: buffer {b} | stride size 0x10 | range [0, 7] From d25ed413d5e51644e18f66e34eec049f17a7abcb Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 26 Feb 2019 19:29:22 +0100 Subject: [PATCH 289/298] net: phy: phylink: fix uninitialized variable in phylink_get_mac_state When debugging an issue I found implausible values in state->pause. Reason in that state->pause isn't initialized and later only single bits are changed. Also the struct itself isn't initialized in phylink_resolve(). So better initialize state->pause and other not yet initialized fields. v2: - use right function name in subject v3: - initialize additional fields Fixes: 9525ae83959b ("phylink: add phylink infrastructure") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 938803237d7f..85987aac31c4 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -320,6 +320,10 @@ static int phylink_get_mac_state(struct phylink *pl, struct phylink_link_state * linkmode_zero(state->lp_advertising); state->interface = pl->link_config.interface; state->an_enabled = pl->link_config.an_enabled; + state->speed = SPEED_UNKNOWN; + state->duplex = DUPLEX_UNKNOWN; + state->pause = MLO_PAUSE_NONE; + state->an_complete = 0; state->link = 1; return pl->ops->mac_link_state(ndev, state); From 90490ef7269906423a1c1b917fc24be8b1602658 Mon Sep 17 00:00:00 2001 From: Bryan Whitehead Date: Tue, 26 Feb 2019 14:06:26 -0500 Subject: [PATCH 290/298] lan743x: Fix TX Stall Issue It has been observed that tx queue stalls while downloading from certain web sites (example www.speedtest.net) The cause has been tracked down to a corner case where dma descriptors where not setup properly. And there for a tx completion interrupt was not signaled. This fix corrects the problem by properly marking the end of a multi descriptor transmission. Fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Bryan Whitehead Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/lan743x_main.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 310807ef328b..4d1b4a24907f 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1400,7 +1400,8 @@ static int lan743x_tx_frame_start(struct lan743x_tx *tx, } static void lan743x_tx_frame_add_lso(struct lan743x_tx *tx, - unsigned int frame_length) + unsigned int frame_length, + int nr_frags) { /* called only from within lan743x_tx_xmit_frame. * assuming tx->ring_lock has already been acquired. @@ -1410,6 +1411,10 @@ static void lan743x_tx_frame_add_lso(struct lan743x_tx *tx, /* wrap up previous descriptor */ tx->frame_data0 |= TX_DESC_DATA0_EXT_; + if (nr_frags <= 0) { + tx->frame_data0 |= TX_DESC_DATA0_LS_; + tx->frame_data0 |= TX_DESC_DATA0_IOC_; + } tx_descriptor = &tx->ring_cpu_ptr[tx->frame_tail]; tx_descriptor->data0 = tx->frame_data0; @@ -1514,8 +1519,11 @@ static void lan743x_tx_frame_end(struct lan743x_tx *tx, u32 tx_tail_flags = 0; /* wrap up previous descriptor */ - tx->frame_data0 |= TX_DESC_DATA0_LS_; - tx->frame_data0 |= TX_DESC_DATA0_IOC_; + if ((tx->frame_data0 & TX_DESC_DATA0_DTYPE_MASK_) == + TX_DESC_DATA0_DTYPE_DATA_) { + tx->frame_data0 |= TX_DESC_DATA0_LS_; + tx->frame_data0 |= TX_DESC_DATA0_IOC_; + } tx_descriptor = &tx->ring_cpu_ptr[tx->frame_tail]; buffer_info = &tx->buffer_info[tx->frame_tail]; @@ -1600,7 +1608,7 @@ static netdev_tx_t lan743x_tx_xmit_frame(struct lan743x_tx *tx, } if (gso) - lan743x_tx_frame_add_lso(tx, frame_length); + lan743x_tx_frame_add_lso(tx, frame_length, nr_frags); if (nr_frags <= 0) goto finish; From d1a2930d8a992fb6ac2529449f81a0056e1b98d1 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 1 Mar 2019 22:58:09 +0000 Subject: [PATCH 291/298] MIPS: eBPF: Fix icache flush end address The MIPS eBPF JIT calls flush_icache_range() in order to ensure the icache observes the code that we just wrote. Unfortunately it gets the end address calculation wrong due to some bad pointer arithmetic. The struct jit_ctx target field is of type pointer to u32, and as such adding one to it will increment the address being pointed to by 4 bytes. Therefore in order to find the address of the end of the code we simply need to add the number of 4 byte instructions emitted, but we mistakenly add the number of instructions multiplied by 4. This results in the call to flush_icache_range() operating on a memory region 4x larger than intended, which is always wasteful and can cause crashes if we overrun into an unmapped page. Fix this by correcting the pointer arithmetic to remove the bogus multiplication, and use braces to remove the need for a set of brackets whilst also making it obvious that the target field is a pointer. Signed-off-by: Paul Burton Fixes: b6bd53f9c4e8 ("MIPS: Add missing file for eBPF JIT.") Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: netdev@vger.kernel.org Cc: bpf@vger.kernel.org Cc: linux-mips@vger.kernel.org Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Daniel Borkmann --- arch/mips/net/ebpf_jit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 76e9bf88d3b9..0effd3cba9a7 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1819,7 +1819,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* Update the icache */ flush_icache_range((unsigned long)ctx.target, - (unsigned long)(ctx.target + ctx.idx * sizeof(u32))); + (unsigned long)&ctx.target[ctx.idx]); if (bpf_jit_enable > 1) /* Dump JIT code */ From 5e1a99eae84999a2536f50a0beaf5d5262337f40 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 27 Feb 2019 16:15:29 +0800 Subject: [PATCH 292/298] ipv4: Add ICMPv6 support when parse route ipproto For ip rules, we need to use 'ipproto ipv6-icmp' to match ICMPv6 headers. But for ip -6 route, currently we only support tcp, udp and icmp. Add ICMPv6 support so we can match ipv6-icmp rules for route lookup. v2: As David Ahern and Sabrina Dubroca suggested, Add an argument to rtm_getroute_parse_ip_proto() to handle ICMP/ICMPv6 with different family. Reported-by: Jianlin Shi Fixes: eacb9384a3fe ("ipv6: support sport, dport and ip_proto in RTM_GETROUTE") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- net/ipv4/netlink.c | 19 ++++++++++++++----- net/ipv4/route.c | 2 +- net/ipv6/route.c | 3 ++- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index f0e8d064e249..be3cad9c2e4c 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -718,7 +718,7 @@ extern int sysctl_icmp_msgs_burst; int ip_misc_proc_init(void); #endif -int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, +int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack); #endif /* _IP_H */ diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c index f86bb4f06609..d8e3a1fb8e82 100644 --- a/net/ipv4/netlink.c +++ b/net/ipv4/netlink.c @@ -3,9 +3,10 @@ #include #include #include +#include #include -int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, +int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack) { *ip_proto = nla_get_u8(attr); @@ -13,11 +14,19 @@ int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, switch (*ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: - case IPPROTO_ICMP: return 0; - default: - NL_SET_ERR_MSG(extack, "Unsupported ip proto"); - return -EOPNOTSUPP; + case IPPROTO_ICMP: + if (family != AF_INET) + break; + return 0; +#if IS_ENABLED(CONFIG_IPV6) + case IPPROTO_ICMPV6: + if (family != AF_INET6) + break; + return 0; +#endif } + NL_SET_ERR_MSG(extack, "Unsupported ip proto"); + return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5163b64f8fb3..7bb9128c8363 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2803,7 +2803,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], - &ip_proto, extack); + &ip_proto, AF_INET, extack); if (err) return err; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b7a620023a52..8dad1d690b78 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4893,7 +4893,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], - &fl6.flowi6_proto, extack); + &fl6.flowi6_proto, AF_INET6, + extack); if (err) goto errout; } From 3612af783cf52c74a031a2f11b82247b2599d3cd Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 1 Mar 2019 22:05:29 +0100 Subject: [PATCH 293/298] bpf: fix sanitation rewrite in case of non-pointers Marek reported that he saw an issue with the below snippet in that timing measurements where off when loaded as unpriv while results were reasonable when loaded as privileged: [...] uint64_t a = bpf_ktime_get_ns(); uint64_t b = bpf_ktime_get_ns(); uint64_t delta = b - a; if ((int64_t)delta > 0) { [...] Turns out there is a bug where a corner case is missing in the fix d3bd7413e0ca ("bpf: fix sanitation of alu op with pointer / scalar type from different paths"), namely fixup_bpf_calls() only checks whether aux has a non-zero alu_state, but it also needs to test for the case of BPF_ALU_NON_POINTER since in both occasions we need to skip the masking rewrite (as there is nothing to mask). Fixes: d3bd7413e0ca ("bpf: fix sanitation of alu op with pointer / scalar type from different paths") Reported-by: Marek Majkowski Reported-by: Arthur Fabre Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/netdev/CAJPywTJqP34cK20iLM5YmUMz9KXQOdu1-+BZrGMAGgLuBWz7fg@mail.gmail.com/T/ Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8f295b790297..5fcce2f4209d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6920,7 +6920,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) u32 off_reg; aux = &env->insn_aux_data[i + delta]; - if (!aux->alu_state) + if (!aux->alu_state || + aux->alu_state == BPF_ALU_NON_POINTER) continue; isneg = aux->alu_state & BPF_ALU_NEG_VALUE; From ed8fe20205ac054bf585156709de3913d1890f30 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 28 Feb 2019 07:39:15 +0100 Subject: [PATCH 294/298] net: dsa: mv88e6xxx: prevent interrupt storm caused by mv88e6390x_port_set_cmode When debugging another issue I faced an interrupt storm in this driver (88E6390, port 9 in SGMII mode), consisting of alternating link-up / link-down interrupts. Analysis showed that the driver wanted to set a cmode that was set already. But so far mv88e6390x_port_set_cmode() doesn't check this and powers down SERDES, what causes the link to break, and eventually results in the described interrupt storm. Fix this by checking whether the cmode actually changes. We want that the very first call to mv88e6390x_port_set_cmode() always configures the registers, therefore initialize port.cmode with a value that is different from any supported cmode value. We have to take care that we only init the ports cmode once chip->info->num_ports is set. v2: - add small helper and init the number of actual ports only Fixes: 364e9d7776a3 ("net: dsa: mv88e6xxx: Power on/off SERDES on cmode change") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 10 ++++++++++ drivers/net/dsa/mv88e6xxx/port.c | 4 ++++ drivers/net/dsa/mv88e6xxx/port.h | 1 + 3 files changed, 15 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 9d36d2b5fbc6..f5cad42875e9 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -4595,6 +4595,14 @@ static int mv88e6xxx_smi_init(struct mv88e6xxx_chip *chip, return 0; } +static void mv88e6xxx_ports_cmode_init(struct mv88e6xxx_chip *chip) +{ + int i; + + for (i = 0; i < mv88e6xxx_num_ports(chip); i++) + chip->ports[i].cmode = MV88E6XXX_PORT_STS_CMODE_INVALID; +} + static enum dsa_tag_protocol mv88e6xxx_get_tag_protocol(struct dsa_switch *ds, int port) { @@ -4631,6 +4639,8 @@ static const char *mv88e6xxx_drv_probe(struct device *dsa_dev, if (err) goto free; + mv88e6xxx_ports_cmode_init(chip); + mutex_lock(&chip->reg_lock); err = mv88e6xxx_switch_reset(chip); mutex_unlock(&chip->reg_lock); diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index 4b02c96f6786..79ab51e69aee 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -398,6 +398,10 @@ int mv88e6390x_port_set_cmode(struct mv88e6xxx_chip *chip, int port, cmode = 0; } + /* cmode doesn't change, nothing to do for us */ + if (cmode == chip->ports[port].cmode) + return 0; + lane = mv88e6390x_serdes_get_lane(chip, port); if (lane < 0) return lane; diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h index e583641de758..4aadf321edb7 100644 --- a/drivers/net/dsa/mv88e6xxx/port.h +++ b/drivers/net/dsa/mv88e6xxx/port.h @@ -52,6 +52,7 @@ #define MV88E6185_PORT_STS_CMODE_1000BASE_X 0x0005 #define MV88E6185_PORT_STS_CMODE_PHY 0x0006 #define MV88E6185_PORT_STS_CMODE_DISABLED 0x0007 +#define MV88E6XXX_PORT_STS_CMODE_INVALID 0xff /* Offset 0x01: MAC (or PCS or Physical) Control Register */ #define MV88E6XXX_PORT_MAC_CTL 0x01 From cf1c9ccba7308e48a68fa77f476287d9d614e4c7 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 28 Feb 2019 14:56:04 +0100 Subject: [PATCH 295/298] geneve: correctly handle ipv6.disable module parameter When IPv6 is compiled but disabled at runtime, geneve_sock_add returns -EAFNOSUPPORT. For metadata based tunnels, this causes failure of the whole operation of bringing up the tunnel. Ignore failure of IPv6 socket creation for metadata based tunnels caused by IPv6 not being available. This is the same fix as what commit d074bf960044 ("vxlan: correctly handle ipv6.disable module parameter") is doing for vxlan. Note there's also commit c0a47e44c098 ("geneve: should not call rt6_lookup() when ipv6 was disabled") which fixes a similar issue but for regular tunnels, while this patch is needed for metadata based tunnels. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/geneve.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 3377ac66a347..5583d993480d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -692,15 +692,20 @@ static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) static int geneve_open(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); - bool ipv6 = !!(geneve->info.mode & IP_TUNNEL_INFO_IPV6); bool metadata = geneve->collect_md; + bool ipv4, ipv6; int ret = 0; + ipv6 = geneve->info.mode & IP_TUNNEL_INFO_IPV6 || metadata; + ipv4 = !ipv6 || metadata; #if IS_ENABLED(CONFIG_IPV6) - if (ipv6 || metadata) + if (ipv6) { ret = geneve_sock_add(geneve, true); + if (ret < 0 && ret != -EAFNOSUPPORT) + ipv4 = false; + } #endif - if (!ret && (!ipv6 || metadata)) + if (ipv4) ret = geneve_sock_add(geneve, false); if (ret < 0) geneve_sock_release(geneve); From a6da21bb0eae459a375d5bd48baed821d14301d0 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Fri, 1 Mar 2019 23:43:39 +0100 Subject: [PATCH 296/298] net: dsa: mv88e6xxx: Fix statistics on mv88e6161 Despite what the datesheet says, the silicon implements the older way of snapshoting the statistics. Change the op. Reported-by: Chris.Healy@zii.aero Tested-by: Chris.Healy@zii.aero Fixes: 0ac64c394900 ("net: dsa: mv88e6xxx: mv88e6161 uses mv88e6320 stats snapshot") Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index f5cad42875e9..7e3c00bd9532 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -3093,7 +3093,7 @@ static const struct mv88e6xxx_ops mv88e6161_ops = { .port_disable_pri_override = mv88e6xxx_port_disable_pri_override, .port_link_state = mv88e6352_port_link_state, .port_get_cmode = mv88e6185_port_get_cmode, - .stats_snapshot = mv88e6320_g1_stats_snapshot, + .stats_snapshot = mv88e6xxx_g1_stats_snapshot, .stats_set_histogram = mv88e6095_g1_stats_set_histogram, .stats_get_sset_count = mv88e6095_stats_get_sset_count, .stats_get_strings = mv88e6095_stats_get_strings, From 07f12b26e21ab359261bf75cfcb424fdc7daeb6d Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Fri, 1 Mar 2019 23:06:40 +0800 Subject: [PATCH 297/298] net: sit: fix memory leak in sit_init_net() If register_netdev() is failed to register sitn->fb_tunnel_dev, it will go to err_reg_dev and forget to free netdev(sitn->fb_tunnel_dev). BUG: memory leak unreferenced object 0xffff888378daad00 (size 512): comm "syz-executor.1", pid 4006, jiffies 4295121142 (age 16.115s) hex dump (first 32 bytes): 00 e6 ed c0 83 88 ff ff 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d6dcb63e>] kvmalloc include/linux/mm.h:577 [inline] [<00000000d6dcb63e>] kvzalloc include/linux/mm.h:585 [inline] [<00000000d6dcb63e>] netif_alloc_netdev_queues net/core/dev.c:8380 [inline] [<00000000d6dcb63e>] alloc_netdev_mqs+0x600/0xcc0 net/core/dev.c:8970 [<00000000867e172f>] sit_init_net+0x295/0xa40 net/ipv6/sit.c:1848 [<00000000871019fa>] ops_init+0xad/0x3e0 net/core/net_namespace.c:129 [<00000000319507f6>] setup_net+0x2ba/0x690 net/core/net_namespace.c:314 [<0000000087db4f96>] copy_net_ns+0x1dc/0x330 net/core/net_namespace.c:437 [<0000000057efc651>] create_new_namespaces+0x382/0x730 kernel/nsproxy.c:107 [<00000000676f83de>] copy_namespaces+0x2ed/0x3d0 kernel/nsproxy.c:165 [<0000000030b74bac>] copy_process.part.27+0x231e/0x6db0 kernel/fork.c:1919 [<00000000fff78746>] copy_process kernel/fork.c:1713 [inline] [<00000000fff78746>] _do_fork+0x1bc/0xe90 kernel/fork.c:2224 [<000000001c2e0d1c>] do_syscall_64+0xc8/0x580 arch/x86/entry/common.c:290 [<00000000ec48bd44>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<0000000039acff8a>] 0xffffffffffffffff Signed-off-by: Mao Wenan Signed-off-by: David S. Miller --- net/ipv6/sit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index e8a1dabef803..09e440e8dfae 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1873,6 +1873,7 @@ static int __net_init sit_init_net(struct net *net) err_reg_dev: ipip6_dev_free(sitn->fb_tunnel_dev); + free_netdev(sitn->fb_tunnel_dev); err_alloc_dev: return err; } From 1c163f4c7b3f621efff9b28a47abb36f7378d783 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Mar 2019 15:21:29 -0800 Subject: [PATCH 298/298] Linux 5.0 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ac5ac28a24e9..d5713e7b1e50 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 0 SUBLEVEL = 0 -EXTRAVERSION = -rc8 +EXTRAVERSION = NAME = Shy Crocodile # *DOCUMENTATION*