From 59bf6c65a09fff74215517aecffbbdcd67df76e3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Jul 2022 11:21:59 -0700 Subject: [PATCH 1/7] tcp: Fix data-races around sk_pacing_rate. While reading sysctl_tcp_pacing_(ss|ca)_ratio, they can be changed concurrently. Thus, we need to add READ_ONCE() to their readers. Fixes: 43e122b014c9 ("tcp: refine pacing rate determination") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c799f39cb774..dd05238f79f6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -910,9 +910,9 @@ static void tcp_update_pacing_rate(struct sock *sk) * end of slow start and should slow down. */ if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2) - rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; + rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio); else - rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; + rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio); rate *= max(tcp_snd_cwnd(tp), tp->packets_out); From 02739545951ad4c1215160db7fbf9b7a918d3c0b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Jul 2022 11:22:00 -0700 Subject: [PATCH 2/7] net: Fix data-races around sysctl_[rw]mem(_offset)?. While reading these sysctl variables, they can be changed concurrently. Thus, we need to add READ_ONCE() to their readers. - .sysctl_rmem - .sysctl_rwmem - .sysctl_rmem_offset - .sysctl_wmem_offset - sysctl_tcp_rmem[1, 2] - sysctl_tcp_wmem[1, 2] - sysctl_decnet_rmem[1] - sysctl_decnet_wmem[1] - sysctl_tipc_rmem[1] Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/sock.h | 8 ++++---- net/decnet/af_decnet.c | 4 ++-- net/ipv4/tcp.c | 6 +++--- net/ipv4/tcp_input.c | 13 +++++++------ net/ipv4/tcp_output.c | 2 +- net/mptcp/protocol.c | 6 +++--- net/tipc/socket.c | 2 +- 7 files changed, 21 insertions(+), 20 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 9fa54762e077..7a48991cdb19 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2843,18 +2843,18 @@ static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset)); - return *proto->sysctl_wmem; + return READ_ONCE(*proto->sysctl_wmem); } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) - return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset); + return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset)); - return *proto->sysctl_rmem; + return READ_ONCE(*proto->sysctl_rmem); } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index dc92a67baea3..7d542eb46172 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -480,8 +480,8 @@ static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gf sk->sk_family = PF_DECnet; sk->sk_protocol = 0; sk->sk_allocation = gfp; - sk->sk_sndbuf = sysctl_decnet_wmem[1]; - sk->sk_rcvbuf = sysctl_decnet_rmem[1]; + sk->sk_sndbuf = READ_ONCE(sysctl_decnet_wmem[1]); + sk->sk_rcvbuf = READ_ONCE(sysctl_decnet_rmem[1]); /* Initialization of DECnet Session Control Port */ scp = DN_SK(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a11e5de3a4c3..002a4a04efbe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -452,8 +452,8 @@ void tcp_init_sock(struct sock *sk) icsk->icsk_sync_mss = tcp_sync_mss; - WRITE_ONCE(sk->sk_sndbuf, sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); - WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); sk_sockets_allocated_inc(sk); } @@ -1724,7 +1724,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else - cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1; + cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dd05238f79f6..ff2e0d87aee4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -426,7 +426,7 @@ static void tcp_sndbuf_expand(struct sock *sk) if (sk->sk_sndbuf < sndmem) WRITE_ONCE(sk->sk_sndbuf, - min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2])); + min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2]))); } /* 2. Tuning advertised window (window_clamp, rcv_ssthresh) @@ -461,7 +461,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; - int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; + int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) @@ -574,16 +574,17 @@ static void tcp_clamp_window(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); + int rmem2; icsk->icsk_ack.quick = 0; + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); - if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] && + if (sk->sk_rcvbuf < rmem2 && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { WRITE_ONCE(sk->sk_rcvbuf, - min(atomic_read(&sk->sk_rmem_alloc), - net->ipv4.sysctl_tcp_rmem[2])); + min(atomic_read(&sk->sk_rmem_alloc), rmem2)); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss); @@ -745,7 +746,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(rcvwin, tp->advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2efe41c84ee8..4c376b6d8764 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -238,7 +238,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, *rcv_wscale = 0; if (wscale_ok) { /* Set window scaling on max possible window */ - space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); *rcv_wscale = clamp_t(int, ilog2(space) - 15, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9bbd8cbe0acb..7e1518bb6115 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1926,7 +1926,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(rcvwin, advmss); rcvbuf = min_t(u64, rcvwin * rcvmem, - sock_net(sk)->ipv4.sysctl_tcp_rmem[2]); + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; @@ -2669,8 +2669,8 @@ static int mptcp_init_sock(struct sock *sk) mptcp_ca_reset(sk); sk_sockets_allocated_inc(sk); - sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1]; - sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]); + sk->sk_sndbuf = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]); return 0; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 43509c7e90fc..f1c3b8eb4b3d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -517,7 +517,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, timer_setup(&sk->sk_timer, tipc_sk_timeout, 0); sk->sk_shutdown = 0; sk->sk_backlog_rcv = tipc_sk_backlog_rcv; - sk->sk_rcvbuf = sysctl_tipc_rmem[1]; + sk->sk_rcvbuf = READ_ONCE(sysctl_tipc_rmem[1]); sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; sk->sk_destruct = tipc_sock_destruct; From 4866b2b0f7672b6d760c4b8ece6fb56f965dcc8a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Jul 2022 11:22:01 -0700 Subject: [PATCH 3/7] tcp: Fix a data-race around sysctl_tcp_comp_sack_delay_ns. While reading sysctl_tcp_comp_sack_delay_ns, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 6d82aa242092 ("tcp: add tcp_comp_sack_delay_ns sysctl") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ff2e0d87aee4..813744180d87 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5542,7 +5542,8 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; - delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, + delay = min_t(unsigned long, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), From 22396941a7f343d704738360f9ef0e6576489d43 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Jul 2022 11:22:02 -0700 Subject: [PATCH 4/7] tcp: Fix a data-race around sysctl_tcp_comp_sack_slack_ns. While reading sysctl_tcp_comp_sack_slack_ns, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: a70437cc09a1 ("tcp: add hrtimer slack to sack compression") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 813744180d87..a4d8851d83ff 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5547,7 +5547,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), - sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), HRTIMER_MODE_REL_PINNED_SOFT); } From 79f55473bfc8ac51bd6572929a679eeb4da22251 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Jul 2022 11:22:03 -0700 Subject: [PATCH 5/7] tcp: Fix a data-race around sysctl_tcp_comp_sack_nr. While reading sysctl_tcp_comp_sack_nr, it can be changed concurrently. Thus, we need to add READ_ONCE() to its reader. Fixes: 9c21d2fc41c0 ("tcp: add tcp_comp_sack_nr sysctl") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a4d8851d83ff..b1637990d570 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5521,7 +5521,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) } if (!tcp_is_sack(tp) || - tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr) + tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) goto send_now; if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { From 870e3a634b6a6cb1543b359007aca73fe6a03ac5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Jul 2022 11:22:04 -0700 Subject: [PATCH 6/7] tcp: Fix data-races around sysctl_tcp_reflect_tos. While reading sysctl_tcp_reflect_tos, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: ac8f1710c12b ("tcp: reflect tos value received in SYN to the socket") Signed-off-by: Kuniyuki Iwashima Acked-by: Wei Wang Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/tcp_ipv6.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d16e6e40f47b..586c102ce152 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1006,7 +1006,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, if (skb) { __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); - tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (inet_sk(sk)->tos & INET_ECN_MASK) : inet_sk(sk)->tos; @@ -1526,7 +1526,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ - if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; if (!dst) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9d3ede293258..be09941fe6d9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -546,7 +546,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, if (np->repflow && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); - tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ? + tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (np->tclass & INET_ECN_MASK) : np->tclass; @@ -1314,7 +1314,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ - if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; /* Clone native IPv6 options from listening socket (if any) From 96b9bd8c6d125490f9adfb57d387ef81a55a103e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 22 Jul 2022 11:22:05 -0700 Subject: [PATCH 7/7] ipv4: Fix data-races around sysctl_fib_notify_on_flag_change. While reading sysctl_fib_notify_on_flag_change, it can be changed concurrently. Thus, we need to add READ_ONCE() to its readers. Fixes: 680aea08e78c ("net: ipv4: Emit notification when fib hardware flags are changed") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 46e8a5125853..452ff177e4da 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1042,6 +1042,7 @@ fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) { + u8 fib_notify_on_flag_change; struct fib_alias *fa_match; struct sk_buff *skb; int err; @@ -1063,14 +1064,16 @@ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) WRITE_ONCE(fa_match->offload, fri->offload); WRITE_ONCE(fa_match->trap, fri->trap); + fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change); + /* 2 means send notifications only if offload_failed was changed. */ - if (net->ipv4.sysctl_fib_notify_on_flag_change == 2 && + if (fib_notify_on_flag_change == 2 && READ_ONCE(fa_match->offload_failed) == fri->offload_failed) goto out; WRITE_ONCE(fa_match->offload_failed, fri->offload_failed); - if (!net->ipv4.sysctl_fib_notify_on_flag_change) + if (!fib_notify_on_flag_change) goto out; skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);