From 2b41fab70fc001d2acd89c0477d32feb8265bb32 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:18 -0700 Subject: [PATCH 1/8] inet: cache listen_sock_qlen() and read rskq_defer_accept once Cache listen_sock_qlen() to limit false sharing, and read rskq_defer_accept once as it might change under us. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 844808d9337b..7d011e825c48 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -571,8 +571,9 @@ static void reqsk_timer_handler(unsigned long data) struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct listen_sock *lopt = queue->listen_opt; - int expire = 0, resend = 0; + int qlen, expire = 0, resend = 0; int max_retries, thresh; + u8 defer_accept; if (sk_listener->sk_state != TCP_LISTEN || !lopt) { reqsk_put(req); @@ -598,19 +599,21 @@ static void reqsk_timer_handler(unsigned long data) * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ - if (listen_sock_qlen(lopt) >> (lopt->max_qlen_log - 1)) { + qlen = listen_sock_qlen(lopt); + if (qlen >> (lopt->max_qlen_log - 1)) { int young = listen_sock_young(lopt) << 1; while (thresh > 2) { - if (listen_sock_qlen(lopt) < young) + if (qlen < young) break; thresh--; young <<= 1; } } - if (queue->rskq_defer_accept) - max_retries = queue->rskq_defer_accept; - syn_ack_recalc(req, thresh, max_retries, queue->rskq_defer_accept, + defer_accept = READ_ONCE(queue->rskq_defer_accept); + if (defer_accept) + max_retries = defer_accept; + syn_ack_recalc(req, thresh, max_retries, defer_accept, &expire, &resend); req->rsk_ops->syn_ack_timeout(sk_listener, req); if (!expire && From 42cb80a2353f42913ae78074ffa1f1b4a49e5436 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:19 -0700 Subject: [PATCH 2/8] inet: remove sk_listener parameter from syn_ack_timeout() It is not needed, and req->sk_listener points to the listener anyway. request_sock argument can be const. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/dccp.h | 2 +- include/net/request_sock.h | 3 +-- include/net/tcp.h | 2 +- net/dccp/ipv4.c | 2 +- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp_timer.c | 8 +++++--- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 439ff698000a..3dca24d3ac67 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -317,6 +317,6 @@ static inline const char *dccp_role(const struct sock *sk) return NULL; } -extern void dccp_syn_ack_timeout(struct sock *sk, struct request_sock *req); +extern void dccp_syn_ack_timeout(const struct request_sock *req); #endif /* _LINUX_DCCP_H */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 6a91261d9b7b..8603c350fad0 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -39,8 +39,7 @@ struct request_sock_ops { void (*send_reset)(struct sock *sk, struct sk_buff *skb); void (*destructor)(struct request_sock *req); - void (*syn_ack_timeout)(struct sock *sk, - struct request_sock *req); + void (*syn_ack_timeout)(const struct request_sock *req); }; int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req); diff --git a/include/net/tcp.h b/include/net/tcp.h index 082fd79132b7..1876262afd59 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -433,7 +433,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, int compat_tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); -void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req); +void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); void tcp_parse_options(const struct sk_buff *skb, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 25a9615b3b88..1f7161e05403 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -576,7 +576,7 @@ static void dccp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -void dccp_syn_ack_timeout(struct sock *sk, struct request_sock *req) +void dccp_syn_ack_timeout(const struct request_sock *req) { } EXPORT_SYMBOL(dccp_syn_ack_timeout); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 7d011e825c48..a12b973164d0 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -615,7 +615,7 @@ static void reqsk_timer_handler(unsigned long data) max_retries = defer_accept; syn_ack_recalc(req, thresh, max_retries, defer_accept, &expire, &resend); - req->rsk_ops->syn_ack_timeout(sk_listener, req); + req->rsk_ops->syn_ack_timeout(req); if (!expire && (!resend || !inet_rtx_syn_ack(sk_listener, req) || diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 3daa6b5d766d..2568fd282873 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -327,7 +327,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; - req->rsk_ops->syn_ack_timeout(sk, req); + req->rsk_ops->syn_ack_timeout(req); if (req->num_timeout >= max_retries) { tcp_write_err(sk); @@ -539,9 +539,11 @@ static void tcp_write_timer(unsigned long data) sock_put(sk); } -void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) +void tcp_syn_ack_timeout(const struct request_sock *req) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); + struct net *net = read_pnet(&inet_rsk(req)->ireq_net); + + NET_INC_STATS_BH(net, LINUX_MIB_TCPTIMEOUTS); } EXPORT_SYMBOL(tcp_syn_ack_timeout); From 8b929ab12fb2ab960adb3c3ec8d107fef5ff3243 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:20 -0700 Subject: [PATCH 3/8] inet: remove some sk_listener dependencies listener can be source of false sharing. request sock has some useful information like : ireq->ir_iif, ireq->ir_num, ireq->ireq_net This patch does not solve the major problem of having to read sk->sk_protocol which is sharing a cache line with sk->sk_wmem_alloc. (This same field is read later in ip_build_and_send_pkt()) One idea would be to move sk_protocol close to sk_family (using 8 bits instead of 16 for sk_family seems enough) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a12b973164d0..711ab143d4cb 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -403,18 +403,17 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) { - struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options_rcu *opt = inet_rsk(req)->opt; - struct net *net = sock_net(sk); - int flags = inet_sk_flowi_flags(sk); + struct net *net = read_pnet(&ireq->ireq_net); + struct ip_options_rcu *opt = ireq->opt; + struct rtable *rt; - flowi4_init_output(fl4, sk->sk_bound_dev_if, ireq->ir_mark, + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, - sk->sk_protocol, - flags, + sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + ireq->ir_loc_addr, ireq->ir_rmt_port, + htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -436,9 +435,9 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, const struct request_sock *req) { const struct inet_request_sock *ireq = inet_rsk(req); + struct net *net = read_pnet(&ireq->ireq_net); struct inet_sock *newinet = inet_sk(newsk); struct ip_options_rcu *opt; - struct net *net = sock_net(sk); struct flowi4 *fl4; struct rtable *rt; @@ -446,11 +445,12 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rcu_read_lock(); opt = rcu_dereference(newinet->inet_opt); - flowi4_init_output(fl4, sk->sk_bound_dev_if, inet_rsk(req)->ir_mark, + flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, ireq->ir_rmt_port, inet_sk(sk)->inet_sport); + ireq->ir_loc_addr, ireq->ir_rmt_port, + htons(ireq->ir_num)); security_req_classify_flow(req, flowi4_to_flowi(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) From b282705336e03fc7b9377a278939594870a40f96 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:21 -0700 Subject: [PATCH 4/8] net: convert syn_wait_lock to a spinlock This is a low hanging fruit, as we'll get rid of syn_wait_lock eventually. We hold syn_wait_lock for such small sections, that it makes no sense to use a read/write lock. A spin lock is simply faster. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/request_sock.h | 11 +++-------- net/core/request_sock.c | 14 +++++++------- net/ipv4/inet_connection_sock.c | 8 ++++---- net/ipv4/inet_diag.c | 4 ++-- net/ipv4/tcp_ipv4.c | 12 ++++++------ net/ipv6/inet6_connection_sock.c | 4 ++-- 6 files changed, 24 insertions(+), 29 deletions(-) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 8603c350fad0..fe41f3ceb008 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -173,11 +173,6 @@ struct fastopen_queue { * %syn_wait_lock is necessary only to avoid proc interface having to grab the main * lock sock while browsing the listening hash (otherwise it's deadlock prone). * - * This lock is acquired in read mode only from listening_get_next() seq_file - * op and it's acquired in write mode _only_ from code that is actively - * changing rskq_accept_head. All readers that are holding the master sock lock - * don't need to grab this lock in read mode too as rskq_accept_head. writes - * are always protected from the main sock lock. */ struct request_sock_queue { struct request_sock *rskq_accept_head; @@ -192,7 +187,7 @@ struct request_sock_queue { */ /* temporary alignment, our goal is to get rid of this lock */ - rwlock_t syn_wait_lock ____cacheline_aligned_in_smp; + spinlock_t syn_wait_lock ____cacheline_aligned_in_smp; }; int reqsk_queue_alloc(struct request_sock_queue *queue, @@ -223,14 +218,14 @@ static inline void reqsk_queue_unlink(struct request_sock_queue *queue, struct listen_sock *lopt = queue->listen_opt; struct request_sock **prev; - write_lock(&queue->syn_wait_lock); + spin_lock(&queue->syn_wait_lock); prev = &lopt->syn_table[req->rsk_hash]; while (*prev != req) prev = &(*prev)->dl_next; *prev = req->dl_next; - write_unlock(&queue->syn_wait_lock); + spin_unlock(&queue->syn_wait_lock); if (del_timer(&req->rsk_timer)) reqsk_put(req); } diff --git a/net/core/request_sock.c b/net/core/request_sock.c index cdc0ddd9ac9f..87b22c0bc08c 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -58,14 +58,14 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, return -ENOMEM; get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd)); - rwlock_init(&queue->syn_wait_lock); + spin_lock_init(&queue->syn_wait_lock); queue->rskq_accept_head = NULL; lopt->nr_table_entries = nr_table_entries; lopt->max_qlen_log = ilog2(nr_table_entries); - write_lock_bh(&queue->syn_wait_lock); + spin_lock_bh(&queue->syn_wait_lock); queue->listen_opt = lopt; - write_unlock_bh(&queue->syn_wait_lock); + spin_unlock_bh(&queue->syn_wait_lock); return 0; } @@ -81,10 +81,10 @@ static inline struct listen_sock *reqsk_queue_yank_listen_sk( { struct listen_sock *lopt; - write_lock_bh(&queue->syn_wait_lock); + spin_lock_bh(&queue->syn_wait_lock); lopt = queue->listen_opt; queue->listen_opt = NULL; - write_unlock_bh(&queue->syn_wait_lock); + spin_unlock_bh(&queue->syn_wait_lock); return lopt; } @@ -100,7 +100,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) for (i = 0; i < lopt->nr_table_entries; i++) { struct request_sock *req; - write_lock_bh(&queue->syn_wait_lock); + spin_lock_bh(&queue->syn_wait_lock); while ((req = lopt->syn_table[i]) != NULL) { lopt->syn_table[i] = req->dl_next; atomic_inc(&lopt->qlen_dec); @@ -108,7 +108,7 @@ void reqsk_queue_destroy(struct request_sock_queue *queue) reqsk_put(req); reqsk_put(req); } - write_unlock_bh(&queue->syn_wait_lock); + spin_unlock_bh(&queue->syn_wait_lock); } } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 711ab143d4cb..79c0c9439fdc 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -495,7 +495,7 @@ struct request_sock *inet_csk_search_req(struct sock *sk, u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd, lopt->nr_table_entries); - write_lock(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); @@ -508,7 +508,7 @@ struct request_sock *inet_csk_search_req(struct sock *sk, break; } } - write_unlock(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); return req; } @@ -650,10 +650,10 @@ void reqsk_queue_hash_req(struct request_sock_queue *queue, setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); req->rsk_hash = hash; - write_lock(&queue->syn_wait_lock); + spin_lock(&queue->syn_wait_lock); req->dl_next = lopt->syn_table[hash]; lopt->syn_table[hash] = req; - write_unlock(&queue->syn_wait_lock); + spin_unlock(&queue->syn_wait_lock); mod_timer_pinned(&req->rsk_timer, jiffies + timeout); } diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index f984b2001d0a..76322c9867d5 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -728,7 +728,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.family = sk->sk_family; - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); lopt = icsk->icsk_accept_queue.listen_opt; if (!lopt || !listen_sock_qlen(lopt)) @@ -776,7 +776,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } out: - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); return err; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5554b8f33d41..8028ad5920a4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1909,13 +1909,13 @@ static void *listening_get_next(struct seq_file *seq, void *cur) } sk = sk_nulls_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { icsk = inet_csk(sk); - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_nulls_next(sk); } get_sk: @@ -1927,7 +1927,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) goto out; } icsk = inet_csk(sk); - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); @@ -1936,7 +1936,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) st->sbucket = 0; goto get_req; } - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2155,7 +2155,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_OPENREQ: if (v) { struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 2f3bbe569e8f..6927f3fb5597 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -124,7 +124,7 @@ struct request_sock *inet6_csk_search_req(struct sock *sk, u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd, lopt->nr_table_entries); - write_lock(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock(&icsk->icsk_accept_queue.syn_wait_lock); for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); @@ -138,7 +138,7 @@ struct request_sock *inet6_csk_search_req(struct sock *sk, break; } } - write_unlock(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock); return req; } From 26e3736090e1037ac929787df21c05497479b77f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:22 -0700 Subject: [PATCH 5/8] ipv4: tcp: handle ICMP messages on TCP_NEW_SYN_RECV request sockets tcp_v4_err() can restrict lookups to ehash table, and not to listeners. Note this patch creates the infrastructure, but this means that ICMP messages for request sockets are ignored until complete conversion. New tcp_req_err() helper is exported so that we can use it in IPv6 in following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 + net/ipv4/tcp_ipv4.c | 69 ++++++++++++++++++++++----------------------- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 1876262afd59..fe60e00e1919 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -447,6 +447,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); +void tcp_req_err(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8028ad5920a4..a57615062b66 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -310,6 +310,34 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) dst->ops->redirect(dst, sk, skb); } + +/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ +void tcp_req_err(struct sock *sk, u32 seq) +{ + struct request_sock *req = inet_reqsk(sk); + struct net *net = sock_net(sk); + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + WARN_ON(req->sk); + + if (seq != tcp_rsk(req)->snt_isn) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + } else { + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); + } + reqsk_put(req); +} +EXPORT_SYMBOL(tcp_req_err); + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -343,8 +371,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) int err; struct net *net = dev_net(icmp_skb->dev); - sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, - iph->saddr, th->source, inet_iif(icmp_skb)); + sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, + th->dest, iph->saddr, ntohs(th->source), + inet_iif(icmp_skb)); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; @@ -353,6 +382,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_twsk_put(inet_twsk(sk)); return; } + seq = ntohl(th->seq); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_err(sk, seq); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -374,7 +406,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); - seq = ntohl(th->seq); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; @@ -458,38 +489,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } switch (sk->sk_state) { - struct request_sock *req; - case TCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - - req = inet_csk_search_req(sk, th->dest, - iph->daddr, iph->saddr); - if (!req) - goto out; - - /* ICMPs are not backlogged, hence we cannot get - an established socket here. - */ - WARN_ON(req->sk); - - if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); - goto out; - } - - /* - * Still in SYN_RECV, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(sk, req); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - reqsk_put(req); - goto out; - case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is From 2215089b224412bfb28c5ae823b2a5d4e28a49d7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:23 -0700 Subject: [PATCH 6/8] ipv6: tcp: handle ICMP messages on TCP_NEW_SYN_RECV request sockets tcp_v6_err() can restrict lookups to ehash table, and not to listeners. Note this patch creates the infrastructure, but this means that ICMP messages for request sockets are ignored until complete conversion. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 47 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6e3f90db038c..4a4e6d30c448 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -324,18 +324,20 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); + struct net *net = dev_net(skb->dev); + struct request_sock *fastopen; struct ipv6_pinfo *np; + struct tcp_sock *tp; + __u32 seq, snd_una; struct sock *sk; int err; - struct tcp_sock *tp; - struct request_sock *fastopen; - __u32 seq, snd_una; - struct net *net = dev_net(skb->dev); - sk = inet6_lookup(net, &tcp_hashinfo, &hdr->daddr, - th->dest, &hdr->saddr, th->source, skb->dev->ifindex); + sk = __inet6_lookup_established(net, &tcp_hashinfo, + &hdr->daddr, th->dest, + &hdr->saddr, ntohs(th->source), + skb->dev->ifindex); - if (sk == NULL) { + if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return; @@ -345,6 +347,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, inet_twsk_put(inet_twsk(sk)); return; } + seq = ntohl(th->seq); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_err(sk, seq); bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) @@ -359,7 +364,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } tp = tcp_sk(sk); - seq = ntohl(th->seq); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; @@ -403,33 +407,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Might be for an request_sock */ switch (sk->sk_state) { - struct request_sock *req; - case TCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - - /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - req = inet6_csk_search_req(sk, th->dest, &hdr->daddr, - &hdr->saddr, inet6_iif(skb)); - if (!req) - goto out; - - /* ICMPs are not backlogged, hence we cannot get - * an established socket here. - */ - WARN_ON(req->sk != NULL); - - if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); - goto out; - } - - inet_csk_reqsk_queue_drop(sk, req); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - reqsk_put(req); - goto out; - case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is From 85645bab57bfc6b0b43bb96a301c4ef83925c07d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:24 -0700 Subject: [PATCH 7/8] ipv4: dccp: handle ICMP messages on DCCP_NEW_SYN_RECV request sockets dccp_v4_err() can restrict lookups to ehash table, and not to listeners. Note this patch creates the infrastructure, but this means that ICMP messages for request sockets are ignored until complete conversion. New dccp_req_err() helper is exported so that we can use it in IPv6 in following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/dccp.h | 2 ++ net/dccp/dccp.h | 1 + net/dccp/ipv4.c | 70 +++++++++++++++++++++----------------------- 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 3dca24d3ac67..221025423e6c 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -43,6 +43,7 @@ enum dccp_state { DCCP_CLOSING = TCP_CLOSING, DCCP_TIME_WAIT = TCP_TIME_WAIT, DCCP_CLOSED = TCP_CLOSE, + DCCP_NEW_SYN_RECV = TCP_NEW_SYN_RECV, DCCP_PARTOPEN = TCP_MAX_STATES, DCCP_PASSIVE_CLOSEREQ, /* clients receiving CloseReq */ DCCP_MAX_STATES @@ -57,6 +58,7 @@ enum { DCCPF_CLOSING = TCPF_CLOSING, DCCPF_TIME_WAIT = TCPF_TIME_WAIT, DCCPF_CLOSED = TCPF_CLOSE, + DCCPF_NEW_SYN_RECV = TCPF_NEW_SYN_RECV, DCCPF_PARTOPEN = (1 << DCCP_PARTOPEN), }; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 2396f50c5b04..bebc735f5afc 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -317,6 +317,7 @@ int inet_dccp_listen(struct socket *sock, int backlog); unsigned int dccp_poll(struct file *file, struct socket *sock, poll_table *wait); int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); +void dccp_req_err(struct sock *sk, u64 seq); struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 1f7161e05403..6310b8b19598 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -195,6 +195,32 @@ static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) dst->ops->redirect(dst, sk, skb); } +void dccp_req_err(struct sock *sk, u64 seq) + { + struct request_sock *req = inet_reqsk(sk); + struct net *net = sock_net(sk); + + /* + * ICMPs are not backlogged, hence we cannot get an established + * socket here. + */ + WARN_ON(req->sk); + + if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + } else { + /* + * Still in RESPOND, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + } + reqsk_put(req); +} +EXPORT_SYMBOL(dccp_req_err); + /* * This routine is called by the ICMP module when it gets some sort of error * condition. If err < 0 then the socket should be closed and the error @@ -227,10 +253,11 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) return; } - sk = inet_lookup(net, &dccp_hashinfo, - iph->daddr, dh->dccph_dport, - iph->saddr, dh->dccph_sport, inet_iif(skb)); - if (sk == NULL) { + sk = __inet_lookup_established(net, &dccp_hashinfo, + iph->daddr, dh->dccph_dport, + iph->saddr, ntohs(dh->dccph_sport), + inet_iif(skb)); + if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } @@ -239,6 +266,9 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) inet_twsk_put(inet_twsk(sk)); return; } + seq = dccp_hdr_seq(dh); + if (sk->sk_state == DCCP_NEW_SYN_RECV) + return dccp_req_err(sk, seq); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -251,7 +281,6 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) goto out; dp = dccp_sk(sk); - seq = dccp_hdr_seq(dh); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); @@ -288,37 +317,6 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) } switch (sk->sk_state) { - struct request_sock *req; - case DCCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - req = inet_csk_search_req(sk, dh->dccph_dport, - iph->daddr, iph->saddr); - if (!req) - goto out; - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - WARN_ON(req->sk); - - if (!between48(seq, dccp_rsk(req)->dreq_iss, - dccp_rsk(req)->dreq_gss)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); - goto out; - } - /* - * Still in RESPOND, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(sk, req); - reqsk_put(req); - goto out; - case DCCP_REQUESTING: case DCCP_RESPOND: if (!sock_owned_by_user(sk)) { From 52036a43055b3aae6659841c45a809af2ad4535e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:25 -0700 Subject: [PATCH 8/8] ipv6: dccp: handle ICMP messages on DCCP_NEW_SYN_RECV request sockets dccp_v6_err() can restrict lookups to ehash table, and not to listeners. Note this patch creates the infrastructure, but this means that ICMP messages for request sockets are ignored until complete conversion. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/dccp/ipv6.c | 41 ++++++++--------------------------------- 1 file changed, 8 insertions(+), 33 deletions(-) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 69d8f13895ba..9d0551092c6c 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -85,11 +85,12 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return; } - sk = inet6_lookup(net, &dccp_hashinfo, - &hdr->daddr, dh->dccph_dport, - &hdr->saddr, dh->dccph_sport, inet6_iif(skb)); + sk = __inet6_lookup_established(net, &dccp_hashinfo, + &hdr->daddr, dh->dccph_dport, + &hdr->saddr, ntohs(dh->dccph_sport), + inet6_iif(skb)); - if (sk == NULL) { + if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return; @@ -99,6 +100,9 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, inet_twsk_put(inet_twsk(sk)); return; } + seq = dccp_hdr_seq(dh); + if (sk->sk_state == DCCP_NEW_SYN_RECV) + return dccp_req_err(sk, seq); bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -108,7 +112,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; dp = dccp_sk(sk); - seq = dccp_hdr_seq(dh); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); @@ -149,34 +152,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Might be for an request_sock */ switch (sk->sk_state) { - struct request_sock *req; - case DCCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - - req = inet6_csk_search_req(sk, dh->dccph_dport, - &hdr->daddr, &hdr->saddr, - inet6_iif(skb)); - if (!req) - goto out; - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - WARN_ON(req->sk != NULL); - - if (!between48(seq, dccp_rsk(req)->dreq_iss, - dccp_rsk(req)->dreq_gss)) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); - goto out; - } - - inet_csk_reqsk_queue_drop(sk, req); - reqsk_put(req); - goto out; - case DCCP_REQUESTING: case DCCP_RESPOND: /* Cannot happen. It can, it SYNs are crossed. --ANK */