From 64dd7e6376f01a50a9ce9aec1a0e11f5e69cf814 Mon Sep 17 00:00:00 2001 From: chao an Date: Tue, 10 Jan 2023 13:41:02 +0800 Subject: [PATCH] net/tcp: add Selective-ACK support Reference: https://datatracker.ietf.org/doc/html/rfc2018 Iperf2 client/server test on esp32c3: Drop(1/50): CONFIG_NET_TCP_DEBUG_DROP_SEND=y CONFIG_NET_TCP_DEBUG_DROP_SEND_PROBABILITY=50 // Drop probability: 1/50 CONFIG_NET_TCP_DEBUG_DROP_RECV=y CONFIG_NET_TCP_DEBUG_DROP_RECV_PROBABILITY=50 // Drop probability: 1/50 Drop(1/50) + OFO/SACK: CONFIG_NET_TCP_DEBUG_DROP_SEND=y CONFIG_NET_TCP_DEBUG_DROP_SEND_PROBABILITY=50 // Drop probability: 1/50 CONFIG_NET_TCP_DEBUG_DROP_RECV=y CONFIG_NET_TCP_DEBUG_DROP_RECV_PROBABILITY=50 // Drop probability: 1/50 CONFIG_NET_TCP_OUT_OF_ORDER=y CONFIG_NET_TCP_SELECTIVE_ACK=y --------------------------------------------------------- | TCP Config | Server | Client | | |-------------------------------------------------------| | Original | 12 | 9 | Mbits/sec | | Drop(1/50) | 0.6 | 0.3 | Mbits/sec | | Drop(1/50) + OFO/SACK | 8 | 8 | Mbits/sec | --------------------------------------------------------- Signed-off-by: chao an --- include/nuttx/net/tcp.h | 9 +- net/tcp/Kconfig | 12 ++ net/tcp/tcp.h | 29 ++++ net/tcp/tcp_input.c | 99 +++++------ net/tcp/tcp_send.c | 53 +++++- net/tcp/tcp_send_buffered.c | 318 +++++++++++++++++++++++++++--------- 6 files changed, 392 insertions(+), 128 deletions(-) diff --git a/include/nuttx/net/tcp.h b/include/nuttx/net/tcp.h index 1a983f3473a..83e4fa1b05b 100644 --- a/include/nuttx/net/tcp.h +++ b/include/nuttx/net/tcp.h @@ -77,10 +77,13 @@ #define TCP_OPT_NOOP 1 /* "No-operation" TCP option */ #define TCP_OPT_MSS 2 /* Maximum segment size TCP option */ #define TCP_OPT_WS 3 /* Window size scaling factor */ +#define TCP_OPT_SACK_PERM 4 /* Selective-ACK Permitted option */ +#define TCP_OPT_SACK 5 /* Selective-ACK Block option */ -#define TCP_OPT_NOOP_LEN 1 /* Length of TCP NOOP option. */ -#define TCP_OPT_MSS_LEN 4 /* Length of TCP MSS option. */ -#define TCP_OPT_WS_LEN 3 /* Length of TCP WS option. */ +#define TCP_OPT_NOOP_LEN 1 /* Length of TCP NOOP option. */ +#define TCP_OPT_MSS_LEN 4 /* Length of TCP MSS option. */ +#define TCP_OPT_WS_LEN 3 /* Length of TCP WS option. */ +#define TCP_OPT_SACK_PERM_LEN 2 /* Length of TCP SACK option. */ /* The TCP states used in the struct tcp_conn_s tcpstateflags field */ diff --git a/net/tcp/Kconfig b/net/tcp/Kconfig index 64208e76b2c..b53c20da74a 100644 --- a/net/tcp/Kconfig +++ b/net/tcp/Kconfig @@ -151,6 +151,18 @@ config NET_TCP_OUT_OF_ORDER_BUFSIZE endif # NET_TCP_OUT_OF_ORDER +config NET_TCP_SELECTIVE_ACK + bool "Enable TCP/IP Selective Acknowledgment Options" + default n + select NET_TCP_OUT_OF_ORDER + ---help--- + Enable RFC2018(TCP Selective Acknowledgment Options): + Selective Acknowledgment (SACK) is a strategy which corrects this + behavior in the face of multiple dropped segments. With selective + acknowledgments, the data receiver can inform the sender about all + segments that have arrived successfully, so the sender need + retransmit only the segments that have actually been lost. + config NET_TCP_NOTIFIER bool "Support TCP notifications" default n diff --git a/net/tcp/tcp.h b/net/tcp/tcp.h index bac6bc94dea..999b001ea2e 100644 --- a/net/tcp/tcp.h +++ b/net/tcp/tcp.h @@ -105,6 +105,7 @@ /* The TCP options flags */ #define TCP_WSCALE 0x01U /* Window Scale option enabled */ +#define TCP_SACK 0x02U /* Selective ACKs enabled */ /* The Max Range count of TCP Selective ACKs */ @@ -157,6 +158,14 @@ struct tcp_ofoseg_s FAR struct iob_s *data; /* Out-of-order buffering */ }; +/* SACK ranges to include in ACK packets. */ + +struct tcp_sack_s +{ + uint32_t left; /* Left edge of the SACK */ + uint32_t right; /* Right edge of the SACK */ +}; + struct tcp_conn_s { /* Common prologue of all connection structures. */ @@ -2143,6 +2152,26 @@ uint16_t tcpip_hdrsize(FAR struct tcp_conn_s *conn); int tcp_ofoseg_bufsize(FAR struct tcp_conn_s *conn); +/**************************************************************************** + * Name: tcp_reorder_ofosegs + * + * Description: + * Sort out-of-order segments by left edge + * + * Input Parameters: + * nofosegs - Number of out-of-order semgnets + * ofosegs - Pointer to out-of-order segments + * + * Returned Value: + * True if re-order occurs + * + * Assumptions: + * The network is locked. + * + ****************************************************************************/ + +bool tcp_reorder_ofosegs(int nofosegs, FAR struct tcp_ofoseg_s *ofosegs); + #ifdef __cplusplus } #endif diff --git a/net/tcp/tcp_input.c b/net/tcp/tcp_input.c index b4b7b052928..001dea54d66 100644 --- a/net/tcp/tcp_input.c +++ b/net/tcp/tcp_input.c @@ -396,52 +396,6 @@ static bool tcp_rebuild_ofosegs(FAR struct tcp_conn_s *conn, return (ofoseg->data == NULL); } -/**************************************************************************** - * Name: tcp_reorder_ofosegs - * - * Description: - * Sort out-of-order segments by left edge - * - * Input Parameters: - * nofosegs - Number of out-of-order semgnets - * ofosegs - Pointer to out-of-order segments - * - * Returned Value: - * True if re-order occurs - * - * Assumptions: - * The network is locked. - * - ****************************************************************************/ - -static bool tcp_reorder_ofosegs(int nofosegs, - FAR struct tcp_ofoseg_s *ofosegs) -{ - struct tcp_ofoseg_s segs; - bool reordered = false; - int i; - int j; - - /* Sort out-of-order segments by left edge */ - - for (i = 0; i < nofosegs - 1; i++) - { - for (j = 0; j < nofosegs - 1 - i; j++) - { - if (TCP_SEQ_GT(ofosegs[j].left, - ofosegs[j + 1].left)) - { - segs = ofosegs[j]; - ofosegs[j] = ofosegs[j + 1]; - ofosegs[j + 1] = segs; - reordered = true; - } - } - } - - return reordered; -} - /**************************************************************************** * Name: tcp_input_ofosegs * @@ -637,6 +591,14 @@ static void tcp_parse_option(FAR struct net_driver_s *dev, conn->rcv_scale = CONFIG_NET_TCP_WINDOW_SCALE_FACTOR; conn->flags |= TCP_WSCALE; } +#endif +#ifdef CONFIG_NET_TCP_SELECTIVE_ACK + else if (opt == TCP_OPT_SACK_PERM && + IPDATA(tcpiplen + 1 + i) == + TCP_OPT_SACK_PERM_LEN) + { + conn->flags |= TCP_SACK; + } #endif else { @@ -1627,6 +1589,51 @@ drop: * Public Functions ****************************************************************************/ +/**************************************************************************** + * Name: tcp_reorder_ofosegs + * + * Description: + * Sort out-of-order segments by left edge + * + * Input Parameters: + * nofosegs - Number of out-of-order semgnets + * ofosegs - Pointer to out-of-order segments + * + * Returned Value: + * True if re-order occurs + * + * Assumptions: + * The network is locked. + * + ****************************************************************************/ + +bool tcp_reorder_ofosegs(int nofosegs, FAR struct tcp_ofoseg_s *ofosegs) +{ + struct tcp_ofoseg_s segs; + bool reordered = false; + int i; + int j; + + /* Sort out-of-order segments by left edge */ + + for (i = 0; i < nofosegs - 1; i++) + { + for (j = 0; j < nofosegs - 1 - i; j++) + { + if (TCP_SEQ_GT(ofosegs[j].left, + ofosegs[j + 1].left)) + { + segs = ofosegs[j]; + ofosegs[j] = ofosegs[j + 1]; + ofosegs[j + 1] = segs; + reordered = true; + } + } + } + + return reordered; +} + /**************************************************************************** * Name: tcp_ipv4_input * diff --git a/net/tcp/tcp_send.c b/net/tcp/tcp_send.c index db3d38835e1..0c57c32e8e5 100644 --- a/net/tcp/tcp_send.c +++ b/net/tcp/tcp_send.c @@ -274,10 +274,44 @@ void tcp_send(FAR struct net_driver_s *dev, FAR struct tcp_conn_s *conn, return; } - tcp = tcp_header(dev); - tcp->flags = flags; - dev->d_len = len; - tcp->tcpoffset = (TCP_HDRLEN / 4) << 4; + tcp = tcp_header(dev); + tcp->flags = flags; + dev->d_len = len; + +#ifdef CONFIG_NET_TCP_SELECTIVE_ACK + if ((conn->flags & TCP_SACK) && (flags == TCP_ACK) && conn->nofosegs > 0) + { + int optlen = conn->nofosegs * sizeof(struct tcp_sack_s); + int i; + + tcp->optdata[0] = TCP_OPT_NOOP; + tcp->optdata[1] = TCP_OPT_NOOP; + tcp->optdata[2] = TCP_OPT_SACK; + tcp->optdata[3] = TCP_OPT_SACK_PERM_LEN + optlen; + + optlen += 4; + + for (i = 0; i < conn->nofosegs; i++) + { + ninfo("TCP SACK [%d]" + "[%" PRIu32 " : %" PRIu32 " : %" PRIu32 "]\n", i, + conn->ofosegs[i].left, conn->ofosegs[i].right, + TCP_SEQ_SUB(conn->ofosegs[i].right, conn->ofosegs[i].left)); + tcp_setsequence(&tcp->optdata[4 + i * 2 * sizeof(uint32_t)], + conn->ofosegs[i].left); + tcp_setsequence(&tcp->optdata[4 + (i * 2 + 1) * sizeof(uint32_t)], + conn->ofosegs[i].right); + } + + dev->d_len += optlen; + tcp->tcpoffset = ((TCP_HDRLEN + optlen) / 4) << 4; + } + else +#endif /* CONFIG_NET_TCP_SELECTIVE_ACK */ + { + tcp->tcpoffset = (TCP_HDRLEN / 4) << 4; + } + tcp_sendcommon(dev, conn, tcp); #if defined(CONFIG_NET_STATISTICS) && \ @@ -597,6 +631,17 @@ void tcp_synack(FAR struct net_driver_s *dev, FAR struct tcp_conn_s *conn, } #endif +#ifdef CONFIG_NET_TCP_SELECTIVE_ACK + if (tcp->flags == TCP_SYN || + ((tcp->flags == (TCP_ACK | TCP_SYN)) && (conn->flags & TCP_SACK))) + { + tcp->optdata[optlen++] = TCP_OPT_NOOP; + tcp->optdata[optlen++] = TCP_OPT_NOOP; + tcp->optdata[optlen++] = TCP_OPT_SACK_PERM; + tcp->optdata[optlen++] = TCP_OPT_SACK_PERM_LEN; + } +#endif + tcp->tcpoffset = ((TCP_HDRLEN + optlen) / 4) << 4; dev->d_len += optlen; diff --git a/net/tcp/tcp_send_buffered.c b/net/tcp/tcp_send_buffered.c index 925d08bb12f..62651169c50 100644 --- a/net/tcp/tcp_send_buffered.c +++ b/net/tcp/tcp_send_buffered.c @@ -170,6 +170,80 @@ static void psock_writebuffer_notify(FAR struct tcp_conn_s *conn) # define psock_writebuffer_notify(conn) #endif +static void retransmit_segment(FAR struct tcp_conn_s *conn, + FAR struct tcp_wrbuffer_s *wrb) +{ + uint16_t sent; + + /* Reset the number of bytes sent sent from the write buffer */ + + sent = TCP_WBSENT(wrb); + if (conn->tx_unacked > sent) + { + conn->tx_unacked -= sent; + } + else + { + conn->tx_unacked = 0; + } + + if (conn->sent > sent) + { + conn->sent -= sent; + } + else + { + conn->sent = 0; + } + + TCP_WBSENT(wrb) = 0; + ninfo("REXMIT: wrb=%p sent=%u, " + "conn tx_unacked=%" PRId32 " sent=%" PRId32 "\n", + wrb, TCP_WBSENT(wrb), conn->tx_unacked, conn->sent); + + /* Free any write buffers that have exceed the retry count */ + + if (++TCP_WBNRTX(wrb) >= TCP_MAXRTX) + { + nwarn("WARNING: Expiring wrb=%p nrtx=%u\n", + wrb, TCP_WBNRTX(wrb)); + + /* Return the write buffer to the free list */ + + tcp_wrbuffer_release(wrb); + + /* Notify any waiters if the write buffers have been + * drained. + */ + + psock_writebuffer_notify(conn); + + /* NOTE expired is different from un-ACKed, it is designed + * to represent the number of segments that have been sent, + * retransmitted, and un-ACKed, if expired is not zero, the + * connection will be closed. + * + * field expired can only be updated at TCP_ESTABLISHED + * state + */ + + conn->expired++; + } + else + { + /* Insert the write buffer into the write_q (in sequence + * number order). The retransmission will occur below + * when the write buffer with the lowest sequence number + * is pulled from the write_q again. + */ + + ninfo("REXMIT: Moving wrb=%p nrtx=%u\n", + wrb, TCP_WBNRTX(wrb)); + + psock_insert_segment(wrb, &conn->write_q); + } +} + /**************************************************************************** * Name: psock_lost_connection * @@ -285,6 +359,97 @@ static inline void send_ipselect(FAR struct net_driver_s *dev, } #endif +/**************************************************************************** + * Name: parse_sack + * + * Description: + * Parse sack from incoming TCP options + * + * Input Parameters: + * conn - The TCP connection of interest + * tcp - Header of tcp structure + * segs - Segments edge of sacks + * + * Returned Value: + * Number of sacks + * + * Assumptions: + * The network is locked. + * + ****************************************************************************/ + +#ifdef CONFIG_NET_TCP_SELECTIVE_ACK +static int parse_sack(FAR struct tcp_conn_s *conn, FAR struct tcp_hdr_s *tcp, + FAR struct tcp_ofoseg_s *segs) +{ + FAR struct tcp_sack_s *sacks; + int nsack = 0; + uint8_t opt; + int i; + + /* Get the size of the link layer header, + * the IP and TCP header + */ + + for (i = 0; i < ((tcp->tcpoffset >> 4) - 5) << 2 ; ) + { + opt = *(tcp->optdata + i); + if (opt == TCP_OPT_END) + { + /* End of options. */ + + break; + } + else if (opt == TCP_OPT_NOOP) + { + /* NOP option. */ + + ++i; + continue; + } + else if (opt == TCP_OPT_SACK) + { + nsack = (*(tcp->optdata + 1 + i) - + TCP_OPT_SACK_PERM_LEN) / + (sizeof(uint32_t) * 2); + sacks = (FAR struct tcp_sack_s *) + (tcp->optdata + i + + TCP_OPT_SACK_PERM_LEN); + + for (i = 0; i < nsack; i++) + { + segs[i].left = tcp_getsequence((uint8_t *)&sacks[i].left); + segs[i].right = tcp_getsequence((uint8_t *)&sacks[i].right); + } + + tcp_reorder_ofosegs(nsack, segs); + + break; + } + else + { + /* All other options have a length field, + * so that we easily can skip past them. + */ + + if (*(tcp->optdata + 1 + i) == 0) + { + /* If the length field is zero, + * the options are malformed and + * we don't process them further. + */ + + break; + } + } + + i += *(tcp->optdata + 1 + i); + } + + return nsack; +} +#endif /* CONFIG_NET_TCP_SELECTIVE_ACK */ + /**************************************************************************** * Name: psock_send_eventhandler * @@ -309,6 +474,10 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev, FAR void *pvpriv, uint16_t flags) { FAR struct tcp_conn_s *conn = pvpriv; +#ifdef CONFIG_NET_TCP_SELECTIVE_ACK + struct tcp_ofoseg_s ofosegs[TCP_SACK_RANGES_MAX]; + uint8_t nsacks = 0; +#endif #ifdef CONFIG_NET_TCP_FAST_RETRANSMIT uint32_t rexmitno = 0; #endif @@ -458,7 +627,6 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev, wrb, TCP_WBSEQNO(wrb), TCP_WBPKTLEN(wrb)); } } -#ifdef CONFIG_NET_TCP_FAST_RETRANSMIT else if (ackno == TCP_WBSEQNO(wrb)) { /* Reset the duplicate ack counter */ @@ -472,16 +640,33 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev, if (++TCP_WBNACK(wrb) == TCP_FAST_RETRANSMISSION_THRESH) { - /* Do fast retransmit */ +#ifdef CONFIG_NET_TCP_SELECTIVE_ACK + if ((conn->flags & TCP_SACK) && + (tcp->tcpoffset & 0xf0) > 0x50) + { + /* Parse s-ack from tcp options */ - rexmitno = ackno; + nsacks = parse_sack(conn, tcp, ofosegs); - /* Reset counter */ + flags |= TCP_REXMIT; + } +#ifdef CONFIG_NET_TCP_FAST_RETRANSMIT + else +#endif +#endif + { +#ifdef CONFIG_NET_TCP_FAST_RETRANSMIT + /* Do fast retransmit */ - TCP_WBNACK(wrb) = 0; + rexmitno = ackno; +#endif + + /* Reset counter */ + + TCP_WBNACK(wrb) = 0; + } } } -#endif } /* A special case is the head of the write_q which may be partially @@ -613,6 +798,57 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev, } #endif +#ifdef CONFIG_NET_TCP_SELECTIVE_ACK + + /* Check if we are being asked to retransmit s-ack data */ + + if (nsacks > 0) + { + FAR struct tcp_wrbuffer_s *wrb; + FAR sq_entry_t *entry; + FAR sq_entry_t *next; + uint32_t right; + int i; + + /* Dump s-ack edge */ + + for (i = 0, right = 0; i < nsacks; i++) + { + ninfo("TCP SACK [%d]" + "[%" PRIu32 " : %" PRIu32 " : %" PRIu32 "]\n", + i, ofosegs[i].left, ofosegs[i].right, + TCP_SEQ_SUB(ofosegs[i].right, ofosegs[i].left)); + } + + for (entry = sq_peek(&conn->unacked_q); entry; entry = next) + { + wrb = (FAR struct tcp_wrbuffer_s *)entry; + next = sq_next(entry); + + for (i = 0, right = 0; i < nsacks; i++) + { + /* Wrb seqno out of s-ack edge ? do retransmit ! */ + + if (TCP_SEQ_LT(TCP_WBSEQNO(wrb), ofosegs[i].left) && + TCP_SEQ_GTE(TCP_WBSEQNO(wrb), right)) + { + ninfo("TCP REXMIT " + "[%" PRIu32 " : %" PRIu32 " : %d]\n", + TCP_WBSEQNO(wrb), + TCP_SEQ_ADD(TCP_WBSEQNO(wrb), TCP_WBPKTLEN(wrb)), + TCP_WBPKTLEN(wrb)); + sq_rem(entry, &conn->unacked_q); + retransmit_segment(conn, (FAR void *)entry); + break; + } + + right = ofosegs[i].right; + } + } + } + else +#endif + /* Check if we are being asked to retransmit data */ if ((flags & TCP_REXMIT) != 0) @@ -706,75 +942,7 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev, while ((entry = sq_remlast(&conn->unacked_q)) != NULL) { - wrb = (FAR struct tcp_wrbuffer_s *)entry; - uint16_t sent; - - /* Reset the number of bytes sent sent from the write buffer */ - - sent = TCP_WBSENT(wrb); - if (conn->tx_unacked > sent) - { - conn->tx_unacked -= sent; - } - else - { - conn->tx_unacked = 0; - } - - if (conn->sent > sent) - { - conn->sent -= sent; - } - else - { - conn->sent = 0; - } - - TCP_WBSENT(wrb) = 0; - ninfo("REXMIT: wrb=%p sent=%u, " - "conn tx_unacked=%" PRId32 " sent=%" PRId32 "\n", - wrb, TCP_WBSENT(wrb), conn->tx_unacked, conn->sent); - - /* Free any write buffers that have exceed the retry count */ - - if (++TCP_WBNRTX(wrb) >= TCP_MAXRTX) - { - nwarn("WARNING: Expiring wrb=%p nrtx=%u\n", - wrb, TCP_WBNRTX(wrb)); - - /* Return the write buffer to the free list */ - - tcp_wrbuffer_release(wrb); - - /* Notify any waiters if the write buffers have been - * drained. - */ - - psock_writebuffer_notify(conn); - - /* NOTE expired is different from un-ACKed, it is designed to - * represent the number of segments that have been sent, - * retransmitted, and un-ACKed, if expired is not zero, the - * connection will be closed. - * - * field expired can only be updated at TCP_ESTABLISHED state - */ - - conn->expired++; - continue; - } - else - { - /* Insert the write buffer into the write_q (in sequence - * number order). The retransmission will occur below - * when the write buffer with the lowest sequence number - * is pulled from the write_q again. - */ - - ninfo("REXMIT: Moving wrb=%p nrtx=%u\n", wrb, TCP_WBNRTX(wrb)); - - psock_insert_segment(wrb, &conn->write_q); - } + retransmit_segment(conn, (FAR void *)entry); } }