net/tcp: add Selective-ACK support

Reference:
https://datatracker.ietf.org/doc/html/rfc2018

Iperf2 client/server test on esp32c3:

Drop(1/50):
CONFIG_NET_TCP_DEBUG_DROP_SEND=y
CONFIG_NET_TCP_DEBUG_DROP_SEND_PROBABILITY=50  // Drop probability: 1/50
CONFIG_NET_TCP_DEBUG_DROP_RECV=y
CONFIG_NET_TCP_DEBUG_DROP_RECV_PROBABILITY=50  // Drop probability: 1/50

Drop(1/50) + OFO/SACK:
CONFIG_NET_TCP_DEBUG_DROP_SEND=y
CONFIG_NET_TCP_DEBUG_DROP_SEND_PROBABILITY=50  // Drop probability: 1/50
CONFIG_NET_TCP_DEBUG_DROP_RECV=y
CONFIG_NET_TCP_DEBUG_DROP_RECV_PROBABILITY=50  // Drop probability: 1/50

CONFIG_NET_TCP_OUT_OF_ORDER=y
CONFIG_NET_TCP_SELECTIVE_ACK=y

---------------------------------------------------------
|  TCP Config            | Server | Client |            |
|-------------------------------------------------------|
|  Original              |   12   |     9  |  Mbits/sec |
|  Drop(1/50)            |  0.6   |   0.3  |  Mbits/sec |
|  Drop(1/50) + OFO/SACK |    8   |     8  |  Mbits/sec |
---------------------------------------------------------

Signed-off-by: chao an <anchao@xiaomi.com>
This commit is contained in:
chao an 2023-01-10 13:41:02 +08:00 committed by Xiang Xiao
parent c581cc5f9b
commit 64dd7e6376
6 changed files with 392 additions and 128 deletions

View File

@ -77,10 +77,13 @@
#define TCP_OPT_NOOP 1 /* "No-operation" TCP option */
#define TCP_OPT_MSS 2 /* Maximum segment size TCP option */
#define TCP_OPT_WS 3 /* Window size scaling factor */
#define TCP_OPT_SACK_PERM 4 /* Selective-ACK Permitted option */
#define TCP_OPT_SACK 5 /* Selective-ACK Block option */
#define TCP_OPT_NOOP_LEN 1 /* Length of TCP NOOP option. */
#define TCP_OPT_MSS_LEN 4 /* Length of TCP MSS option. */
#define TCP_OPT_WS_LEN 3 /* Length of TCP WS option. */
#define TCP_OPT_SACK_PERM_LEN 2 /* Length of TCP SACK option. */
/* The TCP states used in the struct tcp_conn_s tcpstateflags field */

View File

@ -151,6 +151,18 @@ config NET_TCP_OUT_OF_ORDER_BUFSIZE
endif # NET_TCP_OUT_OF_ORDER
config NET_TCP_SELECTIVE_ACK
bool "Enable TCP/IP Selective Acknowledgment Options"
default n
select NET_TCP_OUT_OF_ORDER
---help---
Enable RFC2018(TCP Selective Acknowledgment Options):
Selective Acknowledgment (SACK) is a strategy which corrects this
behavior in the face of multiple dropped segments. With selective
acknowledgments, the data receiver can inform the sender about all
segments that have arrived successfully, so the sender need
retransmit only the segments that have actually been lost.
config NET_TCP_NOTIFIER
bool "Support TCP notifications"
default n

View File

@ -105,6 +105,7 @@
/* The TCP options flags */
#define TCP_WSCALE 0x01U /* Window Scale option enabled */
#define TCP_SACK 0x02U /* Selective ACKs enabled */
/* The Max Range count of TCP Selective ACKs */
@ -157,6 +158,14 @@ struct tcp_ofoseg_s
FAR struct iob_s *data; /* Out-of-order buffering */
};
/* SACK ranges to include in ACK packets. */
struct tcp_sack_s
{
uint32_t left; /* Left edge of the SACK */
uint32_t right; /* Right edge of the SACK */
};
struct tcp_conn_s
{
/* Common prologue of all connection structures. */
@ -2143,6 +2152,26 @@ uint16_t tcpip_hdrsize(FAR struct tcp_conn_s *conn);
int tcp_ofoseg_bufsize(FAR struct tcp_conn_s *conn);
/****************************************************************************
* Name: tcp_reorder_ofosegs
*
* Description:
* Sort out-of-order segments by left edge
*
* Input Parameters:
* nofosegs - Number of out-of-order semgnets
* ofosegs - Pointer to out-of-order segments
*
* Returned Value:
* True if re-order occurs
*
* Assumptions:
* The network is locked.
*
****************************************************************************/
bool tcp_reorder_ofosegs(int nofosegs, FAR struct tcp_ofoseg_s *ofosegs);
#ifdef __cplusplus
}
#endif

View File

@ -396,52 +396,6 @@ static bool tcp_rebuild_ofosegs(FAR struct tcp_conn_s *conn,
return (ofoseg->data == NULL);
}
/****************************************************************************
* Name: tcp_reorder_ofosegs
*
* Description:
* Sort out-of-order segments by left edge
*
* Input Parameters:
* nofosegs - Number of out-of-order semgnets
* ofosegs - Pointer to out-of-order segments
*
* Returned Value:
* True if re-order occurs
*
* Assumptions:
* The network is locked.
*
****************************************************************************/
static bool tcp_reorder_ofosegs(int nofosegs,
FAR struct tcp_ofoseg_s *ofosegs)
{
struct tcp_ofoseg_s segs;
bool reordered = false;
int i;
int j;
/* Sort out-of-order segments by left edge */
for (i = 0; i < nofosegs - 1; i++)
{
for (j = 0; j < nofosegs - 1 - i; j++)
{
if (TCP_SEQ_GT(ofosegs[j].left,
ofosegs[j + 1].left))
{
segs = ofosegs[j];
ofosegs[j] = ofosegs[j + 1];
ofosegs[j + 1] = segs;
reordered = true;
}
}
}
return reordered;
}
/****************************************************************************
* Name: tcp_input_ofosegs
*
@ -637,6 +591,14 @@ static void tcp_parse_option(FAR struct net_driver_s *dev,
conn->rcv_scale = CONFIG_NET_TCP_WINDOW_SCALE_FACTOR;
conn->flags |= TCP_WSCALE;
}
#endif
#ifdef CONFIG_NET_TCP_SELECTIVE_ACK
else if (opt == TCP_OPT_SACK_PERM &&
IPDATA(tcpiplen + 1 + i) ==
TCP_OPT_SACK_PERM_LEN)
{
conn->flags |= TCP_SACK;
}
#endif
else
{
@ -1627,6 +1589,51 @@ drop:
* Public Functions
****************************************************************************/
/****************************************************************************
* Name: tcp_reorder_ofosegs
*
* Description:
* Sort out-of-order segments by left edge
*
* Input Parameters:
* nofosegs - Number of out-of-order semgnets
* ofosegs - Pointer to out-of-order segments
*
* Returned Value:
* True if re-order occurs
*
* Assumptions:
* The network is locked.
*
****************************************************************************/
bool tcp_reorder_ofosegs(int nofosegs, FAR struct tcp_ofoseg_s *ofosegs)
{
struct tcp_ofoseg_s segs;
bool reordered = false;
int i;
int j;
/* Sort out-of-order segments by left edge */
for (i = 0; i < nofosegs - 1; i++)
{
for (j = 0; j < nofosegs - 1 - i; j++)
{
if (TCP_SEQ_GT(ofosegs[j].left,
ofosegs[j + 1].left))
{
segs = ofosegs[j];
ofosegs[j] = ofosegs[j + 1];
ofosegs[j + 1] = segs;
reordered = true;
}
}
}
return reordered;
}
/****************************************************************************
* Name: tcp_ipv4_input
*

View File

@ -277,7 +277,41 @@ void tcp_send(FAR struct net_driver_s *dev, FAR struct tcp_conn_s *conn,
tcp = tcp_header(dev);
tcp->flags = flags;
dev->d_len = len;
#ifdef CONFIG_NET_TCP_SELECTIVE_ACK
if ((conn->flags & TCP_SACK) && (flags == TCP_ACK) && conn->nofosegs > 0)
{
int optlen = conn->nofosegs * sizeof(struct tcp_sack_s);
int i;
tcp->optdata[0] = TCP_OPT_NOOP;
tcp->optdata[1] = TCP_OPT_NOOP;
tcp->optdata[2] = TCP_OPT_SACK;
tcp->optdata[3] = TCP_OPT_SACK_PERM_LEN + optlen;
optlen += 4;
for (i = 0; i < conn->nofosegs; i++)
{
ninfo("TCP SACK [%d]"
"[%" PRIu32 " : %" PRIu32 " : %" PRIu32 "]\n", i,
conn->ofosegs[i].left, conn->ofosegs[i].right,
TCP_SEQ_SUB(conn->ofosegs[i].right, conn->ofosegs[i].left));
tcp_setsequence(&tcp->optdata[4 + i * 2 * sizeof(uint32_t)],
conn->ofosegs[i].left);
tcp_setsequence(&tcp->optdata[4 + (i * 2 + 1) * sizeof(uint32_t)],
conn->ofosegs[i].right);
}
dev->d_len += optlen;
tcp->tcpoffset = ((TCP_HDRLEN + optlen) / 4) << 4;
}
else
#endif /* CONFIG_NET_TCP_SELECTIVE_ACK */
{
tcp->tcpoffset = (TCP_HDRLEN / 4) << 4;
}
tcp_sendcommon(dev, conn, tcp);
#if defined(CONFIG_NET_STATISTICS) && \
@ -597,6 +631,17 @@ void tcp_synack(FAR struct net_driver_s *dev, FAR struct tcp_conn_s *conn,
}
#endif
#ifdef CONFIG_NET_TCP_SELECTIVE_ACK
if (tcp->flags == TCP_SYN ||
((tcp->flags == (TCP_ACK | TCP_SYN)) && (conn->flags & TCP_SACK)))
{
tcp->optdata[optlen++] = TCP_OPT_NOOP;
tcp->optdata[optlen++] = TCP_OPT_NOOP;
tcp->optdata[optlen++] = TCP_OPT_SACK_PERM;
tcp->optdata[optlen++] = TCP_OPT_SACK_PERM_LEN;
}
#endif
tcp->tcpoffset = ((TCP_HDRLEN + optlen) / 4) << 4;
dev->d_len += optlen;

View File

@ -170,6 +170,80 @@ static void psock_writebuffer_notify(FAR struct tcp_conn_s *conn)
# define psock_writebuffer_notify(conn)
#endif
static void retransmit_segment(FAR struct tcp_conn_s *conn,
FAR struct tcp_wrbuffer_s *wrb)
{
uint16_t sent;
/* Reset the number of bytes sent sent from the write buffer */
sent = TCP_WBSENT(wrb);
if (conn->tx_unacked > sent)
{
conn->tx_unacked -= sent;
}
else
{
conn->tx_unacked = 0;
}
if (conn->sent > sent)
{
conn->sent -= sent;
}
else
{
conn->sent = 0;
}
TCP_WBSENT(wrb) = 0;
ninfo("REXMIT: wrb=%p sent=%u, "
"conn tx_unacked=%" PRId32 " sent=%" PRId32 "\n",
wrb, TCP_WBSENT(wrb), conn->tx_unacked, conn->sent);
/* Free any write buffers that have exceed the retry count */
if (++TCP_WBNRTX(wrb) >= TCP_MAXRTX)
{
nwarn("WARNING: Expiring wrb=%p nrtx=%u\n",
wrb, TCP_WBNRTX(wrb));
/* Return the write buffer to the free list */
tcp_wrbuffer_release(wrb);
/* Notify any waiters if the write buffers have been
* drained.
*/
psock_writebuffer_notify(conn);
/* NOTE expired is different from un-ACKed, it is designed
* to represent the number of segments that have been sent,
* retransmitted, and un-ACKed, if expired is not zero, the
* connection will be closed.
*
* field expired can only be updated at TCP_ESTABLISHED
* state
*/
conn->expired++;
}
else
{
/* Insert the write buffer into the write_q (in sequence
* number order). The retransmission will occur below
* when the write buffer with the lowest sequence number
* is pulled from the write_q again.
*/
ninfo("REXMIT: Moving wrb=%p nrtx=%u\n",
wrb, TCP_WBNRTX(wrb));
psock_insert_segment(wrb, &conn->write_q);
}
}
/****************************************************************************
* Name: psock_lost_connection
*
@ -285,6 +359,97 @@ static inline void send_ipselect(FAR struct net_driver_s *dev,
}
#endif
/****************************************************************************
* Name: parse_sack
*
* Description:
* Parse sack from incoming TCP options
*
* Input Parameters:
* conn - The TCP connection of interest
* tcp - Header of tcp structure
* segs - Segments edge of sacks
*
* Returned Value:
* Number of sacks
*
* Assumptions:
* The network is locked.
*
****************************************************************************/
#ifdef CONFIG_NET_TCP_SELECTIVE_ACK
static int parse_sack(FAR struct tcp_conn_s *conn, FAR struct tcp_hdr_s *tcp,
FAR struct tcp_ofoseg_s *segs)
{
FAR struct tcp_sack_s *sacks;
int nsack = 0;
uint8_t opt;
int i;
/* Get the size of the link layer header,
* the IP and TCP header
*/
for (i = 0; i < ((tcp->tcpoffset >> 4) - 5) << 2 ; )
{
opt = *(tcp->optdata + i);
if (opt == TCP_OPT_END)
{
/* End of options. */
break;
}
else if (opt == TCP_OPT_NOOP)
{
/* NOP option. */
++i;
continue;
}
else if (opt == TCP_OPT_SACK)
{
nsack = (*(tcp->optdata + 1 + i) -
TCP_OPT_SACK_PERM_LEN) /
(sizeof(uint32_t) * 2);
sacks = (FAR struct tcp_sack_s *)
(tcp->optdata + i +
TCP_OPT_SACK_PERM_LEN);
for (i = 0; i < nsack; i++)
{
segs[i].left = tcp_getsequence((uint8_t *)&sacks[i].left);
segs[i].right = tcp_getsequence((uint8_t *)&sacks[i].right);
}
tcp_reorder_ofosegs(nsack, segs);
break;
}
else
{
/* All other options have a length field,
* so that we easily can skip past them.
*/
if (*(tcp->optdata + 1 + i) == 0)
{
/* If the length field is zero,
* the options are malformed and
* we don't process them further.
*/
break;
}
}
i += *(tcp->optdata + 1 + i);
}
return nsack;
}
#endif /* CONFIG_NET_TCP_SELECTIVE_ACK */
/****************************************************************************
* Name: psock_send_eventhandler
*
@ -309,6 +474,10 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev,
FAR void *pvpriv, uint16_t flags)
{
FAR struct tcp_conn_s *conn = pvpriv;
#ifdef CONFIG_NET_TCP_SELECTIVE_ACK
struct tcp_ofoseg_s ofosegs[TCP_SACK_RANGES_MAX];
uint8_t nsacks = 0;
#endif
#ifdef CONFIG_NET_TCP_FAST_RETRANSMIT
uint32_t rexmitno = 0;
#endif
@ -458,7 +627,6 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev,
wrb, TCP_WBSEQNO(wrb), TCP_WBPKTLEN(wrb));
}
}
#ifdef CONFIG_NET_TCP_FAST_RETRANSMIT
else if (ackno == TCP_WBSEQNO(wrb))
{
/* Reset the duplicate ack counter */
@ -472,16 +640,33 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev,
if (++TCP_WBNACK(wrb) == TCP_FAST_RETRANSMISSION_THRESH)
{
#ifdef CONFIG_NET_TCP_SELECTIVE_ACK
if ((conn->flags & TCP_SACK) &&
(tcp->tcpoffset & 0xf0) > 0x50)
{
/* Parse s-ack from tcp options */
nsacks = parse_sack(conn, tcp, ofosegs);
flags |= TCP_REXMIT;
}
#ifdef CONFIG_NET_TCP_FAST_RETRANSMIT
else
#endif
#endif
{
#ifdef CONFIG_NET_TCP_FAST_RETRANSMIT
/* Do fast retransmit */
rexmitno = ackno;
#endif
/* Reset counter */
TCP_WBNACK(wrb) = 0;
}
}
#endif
}
}
/* A special case is the head of the write_q which may be partially
@ -613,6 +798,57 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev,
}
#endif
#ifdef CONFIG_NET_TCP_SELECTIVE_ACK
/* Check if we are being asked to retransmit s-ack data */
if (nsacks > 0)
{
FAR struct tcp_wrbuffer_s *wrb;
FAR sq_entry_t *entry;
FAR sq_entry_t *next;
uint32_t right;
int i;
/* Dump s-ack edge */
for (i = 0, right = 0; i < nsacks; i++)
{
ninfo("TCP SACK [%d]"
"[%" PRIu32 " : %" PRIu32 " : %" PRIu32 "]\n",
i, ofosegs[i].left, ofosegs[i].right,
TCP_SEQ_SUB(ofosegs[i].right, ofosegs[i].left));
}
for (entry = sq_peek(&conn->unacked_q); entry; entry = next)
{
wrb = (FAR struct tcp_wrbuffer_s *)entry;
next = sq_next(entry);
for (i = 0, right = 0; i < nsacks; i++)
{
/* Wrb seqno out of s-ack edge ? do retransmit ! */
if (TCP_SEQ_LT(TCP_WBSEQNO(wrb), ofosegs[i].left) &&
TCP_SEQ_GTE(TCP_WBSEQNO(wrb), right))
{
ninfo("TCP REXMIT "
"[%" PRIu32 " : %" PRIu32 " : %d]\n",
TCP_WBSEQNO(wrb),
TCP_SEQ_ADD(TCP_WBSEQNO(wrb), TCP_WBPKTLEN(wrb)),
TCP_WBPKTLEN(wrb));
sq_rem(entry, &conn->unacked_q);
retransmit_segment(conn, (FAR void *)entry);
break;
}
right = ofosegs[i].right;
}
}
}
else
#endif
/* Check if we are being asked to retransmit data */
if ((flags & TCP_REXMIT) != 0)
@ -706,75 +942,7 @@ static uint16_t psock_send_eventhandler(FAR struct net_driver_s *dev,
while ((entry = sq_remlast(&conn->unacked_q)) != NULL)
{
wrb = (FAR struct tcp_wrbuffer_s *)entry;
uint16_t sent;
/* Reset the number of bytes sent sent from the write buffer */
sent = TCP_WBSENT(wrb);
if (conn->tx_unacked > sent)
{
conn->tx_unacked -= sent;
}
else
{
conn->tx_unacked = 0;
}
if (conn->sent > sent)
{
conn->sent -= sent;
}
else
{
conn->sent = 0;
}
TCP_WBSENT(wrb) = 0;
ninfo("REXMIT: wrb=%p sent=%u, "
"conn tx_unacked=%" PRId32 " sent=%" PRId32 "\n",
wrb, TCP_WBSENT(wrb), conn->tx_unacked, conn->sent);
/* Free any write buffers that have exceed the retry count */
if (++TCP_WBNRTX(wrb) >= TCP_MAXRTX)
{
nwarn("WARNING: Expiring wrb=%p nrtx=%u\n",
wrb, TCP_WBNRTX(wrb));
/* Return the write buffer to the free list */
tcp_wrbuffer_release(wrb);
/* Notify any waiters if the write buffers have been
* drained.
*/
psock_writebuffer_notify(conn);
/* NOTE expired is different from un-ACKed, it is designed to
* represent the number of segments that have been sent,
* retransmitted, and un-ACKed, if expired is not zero, the
* connection will be closed.
*
* field expired can only be updated at TCP_ESTABLISHED state
*/
conn->expired++;
continue;
}
else
{
/* Insert the write buffer into the write_q (in sequence
* number order). The retransmission will occur below
* when the write buffer with the lowest sequence number
* is pulled from the write_q again.
*/
ninfo("REXMIT: Moving wrb=%p nrtx=%u\n", wrb, TCP_WBNRTX(wrb));
psock_insert_segment(wrb, &conn->write_q);
}
retransmit_segment(conn, (FAR void *)entry);
}
}