tcp: persist timer re-work (bug #50837)

This re-works the persist timer to have the following behavior:

  1) Only start persist timer when a buffered segment doesn't fit within
     the current window and there is no in-fligh data.  Previously, the
     persist timer was always started when the window went to zero even
     if there was no buffered data (since timer was managed in receive
     pathway rather than transmit pathway)
  2) Upon first fire of persist timer, fill the remaining window if
     non-zero by splitting the unsent segment.  If split segment is sent,
     persist timer is stopped, RTO timer is now ensuring reliable window
     updates
  3) If window is already zero when persist timer fires, send 1 byte probe
  4) Persist timer and zero window probe should only be active when the
     following are true:
       * no in-flight data (pcb->unacked == NULL)
       * when there is buffered data (pcb->unsent != NULL)
       * when pcb->unsent->len > pcb->snd_wnd
This commit is contained in:
Joel Cunningham 2017-08-04 12:50:47 -05:00
parent 2e4867fcde
commit f582c88339
6 changed files with 360 additions and 48 deletions

View File

@ -79,6 +79,9 @@ HISTORY
to make parameter order consistent with other ip*_route*() functions
Same also applies to LWIP_HOOK_IP4_ROUTE_SRC() parameter order.
2017-08-04: Joel Cunningham
* tcp: re-work persist timer to fully close window (details in bug #50837)
2017-07-26: Simon Goldschmidt
* snmp_msg.c: fix bug #51578 (SNMP failed to decode some values on non 32bit platforms)

View File

@ -1075,17 +1075,32 @@ tcp_slowtmr_start:
LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: max DATA retries reached\n"));
} else {
if (pcb->persist_backoff > 0) {
LWIP_ASSERT("tcp_slowtimr: persist ticking with in-flight data", pcb->unacked == NULL);
LWIP_ASSERT("tcp_slowtimr: persist ticking with empty send buffer", pcb->unsent != NULL);
if (pcb->persist_probe >= TCP_MAXRTX) {
++pcb_remove; /* max probes reached */
} else {
/* If snd_wnd is zero, use persist timer to send 1 byte probes
* instead of using the standard retransmission mechanism. */
u8_t backoff_cnt = tcp_persist_backoff[pcb->persist_backoff-1];
if (pcb->persist_cnt < backoff_cnt) {
pcb->persist_cnt++;
}
if (pcb->persist_cnt >= backoff_cnt) {
if (tcp_zero_window_probe(pcb) == ERR_OK) {
int next_slot = 1; /* increment timer to next slot */
/* If snd_wnd is zero, send 1 byte probes */
if (pcb->snd_wnd == 0) {
if (tcp_zero_window_probe(pcb) != ERR_OK) {
next_slot = 0; /* try probe again with current slot */
}
/* snd_wnd not fully closed, split unsent head and fill window */
} else {
if (tcp_split_unsent_seg(pcb, (u16_t)pcb->snd_wnd) == ERR_OK) {
if (tcp_output(pcb) == ERR_OK) {
/* sending will cancel persist timer, else retry with current slot */
next_slot = 0;
}
}
}
if (next_slot) {
pcb->persist_cnt = 0;
if (pcb->persist_backoff < sizeof(tcp_persist_backoff)) {
pcb->persist_backoff++;

View File

@ -1108,17 +1108,6 @@ tcp_receive(struct tcp_pcb *pcb)
}
pcb->snd_wl1 = seqno;
pcb->snd_wl2 = ackno;
if (pcb->snd_wnd == 0) {
if (pcb->persist_backoff == 0) {
/* start persist timer */
pcb->persist_cnt = 0;
pcb->persist_backoff = 1;
pcb->persist_probe = 0;
}
} else if (pcb->persist_backoff > 0) {
/* stop persist timer */
pcb->persist_backoff = 0;
}
LWIP_DEBUGF(TCP_WND_DEBUG, ("tcp_receive: window update %"TCPWNDSIZE_F"\n", pcb->snd_wnd));
#if TCP_WND_DEBUG
} else {

View File

@ -1127,18 +1127,6 @@ tcp_output(struct tcp_pcb *pcb)
seg = pcb->unsent;
/* If the TF_ACK_NOW flag is set and no data will be sent (either
* because the ->unsent queue is empty or because the window does
* not allow it), construct an empty ACK segment and send it.
*
* If data is to be sent, we will just piggyback the ACK (see below).
*/
if ((pcb->flags & TF_ACK_NOW) &&
(seg == NULL ||
lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len > wnd)) {
return tcp_send_empty_ack(pcb);
}
if (seg == NULL) {
LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_output: nothing to send (%p)\n",
(void*)pcb->unsent));
@ -1146,6 +1134,12 @@ tcp_output(struct tcp_pcb *pcb)
", cwnd %"TCPWNDSIZE_F", wnd %"U32_F
", seg == NULL, ack %"U32_F"\n",
pcb->snd_wnd, pcb->cwnd, wnd, pcb->lastack));
/* If the TF_ACK_NOW flag is set and the ->unsent queue is empty, construct
* an empty ACK segment and send it. */
if (pcb->flags & TF_ACK_NOW) {
return tcp_send_empty_ack(pcb);
}
/* nothing to send: shortcut out of here */
goto output_done;
} else {
@ -1177,24 +1171,27 @@ tcp_output(struct tcp_pcb *pcb)
ip_addr_copy(pcb->local_ip, *local_ip);
}
/* Check if we need to start the persistent timer when the next unsent segment
* does not fit within the remaining send window and RTO timer is not running (we
* have no in-flight data). A traditional approach would fill the remaining window
* with part of the unsent segment (which will engage zero-window probing upon
* reception of the zero window update from the receiver). This ensures the
* subsequent window update is reliably received. With the goal of being lightweight,
* we avoid splitting the unsent segment and treat the window as already zero.
*/
if (lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len > wnd &&
wnd > 0 && wnd == pcb->snd_wnd && pcb->unacked == NULL) {
/* Start the persist timer */
if (pcb->persist_backoff == 0) {
/* Handle the current segment not fitting within the window */
if (lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len > wnd) {
/* We need to start the persistent timer when the next unsent segment does not fit
* within the remaining (could be 0) send window and RTO timer is not running (we
* have no in-flight data). If window is still too small after persist timer fires,
* then we split the segment. We don't consider the congestion window since a cwnd
* smaller than 1 SMSS implies in-flight data
*/
if (wnd == pcb->snd_wnd && pcb->unacked == NULL && pcb->persist_backoff == 0) {
pcb->persist_cnt = 0;
pcb->persist_backoff = 1;
pcb->persist_probe = 0;
}
/* We need an ACK, but can't send data now, so send an empty ACK */
if (pcb->flags & TF_ACK_NOW) {
return tcp_send_empty_ack(pcb);
}
goto output_done;
}
/* Stop persist timer, above conditions are not active */
pcb->persist_backoff = 0;
/* data available and window allows it to be sent? */
while (seg != NULL &&
lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len <= wnd) {
@ -1767,6 +1764,165 @@ tcp_keepalive(struct tcp_pcb *pcb)
return err;
}
/**
* Split segment on the head of the unsent queue. If return is not
* ERR_OK, existing head remains intact
*
* The split is accomplished by creating a new TCP segment and pbuf
* which holds the remainder payload after the split. The original
* pbuf is trimmed to new length. This allows splitting of read-only
* pbufs
*
* @param pcb the tcp_pcb for which to split the unsent head
* @param split the amount of payload to remain in the head
*/
err_t
tcp_split_unsent_seg(struct tcp_pcb *pcb, u16_t split)
{
struct tcp_seg *seg = NULL, *useg = NULL;
struct pbuf *p = NULL;
u8_t optlen;
u8_t optflags;
u8_t split_flags;
u8_t remainder_flags;
u16_t remainder;
u16_t offset;
#if TCP_CHECKSUM_ON_COPY
u16_t chksum = 0;
u8_t chksum_swapped = 0;
struct pbuf *q;
#endif /* TCP_CHECKSUM_ON_COPY */
useg = pcb->unsent;
if (useg == NULL) {
return ERR_MEM;
}
if (split == 0) {
LWIP_ASSERT("Can't split segment into length 0", 0);
return ERR_VAL;
}
if (useg->len <= split) {
return ERR_OK;
}
LWIP_ASSERT("split <= mss", split <= pcb->mss);
LWIP_ASSERT("useg->len > 0", useg->len > 0);
/* We should check that we don't exceed TCP_SND_QUEUELEN but we need
* to split this packet so we may actually exceed the max value by
* one!
*/
LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue: split_unsent_seg: %u\n", (unsigned int)pcb->snd_queuelen));
optflags = useg->flags;
#if TCP_CHECKSUM_ON_COPY
/* Remove since checksum is not stored until after tcp_create_segment() */
optflags &= ~TF_SEG_DATA_CHECKSUMMED;
#endif /* TCP_CHECKSUM_ON_COPY */
optlen = LWIP_TCP_OPT_LENGTH(optflags);
remainder = useg->len - split;
/* Create new pbuf for the remainder of the split */
p = pbuf_alloc(PBUF_TRANSPORT, remainder + optlen, PBUF_RAM);
if (p == NULL) {
LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS,
("tcp_split_unsent_seg: could not allocate memory for pbuf remainder %u\n", remainder));
goto memerr;
}
/* Offset into the original pbuf is past TCP/IP headers, options, and split amount */
offset = useg->p->tot_len - useg->len + split;
/* Copy remainder into new pbuf, headers and options will not be filled out */
if (pbuf_copy_partial(useg->p, (u8_t*)p->payload + optlen, remainder, offset ) != remainder) {
LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS,
("tcp_split_unsent_seg: could not copy pbuf remainder %u\n", remainder));
goto memerr;
}
#if TCP_CHECKSUM_ON_COPY
/* calculate the checksum on remainder data */
tcp_seg_add_chksum(~inet_chksum((const u8_t*)p->payload + optlen, remainder), remainder,
&chksum, &chksum_swapped);
#endif /* TCP_CHECKSUM_ON_COPY */
/* Options are created when calling tcp_output() */
/* Migrate flags from original segment */
split_flags = TCPH_FLAGS(useg->tcphdr);
remainder_flags = 0; /* ACK added in tcp_output() */
if (split_flags & TCP_PSH) {
split_flags &= ~TCP_PSH;
remainder_flags |= TCP_PSH;
}
if (split_flags & TCP_FIN) {
split_flags &= ~TCP_FIN;
remainder |= TCP_FIN;
}
/* SYN should be left on split, RST should not be present with data */
seg = tcp_create_segment(pcb, p, remainder_flags, lwip_ntohl(useg->tcphdr->seqno) + split, optflags);
if (seg == NULL) {
LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS,
("tcp_split_unsent_seg: could not create new TCP segment\n"));
goto memerr;
}
#if TCP_CHECKSUM_ON_COPY
seg->chksum = chksum;
seg->chksum_swapped = chksum_swapped;
seg->flags |= TF_SEG_DATA_CHECKSUMMED;
#endif /* TCP_CHECKSUM_ON_COPY */
/* Trim the original pbuf into our split size. At this point our remainder segment must be setup
successfully because we are modifying the original segment */
pbuf_realloc(useg->p, useg->p->tot_len - remainder);
useg->len -= remainder;
TCPH_SET_FLAG(useg->tcphdr, split_flags);
#if TCP_CHECKSUM_ON_COPY
/* The checksum on the split segment is now incorrect. We need to re-run it over the split */
useg->chksum = 0;
useg->chksum_swapped = 0;
q = useg->p;
offset = q->tot_len - useg->len; /* Offset due to exposed headers */
/* Advance to the pbuf where the offset ends */
while (q != NULL && offset > q->len) {
offset -= q->len;
q = q->next;
}
LWIP_ASSERT("Found start of payload pbuf", q != NULL);
/* Checksum the first payload pbuf accounting for offset, then other pbufs are all payload */
for (; q != NULL; offset = 0, q = q->next) {
tcp_seg_add_chksum(~inet_chksum((const u8_t*)q->payload + offset, q->len - offset), q->len - offset,
&useg->chksum, &useg->chksum_swapped);
}
#endif /* TCP_CHECKSUM_ON_COPY */
/* Update number of segments on the queues. Note that length now may
* exceed TCP_SND_QUEUELEN! We don't have to touch pcb->snd_buf
* because the total amount of data is constant when packet is split */
pcb->snd_queuelen++;
/* Finally insert remainder into queue after split (which stays head) */
seg->next = useg->next;
useg->next = seg;
return ERR_OK;
memerr:
TCP_STATS_INC(tcp.memerr);
if (seg != NULL) {
tcp_segs_free(seg);
}
if (p != NULL) {
pbuf_free(p);
}
return ERR_MEM;
}
/**
* Send persist timer zero-window probes to keep a connection active
@ -1797,13 +1953,10 @@ tcp_zero_window_probe(struct tcp_pcb *pcb)
" pcb->tmr %"U32_F" pcb->keep_cnt_sent %"U16_F"\n",
tcp_ticks, pcb->tmr, (u16_t)pcb->keep_cnt_sent));
seg = pcb->unacked;
/* Only consider unsent, persist timer should be off when there data is in-flight */
seg = pcb->unsent;
if (seg == NULL) {
seg = pcb->unsent;
}
if (seg == NULL) {
/* nothing to send, zero window probe not needed */
/* Not expected, persist timer should be off when the send buffer is empty */
return ERR_OK;
}

View File

@ -468,6 +468,7 @@ void tcp_rst(const struct tcp_pcb* pcb, u32_t seqno, u32_t ackno,
u32_t tcp_next_iss(struct tcp_pcb *pcb);
err_t tcp_keepalive(struct tcp_pcb *pcb);
err_t tcp_split_unsent_seg(struct tcp_pcb *pcb, u16_t split);
err_t tcp_zero_window_probe(struct tcp_pcb *pcb);
void tcp_trigger_input_pcb_close(void);

View File

@ -687,7 +687,9 @@ static void test_tcp_tx_full_window_lost(u8_t zero_window_probe_from_unsent)
/* ensure this didn't trigger any transmission */
EXPECT(txcounters.num_tx_calls == 0);
EXPECT(txcounters.num_tx_bytes == 0);
EXPECT(pcb->persist_backoff == 1);
/* window is completely full, but persist timer is off since send buffer is empty */
EXPECT(pcb->snd_wnd == 0);
EXPECT(pcb->persist_backoff == 0);
}
/* send one byte more (out of window) -> persist timer starts */
@ -991,7 +993,8 @@ START_TEST(test_tcp_zwp_timeout)
test_tcp_input(p, &netif);
EXPECT(pcb->unacked == NULL);
EXPECT(pcb->unsent == NULL);
EXPECT(pcb->persist_backoff == 1);
/* send buffer empty, persist should be off */
EXPECT(pcb->persist_backoff == 0);
EXPECT(pcb->snd_wnd == 0);
/* send second segment, should be buffered */
@ -1000,11 +1003,12 @@ START_TEST(test_tcp_zwp_timeout)
err = tcp_output(pcb);
EXPECT(err == ERR_OK);
/* ensure it is buffered */
/* ensure it is buffered and persist timer started */
EXPECT(pcb->unacked == NULL);
check_seqnos(pcb->unsent, 1, &seqnos[1]);
EXPECT(txcounters.num_tx_calls == 0);
EXPECT(txcounters.num_tx_bytes == 0);
EXPECT(pcb->persist_backoff == 1);
/* ensure no errors have been recorded */
EXPECT(counters.err_calls == 0);
@ -1051,6 +1055,152 @@ START_TEST(test_tcp_zwp_timeout)
}
END_TEST
START_TEST(test_tcp_persist_split)
{
struct netif netif;
struct test_tcp_txcounters txcounters;
struct test_tcp_counters counters;
struct tcp_pcb *pcb;
struct pbuf* p;
err_t err;
u16_t i;
LWIP_UNUSED_ARG(_i);
/* Setup data for four segments */
for (i = 0; i < 4 * TCP_MSS; i++) {
tx_data[i] = (u8_t)i;
}
/* initialize local vars */
test_tcp_init_netif(&netif, &txcounters, &test_local_ip, &test_netmask);
memset(&counters, 0, sizeof(counters));
/* create and initialize the pcb */
tcp_ticks = SEQNO1 - ISS;
pcb = test_tcp_new_counters_pcb(&counters);
EXPECT_RET(pcb != NULL);
tcp_set_state(pcb, ESTABLISHED, &test_local_ip, &test_remote_ip, TEST_LOCAL_PORT, TEST_REMOTE_PORT);
pcb->mss = TCP_MSS;
/* set window to three segments */
pcb->cwnd = 3 * TCP_MSS;
pcb->snd_wnd = 3 * TCP_MSS;
pcb->snd_wnd_max = 3 * TCP_MSS;
/* send three segments */
err = tcp_write(pcb, &tx_data[0], 3 * TCP_MSS, TCP_WRITE_FLAG_COPY);
EXPECT(err == ERR_OK);
err = tcp_output(pcb);
EXPECT(err == ERR_OK);
/* verify segments are in-flight */
EXPECT(pcb->unsent == NULL);
EXPECT(pcb->unacked != NULL);
check_seqnos(pcb->unacked, 3, seqnos);
EXPECT(txcounters.num_tx_calls == 3);
EXPECT(txcounters.num_tx_bytes == 3 * (TCP_MSS + 40U));
memset(&txcounters, 0, sizeof(txcounters));
/* ACK the segments and update the window to only 1/2 TCP_MSS */
p = tcp_create_rx_segment_wnd(pcb, NULL, 0, 0, 3 * TCP_MSS, TCP_ACK, TCP_MSS / 2);
test_tcp_input(p, &netif);
EXPECT(pcb->unacked == NULL);
EXPECT(pcb->unsent == NULL);
EXPECT(pcb->persist_backoff == 0);
EXPECT(pcb->snd_wnd == TCP_MSS / 2);
/* send fourth segment, which is larger than snd_wnd */
err = tcp_write(pcb, &tx_data[3 * TCP_MSS], TCP_MSS, TCP_WRITE_FLAG_COPY);
EXPECT(err == ERR_OK);
err = tcp_output(pcb);
EXPECT(err == ERR_OK);
/* ensure it is buffered and persist timer started */
EXPECT(pcb->unacked == NULL);
EXPECT(pcb->unsent != NULL);
check_seqnos(pcb->unsent, 1, &seqnos[3]);
EXPECT(txcounters.num_tx_calls == 0);
EXPECT(txcounters.num_tx_bytes == 0);
EXPECT(pcb->persist_backoff == 1);
/* ensure no errors have been recorded */
EXPECT(counters.err_calls == 0);
EXPECT(counters.last_err == ERR_OK);
/* call tcp_timer some more times to let persist timer count up */
for (i = 0; i < 4; i++) {
test_tcp_tmr();
EXPECT(txcounters.num_tx_calls == 0);
EXPECT(txcounters.num_tx_bytes == 0);
}
/* this should be the first timer shot, which should split the
* segment and send a runt (of the remaining window size) */
txcounters.copy_tx_packets = 1;
test_tcp_tmr();
txcounters.copy_tx_packets = 0;
/* persist will be disabled as RTO timer takes over */
EXPECT(pcb->persist_backoff == 0);
EXPECT(txcounters.num_tx_calls == 1);
EXPECT(txcounters.num_tx_bytes == ((TCP_MSS /2) + 40U));
/* verify half segment sent, half still buffered */
EXPECT(pcb->unsent != NULL);
EXPECT(pcb->unsent->len == TCP_MSS / 2);
EXPECT(pcb->unacked != NULL);
EXPECT(pcb->unacked->len == TCP_MSS / 2);
/* verify first half segment */
EXPECT(txcounters.tx_packets != NULL);
if (txcounters.tx_packets != NULL) {
u8_t sent[TCP_MSS / 2];
u16_t ret;
ret = pbuf_copy_partial(txcounters.tx_packets, &sent, TCP_MSS / 2, 40U);
EXPECT(ret == TCP_MSS / 2);
EXPECT(memcmp(sent, &tx_data[3 * TCP_MSS], TCP_MSS / 2) == 0);
}
if (txcounters.tx_packets != NULL) {
pbuf_free(txcounters.tx_packets);
txcounters.tx_packets = NULL;
}
memset(&txcounters, 0, sizeof(txcounters));
/* ACK the half segment, leave window at half segment */
p = tcp_create_rx_segment_wnd(pcb, NULL, 0, 0, TCP_MSS / 2, TCP_ACK, TCP_MSS / 2);
txcounters.copy_tx_packets = 1;
test_tcp_input(p, &netif);
txcounters.copy_tx_packets = 0;
/* ensure remaining half segment was sent */
EXPECT(txcounters.num_tx_calls == 1);
EXPECT(txcounters.num_tx_bytes == ((TCP_MSS /2 ) + 40U));
EXPECT(pcb->unsent == NULL);
EXPECT(pcb->unacked != NULL);
EXPECT(pcb->unacked->len == TCP_MSS / 2);
EXPECT(pcb->snd_wnd == TCP_MSS / 2);
/* verify second half segment */
EXPECT(txcounters.tx_packets != NULL);
if (txcounters.tx_packets != NULL) {
u8_t sent[TCP_MSS / 2];
u16_t ret;
ret = pbuf_copy_partial(txcounters.tx_packets, &sent, TCP_MSS / 2, 40U);
EXPECT(ret == TCP_MSS / 2);
EXPECT(memcmp(sent, &tx_data[(3 * TCP_MSS) + TCP_MSS / 2], TCP_MSS / 2) == 0);
}
if (txcounters.tx_packets != NULL) {
pbuf_free(txcounters.tx_packets);
txcounters.tx_packets = NULL;
}
/* ensure no errors have been recorded */
EXPECT(counters.err_calls == 0);
EXPECT(counters.last_err == ERR_OK);
/* make sure the pcb is freed */
EXPECT_RET(MEMP_STATS_GET(used, MEMP_TCP_PCB) == 1);
tcp_abort(pcb);
EXPECT_RET(MEMP_STATS_GET(used, MEMP_TCP_PCB) == 0);
}
END_TEST
/** Create the suite including all tests for this module */
Suite *
tcp_suite(void)
@ -1067,7 +1217,8 @@ tcp_suite(void)
TESTFUNC(test_tcp_tx_full_window_lost_from_unsent),
TESTFUNC(test_tcp_rto_tracking),
TESTFUNC(test_tcp_rto_timeout),
TESTFUNC(test_tcp_zwp_timeout)
TESTFUNC(test_tcp_zwp_timeout),
TESTFUNC(test_tcp_persist_split)
};
return create_suite("TCP", tests, sizeof(tests)/sizeof(testfunc), tcp_setup, tcp_teardown);
}