diff --git a/CHANGELOG b/CHANGELOG index 29ec34bc..e968fd19 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -79,6 +79,9 @@ HISTORY to make parameter order consistent with other ip*_route*() functions Same also applies to LWIP_HOOK_IP4_ROUTE_SRC() parameter order. + 2017-08-04: Joel Cunningham + * tcp: re-work persist timer to fully close window (details in bug #50837) + 2017-07-26: Simon Goldschmidt * snmp_msg.c: fix bug #51578 (SNMP failed to decode some values on non 32bit platforms) diff --git a/src/core/tcp.c b/src/core/tcp.c index b8490a51..87c03453 100644 --- a/src/core/tcp.c +++ b/src/core/tcp.c @@ -1075,17 +1075,32 @@ tcp_slowtmr_start: LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: max DATA retries reached\n")); } else { if (pcb->persist_backoff > 0) { + LWIP_ASSERT("tcp_slowtimr: persist ticking with in-flight data", pcb->unacked == NULL); + LWIP_ASSERT("tcp_slowtimr: persist ticking with empty send buffer", pcb->unsent != NULL); if (pcb->persist_probe >= TCP_MAXRTX) { ++pcb_remove; /* max probes reached */ } else { - /* If snd_wnd is zero, use persist timer to send 1 byte probes - * instead of using the standard retransmission mechanism. */ u8_t backoff_cnt = tcp_persist_backoff[pcb->persist_backoff-1]; if (pcb->persist_cnt < backoff_cnt) { pcb->persist_cnt++; } if (pcb->persist_cnt >= backoff_cnt) { - if (tcp_zero_window_probe(pcb) == ERR_OK) { + int next_slot = 1; /* increment timer to next slot */ + /* If snd_wnd is zero, send 1 byte probes */ + if (pcb->snd_wnd == 0) { + if (tcp_zero_window_probe(pcb) != ERR_OK) { + next_slot = 0; /* try probe again with current slot */ + } + /* snd_wnd not fully closed, split unsent head and fill window */ + } else { + if (tcp_split_unsent_seg(pcb, (u16_t)pcb->snd_wnd) == ERR_OK) { + if (tcp_output(pcb) == ERR_OK) { + /* sending will cancel persist timer, else retry with current slot */ + next_slot = 0; + } + } + } + if (next_slot) { pcb->persist_cnt = 0; if (pcb->persist_backoff < sizeof(tcp_persist_backoff)) { pcb->persist_backoff++; diff --git a/src/core/tcp_in.c b/src/core/tcp_in.c index 11fb1519..53c61b14 100644 --- a/src/core/tcp_in.c +++ b/src/core/tcp_in.c @@ -1108,17 +1108,6 @@ tcp_receive(struct tcp_pcb *pcb) } pcb->snd_wl1 = seqno; pcb->snd_wl2 = ackno; - if (pcb->snd_wnd == 0) { - if (pcb->persist_backoff == 0) { - /* start persist timer */ - pcb->persist_cnt = 0; - pcb->persist_backoff = 1; - pcb->persist_probe = 0; - } - } else if (pcb->persist_backoff > 0) { - /* stop persist timer */ - pcb->persist_backoff = 0; - } LWIP_DEBUGF(TCP_WND_DEBUG, ("tcp_receive: window update %"TCPWNDSIZE_F"\n", pcb->snd_wnd)); #if TCP_WND_DEBUG } else { diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c index 63b0ee34..2f1fab74 100644 --- a/src/core/tcp_out.c +++ b/src/core/tcp_out.c @@ -1127,18 +1127,6 @@ tcp_output(struct tcp_pcb *pcb) seg = pcb->unsent; - /* If the TF_ACK_NOW flag is set and no data will be sent (either - * because the ->unsent queue is empty or because the window does - * not allow it), construct an empty ACK segment and send it. - * - * If data is to be sent, we will just piggyback the ACK (see below). - */ - if ((pcb->flags & TF_ACK_NOW) && - (seg == NULL || - lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len > wnd)) { - return tcp_send_empty_ack(pcb); - } - if (seg == NULL) { LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_output: nothing to send (%p)\n", (void*)pcb->unsent)); @@ -1146,6 +1134,12 @@ tcp_output(struct tcp_pcb *pcb) ", cwnd %"TCPWNDSIZE_F", wnd %"U32_F ", seg == NULL, ack %"U32_F"\n", pcb->snd_wnd, pcb->cwnd, wnd, pcb->lastack)); + + /* If the TF_ACK_NOW flag is set and the ->unsent queue is empty, construct + * an empty ACK segment and send it. */ + if (pcb->flags & TF_ACK_NOW) { + return tcp_send_empty_ack(pcb); + } /* nothing to send: shortcut out of here */ goto output_done; } else { @@ -1177,24 +1171,27 @@ tcp_output(struct tcp_pcb *pcb) ip_addr_copy(pcb->local_ip, *local_ip); } - /* Check if we need to start the persistent timer when the next unsent segment - * does not fit within the remaining send window and RTO timer is not running (we - * have no in-flight data). A traditional approach would fill the remaining window - * with part of the unsent segment (which will engage zero-window probing upon - * reception of the zero window update from the receiver). This ensures the - * subsequent window update is reliably received. With the goal of being lightweight, - * we avoid splitting the unsent segment and treat the window as already zero. - */ - if (lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len > wnd && - wnd > 0 && wnd == pcb->snd_wnd && pcb->unacked == NULL) { - /* Start the persist timer */ - if (pcb->persist_backoff == 0) { + /* Handle the current segment not fitting within the window */ + if (lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len > wnd) { + /* We need to start the persistent timer when the next unsent segment does not fit + * within the remaining (could be 0) send window and RTO timer is not running (we + * have no in-flight data). If window is still too small after persist timer fires, + * then we split the segment. We don't consider the congestion window since a cwnd + * smaller than 1 SMSS implies in-flight data + */ + if (wnd == pcb->snd_wnd && pcb->unacked == NULL && pcb->persist_backoff == 0) { pcb->persist_cnt = 0; pcb->persist_backoff = 1; pcb->persist_probe = 0; } + /* We need an ACK, but can't send data now, so send an empty ACK */ + if (pcb->flags & TF_ACK_NOW) { + return tcp_send_empty_ack(pcb); + } goto output_done; } + /* Stop persist timer, above conditions are not active */ + pcb->persist_backoff = 0; /* data available and window allows it to be sent? */ while (seg != NULL && lwip_ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len <= wnd) { @@ -1767,6 +1764,165 @@ tcp_keepalive(struct tcp_pcb *pcb) return err; } +/** + * Split segment on the head of the unsent queue. If return is not + * ERR_OK, existing head remains intact + * + * The split is accomplished by creating a new TCP segment and pbuf + * which holds the remainder payload after the split. The original + * pbuf is trimmed to new length. This allows splitting of read-only + * pbufs + * + * @param pcb the tcp_pcb for which to split the unsent head + * @param split the amount of payload to remain in the head + */ +err_t +tcp_split_unsent_seg(struct tcp_pcb *pcb, u16_t split) +{ + struct tcp_seg *seg = NULL, *useg = NULL; + struct pbuf *p = NULL; + u8_t optlen; + u8_t optflags; + u8_t split_flags; + u8_t remainder_flags; + u16_t remainder; + u16_t offset; +#if TCP_CHECKSUM_ON_COPY + u16_t chksum = 0; + u8_t chksum_swapped = 0; + struct pbuf *q; +#endif /* TCP_CHECKSUM_ON_COPY */ + + useg = pcb->unsent; + if (useg == NULL) { + return ERR_MEM; + } + + if (split == 0) { + LWIP_ASSERT("Can't split segment into length 0", 0); + return ERR_VAL; + } + + if (useg->len <= split) { + return ERR_OK; + } + + LWIP_ASSERT("split <= mss", split <= pcb->mss); + LWIP_ASSERT("useg->len > 0", useg->len > 0); + + /* We should check that we don't exceed TCP_SND_QUEUELEN but we need + * to split this packet so we may actually exceed the max value by + * one! + */ + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue: split_unsent_seg: %u\n", (unsigned int)pcb->snd_queuelen)); + + optflags = useg->flags; +#if TCP_CHECKSUM_ON_COPY + /* Remove since checksum is not stored until after tcp_create_segment() */ + optflags &= ~TF_SEG_DATA_CHECKSUMMED; +#endif /* TCP_CHECKSUM_ON_COPY */ + optlen = LWIP_TCP_OPT_LENGTH(optflags); + remainder = useg->len - split; + + /* Create new pbuf for the remainder of the split */ + p = pbuf_alloc(PBUF_TRANSPORT, remainder + optlen, PBUF_RAM); + if (p == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, + ("tcp_split_unsent_seg: could not allocate memory for pbuf remainder %u\n", remainder)); + goto memerr; + } + + /* Offset into the original pbuf is past TCP/IP headers, options, and split amount */ + offset = useg->p->tot_len - useg->len + split; + /* Copy remainder into new pbuf, headers and options will not be filled out */ + if (pbuf_copy_partial(useg->p, (u8_t*)p->payload + optlen, remainder, offset ) != remainder) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, + ("tcp_split_unsent_seg: could not copy pbuf remainder %u\n", remainder)); + goto memerr; + } +#if TCP_CHECKSUM_ON_COPY + /* calculate the checksum on remainder data */ + tcp_seg_add_chksum(~inet_chksum((const u8_t*)p->payload + optlen, remainder), remainder, + &chksum, &chksum_swapped); +#endif /* TCP_CHECKSUM_ON_COPY */ + + /* Options are created when calling tcp_output() */ + + /* Migrate flags from original segment */ + split_flags = TCPH_FLAGS(useg->tcphdr); + remainder_flags = 0; /* ACK added in tcp_output() */ + + if (split_flags & TCP_PSH) { + split_flags &= ~TCP_PSH; + remainder_flags |= TCP_PSH; + } + if (split_flags & TCP_FIN) { + split_flags &= ~TCP_FIN; + remainder |= TCP_FIN; + } + /* SYN should be left on split, RST should not be present with data */ + + seg = tcp_create_segment(pcb, p, remainder_flags, lwip_ntohl(useg->tcphdr->seqno) + split, optflags); + if (seg == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, + ("tcp_split_unsent_seg: could not create new TCP segment\n")); + goto memerr; + } + +#if TCP_CHECKSUM_ON_COPY + seg->chksum = chksum; + seg->chksum_swapped = chksum_swapped; + seg->flags |= TF_SEG_DATA_CHECKSUMMED; +#endif /* TCP_CHECKSUM_ON_COPY */ + + /* Trim the original pbuf into our split size. At this point our remainder segment must be setup + successfully because we are modifying the original segment */ + pbuf_realloc(useg->p, useg->p->tot_len - remainder); + useg->len -= remainder; + TCPH_SET_FLAG(useg->tcphdr, split_flags); + +#if TCP_CHECKSUM_ON_COPY + /* The checksum on the split segment is now incorrect. We need to re-run it over the split */ + useg->chksum = 0; + useg->chksum_swapped = 0; + q = useg->p; + offset = q->tot_len - useg->len; /* Offset due to exposed headers */ + + /* Advance to the pbuf where the offset ends */ + while (q != NULL && offset > q->len) { + offset -= q->len; + q = q->next; + } + LWIP_ASSERT("Found start of payload pbuf", q != NULL); + /* Checksum the first payload pbuf accounting for offset, then other pbufs are all payload */ + for (; q != NULL; offset = 0, q = q->next) { + tcp_seg_add_chksum(~inet_chksum((const u8_t*)q->payload + offset, q->len - offset), q->len - offset, + &useg->chksum, &useg->chksum_swapped); + } +#endif /* TCP_CHECKSUM_ON_COPY */ + + /* Update number of segments on the queues. Note that length now may + * exceed TCP_SND_QUEUELEN! We don't have to touch pcb->snd_buf + * because the total amount of data is constant when packet is split */ + pcb->snd_queuelen++; + + /* Finally insert remainder into queue after split (which stays head) */ + seg->next = useg->next; + useg->next = seg; + + return ERR_OK; +memerr: + TCP_STATS_INC(tcp.memerr); + + if (seg != NULL) { + tcp_segs_free(seg); + } + if (p != NULL) { + pbuf_free(p); + } + + return ERR_MEM; +} /** * Send persist timer zero-window probes to keep a connection active @@ -1797,13 +1953,10 @@ tcp_zero_window_probe(struct tcp_pcb *pcb) " pcb->tmr %"U32_F" pcb->keep_cnt_sent %"U16_F"\n", tcp_ticks, pcb->tmr, (u16_t)pcb->keep_cnt_sent)); - seg = pcb->unacked; - + /* Only consider unsent, persist timer should be off when there data is in-flight */ + seg = pcb->unsent; if (seg == NULL) { - seg = pcb->unsent; - } - if (seg == NULL) { - /* nothing to send, zero window probe not needed */ + /* Not expected, persist timer should be off when the send buffer is empty */ return ERR_OK; } diff --git a/src/include/lwip/priv/tcp_priv.h b/src/include/lwip/priv/tcp_priv.h index a4319326..9987657a 100644 --- a/src/include/lwip/priv/tcp_priv.h +++ b/src/include/lwip/priv/tcp_priv.h @@ -468,6 +468,7 @@ void tcp_rst(const struct tcp_pcb* pcb, u32_t seqno, u32_t ackno, u32_t tcp_next_iss(struct tcp_pcb *pcb); err_t tcp_keepalive(struct tcp_pcb *pcb); +err_t tcp_split_unsent_seg(struct tcp_pcb *pcb, u16_t split); err_t tcp_zero_window_probe(struct tcp_pcb *pcb); void tcp_trigger_input_pcb_close(void); diff --git a/test/unit/tcp/test_tcp.c b/test/unit/tcp/test_tcp.c index cac925e6..49e7e347 100644 --- a/test/unit/tcp/test_tcp.c +++ b/test/unit/tcp/test_tcp.c @@ -687,7 +687,9 @@ static void test_tcp_tx_full_window_lost(u8_t zero_window_probe_from_unsent) /* ensure this didn't trigger any transmission */ EXPECT(txcounters.num_tx_calls == 0); EXPECT(txcounters.num_tx_bytes == 0); - EXPECT(pcb->persist_backoff == 1); + /* window is completely full, but persist timer is off since send buffer is empty */ + EXPECT(pcb->snd_wnd == 0); + EXPECT(pcb->persist_backoff == 0); } /* send one byte more (out of window) -> persist timer starts */ @@ -991,7 +993,8 @@ START_TEST(test_tcp_zwp_timeout) test_tcp_input(p, &netif); EXPECT(pcb->unacked == NULL); EXPECT(pcb->unsent == NULL); - EXPECT(pcb->persist_backoff == 1); + /* send buffer empty, persist should be off */ + EXPECT(pcb->persist_backoff == 0); EXPECT(pcb->snd_wnd == 0); /* send second segment, should be buffered */ @@ -1000,11 +1003,12 @@ START_TEST(test_tcp_zwp_timeout) err = tcp_output(pcb); EXPECT(err == ERR_OK); - /* ensure it is buffered */ + /* ensure it is buffered and persist timer started */ EXPECT(pcb->unacked == NULL); check_seqnos(pcb->unsent, 1, &seqnos[1]); EXPECT(txcounters.num_tx_calls == 0); EXPECT(txcounters.num_tx_bytes == 0); + EXPECT(pcb->persist_backoff == 1); /* ensure no errors have been recorded */ EXPECT(counters.err_calls == 0); @@ -1051,6 +1055,152 @@ START_TEST(test_tcp_zwp_timeout) } END_TEST +START_TEST(test_tcp_persist_split) +{ + struct netif netif; + struct test_tcp_txcounters txcounters; + struct test_tcp_counters counters; + struct tcp_pcb *pcb; + struct pbuf* p; + err_t err; + u16_t i; + LWIP_UNUSED_ARG(_i); + + /* Setup data for four segments */ + for (i = 0; i < 4 * TCP_MSS; i++) { + tx_data[i] = (u8_t)i; + } + + /* initialize local vars */ + test_tcp_init_netif(&netif, &txcounters, &test_local_ip, &test_netmask); + memset(&counters, 0, sizeof(counters)); + + /* create and initialize the pcb */ + tcp_ticks = SEQNO1 - ISS; + pcb = test_tcp_new_counters_pcb(&counters); + EXPECT_RET(pcb != NULL); + tcp_set_state(pcb, ESTABLISHED, &test_local_ip, &test_remote_ip, TEST_LOCAL_PORT, TEST_REMOTE_PORT); + pcb->mss = TCP_MSS; + /* set window to three segments */ + pcb->cwnd = 3 * TCP_MSS; + pcb->snd_wnd = 3 * TCP_MSS; + pcb->snd_wnd_max = 3 * TCP_MSS; + + /* send three segments */ + err = tcp_write(pcb, &tx_data[0], 3 * TCP_MSS, TCP_WRITE_FLAG_COPY); + EXPECT(err == ERR_OK); + err = tcp_output(pcb); + EXPECT(err == ERR_OK); + + /* verify segments are in-flight */ + EXPECT(pcb->unsent == NULL); + EXPECT(pcb->unacked != NULL); + check_seqnos(pcb->unacked, 3, seqnos); + EXPECT(txcounters.num_tx_calls == 3); + EXPECT(txcounters.num_tx_bytes == 3 * (TCP_MSS + 40U)); + memset(&txcounters, 0, sizeof(txcounters)); + + /* ACK the segments and update the window to only 1/2 TCP_MSS */ + p = tcp_create_rx_segment_wnd(pcb, NULL, 0, 0, 3 * TCP_MSS, TCP_ACK, TCP_MSS / 2); + test_tcp_input(p, &netif); + EXPECT(pcb->unacked == NULL); + EXPECT(pcb->unsent == NULL); + EXPECT(pcb->persist_backoff == 0); + EXPECT(pcb->snd_wnd == TCP_MSS / 2); + + /* send fourth segment, which is larger than snd_wnd */ + err = tcp_write(pcb, &tx_data[3 * TCP_MSS], TCP_MSS, TCP_WRITE_FLAG_COPY); + EXPECT(err == ERR_OK); + err = tcp_output(pcb); + EXPECT(err == ERR_OK); + + /* ensure it is buffered and persist timer started */ + EXPECT(pcb->unacked == NULL); + EXPECT(pcb->unsent != NULL); + check_seqnos(pcb->unsent, 1, &seqnos[3]); + EXPECT(txcounters.num_tx_calls == 0); + EXPECT(txcounters.num_tx_bytes == 0); + EXPECT(pcb->persist_backoff == 1); + + /* ensure no errors have been recorded */ + EXPECT(counters.err_calls == 0); + EXPECT(counters.last_err == ERR_OK); + + /* call tcp_timer some more times to let persist timer count up */ + for (i = 0; i < 4; i++) { + test_tcp_tmr(); + EXPECT(txcounters.num_tx_calls == 0); + EXPECT(txcounters.num_tx_bytes == 0); + } + + /* this should be the first timer shot, which should split the + * segment and send a runt (of the remaining window size) */ + txcounters.copy_tx_packets = 1; + test_tcp_tmr(); + txcounters.copy_tx_packets = 0; + /* persist will be disabled as RTO timer takes over */ + EXPECT(pcb->persist_backoff == 0); + EXPECT(txcounters.num_tx_calls == 1); + EXPECT(txcounters.num_tx_bytes == ((TCP_MSS /2) + 40U)); + /* verify half segment sent, half still buffered */ + EXPECT(pcb->unsent != NULL); + EXPECT(pcb->unsent->len == TCP_MSS / 2); + EXPECT(pcb->unacked != NULL); + EXPECT(pcb->unacked->len == TCP_MSS / 2); + + /* verify first half segment */ + EXPECT(txcounters.tx_packets != NULL); + if (txcounters.tx_packets != NULL) { + u8_t sent[TCP_MSS / 2]; + u16_t ret; + ret = pbuf_copy_partial(txcounters.tx_packets, &sent, TCP_MSS / 2, 40U); + EXPECT(ret == TCP_MSS / 2); + EXPECT(memcmp(sent, &tx_data[3 * TCP_MSS], TCP_MSS / 2) == 0); + } + if (txcounters.tx_packets != NULL) { + pbuf_free(txcounters.tx_packets); + txcounters.tx_packets = NULL; + } + memset(&txcounters, 0, sizeof(txcounters)); + + /* ACK the half segment, leave window at half segment */ + p = tcp_create_rx_segment_wnd(pcb, NULL, 0, 0, TCP_MSS / 2, TCP_ACK, TCP_MSS / 2); + txcounters.copy_tx_packets = 1; + test_tcp_input(p, &netif); + txcounters.copy_tx_packets = 0; + /* ensure remaining half segment was sent */ + EXPECT(txcounters.num_tx_calls == 1); + EXPECT(txcounters.num_tx_bytes == ((TCP_MSS /2 ) + 40U)); + EXPECT(pcb->unsent == NULL); + EXPECT(pcb->unacked != NULL); + EXPECT(pcb->unacked->len == TCP_MSS / 2); + EXPECT(pcb->snd_wnd == TCP_MSS / 2); + + /* verify second half segment */ + EXPECT(txcounters.tx_packets != NULL); + if (txcounters.tx_packets != NULL) { + u8_t sent[TCP_MSS / 2]; + u16_t ret; + ret = pbuf_copy_partial(txcounters.tx_packets, &sent, TCP_MSS / 2, 40U); + EXPECT(ret == TCP_MSS / 2); + EXPECT(memcmp(sent, &tx_data[(3 * TCP_MSS) + TCP_MSS / 2], TCP_MSS / 2) == 0); + } + if (txcounters.tx_packets != NULL) { + pbuf_free(txcounters.tx_packets); + txcounters.tx_packets = NULL; + } + + /* ensure no errors have been recorded */ + EXPECT(counters.err_calls == 0); + EXPECT(counters.last_err == ERR_OK); + + /* make sure the pcb is freed */ + EXPECT_RET(MEMP_STATS_GET(used, MEMP_TCP_PCB) == 1); + tcp_abort(pcb); + EXPECT_RET(MEMP_STATS_GET(used, MEMP_TCP_PCB) == 0); +} +END_TEST + /** Create the suite including all tests for this module */ Suite * tcp_suite(void) @@ -1067,7 +1217,8 @@ tcp_suite(void) TESTFUNC(test_tcp_tx_full_window_lost_from_unsent), TESTFUNC(test_tcp_rto_tracking), TESTFUNC(test_tcp_rto_timeout), - TESTFUNC(test_tcp_zwp_timeout) + TESTFUNC(test_tcp_zwp_timeout), + TESTFUNC(test_tcp_persist_split) }; return create_suite("TCP", tests, sizeof(tests)/sizeof(testfunc), tcp_setup, tcp_teardown); }