From b6542b977e73ef1fe2bd3b9aa0230b18412658d9 Mon Sep 17 00:00:00 2001 From: goldsimon Date: Fri, 5 Mar 2010 11:14:31 +0000 Subject: [PATCH] task #7040 (Work on tcp_enqueue): Don't waste memory when chaining segments, added option TCP_OVERSIZE to prevent creating many small pbufs when calling tcp_write with many small blocks of data. Instead, pbufs are allocated larger than needed and the space is used for later calls to tcp_write. --- CHANGELOG | 8 + src/core/tcp.c | 16 +- src/core/tcp_in.c | 12 +- src/core/tcp_out.c | 757 +++++++++++++++++++++++------------- src/include/lwip/opt.h | 18 + src/include/lwip/tcp.h | 10 +- src/include/lwip/tcp_impl.h | 7 +- 7 files changed, 538 insertions(+), 290 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 2e4c3be0..6d6f6b07 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -13,6 +13,14 @@ HISTORY ++ New features: + 2010-03-05: Jakob Ole Stoklundsen / Simon Goldschmidt + * opt.h, tcp.h, tcp_impl.h, tcp.c, tcp_in.c, tcp_out.c: task #7040: + Work on tcp_enqueue: Don't waste memory when chaining segments, + added option TCP_OVERSIZE to prevent creating many small pbufs when + calling tcp_write with many small blocks of data. Instead, pbufs are + allocated larger than needed and the space is used for later calls to + tcp_write. + 2010-02-21: Simon Goldschmidt * stats.c/.h: Added const char* name to mem- and memp-stats for easier debugging. diff --git a/src/core/tcp.c b/src/core/tcp.c index f86c704f..0fc70262 100644 --- a/src/core/tcp.c +++ b/src/core/tcp.c @@ -170,21 +170,21 @@ tcp_close_shutdown(struct tcp_pcb *pcb, u8_t rst_on_unacked_data) snmp_inc_tcpattemptfails(); break; case SYN_RCVD: - err = tcp_send_ctrl(pcb, TCP_FIN); + err = tcp_send_fin(pcb); if (err == ERR_OK) { snmp_inc_tcpattemptfails(); pcb->state = FIN_WAIT_1; } break; case ESTABLISHED: - err = tcp_send_ctrl(pcb, TCP_FIN); + err = tcp_send_fin(pcb); if (err == ERR_OK) { snmp_inc_tcpestabresets(); pcb->state = FIN_WAIT_1; } break; case CLOSE_WAIT: - err = tcp_send_ctrl(pcb, TCP_FIN); + err = tcp_send_fin(pcb); if (err == ERR_OK) { snmp_inc_tcpestabresets(); pcb->state = LAST_ACK; @@ -659,12 +659,7 @@ tcp_connect(struct tcp_pcb *pcb, ip_addr_t *ipaddr, u16_t port, #endif /* LWIP_CALLBACK_API */ /* Send a SYN together with the MSS option. */ - ret = tcp_enqueue(pcb, NULL, 0, TCP_SYN, 0, TF_SEG_OPTS_MSS -#if LWIP_TCP_TIMESTAMPS - /* and maybe include the TIMESTAMP option */ - | (pcb->flags & TF_TIMESTAMP ? TF_SEG_OPTS_TS : 0) -#endif /* LWIP_TCP_TIMESTAMPS */ - ); + ret = tcp_enqueue_flags(pcb, TCP_SYN); if (ret == ERR_OK) { /* SYN segment was enqueued, changed the pcbs state now */ pcb->state = SYN_SENT; @@ -1417,8 +1412,7 @@ tcp_eff_send_mss(u16_t sendmss, ip_addr_t *addr) mss_s = outif->mtu - IP_HLEN - TCP_HLEN; /* RFC 1122, chap 4.2.2.6: * Eff.snd.MSS = min(SendMSS+20, MMS_S) - TCPhdrsize - IPoptionsize - * We correct for TCP options in tcp_enqueue(), and don't support - * IP options + * We correct for TCP options in tcp_write(), and don't support IP options. */ sendmss = LWIP_MIN(sendmss, mss_s); } diff --git a/src/core/tcp_in.c b/src/core/tcp_in.c index 478cb8f7..fcba86f2 100644 --- a/src/core/tcp_in.c +++ b/src/core/tcp_in.c @@ -467,12 +467,7 @@ tcp_listen_input(struct tcp_pcb_listen *pcb) snmp_inc_tcppassiveopens(); /* Send a SYN|ACK together with the MSS option. */ - rc = tcp_enqueue(npcb, NULL, 0, TCP_SYN | TCP_ACK, 0, TF_SEG_OPTS_MSS -#if LWIP_TCP_TIMESTAMPS - /* and maybe include the TIMESTAMP option */ - | (npcb->flags & TF_TIMESTAMP ? TF_SEG_OPTS_TS : 0) -#endif /* LWIP_TCP_TIMESTAMPS */ - ); + rc = tcp_enqueue_flags(npcb, TCP_SYN | TCP_ACK); if (rc != ERR_OK) { tcp_abandon(npcb, 0); return rc; @@ -780,7 +775,7 @@ tcp_oos_insert_segment(struct tcp_seg *cseg, struct tcp_seg *next) (next->tcphdr->seqno + next->len))) { /* cseg with FIN already processed */ if (TCPH_FLAGS(next->tcphdr) & TCP_FIN) { - TCPH_FLAGS_SET(cseg->tcphdr, TCPH_FLAGS(cseg->tcphdr) | TCP_FIN); + TCPH_SET_FLAG(cseg->tcphdr, TCP_FIN); } old_seg = next; next = next->next; @@ -1209,8 +1204,7 @@ tcp_receive(struct tcp_pcb *pcb) /* inseg cannot have FIN here (already processed above) */ if (TCPH_FLAGS(next->tcphdr) & TCP_FIN && (TCPH_FLAGS(inseg.tcphdr) & TCP_SYN) == 0) { - TCPH_FLAGS_SET(inseg.tcphdr, - TCPH_FLAGS(inseg.tcphdr) | TCP_FIN); + TCPH_SET_FLAG(inseg.tcphdr, TCP_FIN); tcplen = TCP_TCPLEN(&inseg); } prev = next; diff --git a/src/core/tcp_out.c b/src/core/tcp_out.c index 33dd36d3..25ec95b2 100644 --- a/src/core/tcp_out.c +++ b/src/core/tcp_out.c @@ -80,19 +80,137 @@ tcp_output_set_header(struct tcp_pcb *pcb, struct pbuf *p, u16_t optlen, } /** - * Called by tcp_close() to send a segment including flags but not data. + * Called by tcp_close() to send a segment including FIN flag but not data. * * @param pcb the tcp_pcb over which to send a segment - * @param flags the flags to set in the segment header * @return ERR_OK if sent, another err_t otherwise */ err_t -tcp_send_ctrl(struct tcp_pcb *pcb, u8_t flags) +tcp_send_fin(struct tcp_pcb *pcb) { + /* first, try to add the fin to the last unsent segment */ + if (pcb->unsent != NULL) { + struct tcp_seg *last_unsent; + for (last_unsent = pcb->unsent; last_unsent->next != NULL; + last_unsent = last_unsent->next); + + if ((TCPH_FLAGS(last_unsent->tcphdr) & (TCP_SYN | TCP_FIN | TCP_RST)) == 0) { + /* no SYN/FIN/RST flag in the header, we can add the FIN flag */ + TCPH_SET_FLAG(last_unsent->tcphdr, TCP_FIN); + return ERR_OK; + } + } /* no data, no length, flags, copy=1, no optdata */ - return tcp_enqueue(pcb, NULL, 0, flags, TCP_WRITE_FLAG_COPY, 0); + return tcp_enqueue_flags(pcb, TCP_FIN); } +/** + * Create a TCP segment with prefilled header. + * + * Called by tcp_write and tcp_enqueue_flags. + * + * @param pcb Protocol control block for the TCP connection. + * @param p pbuf that is used to hold the TCP header. + * @param flags TCP flags for header. + * @param seqno TCP sequence number of this packet + * @param optflags options to include in TCP header + * @return a new tcp_seg pointing to p, or NULL. + * The TCP header is filled in except ackno and wnd. + * p is freed on failure. + */ +static struct tcp_seg * +tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u8_t flags, u32_t seqno, u8_t optflags) +{ + struct tcp_seg *seg; + u8_t optlen = LWIP_TCP_OPT_LENGTH(optflags); + + if ((seg = memp_malloc(MEMP_TCP_SEG)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_create_segment: no memory.\n")); + pbuf_free(p); + return NULL; + } + seg->flags = optflags; + seg->next = NULL; + seg->p = p; + seg->dataptr = p->payload; + seg->len = p->tot_len - optlen; + + /* build TCP header */ + if (pbuf_header(p, TCP_HLEN)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_create_segment: no room for TCP header in pbuf.\n")); + TCP_STATS_INC(tcp.err); + tcp_seg_free(seg); + return NULL; + } + seg->tcphdr = seg->p->payload; + seg->tcphdr->src = htons(pcb->local_port); + seg->tcphdr->dest = htons(pcb->remote_port); + seg->tcphdr->seqno = htonl(seqno); + /* ackno is set in tcp_output */ + TCPH_HDRLEN_FLAGS_SET(seg->tcphdr, (5 + optlen / 4), flags); + /* wnd and chksum are set in tcp_output */ + seg->tcphdr->urgp = 0; + return seg; +} + +/** + * Allocate a PBUF_RAM pbuf, perhaps with extra space at the end. + * + * This function is like pbuf_alloc(layer, length, PBUF_RAM) except + * there may be extra bytes available at the end. + * + * @param layer flag to define header size. + * @param length size of the pbuf's payload. + * @param max_length maximum usable size of payload+oversize. + * @param oversize pointer to a u16_t that will receive the number of usable tail bytes. + * @param pcb The TCP connection that willo enqueue the pbuf. + * @param apiflags API flags given to tcp_write. + * @param first_seg true when this pbuf will be used in the first enqueued segment. + * @param + */ +#if TCP_OVERSIZE +static struct pbuf * +tcp_pbuf_prealloc(pbuf_layer layer, u16_t length, u16_t max_length, + u16_t *oversize, struct tcp_pcb *pcb, u8_t apiflags, + u8_t first_seg) +{ + struct pbuf *p; + u16_t alloc = length; + + if (length < max_length) { + /* Should we allocate an oversized pbuf, or just the minimum + * length required? If tcp_write is going to be called again + * before this segment is transmitted, we want the oversized + * buffer. If the segment will be transmitted immediately, we can + * save memory by allocating only length. We use a simple + * heuristic based on the following information: + * + * Did the user set TCP_WRITE_FLAG_MORE? + * + * Will the Nagle algorithm defer transmission of this segment? + */ + if ((apiflags & TCP_WRITE_FLAG_MORE) || + (!(pcb->flags & TF_NODELAY) && + (!first_seg || + pcb->unsent != NULL || + pcb->unacked != NULL))) { + alloc = LWIP_MIN(max_length, LWIP_MEM_ALIGN_SIZE(length + TCP_OVERSIZE)); + } + } + p = pbuf_alloc(layer, alloc, PBUF_RAM); + if (p == NULL) { + return NULL; + } + LWIP_ASSERT("need unchained pbuf", p->next == NULL); + *oversize = p->len - length; + /* trim p->len to the currently used size */ + p->len = p->tot_len = length; + return p; +} +#else /* TCP_OVERSIZE */ +#define tcp_pbuf_prealloc(layer, length, mx, os, pcb, api, fst) pbuf_alloc((layer), (length), PBUF_RAM) +#endif /* TCP_OVERSIZE */ + /** * Write data for sending (but does not send it immediately). * @@ -100,323 +218,321 @@ tcp_send_ctrl(struct tcp_pcb *pcb, u8_t flags) * it can send them more efficiently by combining them together). * To prompt the system to send data now, call tcp_output() after * calling tcp_write(). - * - * @param pcb Protocol control block of the TCP connection to enqueue data for. - * @param data pointer to the data to send - * @param len length (in bytes) of the data to send - * @param apiflags combination of following flags : - * - TCP_WRITE_FLAG_COPY (0x01) data will be copied into memory belonging to the stack - * - TCP_WRITE_FLAG_MORE (0x02) for TCP connection, PSH flag will be set on last segment sent, - * @return ERR_OK if enqueued, another err_t on error - * - * @see tcp_write() - */ -err_t -tcp_write(struct tcp_pcb *pcb, const void *data, u16_t len, u8_t apiflags) -{ - LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_write(pcb=%p, data=%p, len=%"U16_F", apiflags=%"U16_F")\n", (void *)pcb, - data, len, (u16_t)apiflags)); - /* connection is in valid state for data transmission? */ - if (pcb->state == ESTABLISHED || - pcb->state == CLOSE_WAIT || - pcb->state == SYN_SENT || - pcb->state == SYN_RCVD) { - if (len > 0) { -#if LWIP_TCP_TIMESTAMPS - return tcp_enqueue(pcb, (void *)data, len, 0, apiflags, - pcb->flags & TF_TIMESTAMP ? TF_SEG_OPTS_TS : 0); -#else - return tcp_enqueue(pcb, (void *)data, len, 0, apiflags, 0); -#endif - } - return ERR_OK; - } else { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_STATE | LWIP_DBG_LEVEL_SEVERE, ("tcp_write() called in invalid state\n")); - return ERR_CONN; - } -} - -/** - * Enqueue data and/or TCP options for transmission - * - * Called by tcp_connect(), tcp_listen_input(), tcp_send_ctrl() and tcp_write(). * * @param pcb Protocol control block for the TCP connection to enqueue data for. * @param arg Pointer to the data to be enqueued for sending. * @param len Data length in bytes - * @param flags tcp header flags to set in the outgoing segment * @param apiflags combination of following flags : * - TCP_WRITE_FLAG_COPY (0x01) data will be copied into memory belonging to the stack * - TCP_WRITE_FLAG_MORE (0x02) for TCP connection, PSH flag will be set on last segment sent, - * @param optflags options to include in segment later on (see definition of struct tcp_seg) + * @return ERR_OK if enqueued, another err_t on error */ err_t -tcp_enqueue(struct tcp_pcb *pcb, void *arg, u16_t len, - u8_t flags, u8_t apiflags, u8_t optflags) +tcp_write(struct tcp_pcb *pcb, const void *arg, u16_t len, u8_t apiflags) { - struct pbuf *p; - struct tcp_seg *seg, *useg, *queue; - u32_t seqno; - u16_t left, seglen; - void *ptr; + struct pbuf *concat_p = NULL; + struct tcp_seg *last_unsent = NULL, *seg = NULL, *prev_seg = NULL, *queue = NULL; + u16_t pos = 0; /* position in 'arg' data */ u16_t queuelen; - u8_t optlen; + u8_t optlen = 0; + u8_t optflags = 0; +#if TCP_OVERSIZE + u16_t oversize = 0; + u16_t oversize_used = 0; +#endif /* TCP_OVERSIZE */ - LWIP_DEBUGF(TCP_OUTPUT_DEBUG, - ("tcp_enqueue(pcb=%p, arg=%p, len=%"U16_F", flags=%"X16_F", apiflags=%"U16_F")\n", - (void *)pcb, arg, len, (u16_t)flags, (u16_t)apiflags)); - LWIP_ERROR("tcp_enqueue: packet needs payload, options, or SYN/FIN (programmer violates API)", - ((len != 0) || (optflags != 0) || ((flags & (TCP_SYN | TCP_FIN)) != 0)), - return ERR_ARG;); - LWIP_ERROR("tcp_enqueue: len != 0 || arg == NULL (programmer violates API)", - ((len != 0) || (arg == NULL)), return ERR_ARG;); + LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_write(pcb=%p, data=%p, len=%"U16_F", apiflags=%"U16_F")\n", + (void *)pcb, arg, len, (u16_t)apiflags)); + /* connection is in invalid state for data transmission? */ + if ((pcb->state != ESTABLISHED) && + (pcb->state != CLOSE_WAIT) && + (pcb->state != SYN_SENT) && + (pcb->state != SYN_RCVD)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_STATE | LWIP_DBG_LEVEL_SEVERE, ("tcp_write() called in invalid state\n")); + return ERR_CONN; + } else if (len == 0) { + return ERR_OK; + } + + LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_write(pcb=%p, arg=%p, len=%"U16_F", apiflags=%"U16_F")\n", + (void *)pcb, arg, len, (u16_t)apiflags)); + LWIP_ERROR("tcp_write: arg == NULL (programmer violates API)", + arg != NULL, return ERR_ARG;); /* fail on too much data */ if (len > pcb->snd_buf) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_WARNING, - ("tcp_enqueue: too much data (len=%"U16_F" > snd_buf=%"U16_F")\n", len, pcb->snd_buf)); + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_write: too much data (len=%"U16_F" > snd_buf=%"U16_F")\n", + len, pcb->snd_buf)); pcb->flags |= TF_NAGLEMEMERR; return ERR_MEM; } - left = len; - ptr = arg; - optlen = LWIP_TCP_OPT_LENGTH(optflags); - - /* seqno will be the sequence number of the first segment enqueued - * by the call to this function. */ - seqno = pcb->snd_lbb; - - LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue: queuelen: %"U16_F"\n", (u16_t)pcb->snd_queuelen)); + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_write: queuelen: %"U16_F"\n", (u16_t)pcb->snd_queuelen)); /* If total number of pbufs on the unsent/unacked queues exceeds the * configured maximum, return an error */ queuelen = pcb->snd_queuelen; /* check for configured max queuelen and possible overflow */ if ((queuelen >= TCP_SND_QUEUELEN) || (queuelen > TCP_SNDQUEUELEN_OVERFLOW)) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_WARNING, - ("tcp_enqueue: too long queue %"U16_F" (max %"U16_F")\n", queuelen, TCP_SND_QUEUELEN)); + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_write: too long queue %"U16_F" (max %"U16_F")\n", + queuelen, TCP_SND_QUEUELEN)); TCP_STATS_INC(tcp.memerr); pcb->flags |= TF_NAGLEMEMERR; return ERR_MEM; } if (queuelen != 0) { - LWIP_ASSERT("tcp_enqueue: pbufs on queue => at least one queue non-empty", + LWIP_ASSERT("tcp_write: pbufs on queue => at least one queue non-empty", pcb->unacked != NULL || pcb->unsent != NULL); } else { - LWIP_ASSERT("tcp_enqueue: no pbufs on queue => both queues empty", + LWIP_ASSERT("tcp_write: no pbufs on queue => both queues empty", pcb->unacked == NULL && pcb->unsent == NULL); } - /* First, break up the data into segments and tuck them together in - * the local "queue" variable. */ - useg = queue = seg = NULL; - seglen = 0; - while (queue == NULL || left > 0) { - /* The segment length (including options) should be at most the MSS */ - seglen = left > (pcb->mss - optlen) ? (pcb->mss - optlen) : left; +#if LWIP_TCP_TIMESTAMPS + if ((pcb->flags & TF_TIMESTAMP)) { + optflags = TF_SEG_OPTS_TS; + optlen = LWIP_TCP_OPT_LENGTH(TF_SEG_OPTS_TS); + } +#endif /* LWIP_TCP_TIMESTAMPS */ - /* Allocate memory for tcp_seg, and fill in fields. */ - seg = (struct tcp_seg *)memp_malloc(MEMP_TCP_SEG); - if (seg == NULL) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, - ("tcp_enqueue: could not allocate memory for tcp_seg\n")); + + /* + * TCP segmentation is done in three phases with increasing complexity: + * + * 1. Copy data directly into an oversized pbuf. + * 2. Chain a new pbuf to the end of pcb->unsent. + * 3. Create new segments. + * + * We may run out of memory at any point. In that case we must + * return ERR_MEM and not change anything in pcb. Therefore, all + * changes are recorded in local variables and committed at the end + * of the function. Some pcb fields are maintained in local copies: + * + * queuelen = pcb->snd_queuelen + * oversize = pcb->unsent_oversize + * + * These variables are set consistently by the phases: + * + * seg points to the last segment tampered with. + * + * pos records progress as data is segmented. + */ + + /* Find the tail of the unsent queue. */ + if (pcb->unsent != NULL) { + u16_t space; + + /* @todo: this could be sped up by keeping last_unsent in the pcb */ + for (last_unsent = pcb->unsent; last_unsent->next != NULL; + last_unsent = last_unsent->next); + + /* Usable space at the end of the last unsent segment */ + space = pcb->mss - last_unsent->len; + + /* + * Phase 1: Copy data directly into an oversized pbuf. + * + * The number of bytes copied is recorded in the oversize_used + * variable. The actual copying is done at the bottom of the + * function. + */ +#if TCP_OVERSIZE + oversize = pcb->unsent_oversize; + if (oversize > 0) { + seg = last_unsent; + oversize_used = oversize < len ? oversize : len; + LWIP_ASSERT("inconsistent oversize vs. space", oversize_used <= space); + pos += oversize_used; + oversize -= oversize_used; + space -= oversize_used; + } + /* now we are either finished or oversize is zero */ + LWIP_ASSERT("inconsistend oversize vs. len", (oversize == 0) || (pos == len)); +#endif /* TCP_OVERSIZE */ + + /* + * Phase 2: Chain a new pbuf to the end of pcb->unsent. + * + * We don't extend segments containing SYN/FIN flags or options + * (len==0). The new pbuf is kept in concat_p and pbuf_cat'ed at + * the end. + */ + if ((pos < len) && (space > 0) && (last_unsent->len > 0)) { + u16_t seglen = space < len - pos ? space : len - pos; + seg = last_unsent; + + /* Create a pbuf with a copy or reference to seglen bytes. We + * can use PBUF_RAW here since the data appears in the middle of + * a segment. A header will never be prepended. */ + if (apiflags & TCP_WRITE_FLAG_COPY) { + /* Data is copied */ + if ((concat_p = tcp_pbuf_prealloc(PBUF_RAW, seglen, space, &oversize, pcb, apiflags, 1)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, + ("tcp_write : could not allocate memory for pbuf copy size %"U16_F"\n", + seglen)); + goto memerr; + } + MEMCPY(concat_p->payload, (u8_t*)arg + pos, seglen); + } else { + /* Data is not copied */ + if ((concat_p = pbuf_alloc(PBUF_RAW, seglen, PBUF_ROM)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, + ("tcp_write: could not allocate memory for zero-copy pbuf\n")); + goto memerr; + } + concat_p->payload = (u8_t*)arg + pos; + } + + pos += seglen; + queuelen += pbuf_clen(concat_p); + } + } + + /* + * Phase 3: Create new segments. + * + * The new segments are chained together in the local 'queue' + * variable, ready to be appended to pcb->unsent. + */ + while (pos < len) { + struct pbuf *p; + u16_t left = len - pos; + u16_t max_len = pcb->mss - optlen; + u16_t seglen = left > max_len ? max_len : left; + + if (apiflags & TCP_WRITE_FLAG_COPY) { + /* If copy is set, memory should be allocated and data copied + * into pbuf */ + if ((p = tcp_pbuf_prealloc(PBUF_TRANSPORT, seglen + optlen, pcb->mss, &oversize, pcb, apiflags, queue == NULL)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_write : could not allocate memory for pbuf copy size %"U16_F"\n", seglen)); + goto memerr; + } + LWIP_ASSERT("tcp_write: check that first pbuf can hold the complete seglen", + (p->len >= seglen)); + MEMCPY((char *)p->payload + optlen, (u8_t*)arg + pos, seglen); + } else { + /* Copy is not set: First allocate a pbuf for holding the data. + * Since the referenced data is available at least until it is + * sent out on the link (as it has to be ACKed by the remote + * party) we can safely use PBUF_ROM instead of PBUF_REF here. + */ + struct pbuf *p2; + if ((p2 = pbuf_alloc(PBUF_TRANSPORT, seglen, PBUF_ROM)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_write: could not allocate memory for zero-copy pbuf\n")); + goto memerr; + } + /* reference the non-volatile payload data */ + p2->payload = (u8_t*)arg + pos; + + /* Second, allocate a pbuf for the headers. */ + if ((p = pbuf_alloc(PBUF_TRANSPORT, optlen, PBUF_RAM)) == NULL) { + /* If allocation fails, we have to deallocate the data pbuf as + * well. */ + pbuf_free(p2); + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_write: could not allocate memory for header pbuf\n")); + goto memerr; + } + /* Concatenate the headers and data pbufs together. */ + pbuf_cat(p/*header*/, p2/*data*/); + } + + queuelen += pbuf_clen(p); + + /* Now that there are more segments queued, we check again if the + * length of the queue exceeds the configured maximum or + * overflows. */ + if ((queuelen > TCP_SND_QUEUELEN) || (queuelen > TCP_SNDQUEUELEN_OVERFLOW)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_write: queue too long %"U16_F" (%"U16_F")\n", queuelen, TCP_SND_QUEUELEN)); goto memerr; } - seg->next = NULL; - seg->p = NULL; + + if ((seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + pos, optflags)) == NULL) { + goto memerr; + } + /* Fix dataptr for the nocopy case */ + if ((apiflags & TCP_WRITE_FLAG_COPY) == 0) { + seg->dataptr = (u8_t*)arg + pos; + } /* first segment of to-be-queued data? */ if (queue == NULL) { queue = seg; - } - /* subsequent segments of to-be-queued data */ - else { + } else { /* Attach the segment to the end of the queued segments */ - LWIP_ASSERT("useg != NULL", useg != NULL); - useg->next = seg; + LWIP_ASSERT("prev_seg != NULL", prev_seg != NULL); + prev_seg->next = seg; } /* remember last segment of to-be-queued data for next iteration */ - useg = seg; + prev_seg = seg; - /* If copy is set, memory should be allocated - * and data copied into pbuf, otherwise data comes from - * ROM or other static memory, and need not be copied. */ - if (apiflags & TCP_WRITE_FLAG_COPY) { - if ((seg->p = pbuf_alloc(PBUF_TRANSPORT, seglen + optlen, PBUF_RAM)) == NULL) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, - ("tcp_enqueue : could not allocate memory for pbuf copy size %"U16_F"\n", seglen)); - goto memerr; - } - LWIP_ASSERT("check that first pbuf can hold the complete seglen", - (seg->p->len >= seglen + optlen)); - queuelen += pbuf_clen(seg->p); - if (arg != NULL) { - MEMCPY((char *)seg->p->payload + optlen, ptr, seglen); - } - seg->dataptr = seg->p->payload; - } - /* do not copy data */ - else { - /* First, allocate a pbuf for the headers. */ - if ((seg->p = pbuf_alloc(PBUF_TRANSPORT, optlen, PBUF_RAM)) == NULL) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, - ("tcp_enqueue: could not allocate memory for header pbuf\n")); - goto memerr; - } - queuelen += pbuf_clen(seg->p); - - /* Second, allocate a pbuf for holding the data. - * since the referenced data is available at least until it is sent out on the - * link (as it has to be ACKed by the remote party) we can safely use PBUF_ROM - * instead of PBUF_REF here. - */ - if (left > 0) { - if ((p = pbuf_alloc(PBUF_RAW, seglen, PBUF_ROM)) == NULL) { - /* If allocation fails, we have to deallocate the header pbuf as well. */ - pbuf_free(seg->p); - seg->p = NULL; - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, - ("tcp_enqueue: could not allocate memory for zero-copy pbuf\n")); - goto memerr; - } - ++queuelen; - /* reference the non-volatile payload data */ - p->payload = ptr; - seg->dataptr = ptr; - - /* Concatenate the headers and data pbufs together. */ - pbuf_cat(seg->p/*header*/, p/*data*/); - p = NULL; - } - } - - /* Now that there are more segments queued, we check again if the - length of the queue exceeds the configured maximum or overflows. */ - if ((queuelen > TCP_SND_QUEUELEN) || (queuelen > TCP_SNDQUEUELEN_OVERFLOW)) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, - ("tcp_enqueue: queue too long %"U16_F" (%"U16_F")\n", queuelen, TCP_SND_QUEUELEN)); - goto memerr; - } - - seg->len = seglen; - - /* build TCP header */ - if (pbuf_header(seg->p, TCP_HLEN)) { - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_LEVEL_SERIOUS, ("tcp_enqueue: no room for TCP header in pbuf.\n")); - TCP_STATS_INC(tcp.err); - goto memerr; - } - seg->tcphdr = (struct tcp_hdr *)seg->p->payload; - seg->tcphdr->src = htons(pcb->local_port); - seg->tcphdr->dest = htons(pcb->remote_port); - seg->tcphdr->seqno = htonl(seqno); - seg->tcphdr->urgp = 0; - TCPH_FLAGS_SET(seg->tcphdr, flags); - /* don't fill in tcphdr->ackno and tcphdr->wnd until later */ - - seg->flags = optflags; - - /* Set the length of the header */ - TCPH_HDRLEN_SET(seg->tcphdr, (5 + optlen / 4)); - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE, ("tcp_enqueue: queueing %"U32_F":%"U32_F" (0x%"X16_F")\n", + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE, ("tcp_write: queueing %"U32_F":%"U32_F"\n", ntohl(seg->tcphdr->seqno), - ntohl(seg->tcphdr->seqno) + TCP_TCPLEN(seg), - (u16_t)flags)); + ntohl(seg->tcphdr->seqno) + TCP_TCPLEN(seg))); - left -= seglen; - seqno += seglen; - ptr = (void *)((u8_t *)ptr + seglen); + pos += seglen; } - /* Now that the data to be enqueued has been broken up into TCP - segments in the queue variable, we add them to the end of the - pcb->unsent queue. */ - if (pcb->unsent == NULL) { - useg = NULL; - } - else { - for (useg = pcb->unsent; useg->next != NULL; useg = useg->next); - } - /* { useg is last segment on the unsent queue, NULL if list is empty } */ + /* + * All three segmentation phases were successful. We can commit the + * transaction. + */ - /* If there is room in the last pbuf on the unsent queue, - chain the first pbuf on the queue together with that. */ - if (useg != NULL && - TCP_TCPLEN(useg) != 0 && - !(TCPH_FLAGS(useg->tcphdr) & (TCP_SYN | TCP_FIN)) && - (!(flags & (TCP_SYN | TCP_FIN)) || (flags == TCP_FIN)) && - /* fit within max seg size */ - (useg->len + queue->len <= pcb->mss) && - /* only concatenate segments with the same options */ - (useg->flags == queue->flags) && - /* segments are consecutive */ - (ntohl(useg->tcphdr->seqno) + useg->len == ntohl(queue->tcphdr->seqno)) ) { - /* Remove TCP header from first segment of our to-be-queued list */ - if(pbuf_header(queue->p, -(TCP_HLEN + optlen))) { - /* Can we cope with this failing? Just assert for now */ - LWIP_ASSERT("pbuf_header failed\n", 0); - TCP_STATS_INC(tcp.err); - goto memerr; - } - if (queue->p->len == 0) { - /* free the first (header-only) pbuf if it is now empty (contained only headers) */ - struct pbuf *old_q = queue->p; - queue->p = queue->p->next; - old_q->next = NULL; - queuelen--; - pbuf_free(old_q); - } - if (flags & TCP_FIN) { - /* the new segment contains only FIN, no data -> put the FIN into the last segment */ - LWIP_ASSERT("FIN enqueued together with data", queue->p == NULL && queue->len == 0); - TCPH_SET_FLAG(useg->tcphdr, TCP_FIN); - } else { - LWIP_ASSERT("zero-length pbuf", (queue->p != NULL) && (queue->p->len > 0)); - pbuf_cat(useg->p, queue->p); - useg->len += queue->len; - useg->next = queue->next; + /* + * Phase 1: If data has been added to the preallocated tail of + * last_unsent, we update the length fields of the pbuf chain. + */ +#if TCP_OVERSIZE + if (oversize_used > 0) { + struct pbuf *p; + /* Bump tot_len of whole chain, len of tail */ + for (p = last_unsent->p; p; p = p->next) { + p->tot_len += oversize_used; + if (p->next == NULL) { + MEMCPY((u8_t *)p->payload + p->len, arg, oversize_used); + p->len += oversize_used; + } } + last_unsent->len += oversize_used; + } + pcb->unsent_oversize = oversize; +#endif /* TCP_OVERSIZE */ - LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE | LWIP_DBG_STATE, ("tcp_enqueue: chaining segments, new len %"U16_F"\n", useg->len)); - if (seg == queue) { - seg = useg; - seglen = useg->len; - } - memp_free(MEMP_TCP_SEG, queue); + /* + * Phase 2: concat_p can be concatenated onto last_unsent->p + */ + if (concat_p != NULL) { + LWIP_ASSERT("tcp_write: cannot concatenate when pcb->unsent is empty", + (last_unsent != NULL)); + pbuf_cat(last_unsent->p, concat_p); + last_unsent->len += concat_p->tot_len; } - else { - /* empty list */ - if (useg == NULL) { - /* initialize list with this segment */ - pcb->unsent = queue; - } - /* enqueue segment */ - else { - useg->next = queue; - } - } - if ((flags & TCP_SYN) || (flags & TCP_FIN)) { - ++len; - } - if (flags & TCP_FIN) { - pcb->flags |= TF_FIN; + + /* + * Phase 3: Append queue to pcb->unsent. Queue may be NULL, but that + * is harmless + */ + if (last_unsent == NULL) { + pcb->unsent = queue; + } else { + last_unsent->next = queue; } + + /* + * Finally update the pcb state. + */ pcb->snd_lbb += len; - pcb->snd_buf -= len; - - /* update number of segments on the queues */ pcb->snd_queuelen = queuelen; - LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue: %"S16_F" (after enqueued)\n", pcb->snd_queuelen)); + + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_write: %"S16_F" (after enqueued)\n", + pcb->snd_queuelen)); if (pcb->snd_queuelen != 0) { - LWIP_ASSERT("tcp_enqueue: valid queue length", - pcb->unacked != NULL || pcb->unsent != NULL); + LWIP_ASSERT("tcp_write: valid queue length", + pcb->unacked != NULL || pcb->unsent != NULL); } - /* Set the PSH flag in the last segment that we enqueued, but only - if the segment has data (indicated by seglen > 0). */ - if (seg != NULL && seglen > 0 && seg->tcphdr != NULL && ((apiflags & TCP_WRITE_FLAG_MORE)==0)) { + /* Set the PSH flag in the last segment that we enqueued. */ + if (seg != NULL && seg->tcphdr != NULL && ((apiflags & TCP_WRITE_FLAG_MORE)==0)) { TCPH_SET_FLAG(seg->tcphdr, TCP_PSH); } @@ -425,17 +541,129 @@ memerr: pcb->flags |= TF_NAGLEMEMERR; TCP_STATS_INC(tcp.memerr); + if (concat_p != NULL) { + pbuf_free(concat_p); + } if (queue != NULL) { tcp_segs_free(queue); } if (pcb->snd_queuelen != 0) { - LWIP_ASSERT("tcp_enqueue: valid queue length", pcb->unacked != NULL || + LWIP_ASSERT("tcp_write: valid queue length", pcb->unacked != NULL || pcb->unsent != NULL); } - LWIP_DEBUGF(TCP_QLEN_DEBUG | LWIP_DBG_STATE, ("tcp_enqueue: %"S16_F" (with mem err)\n", pcb->snd_queuelen)); + LWIP_DEBUGF(TCP_QLEN_DEBUG | LWIP_DBG_STATE, ("tcp_write: %"S16_F" (with mem err)\n", pcb->snd_queuelen)); return ERR_MEM; } +/** + * Enqueue TCP options for transmission. + * + * Called by tcp_connect(), tcp_listen_input(), and tcp_send_ctrl(). + * + * @param pcb Protocol control block for the TCP connection. + * @param flags TCP header flags to set in the outgoing segment. + * @param optdata pointer to TCP options, or NULL. + * @param optlen length of TCP options in bytes. + */ +err_t +tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) +{ + struct pbuf *p; + struct tcp_seg *seg; + u8_t optflags = 0; + u8_t optlen = 0; + + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue_flags: queuelen: %"U16_F"\n", (u16_t)pcb->snd_queuelen)); + + LWIP_ASSERT("tcp_enqueue_flags: need either TCP_SYN or TCP_FIN in flags (programmer violates API)", + (flags & (TCP_SYN | TCP_FIN)) != 0); + + /* check for configured max queuelen and possible overflow */ + if ((pcb->snd_queuelen >= TCP_SND_QUEUELEN) || (pcb->snd_queuelen > TCP_SNDQUEUELEN_OVERFLOW)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_enqueue_flags: too long queue %"U16_F" (max %"U16_F")\n", + pcb->snd_queuelen, TCP_SND_QUEUELEN)); + TCP_STATS_INC(tcp.memerr); + pcb->flags |= TF_NAGLEMEMERR; + return ERR_MEM; + } + + if (flags & TCP_SYN) { + optflags = TF_SEG_OPTS_MSS; + } +#if LWIP_TCP_TIMESTAMPS + if ((pcb->flags & TF_TIMESTAMP)) { + optflags |= TF_SEG_OPTS_TS; + } +#endif /* LWIP_TCP_TIMESTAMPS */ + optlen = LWIP_TCP_OPT_LENGTH(optflags); + + /* tcp_enqueue_flags is always called with either SYN or FIN in flags. + * We need one available snd_buf byte to do that. + * This means we can't send FIN while snd_buf==0. A better fix would be to + * not include SYN and FIN sequence numbers in the snd_buf count. */ + if (pcb->snd_buf == 0) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_enqueue_flags: no send buffer available\n")); + TCP_STATS_INC(tcp.memerr); + return ERR_MEM; + } + + /* Allocate pbuf with room for TCP header + options */ + if ((p = pbuf_alloc(PBUF_TRANSPORT, optlen, PBUF_RAM)) == NULL) { + pcb->flags |= TF_NAGLEMEMERR; + TCP_STATS_INC(tcp.memerr); + return ERR_MEM; + } + LWIP_ASSERT("tcp_enqueue_flags: check that first pbuf can hold optlen", + (p->len >= optlen)); + + /* Allocate memory for tcp_seg, and fill in fields. */ + if ((seg = tcp_create_segment(pcb, p, flags, pcb->snd_lbb, optflags)) == NULL) { + pcb->flags |= TF_NAGLEMEMERR; + TCP_STATS_INC(tcp.memerr); + return ERR_MEM; + } + LWIP_ASSERT("tcp_enqueue_flags: invalid segment length", seg->len == 0); + + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE, + ("tcp_enqueue_flags: queueing %"U32_F":%"U32_F" (0x%"X16_F")\n", + ntohl(seg->tcphdr->seqno), + ntohl(seg->tcphdr->seqno) + TCP_TCPLEN(seg), + (u16_t)flags)); + + /* Now append seg to pcb->unsent queue */ + if (pcb->unsent == NULL) { + pcb->unsent = seg; + } else { + struct tcp_seg *useg; + for (useg = pcb->unsent; useg->next != NULL; useg = useg->next); + useg->next = seg; + } +#if TCP_OVERSIZE + /* The new unsent tail has no space */ + pcb->unsent_oversize = 0; +#endif /* TCP_OVERSIZE */ + + /* SYN and FIN bump the sequence number */ + if ((flags & TCP_SYN) || (flags & TCP_FIN)) { + pcb->snd_lbb++; + /* optlen does not influence snd_buf */ + pcb->snd_buf--; + } + if (flags & TCP_FIN) { + pcb->flags |= TF_FIN; + } + + /* update number of segments on the queues */ + pcb->snd_queuelen += pbuf_clen(seg->p); + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue_flags: %"S16_F" (after enqueued)\n", pcb->snd_queuelen)); + if (pcb->snd_queuelen != 0) { + LWIP_ASSERT("tcp_enqueue_flags: invalid queue length", + pcb->unacked != NULL || pcb->unsent != NULL); + } + + return ERR_OK; +} + #if LWIP_TCP_TIMESTAMPS /* Build a timestamp option (12 bytes long) at the specified options pointer) @@ -580,10 +808,10 @@ tcp_output(struct tcp_pcb *pcb) (TCPH_FLAGS(seg->tcphdr) & TCP_RST) == 0); /* Stop sending if the nagle algorithm would prevent it * Don't stop: - * - if tcp_enqueue had a memory error before (prevent delayed ACK timeout) or + * - if tcp_write had a memory error before (prevent delayed ACK timeout) or * - if FIN was already enqueued for this PCB (SYN is always alone in a segment - * either seg->next != NULL or pcb->unacked == NULL; - * RST is no sent using tcp_enqueue/tcp_output. + * RST is no sent using tcp_write/tcp_output. */ if((tcp_do_output_nagle(pcb) == 0) && ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)){ @@ -1010,6 +1238,7 @@ tcp_zero_window_probe(struct tcp_pcb *pcb) return; is_fin = ((TCPH_FLAGS(seg->tcphdr) & TCP_FIN) != 0) && (seg->len == 0); + /* @todo: is this correct? p->len should not depend on the FIN flag! */ len = is_fin ? TCP_HLEN : TCP_HLEN + 1; p = pbuf_alloc(PBUF_IP, len, PBUF_RAM); diff --git a/src/include/lwip/opt.h b/src/include/lwip/opt.h index 7bb9aa22..c7cf3585 100644 --- a/src/include/lwip/opt.h +++ b/src/include/lwip/opt.h @@ -937,6 +937,24 @@ #define TCP_DEFAULT_LISTEN_BACKLOG 0xff #endif +/** + * TCP_OVERSIZE: The maximum number of bytes that tcp_write may + * allocate ahead of time in an attempt to create shorter pbuf chains + * for transmission. The meaningful range is 0 to TCP_MSS. Some + * suggested values are: + * + * 0: Disable oversized allocation. Each tcp_write() allocates a new + pbuf (old behaviour). + * 1: Allocate size-aligned pbufs with minimal excess. Use this if your + * scatter-gather DMA requires aligned fragments. + * 128: Limit the pbuf/memory overhead to 20%. + * TCP_MSS: Try to create unfragmented TCP packets. + * TCP_MSS/4: Try to create 4 fragments or less per TCP packet. + */ +#ifndef TCP_OVERSIZE +#define TCP_OVERSIZE TCP_MSS +#endif + /** * LWIP_TCP_TIMESTAMPS==1: support the TCP timestamp option. */ diff --git a/src/include/lwip/tcp.h b/src/include/lwip/tcp.h index bd736c80..5fcc7a3f 100644 --- a/src/include/lwip/tcp.h +++ b/src/include/lwip/tcp.h @@ -230,8 +230,12 @@ struct tcp_pcb { u16_t snd_buf; /* Available buffer space for sending (in bytes). */ #define TCP_SNDQUEUELEN_OVERFLOW (0xffff-3) u16_t snd_queuelen; /* Available buffer space for sending (in tcp_segs). */ - - + +#if TCP_OVERSIZE + /* Extra bytes available at the end of the last pbuf in unsent. */ + u16_t unsent_oversize; +#endif /* TCP_OVERSIZE */ + /* These are ordered by sequence number: */ struct tcp_seg *unsent; /* Unsent (queued) segments. */ struct tcp_seg *unacked; /* Sent but unacknowledged segments. */ @@ -342,7 +346,7 @@ void tcp_abort (struct tcp_pcb *pcb); err_t tcp_close (struct tcp_pcb *pcb); err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx); -/* Flags for "apiflags" parameter in tcp_write and tcp_enqueue */ +/* Flags for "apiflags" parameter in tcp_write */ #define TCP_WRITE_FLAG_COPY 0x01 #define TCP_WRITE_FLAG_MORE 0x02 diff --git a/src/include/lwip/tcp_impl.h b/src/include/lwip/tcp_impl.h index e6a9a842..4283ab8b 100644 --- a/src/include/lwip/tcp_impl.h +++ b/src/include/lwip/tcp_impl.h @@ -177,6 +177,8 @@ PACK_STRUCT_END #define TCPH_OFFSET_SET(phdr, offset) (phdr)->_hdrlen_rsvd_flags = htons(((offset) << 8) | TCPH_FLAGS(phdr)) #define TCPH_HDRLEN_SET(phdr, len) (phdr)->_hdrlen_rsvd_flags = htons(((len) << 12) | TCPH_FLAGS(phdr)) #define TCPH_FLAGS_SET(phdr, flags) (phdr)->_hdrlen_rsvd_flags = (((phdr)->_hdrlen_rsvd_flags & htons((u16_t)(~(u16_t)(TCP_FLAGS)))) | htons(flags)) +#define TCPH_HDRLEN_FLAGS_SET(phdr, len, flags) (phdr)->_hdrlen_rsvd_flags = htons(((len) << 12) | (flags)) + #define TCPH_SET_FLAG(phdr, flags ) (phdr)->_hdrlen_rsvd_flags = ((phdr)->_hdrlen_rsvd_flags | htons(flags)) #define TCPH_UNSET_FLAG(phdr, flags) (phdr)->_hdrlen_rsvd_flags = htons(ntohs((phdr)->_hdrlen_rsvd_flags) | (TCPH_FLAGS(phdr) & ~(flags)) ) @@ -384,9 +386,8 @@ struct tcp_seg *tcp_seg_copy(struct tcp_seg *seg); (pcb)->flags |= TF_ACK_NOW; \ } while (0) -err_t tcp_send_ctrl(struct tcp_pcb *pcb, u8_t flags); -err_t tcp_enqueue(struct tcp_pcb *pcb, void *dataptr, u16_t len, - u8_t flags, u8_t apiflags, u8_t optflags); +err_t tcp_send_fin(struct tcp_pcb *pcb); +err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags); void tcp_rexmit_seg(struct tcp_pcb *pcb, struct tcp_seg *seg);