31 #include <rte_config.h> 33 #include <rte_hash_crc.h> 36 #include <packet_defs.h> 38 #include <utils_rng.h> 42 #define TCP_HTSIZE 4096 44 #define PORT_MAX ((1u << 16) - 1) 45 #define PORT_FIRST_EPH 8192 47 #define PORT_TYPE_UNUSED 0x0ULL 48 #define PORT_TYPE_LISTEN 0x1ULL 49 #define PORT_TYPE_LMULTI 0x2ULL 50 #define PORT_TYPE_CONN 0x3ULL 51 #define PORT_TYPE_MASK 0x3ULL 54 #define LISTEN_MULTI_MAX 32 56 #define CONN_DEBUG(c, f, x...) do { } while (0) 57 #define CONN_DEBUG0(c, f) do { } while (0) 63 struct listener *ls[LISTEN_MULTI_MAX];
76 static int conn_arp_done(
struct connection *conn);
78 const struct tcp_opts *opts, uint32_t fn_core, uint16_t flow_group);
79 static inline struct connection *conn_alloc(
void);
80 static inline void conn_free(
struct connection *conn);
81 static void conn_register(
struct connection *conn);
82 static void conn_unregister(
struct connection *conn);
86 static int conn_reg_synack(
struct connection *c);
88 static void conn_timeout_arm(
struct connection *c,
int type);
89 static void conn_timeout_disarm(
struct connection *c);
90 static void conn_close_timeout(
struct connection *c);
93 static void listener_packet(
struct listener *l,
const struct pkt_tcp *p,
94 const struct tcp_opts *opts, uint32_t fn_core, uint16_t flow_group);
95 static void listener_accept(
struct listener *l);
97 static inline uint16_t port_alloc(
void);
98 static inline int send_control(
const struct connection *conn, uint16_t
flags,
99 int ts_opt, uint32_t ts_echo, uint16_t mss_opt);
100 static inline int send_reset(
const struct pkt_tcp *p,
102 static inline int parse_options(
const struct pkt_tcp *p, uint16_t len,
105 static uintptr_t ports[PORT_MAX + 1];
106 static uint16_t port_eph_hint = PORT_FIRST_EPH;
107 static struct nbqueue conn_async_q;
113 nbqueue_init(&conn_async_q);
116 port_eph_hint = utils_rng_gen32(&rng) % ((1 << 16) - 1 - PORT_FIRST_EPH);
117 if ((tcp_hashtable = calloc(TCP_HTSIZE,
sizeof(*tcp_hashtable))) == NULL) {
129 while ((p = nbqueue_deq(&conn_async_q)) != NULL) {
132 if ((ret = conn->
comp.status) != 0 || (ret = conn_arp_done(conn)) != 0) {
133 conn_failed(conn, ret);
136 if ((ret = conn->
comp.status) != 0 ||
137 (ret = conn_reg_synack(conn)) != 0)
139 conn_failed(conn, ret);
142 fprintf(stderr,
"tcp_poll: unexpected conn state %u\n", conn->
status);
155 if ((conn = conn_alloc()) == NULL) {
156 fprintf(stderr,
"tcp_open: malloc failed\n");
160 CONN_DEBUG(conn,
"opening connection (ctx=%p, op=%"PRIx64
", rip=%x, rp=%u, " 161 "db=%u)\n", ctx, opaque, remote_ip, remote_port, db_id);
164 if ((local_port = port_alloc()) == 0) {
165 fprintf(stderr,
"tcp_open: port_alloc failed\n");
183 conn->
comp.q = &conn_async_q;
184 conn->
comp.notify_fd = -1;
185 conn->
comp.status = 0;
191 fprintf(stderr,
"tcp_open: nicif_arp failed\n");
194 }
else if (ret == 0) {
195 CONN_DEBUG0(conn,
"routing_resolve succeeded immediately\n");
198 ret = conn_arp_done(conn);
200 CONN_DEBUG0(conn,
"routing_resolve pending\n");
205 ports[
local_port] = (uintptr_t) conn | PORT_TYPE_CONN;
212 uint32_t backlog,
int reuseport,
struct listener **listen)
221 type = ports[local_port] & PORT_TYPE_MASK;
222 if (type != PORT_TYPE_UNUSED && reuseport == 0) {
223 fprintf(stderr,
"tcp_listen: port not unused\n");
225 }
else if (reuseport != 0 && type != PORT_TYPE_UNUSED &&
226 type != PORT_TYPE_LMULTI)
228 fprintf(stderr,
"tcp_listen: port not unused or multi listener\n");
233 if (reuseport != 0 && type == PORT_TYPE_UNUSED) {
234 if ((lm_new = calloc(1,
sizeof(*lm_new))) == NULL) {
235 fprintf(stderr,
"tcp_listen: calloc listen_multi failed\n");
239 }
else if (reuseport != 0) {
240 lm = (
struct listen_multi *) (ports[local_port] & ~PORT_TYPE_MASK);
241 if (lm->num == LISTEN_MULTI_MAX) {
242 fprintf(stderr,
"tcp_listen: no more additional listeners supported\n");
248 if ((lst = calloc(1,
sizeof(*lst))) == NULL) {
249 fprintf(stderr,
"tcp_listen: malloc failed\n");
255 if ((lst->
backlog_ptrs = calloc(backlog,
sizeof(
void *))) == NULL) {
256 fprintf(stderr,
"tcp_listen: malloc backlog_ptrss failed\n");
264 fprintf(stderr,
"tcp_listen: malloc backlog_cores failed\n");
273 fprintf(stderr,
"tcp_listen: malloc backlog_fgs failed\n");
282 if ((bls = malloc(
sizeof(*bls) * backlog)) == NULL) {
283 fprintf(stderr,
"tcp_listen: malloc backlog bufs failed\n");
291 for (i = 0; i < backlog; i++) {
298 lst->
port = local_port;
306 if (reuseport == 0) {
307 ports[local_port] = (uintptr_t) lst | PORT_TYPE_LISTEN;
309 lm->ls[lm->num] = lst;
311 if (lm_new != NULL) {
313 ports[local_port] = (uintptr_t) lm | PORT_TYPE_LMULTI;
323 struct listener *listen, uint32_t db_id)
328 if ((conn = conn_alloc()) == NULL) {
329 fprintf(stderr,
"tcp_accept: conn_alloc failed\n");
345 listener_accept(listen);
359 if (len <
sizeof(*p)) {
360 fprintf(stderr,
"tcp_packet: incomplete TCP receive (%u received, " 361 "%u expected)\n", len, (
unsigned)
sizeof(*p));
365 if (f_beui32(p->ip.dest) != config.
ip) {
366 fprintf(stderr,
"tcp_packet: unexpected destination IP (%x received, " 367 "%x expected)\n", f_beui32(p->ip.dest), config.
ip);
371 if (parse_options(p, len, &opts) != 0) {
372 fprintf(stderr,
"tcp_packet: parsing TCP options failed\n");
376 if ((c = conn_lookup(p)) != NULL) {
377 conn_packet(c, p, &opts, fn_core, flow_group);
378 }
else if ((l = listener_lookup(p)) != NULL) {
379 listener_packet(l, p, &opts, fn_core, flow_group);
384 if (!(TCPH_FLAGS(&p->tcp) & TCP_RST) &&
386 send_reset(p, &opts);
394 uint32_t tx_seq, rx_seq;
398 fprintf(stderr,
"tcp_close: currently no support for non-opened conns.\n");
406 fprintf(stderr,
"tcp_close: nicif_connection_disable failed unexpected\n");
413 if (!tx_c || !rx_c) {
414 send_control(conn, TCP_RST, 0, 0, 0);
437 ((uintptr_t) to - offsetof(
struct connection, to));
443 if (type == TO_TCP_CLOSED) {
444 conn_close_timeout(c);
446 }
else if (type != TO_TCP_HANDSHAKE) {
447 fprintf(stderr,
"tcp_timeout: unexpected timeout type (%u)\n", type);
451 fprintf(stderr,
"tcp_timeout: unexpected connection state (%u)\n", c->
status);
457 fprintf(stderr,
"tcp_timeout: giving up because of too many retries\n");
464 conn_timeout_arm(c, TO_TCP_HANDSHAKE);
467 send_control(c, TCP_SYN | TCP_ECE | TCP_CWR, 1, 0, TCP_MSS);
474 uint32_t ecn_flags = 0;
480 if ((ret = conn_syn_sent_packet(c, p, opts)) != 0) {
484 (TCPH_FLAGS(&p->tcp) & ~ecn_flags) == TCP_SYN)
489 if (opts->
ts == NULL) {
490 fprintf(stderr,
"conn_packet: re-transmitted SYN does not have TS " 501 send_control(c, TCP_SYN | TCP_ACK | ecn_flags, 1,
502 f_beui32(opts->
ts->ts_val), TCP_MSS);
504 (TCPH_FLAGS(&p->tcp) & TCP_SYN) == TCP_SYN)
508 (TCPH_FLAGS(&p->tcp) & TCP_FIN) == TCP_FIN)
512 send_control(c, TCP_ACK, 1, 0, 0);
514 fprintf(stderr,
"tcp_packet: unexpected connection state %u\n", c->
status);
518 static int conn_arp_done(
struct connection *conn)
520 CONN_DEBUG0(conn,
"arp resolution done\n");
527 conn_timeout_arm(conn, TO_TCP_HANDSHAKE);
530 send_control(conn, TCP_SYN | TCP_ECE | TCP_CWR, 1, 0, TCP_MSS);
532 CONN_DEBUG0(conn,
"SYN SENT\n");
539 uint32_t ecn_flags = TCPH_FLAGS(&p->tcp) & (TCP_ECE | TCP_CWR);
542 conn_timeout_disarm(c);
544 if ((TCPH_FLAGS(&p->tcp) & (TCP_SYN | TCP_ACK)) != (TCP_SYN | TCP_ACK)) {
545 fprintf(stderr,
"conn_syn_sent_packet: unexpected flags %x\n",
546 TCPH_FLAGS(&p->tcp));
549 if (opts->
ts == NULL) {
550 fprintf(stderr,
"conn_syn_sent_packet: no timestamp option received\n");
554 CONN_DEBUG0(c,
"conn_syn_sent_packet: syn-ack received\n");
558 c->
syn_ts = f_beui32(opts->
ts->ts_val);
561 if (ecn_flags == TCP_ECE) {
567 c->
comp.q = &conn_async_q;
568 c->
comp.notify_fd = -1;
578 fprintf(stderr,
"conn_syn_sent_packet: nicif_connection_add failed\n");
582 CONN_DEBUG0(c,
"conn_syn_sent_packet: connection registered\n");
587 send_control(c, TCP_ACK, 1, c->
syn_ts, 0);
589 CONN_DEBUG0(c,
"conn_syn_sent_packet: ACK sent\n");
596 static int conn_reg_synack(
struct connection *c)
598 uint32_t ecn_flags = 0;
607 send_control(c, TCP_SYN | TCP_ACK | ecn_flags, 1, c->
syn_ts, TCP_MSS);
614 static inline uint16_t port_alloc(
void)
616 uint16_t p, p_start, p_next;
618 p = p_start = port_eph_hint;
620 p_next = (((uint16_t) (p + 1)) < (uint16_t) PORT_FIRST_EPH ?
621 PORT_FIRST_EPH : p + 1);
623 if ((ports[p] & PORT_TYPE_MASK) == PORT_TYPE_UNUSED) {
624 port_eph_hint = p_next;
629 }
while (p != p_start);
634 static inline struct connection *conn_alloc(
void)
637 uintptr_t off_rx, off_tx;
639 if ((conn = malloc(
sizeof(*conn))) == NULL) {
640 fprintf(stderr,
"conn_alloc: malloc failed\n");
645 fprintf(stderr,
"conn_alloc: packetmem_alloc rx failed\n");
651 fprintf(stderr,
"conn_alloc: packetmem_alloc tx failed\n");
657 conn->
rx_buf = (uint8_t *) tas_shm + off_rx;
659 conn->
tx_buf = (uint8_t *) tas_shm + off_tx;
666 static inline void conn_free(
struct connection *conn)
673 static inline uint32_t conn_hash(uint32_t l_ip, uint32_t r_ip, uint16_t l_po,
676 return crc32c_sse42_u32(l_po | (((uint32_t) r_po) << 16),
677 crc32c_sse42_u64(l_ip | (((uint64_t) r_ip) << 32), 0));
680 static void conn_register(
struct connection *conn)
687 conn->
ht_next = tcp_hashtable[h];
688 tcp_hashtable[h] = conn;
691 static void conn_unregister(
struct connection *conn)
698 if (tcp_hashtable[h] == conn) {
699 tcp_hashtable[h] = conn->
ht_next;
701 for (cp = tcp_hashtable[h]; cp != NULL && cp->
ht_next != conn;
704 fprintf(stderr,
"conn_unregister: connection not found in ht\n");
717 h = conn_hash(f_beui32(p->ip.dest), f_beui32(p->ip.src),
718 f_beui16(p->tcp.dest), f_beui16(p->tcp.src)) % TCP_HTSIZE;
720 for (c = tcp_hashtable[h]; c != NULL; c = c->
ht_next) {
721 if (f_beui32(p->ip.src) == c->
remote_ip &&
735 conn_timeout_disarm(c);
743 static void conn_timeout_arm(
struct connection *c,
int type)
755 static void conn_timeout_disarm(
struct connection *c)
763 static void conn_close_timeout(
struct connection *c)
766 if ((ports[c->
local_port] & PORT_TYPE_MASK) == PORT_TYPE_CONN) {
781 if ((ports[c->
local_port] & PORT_TYPE_MASK) == PORT_TYPE_CONN) {
792 static inline uint32_t hash_64_to_32(uint64_t key)
794 key = (~key) + (key << 18);
795 key = key ^ (key >> 31);
797 key = key ^ (key >> 11);
798 key = key + (key << 6);
799 key = key ^ (key >> 22);
800 return (uint32_t) key;
805 uint16_t local_port = f_beui16(p->tcp.dest);
810 type = ports[local_port] & PORT_TYPE_MASK;
811 if (type == PORT_TYPE_LISTEN) {
813 return (
struct listener *) (ports[local_port] & ~PORT_TYPE_MASK);
814 }
else if (type == PORT_TYPE_LMULTI) {
816 lm = (
struct listen_multi *) (ports[local_port] & ~PORT_TYPE_MASK);
817 hash = hash_64_to_32(((uint64_t) f_beui32(p->ip.src) << 32) |
818 ((uint32_t) f_beui16(p->tcp.src) << 16) | local_port);
819 return lm->ls[hash % lm->num];
824 return (
struct listener *) (ports[local_port] & ~PORT_TYPE_MASK);
827 static void listener_packet(
struct listener *l,
const struct pkt_tcp *p,
828 const struct tcp_opts *opts, uint32_t fn_core, uint16_t flow_group)
835 if ((TCPH_FLAGS(&p->tcp) & ~(TCP_ECE | TCP_CWR)) != TCP_SYN) {
836 fprintf(stderr,
"listener_packet: Not a SYN (flags %x)\n",
837 TCPH_FLAGS(&p->tcp));
843 len =
sizeof(p->eth) + f_beui16(p->ip.len);
844 if (len >
sizeof(bls->buf)) {
845 fprintf(stderr,
"listener_packet: SYN larger than backlog buffer, " 851 for (n = 0, bp = l->
backlog_pos; n < l->backlog_used;
855 bl_p = (
struct pkt_tcp *) bls->buf;
856 if (f_beui32(p->ip.src) == f_beui32(bl_p->ip.src) &&
857 f_beui32(p->ip.dest) == f_beui32(bl_p->ip.dest) &&
858 f_beui16(p->tcp.src) == f_beui16(bl_p->tcp.src) &&
859 f_beui16(p->tcp.dest) == f_beui16(bl_p->tcp.dest))
866 fprintf(stderr,
"listener_packet: backlog queue full\n");
880 memcpy(bls->buf, p, len);
893 static void listener_accept(
struct listener *l)
899 uint32_t ecn_flags, fn_core;
909 p = (
const struct pkt_tcp *) bls->buf;
910 ret = parse_options(p, bls->len, &opts);
911 if (ret != 0 || opts.
ts == NULL) {
912 fprintf(stderr,
"listener_packet: parsing options failed or no timestamp " 920 memcpy(&c->
remote_mac, &p->eth.src, ETH_ADDR_LEN);
928 c->
syn_ts = f_beui32(opts.
ts->ts_val);
931 ecn_flags = TCPH_FLAGS(&p->tcp) & (TCP_ECE | TCP_CWR);
932 if (ecn_flags == (TCP_ECE | TCP_CWR)) {
940 c->
comp.q = &conn_async_q;
941 c->
comp.notify_fd = -1;
951 fprintf(stderr,
"listener_packet: nicif_connection_add failed\n");
957 nbqueue_enq(&conn_async_q, &c->
comp.el);
967 static inline int send_control_raw(uint64_t remote_mac, uint32_t remote_ip,
968 uint16_t remote_port, uint16_t local_port, uint32_t local_seq,
969 uint32_t remote_seq, uint16_t flags,
int ts_opt, uint32_t ts_echo,
977 uint16_t len, off_ts, off_mss;
982 optlen += (mss_opt ?
sizeof(*opt_mss) : 0);
984 optlen += (ts_opt ?
sizeof(*opt_ts) : 0);
985 optlen = (optlen + 3) & ~3;
986 len =
sizeof(*p) + optlen;
990 fprintf(stderr,
"send_control failed\n");
995 memcpy(&p->eth.dest, &remote_mac, ETH_ADDR_LEN);
996 memcpy(&p->eth.src, &
eth_addr, ETH_ADDR_LEN);
997 p->eth.type = t_beui16(ETH_TYPE_IP);
1000 IPH_VHL_SET(&p->ip, 4, 5);
1002 p->ip.len = t_beui16(len - offsetof(
struct pkt_tcp, ip));
1003 p->ip.id = t_beui16(3);
1004 p->ip.offset = t_beui16(0);
1006 p->ip.proto = IP_PROTO_TCP;
1008 p->ip.src = t_beui32(config.
ip);
1009 p->ip.dest = t_beui32(remote_ip);
1012 p->tcp.src = t_beui16(local_port);
1013 p->tcp.dest = t_beui16(remote_port);
1014 p->tcp.seqno = t_beui32(local_seq);
1015 p->tcp.ackno = t_beui32(remote_seq);
1016 TCPH_HDRLEN_FLAGS_SET(&p->tcp, 5 + optlen / 4, flags);
1017 p->tcp.wnd = t_beui16(11680);
1019 p->tcp.urgp = t_beui16(0);
1023 opt_mss = (
struct tcp_mss_opt *) ((uint8_t *) (p + 1) + off_mss);
1024 opt_mss->kind = TCP_OPT_MSS;
1025 opt_mss->length =
sizeof(*opt_mss);
1026 opt_mss->mss = t_beui16(mss_opt);
1032 memset(opt_ts, 0, optlen);
1033 opt_ts->kind = TCP_OPT_TIMESTAMP;
1034 opt_ts->length =
sizeof(*opt_ts);
1035 opt_ts->ts_val = t_beui32(0);
1036 opt_ts->ts_ecr = t_beui32(ts_echo);
1040 p->ip.chksum = rte_ipv4_cksum((
void *) &p->ip);
1041 p->tcp.chksum = rte_ipv4_udptcp_cksum((
void *) &p->ip, (
void *) &p->tcp);
1048 static inline int send_control(
const struct connection *conn, uint16_t flags,
1049 int ts_opt, uint32_t ts_echo, uint16_t mss_opt)
1056 static inline int send_reset(
const struct pkt_tcp *p,
1061 uint64_t remote_mac = 0;
1063 if (opts->
ts != NULL) {
1065 ts_val = f_beui32(opts->
ts->ts_val);
1068 memcpy(&remote_mac, &p->eth.src, ETH_ADDR_LEN);
1069 return send_control_raw(remote_mac, f_beui32(p->ip.src), f_beui16(p->tcp.src),
1070 f_beui16(p->tcp.dest), f_beui32(p->tcp.ackno), f_beui32(p->tcp.seqno) + 1,
1071 TCP_RST | TCP_ACK, ts_opt, ts_val, 0);
1074 static inline int parse_options(
const struct pkt_tcp *p, uint16_t len,
1077 uint8_t *opt = (uint8_t *) (p + 1);
1078 uint16_t opts_len = TCPH_HDRLEN(&p->tcp) * 4 - 20;
1080 uint8_t opt_kind, opt_len, opt_avail;
1086 if (TCPH_HDRLEN(&p->tcp) < 5 || opts_len > (len -
sizeof(*p))) {
1087 fprintf(stderr,
"hdrlen=%u opts_len=%u len=%u so=%zu\n", TCPH_HDRLEN(&p->tcp), opts_len, len,
sizeof(*p));
1091 while (off < opts_len) {
1092 opt_kind = opt[off];
1093 opt_avail = opts_len - off;
1094 if (opt_kind == TCP_OPT_END_OF_OPTIONS) {
1096 }
else if (opt_kind == TCP_OPT_NO_OP) {
1099 if (opt_avail < 2) {
1100 fprintf(stderr,
"parse_options: opt_avail=%u kind=%u off=%u\n", opt_avail, opt_kind, off);
1104 opt_len = opt[off + 1];
1105 if (opt_kind == TCP_OPT_MSS) {
1107 fprintf(stderr,
"parse_options: mss option size wrong (expect %zu " 1108 "got %u)\n",
sizeof(
struct tcp_mss_opt), opt_len);
1113 }
else if (opt_kind == TCP_OPT_TIMESTAMP) {
1115 fprintf(stderr,
"parse_options: opt_len=%u so=%zu\n", opt_len,
sizeof(
struct tcp_timestamp_opt));
void tcp_destroy(struct connection *conn)
struct packetmem_handle * rx_handle
void appif_conn_closed(struct connection *c, int status)
uint32_t tcp_handshake_to
struct nicif_completion comp
int nicif_connection_add(uint32_t db, uint64_t mac_remote, uint32_t ip_local, uint16_t port_local, uint32_t ip_remote, uint16_t port_remote, uint64_t rx_base, uint32_t rx_len, uint64_t tx_base, uint32_t tx_len, uint32_t remote_seq, uint32_t local_seq, uint64_t app_opaque, uint32_t flags, uint32_t rate, uint32_t fn_core, uint16_t flow_group, uint32_t *pf_id)
void nicif_connection_free(uint32_t f_id)
int tcp_packet(const void *pkt, uint16_t len, uint32_t fn_core, uint16_t flow_group)
void tcp_timeout(struct timeout *to, enum timeout_type type)
int tcp_listen(struct app_context *ctx, uint64_t opaque, uint16_t local_port, uint32_t backlog, int reuseport, struct listener **listen)
uint32_t util_timeout_time_us(void)
enum connection_status status
struct tcp_timestamp_opt * ts
void appif_listen_newconn(struct listener *l, uint32_t remote_ip, uint16_t remote_port)
int tcp_accept(struct app_context *ctx, uint64_t opaque, struct listener *listen, uint32_t db_id)
int packetmem_alloc(size_t length, uintptr_t *off, struct packetmem_handle **handle)
struct connection * ht_next
int tcp_open(struct app_context *ctx, uint64_t opaque, uint32_t remote_ip, uint16_t remote_port, uint32_t db_id, struct connection **pconn)
void util_timeout_disarm(struct timeout_manager *mgr, struct timeout *to)
void packetmem_free(struct packetmem_handle *handle)
int tcp_close(struct connection *conn)
void cc_conn_init(struct connection *conn)
int nicif_tx_alloc(uint16_t len, void **buf, uint32_t *opaque)
void appif_conn_opened(struct connection *c, int status)
int nicif_connection_disable(uint32_t f_id, uint32_t *tx_seq, uint32_t *rx_seq, int *tx_closed, int *rx_closed)
struct packetmem_handle * tx_handle
void appif_accept_conn(struct connection *c, int status)
void cc_conn_remove(struct connection *conn)
uint32_t tcp_handshake_retries
struct connection * wait_conns
void nicif_tx_send(uint32_t opaque, int no_ts)
void util_timeout_arm(struct timeout_manager *mgr, struct timeout *to, uint32_t us, uint8_t type)
int routing_resolve(struct nicif_completion *comp, uint32_t ip, uint64_t *mac)