TAS
TCP Acceleration as an OS Service
network.c
1 /*
2  * Copyright 2019 University of Washington, Max Planck Institute for
3  * Software Systems, and The University of Texas at Austin
4  *
5  * Permission is hereby granted, free of charge, to any person obtaining
6  * a copy of this software and associated documentation files (the
7  * "Software"), to deal in the Software without restriction, including
8  * without limitation the rights to use, copy, modify, merge, publish,
9  * distribute, sublicense, and/or sell copies of the Software, and to
10  * permit persons to whom the Software is furnished to do so, subject to
11  * the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be
14  * included in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23  */
24 
25 #include <stdio.h>
26 #include <assert.h>
27 
28 #include <rte_config.h>
29 #include <rte_memcpy.h>
30 #include <rte_malloc.h>
31 #include <rte_lcore.h>
32 #include <rte_ether.h>
33 #include <rte_ethdev.h>
34 #include <rte_mempool.h>
35 #include <rte_mbuf.h>
36 #include <rte_ip.h>
37 #include <rte_version.h>
38 #include <rte_spinlock.h>
39 
40 #include <utils.h>
41 #include <utils_rng.h>
42 #include <tas_memif.h>
43 #include "internal.h"
44 
45 #define PERTHREAD_MBUFS 2048
46 #define MBUF_SIZE (BUFFER_SIZE + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
47 #define RX_DESCRIPTORS 256
48 #define TX_DESCRIPTORS 128
49 
50 uint8_t net_port_id = 0;
51 static struct rte_eth_conf port_conf = {
52  .rxmode = {
53  .mq_mode = ETH_MQ_RX_RSS,
54  .offloads = 0,
55 #if RTE_VER_YEAR < 18
56  .ignore_offload_bitfield = 1,
57 #endif
58  },
59  .txmode = {
60  .mq_mode = ETH_MQ_TX_NONE,
61  .offloads = 0,
62  },
63  .rx_adv_conf = {
64  .rss_conf = {
65  .rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
66  },
67  },
68  .intr_conf = {
69  .rxq = 1,
70  },
71  };
72 
73 static unsigned num_threads;
74 static struct network_rx_thread **net_threads;
75 
76 static struct rte_eth_dev_info eth_devinfo;
77 #if RTE_VER_YEAR < 19
78  struct ether_addr eth_addr;
79 #else
80  struct rte_ether_addr eth_addr;
81 #endif
82 
83 uint16_t rss_reta_size;
84 static struct rte_eth_rss_reta_entry64 *rss_reta = NULL;
85 static uint16_t *rss_core_buckets = NULL;
86 
87 static struct rte_mempool *mempool_alloc(void);
88 static int reta_setup(void);
89 static int reta_mlx5_resize(void);
90 static rte_spinlock_t initlock = RTE_SPINLOCK_INITIALIZER;
91 
92 int network_init(unsigned n_threads)
93 {
94  uint8_t count;
95  int ret;
96  uint16_t p;
97 
98  num_threads = n_threads;
99 
100  /* allocate thread pointer arrays */
101  net_threads = rte_calloc("net thread ptrs", n_threads, sizeof(*net_threads), 0);
102  if (net_threads == NULL) {
103  goto error_exit;
104  }
105 
106  /* make sure there is only one port */
107 #if RTE_VER_YEAR < 18
108  count = rte_eth_dev_count();
109 #else
110  count = rte_eth_dev_count_avail();
111 #endif
112  if (count == 0) {
113  fprintf(stderr, "No ethernet devices\n");
114  goto error_exit;
115  } else if (count > 1) {
116  fprintf(stderr, "Multiple ethernet devices\n");
117  goto error_exit;
118  }
119 
120  RTE_ETH_FOREACH_DEV(p) {
121  net_port_id = p;
122  }
123 
124  /* get mac address and device info */
125  rte_eth_macaddr_get(net_port_id, &eth_addr);
126  rte_eth_dev_info_get(net_port_id, &eth_devinfo);
127 
128  if (eth_devinfo.max_rx_queues < n_threads ||
129  eth_devinfo.max_tx_queues < n_threads)
130  {
131  fprintf(stderr, "Error: NIC does not support enough hw queues (rx=%u tx=%u)"
132  " for the requested number of cores (%u)\n", eth_devinfo.max_rx_queues,
133  eth_devinfo.max_tx_queues, n_threads);
134  goto error_exit;
135  }
136 
137  /* mask unsupported RSS hash functions */
138  if ((port_conf.rx_adv_conf.rss_conf.rss_hf &
139  eth_devinfo.flow_type_rss_offloads) !=
140  port_conf.rx_adv_conf.rss_conf.rss_hf)
141  {
142  fprintf(stderr, "Warning: NIC does not support all requested RSS "
143  "hash functions.\n");
144  port_conf.rx_adv_conf.rss_conf.rss_hf &= eth_devinfo.flow_type_rss_offloads;
145  }
146 
147  /* enable per port checksum offload if requested */
148  if (config.fp_xsumoffload)
149  port_conf.txmode.offloads =
150  DEV_TX_OFFLOAD_IPV4_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
151 
152  /* disable rx interrupts if requested */
153  if (!config.fp_interrupts)
154  port_conf.intr_conf.rxq = 0;
155 
156  /* initialize port */
157  ret = rte_eth_dev_configure(net_port_id, n_threads, n_threads, &port_conf);
158  if (ret < 0) {
159  fprintf(stderr, "rte_eth_dev_configure failed\n");
160  goto error_exit;
161  }
162 
163 
164  /* workaround for mlx5. */
165  if (config.fp_autoscale) {
166  if (reta_mlx5_resize() != 0) {
167  goto error_exit;
168  }
169  }
170 
171 #if RTE_VER_YEAR < 18
172  eth_devinfo.default_txconf.txq_flags = ETH_TXQ_FLAGS_IGNORE;
173 #endif
174  eth_devinfo.default_rxconf.offloads = 0;
175 
176  /* enable per-queue checksum offload if requested */
177  eth_devinfo.default_txconf.offloads = 0;
178  if (config.fp_xsumoffload)
179  eth_devinfo.default_txconf.offloads =
180  DEV_TX_OFFLOAD_IPV4_CKSUM | DEV_TX_OFFLOAD_TCP_CKSUM;
181 
182  memcpy(&tas_info->mac_address, &eth_addr, 6);
183 
184  return 0;
185 
186 error_exit:
187  rte_free(net_threads);
188  return -1;
189 }
190 
191 void network_cleanup(void)
192 {
193  rte_eth_dev_stop(net_port_id);
194  rte_free(net_threads);
195 }
196 
197 void network_dump_stats(void)
198 {
199  struct rte_eth_stats stats;
200  if (rte_eth_stats_get(0, &stats) == 0) {
201  fprintf(stderr, "network stats: ipackets=%"PRIu64" opackets=%"PRIu64
202  " ibytes=%"PRIu64" obytes=%"PRIu64" imissed=%"PRIu64" ierrors=%"PRIu64
203  " oerrors=%"PRIu64" rx_nombuf=%"PRIu64"\n", stats.ipackets,
204  stats.opackets, stats.ibytes, stats.obytes, stats.imissed,
205  stats.ierrors, stats.oerrors, stats.rx_nombuf);
206  } else {
207  fprintf(stderr, "failed to get stats\n");
208  }
209 }
210 
211 int network_thread_init(struct dataplane_context *ctx)
212 {
213  static volatile uint32_t tx_init_done = 0;
214  static volatile uint32_t rx_init_done = 0;
215  static volatile uint32_t start_done = 0;
216 
217  struct network_thread *t = &ctx->net;
218  int ret;
219 
220  /* allocate mempool */
221  if ((t->pool = mempool_alloc()) == NULL) {
222  goto error_mpool;
223  }
224 
225  /* initialize tx queue */
226  t->queue_id = ctx->id;
227  rte_spinlock_lock(&initlock);
228  ret = rte_eth_tx_queue_setup(net_port_id, t->queue_id, TX_DESCRIPTORS,
229  rte_socket_id(), &eth_devinfo.default_txconf);
230  rte_spinlock_unlock(&initlock);
231  if (ret != 0) {
232  fprintf(stderr, "network_thread_init: rte_eth_tx_queue_setup failed\n");
233  goto error_tx_queue;
234  }
235 
236  /* barrier to make sure tx queues are initialized first */
237  __sync_add_and_fetch(&tx_init_done, 1);
238  while (tx_init_done < num_threads);
239 
240  /* initialize rx queue */
241  t->queue_id = ctx->id;
242  rte_spinlock_lock(&initlock);
243  ret = rte_eth_rx_queue_setup(net_port_id, t->queue_id, RX_DESCRIPTORS,
244  rte_socket_id(), &eth_devinfo.default_rxconf, t->pool);
245  rte_spinlock_unlock(&initlock);
246  if (ret != 0) {
247  fprintf(stderr, "network_thread_init: rte_eth_rx_queue_setup failed\n");
248  goto error_rx_queue;
249  }
250 
251  /* barrier to make sure rx queues are initialized first */
252  __sync_add_and_fetch(&rx_init_done, 1);
253  while (rx_init_done < num_threads);
254 
255  /* start device if this ìs core 0 */
256  if (ctx->id == 0) {
257  if (rte_eth_dev_start(net_port_id) != 0) {
258  fprintf(stderr, "rte_eth_dev_start failed\n");
259  goto error_tx_queue;
260  }
261 
262  /* enable vlan stripping if configured */
263  if (config.fp_vlan_strip) {
264  ret = rte_eth_dev_get_vlan_offload(net_port_id);
265  ret |= ETH_VLAN_STRIP_OFFLOAD;
266  if (rte_eth_dev_set_vlan_offload(net_port_id, ret)) {
267  fprintf(stderr, "network_thread_init: vlan off set failed\n");
268  goto error_tx_queue;
269  }
270  }
271 
272  /* setting up RETA failed */
273  if (config.fp_autoscale) {
274  if (reta_setup() != 0) {
275  fprintf(stderr, "RETA setup failed\n");
276  goto error_tx_queue;
277  }
278  }
279  start_done = 1;
280  }
281 
282  /* barrier wait for main thread to start the device */
283  while (!start_done);
284 
285  if (config.fp_interrupts) {
286  /* setup rx queue interrupt */
287  rte_spinlock_lock(&initlock);
288  ret = rte_eth_dev_rx_intr_ctl_q(net_port_id, t->queue_id,
289  RTE_EPOLL_PER_THREAD, RTE_INTR_EVENT_ADD, NULL);
290  rte_spinlock_unlock(&initlock);
291  if (ret != 0) {
292  fprintf(stderr, "network_thread_init: rte_eth_dev_rx_intr_ctl_q failed "
293  "(%d)\n", rte_errno);
294  goto error_int_queue;
295  }
296  }
297 
298  return 0;
299 
300 error_int_queue:
301  /* TODO: destroy rx queue */
302 error_rx_queue:
303  /* TODO: destroy tx queue */
304 error_tx_queue:
305  /* TODO: free mempool */
306 error_mpool:
307  rte_free(t);
308  return -1;
309 }
310 
311 int network_rx_interrupt_ctl(struct network_thread *t, int turnon)
312 {
313  if(turnon) {
314  return rte_eth_dev_rx_intr_enable(net_port_id, t->queue_id);
315  } else {
316  return rte_eth_dev_rx_intr_disable(net_port_id, t->queue_id);
317  }
318 }
319 
320 static struct rte_mempool *mempool_alloc(void)
321 {
322  static unsigned pool_id = 0;
323  unsigned n;
324  char name[32];
325  n = __sync_fetch_and_add(&pool_id, 1);
326  snprintf(name, 32, "mbuf_pool_%u\n", n);
327  return rte_mempool_create(name, PERTHREAD_MBUFS, MBUF_SIZE, 32,
328  sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init, NULL,
329  rte_pktmbuf_init, NULL, rte_socket_id(), 0);
330 
331 }
332 
333 static inline uint16_t core_min(uint16_t num)
334 {
335  uint16_t i, i_min = 0, v_min = UINT8_MAX;
336 
337  for (i = 0; i < num; i++) {
338  if (rss_core_buckets[i] < v_min) {
339  v_min = rss_core_buckets[i];
340  i_min = i;
341  }
342  }
343 
344  return i_min;
345 }
346 
347 static inline uint16_t core_max(uint16_t num)
348 {
349  uint16_t i, i_max = 0, v_max = 0;
350 
351  for (i = 0; i < num; i++) {
352  if (rss_core_buckets[i] >= v_max) {
353  v_max = rss_core_buckets[i];
354  i_max = i;
355  }
356  }
357 
358  return i_max;
359 }
360 
361 int network_scale_up(uint16_t old, uint16_t new)
362 {
363  uint16_t i, j, k, c, share = rss_reta_size / new;
364  uint16_t outer, inner;
365 
366  /* clear mask */
367  for (k = 0; k < rss_reta_size; k += RTE_RETA_GROUP_SIZE) {
368  rss_reta[k / RTE_RETA_GROUP_SIZE].mask = 0;
369  }
370 
371  k = 0;
372  for (j = old; j < new; j++) {
373  for (i = 0; i < share; i++) {
374  c = core_max(old);
375 
376  for (; ; k = (k + 1) % rss_reta_size) {
377  outer = k / RTE_RETA_GROUP_SIZE;
378  inner = k % RTE_RETA_GROUP_SIZE;
379  if (rss_reta[outer].reta[inner] == c) {
380  rss_reta[outer].mask |= 1ULL << inner;
381  rss_reta[outer].reta[inner] = j;
382  fp_state->flow_group_steering[k] = j;
383  break;
384  }
385  }
386 
387  rss_core_buckets[c]--;
388  rss_core_buckets[j]++;
389  }
390  }
391 
392  if (rte_eth_dev_rss_reta_update(net_port_id, rss_reta, rss_reta_size) != 0) {
393  fprintf(stderr, "network_scale_up: rte_eth_dev_rss_reta_update failed\n");
394  return -1;
395  }
396 
397  return 0;
398 }
399 
400 int network_scale_down(uint16_t old, uint16_t new)
401 {
402  uint16_t i, o_c, n_c, outer, inner;
403 
404  /* clear mask */
405  for (i = 0; i < rss_reta_size; i += RTE_RETA_GROUP_SIZE) {
406  rss_reta[i / RTE_RETA_GROUP_SIZE].mask = 0;
407  }
408 
409  for (i = 0; i < rss_reta_size; i++) {
410  outer = i / RTE_RETA_GROUP_SIZE;
411  inner = i % RTE_RETA_GROUP_SIZE;
412 
413  o_c = rss_reta[outer].reta[inner];
414  if (o_c >= new) {
415  n_c = core_min(new);
416 
417  rss_reta[outer].reta[inner] = n_c;
418  rss_reta[outer].mask |= 1ULL << inner;
419 
420  fp_state->flow_group_steering[i] = n_c;
421 
422  rss_core_buckets[o_c]--;
423  rss_core_buckets[n_c]++;
424  }
425  }
426 
427  if (rte_eth_dev_rss_reta_update(net_port_id, rss_reta, rss_reta_size) != 0) {
428  fprintf(stderr, "network_scale_down: rte_eth_dev_rss_reta_update failed\n");
429  return -1;
430  }
431 
432  return 0;
433 }
434 
435 static int reta_setup()
436 {
437  uint16_t i, c;
438 
439  /* allocate RSS redirection table and core-bucket count table */
440  rss_reta_size = eth_devinfo.reta_size;
441  rss_reta = rte_calloc("rss reta", ((rss_reta_size + RTE_RETA_GROUP_SIZE - 1) /
442  RTE_RETA_GROUP_SIZE), sizeof(*rss_reta), 0);
443  rss_core_buckets = rte_calloc("rss core buckets", fp_cores_max,
444  sizeof(*rss_core_buckets), 0);
445 
446  if (rss_reta == NULL || rss_core_buckets == NULL) {
447  fprintf(stderr, "reta_setup: rss_reta alloc failed\n");
448  goto error_exit;
449  }
450 
451  if (rss_reta_size > FLEXNIC_PL_MAX_FLOWGROUPS) {
452  fprintf(stderr, "reta_setup: reta size (%u) greater than maximum supported"
453  " (%u)\n", rss_reta_size, FLEXNIC_PL_MAX_FLOWGROUPS);
454  abort();
455  }
456 
457  /* initialize reta */
458  for (i = 0, c = 0; i < rss_reta_size; i++) {
459  rss_core_buckets[c]++;
460  rss_reta[i / RTE_RETA_GROUP_SIZE].mask = -1ULL;
461  rss_reta[i / RTE_RETA_GROUP_SIZE].reta[i % RTE_RETA_GROUP_SIZE] = c;
462  fp_state->flow_group_steering[i] = c;
463  c = (c + 1) % fp_cores_cur;
464  }
465 
466  if (rte_eth_dev_rss_reta_update(net_port_id, rss_reta, rss_reta_size) != 0) {
467  fprintf(stderr, "reta_setup: rte_eth_dev_rss_reta_update failed\n");
468  return -1;
469  }
470 
471  return 0;
472 
473 error_exit:
474  rte_free(rss_core_buckets);
475  rte_free(rss_reta);
476  return -1;
477 }
478 
479 /* The mlx5 driver by default picks reta size = number of queues. Which is not
480  * enough for scaling up and down with balanced load. But when updating the reta
481  * with a larger size, the mlx5 driver resizes the reta.
482  */
483 static int reta_mlx5_resize(void)
484 {
485  if (!strcmp(eth_devinfo.driver_name, "net_mlx5")) {
486  /* for mlx5 we can increase the size with a call to
487  * rte_eth_dev_rss_reta_update with the target size, so just up the
488  * reta_sizeo in devinfo so that the reta_setup() call increases it.
489  */
490  eth_devinfo.reta_size = 512;
491  }
492 
493  /* warn if reta is too small */
494  if (eth_devinfo.reta_size < 128) {
495  fprintf(stderr, "net: RSS redirection table is small (%u), this results in"
496  " bad load balancing when scaling down\n", eth_devinfo.reta_size);
497  }
498 
499  return 0;
500 }
uint32_t fp_autoscale
Definition: config.h:121
uint32_t fp_interrupts
Definition: config.h:117
uint32_t fp_xsumoffload
Definition: config.h:119
uint32_t fp_vlan_strip
Definition: config.h:125
uint64_t mac_address
Definition: tas_memif.h:64