diff options
author | gallatin <gallatin@ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f> | 2020-12-19 21:46:09 +0000 |
---|---|---|
committer | gallatin <gallatin@ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f> | 2020-12-19 21:46:09 +0000 |
commit | c1f5bee70f94c413b460a0561416e60932a43f80 (patch) | |
tree | 05fb1928d02f84e6d047a78e405e9caee69bdbb8 | |
parent | e764aa1ce8e425cf3a6e737a4a5ab9af59725b35 (diff) | |
download | freebsd-c1f5bee70f94c413b460a0561416e60932a43f80.tar.gz freebsd-c1f5bee70f94c413b460a0561416e60932a43f80.tar.bz2 |
Optionally bind ktls threads to NUMA domains
When ktls_bind_thread is 2, we pick a ktls worker thread that is
bound to the same domain as the TCP connection associated with
the socket. We use roughly the same code as netinet/tcp_hpts.c to
do this. This allows crypto to run on the same domain as the TCP
connection is associated with. Assuming TCP_REUSPORT_LB_NUMA
(D21636) is in place & in use, this ensures that the crypto source
and destination buffers are local to the same NUMA domain as we're
running crypto on.
This change (when TCP_REUSPORT_LB_NUMA, D21636, is used) reduces
cross-domain traffic from over 37% down to about 13% as measured
by pcm.x on a dual-socket Xeon using nginx and a Netflix workload.
Reviewed by: jhb
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D21648
git-svn-id: http://svn.freebsd.org/base/head@368818 ccf9f872-aa2e-dd11-9fc8-001c23d0bc1f
-rw-r--r-- | sys/kern/uipc_ktls.c | 42 |
1 files changed, 39 insertions, 3 deletions
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c index 2776d6fcd19..6ea14e8aa9e 100644 --- a/sys/kern/uipc_ktls.c +++ b/sys/kern/uipc_ktls.c @@ -34,6 +34,7 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/kernel.h> +#include <sys/domainset.h> #include <sys/ktls.h> #include <sys/lock.h> #include <sys/mbuf.h> @@ -83,6 +84,12 @@ struct ktls_wq { bool running; } __aligned(CACHE_LINE_SIZE); +struct ktls_domain_info { + int count; + int cpu[MAXCPU]; +}; + +struct ktls_domain_info ktls_domains[MAXMEMDOM]; static struct ktls_wq *ktls_wq; static struct proc *ktls_proc; LIST_HEAD(, ktls_crypto_backend) ktls_backends; @@ -316,6 +323,9 @@ static u_int ktls_get_cpu(struct socket *so) { struct inpcb *inp; +#ifdef NUMA + struct ktls_domain_info *di; +#endif u_int cpuid; inp = sotoinpcb(so); @@ -330,7 +340,13 @@ ktls_get_cpu(struct socket *so) * serialization provided by having the same connection use * the same queue. */ - cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; +#ifdef NUMA + if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) { + di = &ktls_domains[inp->inp_numa_domain]; + cpuid = di->cpu[inp->inp_flowid % di->count]; + } else +#endif + cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads]; return (cpuid); } #endif @@ -341,7 +357,7 @@ ktls_init(void *dummy __unused) struct thread *td; struct pcpu *pc; cpuset_t mask; - int error, i; + int count, domain, error, i; ktls_tasks_active = counter_u64_alloc(M_WAITOK); ktls_cnt_tx_queued = counter_u64_alloc(M_WAITOK); @@ -397,7 +413,11 @@ ktls_init(void *dummy __unused) if (ktls_bind_threads) { if (ktls_bind_threads > 1) { pc = pcpu_find(i); - CPU_COPY(&cpuset_domain[pc->pc_domain], &mask); + domain = pc->pc_domain; + CPU_COPY(&cpuset_domain[domain], &mask); + count = ktls_domains[domain].count; + ktls_domains[domain].cpu[count] = i; + ktls_domains[domain].count++; } else { CPU_SETOF(i, &mask); } @@ -410,6 +430,18 @@ ktls_init(void *dummy __unused) ktls_cpuid_lookup[ktls_number_threads] = i; ktls_number_threads++; } + + /* + * If we somehow have an empty domain, fall back to choosing + * among all KTLS threads. + */ + for (i = 0; i < vm_ndomains; i++) { + if (ktls_domains[i].count == 0) { + ktls_bind_threads = 0; + break; + } + } + printf("KTLS: Initialized %d threads\n", ktls_number_threads); } SYSINIT(ktls, SI_SUB_SMP + 1, SI_ORDER_ANY, ktls_init, NULL); @@ -2093,6 +2125,10 @@ ktls_work_thread(void *ctx) STAILQ_HEAD(, mbuf) local_m_head; STAILQ_HEAD(, socket) local_so_head; + if (ktls_bind_threads > 1) { + curthread->td_domain.dr_policy = + DOMAINSET_PREF(PCPU_GET(domain)); + } #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__) fpu_kern_thread(0); #endif |