﻿id	summary	reporter	owner	description	type	status	priority	milestone	component	version	resolution	keywords	cc	uname	nginx_version
1210	connections not distributed evenly after restart under high load with accept_mutex enabled	gallatin@…		"On a heavily loaded server (90,000 long lived connections) restarting nginx results in a few nginx workers with the connection limit, and others with very few connections.

conf file snippets:


{{{
worker_processes     64;
worker_rlimit_nofile 32768;
events {
    worker_connections  4096;

    accept_mutex on;

}

}}}

Before service nginx restart, server is serving 90+ Gb/s across ~90,000 conenctions:

{{{

            input        (Total)           output
   packets  errs idrops      bytes    packets  errs      bytes colls
   2237651     0     0  167011176    7789342     0 11633619064     0
   2214132     0     0  165215742    7719175     0 11531815268     0
   2255186     0     0  168221493    7816878     0 11679556820     0
<...>
}}}

nginx restarted.  Now top shows uneven load in new nginx workers:


{{{
last pid:  4557;  load averages: 16.89, 19.71, 17.60                    up 0+00:52:43  20:00:44
227 processes: 5 running, 222 sleeping
CPU:  4.8% user,  0.0% nice, 10.8% system, 21.1% interrupt, 63.2% idle
Mem: 2547M Active, 91G Inact, 3260K Laundry, 27G Wired, 1585M Buf, 2710M Free
Swap: 32G Total, 79M Used, 32G Free

  PID USERNAME    THR PRI NICE   SIZE    RES STATE   C   TIME    WCPU COMMAND
 4364 www           1  48   -4   270M   177M CPU18  18   0:50  40.25% nginx
 4371 www           1  39   -4   246M   183M CPU4    4   0:56  33.52% nginx
 4347 www           1  35   -4   236M   170M kqread  4   0:54  26.62% nginx
 4398 www           1  32   -4   230M   167M kqread 18   0:55  26.00% nginx
 4391 www           1  30   -4   216M   151M kqread 15   0:53  24.04% nginx
 4381 www           1  31   -4   214M   150M kqread  7   0:50  23.07% nginx
 4372 www           1  27   -4   220M   157M kqread 20   0:38  22.09% nginx
 4346 www           1  26   -4   228M   166M kqread  5   0:37  19.00% nginx
 4380 www           1  26   -4   232M   158M CPU26  26   0:48  18.59% nginx
 4363 www           1  25   -4   206M   144M kqread 20   0:31  17.16% nginx
 4396 www           1  26   -4   210M   147M kqread 14   0:34  16.60% nginx
 4402 www           1  24   -4   186M   123M kqread  3   0:27  14.66% nginx
 4345 www           1  24   -4   198M   136M kqread  6   0:30  14.58% nginx
 4349 www           1  24   -4   200M   137M kqread  6   0:31  14.30% nginx
 4365 www           1  20   -4   152M 91584K kqread  5   0:14  10.41% nginx
 4355 www           1  20   -4   138M 76796K kqread  7   0:12   7.28% nginx
 4351 www           1  20   -4   146M 84196K kqread 15   0:14   6.81% nginx
 4384 www           1  20   -4   120M 56260K kqread 21   0:06   4.76% nginx
 4400 www           1  20   -4   126M 63348K CPU17  17   0:08   4.49% nginx
 4383 www           1  20   -4   116M 53096K kqread  9   0:04   3.80% nginx
 4376 www           1  20   -4   116M 52824K kqread 20   0:04   3.76% nginx
 4397 www           1  20   -4   118M 53928K kqread 22   0:05   3.45% nginx
 4367 www           1  20   -4   112M 46252K kqread  9   0:03   3.32% nginx
 4368 www           1  20   -4   110M 43836K kqread  8   0:02   3.30% nginx
 4352 www           1  20   -4   112M 48280K kqread 22   0:03   2.84% nginx
 4373 www           1  20   -4   110M 42596K kqread 28   0:02   2.33% nginx
 4392 www           1  20   -4   112M 45620K kqread  5   0:03   2.14% nginx
 4377 www           1  20   -4   104M 31236K kqread 22   0:01   1.85% nginx
 4379 www           1  20   -4   108M 39072K kqread 30   0:02   1.68% nginx
 4361 www           1  20   -4   106M 36536K kqread 30   0:01   1.65% nginx
 4369 www           1  20   -4   108M 41188K kqread 20   0:02   1.63% nginx
 4388 www           1  20   -4   106M 36644K kqread  7   0:01   1.52% nginx
 4405 www           1  20   -4   106M 35744K kqread 13   0:01   1.32% nginx
 4395 www           1  20   -4   106M 37380K kqread 17   0:02   1.31% nginx
 4378 www           1  20   -4   104M 34352K kqread 13   0:01   1.28% nginx
 4407 www           1  20   -4   106M 39596K kqread  1   0:02   1.18% nginx
 4354 www           1  20   -4   106M 37312K kqread 29   0:02   1.17% nginx
 4382 www           1  20   -4   104M 31644K kqread  4   0:01   1.17% nginx
 4350 www           1  20   -4   104M 33508K kqread  8   0:01   0.94% nginx
 4401 www           1  20   -4   104M 30320K kqread 14   0:01   0.93% nginx
 4406 www           1  20   -4   104M 32572K kqread 12   0:01   0.92% nginx
 4408 www           1  20   -4   104M 31300K kqread 10   0:01   0.85% nginx
 4360 www           1  20   -4   106M 35236K kqread  1   0:01   0.72% nginx
 4362 www           1  20   -4   104M 32712K kqread 26   0:01   0.68% nginx
 4366 www           1  20   -4   102M 28800K kqread  3   0:00   0.65% nginx
 4353 www           1  20   -4   104M 30888K kqread 28   0:01   0.58% nginx
 4375 www           1  20   -4   104M 32060K kqread 18   0:01   0.57% nginx
 4403 www           1  20   -4   104M 29636K kqread  6   0:01   0.52% nginx
  568 root          1  20    0 10496K  2588K select 17   0:09   0.45% syslogd
 4385 www           1  20   -4   104M 29536K kqread  5   0:01   0.45% nginx
 4404 www           1  20   -4   102M 26208K kqread 22   0:00   0.44% nginx
 4387 www           1  20   -4   102M 22384K kqread  6   0:00   0.41% nginx
 4370 www           1  20   -4   102M 22328K kqread 10   0:00   0.41% nginx
 4399 www           1  20   -4   102M 23440K kqread 10   0:00   0.40% nginx
 4358 www           1  20   -4   102M 26216K kqread 12   0:00   0.38% nginx
 4390 www           1  20   -4   102M 26612K kqread 17   0:00   0.34% nginx
 4356 www           1  20   -4   102M 24788K kqread  7   0:00   0.33% nginx

}}}


Most popular workers have near connection limit, least popular has almost no connections:


{{{
c099.ord001.dev# lsof -n -p 4364 | grep TCP | wc -l
    4058
c099.ord001.dev# lsof -n -p 4380 | grep TCP | wc -l
    4058
c099.ord001.dev# lsof -n -p 4389 | grep TCP | wc -l
      77
c099.ord001.dev# ps augxw | grep 4389
www     4389    0.0  0.0 104392  27436  -  S<   19:57      0:00.60 nginx: worker process (nginx)
root    4618    0.0  0.0  14812   2484  0  S+   20:01      0:00.00 grep 4389

}}}

The uneven loading seems to persist indefinitely.  Eg, the connection count never seems to even out.

As a workaround, disabling the accept mutex helps, but causes other problems (kernel lock contention in kqueue).

"	defect	closed	minor		other	1.10.x	wontfix			FreeBSD c099.ord001.dev.ix.nflxvideo.net 12.0-CURRENT FreeBSD 12.0-CURRENT #0: Fri Feb 24 13:40:24 EST 2017     root@viserion:/data/NCD-3545/.obj/nano.nflx/data/NCD-3545/FreeBSD/sys/RED1  amd64	"nginx version: nginx/1.10.0
built with OpenSSL 1.0.2k-freebsd  26 Jan 2017
TLS SNI support enabled
configure arguments: --prefix=/usr/local/etc/nginx --with-cc-opt='-I /usr/local/include' --with-ld-opt='-L /usr/local/lib' --conf-path=/usr/local/etc/nginx/nginx.conf --sbin-path=/usr/local/sbin/nginx --pid-path=/var/run/nginx.pid --error-log-path=/var/log/nginx-error.log --user=www --group=www --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_smooth --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_geo_check --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_signed_uri --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_range_path --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_variables --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_nccp_conf --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_logstat --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_loghot --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_status --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_upstream_jdomain --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_rorder --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_delay --add-module=/var/tmp/portswork/_.p/netflix/nginx-1.10/work/nginx-1.10.0/nflx_sendfile_flags --with-debug --with-file-aio --with-ipv6 --http-client-body-temp-path=/var/tmp/nginx/client_body_temp --http-fastcgi-temp-path=/var/tmp/nginx/fastcgi_temp --http-proxy-temp-path=/var/tmp/nginx/proxy_temp --http-scgi-temp-path=/var/tmp/nginx/scgi_temp --http-uwsgi-temp-path=/var/tmp/nginx/uwsgi_temp --http-log-path=/var/log/nginx-access.log --with-http_v2_module --with-pcre --with-http_ssl_module --with-threads
"
