Opened 6 years ago

Closed 6 years ago

#1673 closed defect (invalid)

Nginx don't read upstream response in case upstream close connection.

Reported by: akayunov@… Owned by:
Priority: minor Milestone:
Component: nginx-core Version: 1.15.x
Keywords: Cc:
uname -a: Linux nginx-rj4gd 4.9.0-8-amd64 #1 SMP Debian 4.9.110-3+deb9u4 (2018-08-21) x86_64 GNU/Linux
nginx -V: nginx version: nginx/1.13.4
built by gcc 6.3.0 20170516 (Debian 6.3.0-18)
built with OpenSSL 1.1.0f 25 May 2017
TLS SNI support enabled
configure arguments: --prefix=/etc/nginx --sbin-path=/usr/sbin/nginx --modules-path=/usr/lib/nginx/modules --conf-path=/etc/nginx/nginx.conf --error-log-path=/var/log/nginx/error.log --http-log-path=/var/log/nginx/access.log --pid-path=/var/run/nginx.pid --lock-path=/var/run/nginx.lock --http-client-body-temp-path=/var/cache/nginx/client_temp --http-proxy-temp-path=/var/cache/nginx/proxy_temp --http-fastcgi-temp-path=/var/cache/nginx/fastcgi_temp --http-uwsgi-temp-path=/var/cache/nginx/uwsgi_temp --http-scgi-temp-path=/var/cache/nginx/scgi_temp --user=nginx --group=nginx --with-compat --with-file-aio --with-threads --with-http_addition_module --with-http_auth_request_module --with-http_dav_module --with-http_flv_module --with-http_gunzip_module --with-http_gzip_static_module --with-http_mp4_module --with-http_random_index_module --with-http_realip_module --with-http_secure_link_module --with-http_slice_module --with-http_ssl_module --with-http_stub_status_module --with-http_sub_module --with-http_v2_module --with-mail --with-mail_ssl_module --with-stream --with-stream_realip_module --with-stream_ssl_module --with-stream_ssl_preread_module --with-cc-opt='-g -O2 -fdebug-prefix-map=/data/builder/debuild/nginx-1.13.4/debian/debuild-base/nginx-1.13.4=. -specs=/usr/share/dpkg/no-pie-compile.specs -fstack-protector-strong -Wformat -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -fPIC' --with-ld-opt='-specs=/usr/share/dpkg/no-pie-link.specs -Wl,-z,relro -Wl,-z,now -Wl,--as-needed -pie'

Description

Hi,
we have very usual configuration nginx + uwsgi, and out tests sometimes failed by next reason.
We upload file with helps our application, and do some checks of these files in process of uploading. In case invalid file our application generate error and stop processing upload request. Below I show you strace output in case successful test:

epoll_wait(7, [{EPOLLIN|EPOLLOUT, {u32=1360470513, u64=139763891249649}}], 512, 75000) = 1
recvfrom(3, "PUT /u/c74e5ad22fd7467dbcbd14db1"..., 1024, 0, NULL, NULL) = 1024
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 12
ioctl(12, FIONBIO, [1])                 = 0
epoll_ctl(7, EPOLL_CTL_ADD, 12, {EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, {u32=1360470753, u64=139763891249889}}) = 0
connect(12, {sa_family=AF_INET, sin_port=htons(8080), sin_addr=inet_addr("192.168.127.91")}, 16) = -1 EINPROGRESS (Operation now in progress)
epoll_wait(7, [{EPOLLIN|EPOLLOUT, {u32=1360470513, u64=139763891249649}}, {EPOLLOUT, {u32=1360470753, u64=139763891249889}}], 512, 60000) = 2
getsockopt(12, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
setsockopt(12, SOL_TCP, TCP_NODELAY, [1], 4) = 0
writev(12, [{iov_base="PUT /u/c74e5ad22fd7467dbcbd14db1"..., iov_len=378}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=625}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 3) = 9195
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 8192
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
...
----many such rows----
...
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 8192
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 8192
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 2966
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=5226}], 1) = -1 EAGAIN (Resource temporarily unavailable)
epoll_wait(7, [{EPOLLIN|EPOLLOUT, {u32=1360470513, u64=139763891249649}}], 512, 60000) = 1
epoll_wait(7, [{EPOLLOUT, {u32=1360470753, u64=139763891249889}}], 512, 59993) = 1
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=5226}], 1) = 5226
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 8192
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
...
----many such rows----
...
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 8192
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 1908
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=6284}], 1) = -1 EAGAIN (Resource temporarily unavailable)
epoll_wait(7, [{EPOLLIN|EPOLLOUT, {u32=1360470513, u64=139763891249649}}], 512, 59910) = 1
epoll_wait(7, [{EPOLLIN|EPOLLOUT|EPOLLERR|EPOLLHUP|EPOLLRDHUP, {u32=1360470753, u64=139763891249889}}], 512, 59907) = 1
recvfrom(12, "HTTP/1.1 400 BAD REQUEST\r\nX-Offs"..., 4096, 0, NULL, NULL) = 171
readv(12, [{iov_base=0x556392a64f1b, iov_len=3925}], 1) = -1 ECONNRESET (Connection reset by peer)
gettid()                                = 7
sendto(13, "<171>Nov 14 01:30:56 nginx-rj4gd"..., 355, 0, NULL, 0) = 355
close(12)                               = 0
writev(3, [{iov_base="HTTP/1.1 400 BAD REQUEST\r\nServer"..., iov_len=247}], 1) = 247
shutdown(3, SHUT_WR)                    = 0
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096, 0, NULL, NULL) = 4096
...
----many such rows----
...
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096, 0, NULL, NULL) = 4096
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096, 0, NULL, NULL) = 3471
epoll_wait(7, [{EPOLLOUT, {u32=1360470513, u64=139763891249649}}], 512, 5000) = 1
epoll_wait(7, [{EPOLLIN|EPOLLOUT|EPOLLHUP|EPOLLRDHUP, {u32=1360470513, u64=139763891249649}}], 512, 4906) = 1
recvfrom(3, "", 4096, 0, NULL, NULL)    = 0
sendto(11, "<180>Nov 14 01:30:56 nginx-rj4gd"..., 184, 0, NULL, 0) = 184
close(3)                                = 0
epoll_wait(7,

As you can see nginx accepted connection and started process content, after he got second EAGAIN (Resource temporarily unavailable) he did epoll_wait and got READ event on socket, so he read upstream response and understood that request finished. It good!

But sometime our test failed and we saw next strace output:

epoll_wait(7, [{EPOLLIN|EPOLLOUT, {u32=1360470512, u64=139763891249648}}], 512, 75000) = 1
recvfrom(3, "PUT /u/13fc38b378ee4350a48b384fc"..., 1024, 0, NULL, NULL) = 1024
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 12
ioctl(12, FIONBIO, [1])                 = 0
epoll_ctl(7, EPOLL_CTL_ADD, 12, {EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET, {u32=1360470753, u64=139763891249889}}) = 0
connect(12, {sa_family=AF_INET, sin_port=htons(8080), sin_addr=inet_addr("192.168.127.91")}, 16) = -1 EINPROGRESS (Operation now in progress)
epoll_wait(7, [{EPOLLIN|EPOLLOUT, {u32=1360470512, u64=139763891249648}}, {EPOLLOUT, {u32=1360470753, u64=139763891249889}}], 512, 60000) = 2
getsockopt(12, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
setsockopt(12, SOL_TCP, TCP_NODELAY, [1], 4) = 0
writev(12, [{iov_base="PUT /u/13fc38b378ee4350a48b384fc"..., iov_len=378}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=625}, {iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 3) = 9195
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 8192
...
----many such rows----
...
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 8192
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 2966
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=5226}], 1) = -1 EAGAIN (Resource temporarily unavailable)
epoll_wait(7, [{EPOLLIN|EPOLLOUT, {u32=1360470512, u64=139763891249648}}], 512, 59998) = 1
epoll_wait(7, [{EPOLLOUT, {u32=1360470753, u64=139763891249889}}], 512, 59980) = 1
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=5226}], 1) = 5226
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
...
----many such rows----
...
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = 8192
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192, 0, NULL, NULL) = 8192
writev(12, [{iov_base="\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., iov_len=8192}], 1) = -1 ECONNRESET (Connection reset by peer)
gettid()                                = 7
socket(AF_INET, SOCK_DGRAM, IPPROTO_IP) = 13
ioctl(13, FIONBIO, [1])                 = 0
connect(13, {sa_family=AF_INET, sin_port=htons(514), sin_addr=inet_addr("192.168.127.132")}, 16) = 0
sendto(13, "<171>Nov 14 01:18:23 nginx-rj4gd"..., 367, 0, NULL, 0) = 367
close(12)                               = 0
writev(3, [{iov_base="HTTP/1.1 502 Bad Gateway\r\nServer"..., iov_len=165}, {iov_base="<html>\r\n<head><title>502 Bad Gat"..., iov_len=120}, {iov_base="<hr><center>nginx</center>\r\n</bo"..., iov_len=46}], 3) = 331
shutdown(3, SHUT_WR)                    = 0
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096, 0, NULL, NULL) = 4096
...
----many such rows----
...
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096, 0, NULL, NULL) = 4096
recvfrom(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 4096, 0, NULL, NULL) = 3471
epoll_wait(7, [{EPOLLOUT, {u32=1360470512, u64=139763891249648}}], 512, 5000) = 1
epoll_wait(7, [{EPOLLIN|EPOLLOUT|EPOLLHUP|EPOLLRDHUP, {u32=1360470512, u64=139763891249648}}], 512, 4654) = 1
recvfrom(3, "", 4096, 0, NULL, NULL)    = 0
sendto(11, "<180>Nov 14 01:18:23 nginx-rj4gd"..., 186, 0, NULL, 0) = 186
close(3) 

As you can see in this case nginx get ECONNRESET (Connection reset by peer), and didn't try to read response from upstream. He desided that connection was close abruptly and generate 502 error, but application in this case also generate same 400 BAD REQUEST error as in previous example. That error describe reason by that connection was closed, but didn't get this response on client side. So how could you explaine this behavior and how we can do some influence to stabilize system behavior?

Attachments (1)

nginx.conf (6.6 KB ) - added by akayunov@… 6 years ago.
nginx.conf

Download all attachments as: .zip

Change History (2)

by akayunov@…, 6 years ago

Attachment: nginx.conf added

nginx.conf

comment:1 by Maxim Dounin, 6 years ago

Resolution: invalid
Status: newclosed

Once the connection was reset, it is not possible to read anything from the socket. To avoid connections being reset, connections have to be closed in stages when further data are expected to come from the client. Quoting RFC 7230:

   If a server performs an immediate close of a TCP connection, there is
   a significant risk that the client will not be able to read the last
   HTTP response.  If the server receives additional data from the
   client on a fully closed connection, such as another request that was
   sent by the client before receiving the server's response, the
   server's TCP stack will send a reset packet to the client;
   unfortunately, the reset packet might erase the client's
   unacknowledged input buffers before they can be read and interpreted
   by the client's HTTP parser.

   To avoid the TCP reset problem, servers typically close a connection
   in stages.  First, the server performs a half-close by closing only
   the write side of the read/write connection.  The server then
   continues to read from the connection until it receives a
   corresponding close by the client, or until the server is reasonably
   certain that its own TCP stack has received the client's
   acknowledgement of the packet(s) containing the server's last
   response.  Finally, the server fully closes the connection.

To make it possible for nginx to receive the response, backend must either read the whole body, or implement proper connection teardown (in nginx, this is called lingering_close).

Note: See TracTickets for help on using tickets.