Joseph Jones
2018-10-11 12:30:15 UTC
I'm trying to find a root cause for failed workers. We have three squid instances that act as transparent forward proxies that limit internet connectivity for our network by doing url whitelisting. Current throughput per instances is about 90MB/s. after a restart of squid all workers seem to be working just fine, but after about an hour some of the workers fail and they never come back until we do a complete restart. These are EC2 instances in AWS (c5.4xlarge). so we have 16 vCPU to work with. but that's really 8 Cores and 2 Threads per core.
CPU and Memory loads are small. Disk IO could be a concern. We've ran some load test on a different instances with logging turned off and we were able to get a higher throughput without worker failure. We don't have caching enabled as most of our traffic is SSL anyway. I Was hoping someone could point us in a direction we should take our testing or if from the information I've given can tell use any obvious this we may be doing wrong.
$ uptime
18:28:30 up 6 days, 2:13, 1 user, load average: 0.88, 1.10, 1.09
$ free -m
total used free shared buff/cache available
Mem: 30987 1728 25264 1156 3994 27523
Swap: 0 0 0
2018/10/10 18:19:42 kid2| Squid Cache (Version 4.1): Terminated abnormally.
CPU Usage: 0.036 seconds = 0.022 user + 0.014 sys
Maximum Resident Size: 92544 KB
Page faults with physical i/o: 0
2018/10/10 18:19:42 kid1| Closing HTTP(S) port [::]:3129
2018/10/10 18:19:42 kid1| Closing HTTP(S) port [::]:3128
2018/10/10 18:19:42 kid1| Closing HTTP(S) port [::]:3130
2018/10/10 18:19:42 kid1| storeDirWriteCleanLogs: Starting...
2018/10/10 18:19:42 kid1| Finished. Wrote 0 entries.
2018/10/10 18:19:42 kid1| Took 0.00 seconds ( 0.00 entries/sec).
2018/10/10 18:19:42 kid1| FATAL: kid1 registration timed out
2018/10/10 18:19:42 kid1| Squid Cache (Version 4.1): Terminated abnormally.
CPU Usage: 0.034 seconds = 0.021 user + 0.013 sys
Maximum Resident Size: 92544 KB
Page faults with physical i/o: 0
$ cat /etc/redhat-release
CentOS Linux release 7.5.1804 (Core)
$ squid -v
Squid Cache: Version 4.1
Service Name: squid
This binary uses OpenSSL 1.0.2k-fips 26 Jan 2017. For legal restrictions on distribution see https://www.openssl.org/source/license.html
configure options: '--build=x86_64-redhat-linux-gnu' '--host=x86_64-redhat-linux-gnu' '--program-prefix=' '--prefix=/usr' '--exec-prefix=/usr' '--bindir=/usr/bin' '--sbindir=/usr/sbin' '--sysconfdir=/etc' '--datadir=/usr/share' '--includedir=/usr/include' '--libdir=/usr/lib64' '--libexecdir=/usr/libexec' '--sharedstatedir=/var/lib' '--mandir=/usr/share/man' '--infodir=/usr/share/info' '--exec_prefix=/usr' '--libexecdir=/usr/lib64/squid' '--localstatedir=/var' '--datadir=/usr/share/squid' '--sysconfdir=/etc/squid' '--with-logdir=$(localstatedir)/log/squid' '--with-pidfile=$(localstatedir)/run/squid.pid' '--disable-dependency-tracking' '--enable-follow-x-forwarded-for' '--enable-auth' '--enable-auth-basic=DB,LDAP,NCSA,NIS,PAM,POP3,RADIUS,SASL,SMB,getpwnam,fake' '--enable-auth-ntlm=fake' '--enable-auth-digest=file,LDAP,eDirectory' '--enable-auth-negotiate=kerberos,wrapper' '--enable-external-acl-helpers=wbinfo_group,kerberos_ldap_group,LDAP_group,delayer,file_userip,SQL_session,unix_group,session,time_quota' '--enable-cache-digests' '--enable-cachemgr-hostname=localhost' '--enable-delay-pools' '--enable-epoll' '--enable-smp' '--enable-icap-client' '--enable-ident-lookups' '--enable-linux-netfilter' '--enable-removal-policies=heap,lru' '--enable-snmp' '--enable-storeio=aufs,diskd,ufs,rock' '--enable-wccpv2' '--enable-esi' '--enable-security-cert-generators' '--enable-security-cert-validators' '--enable-icmp' '--with-aio' '--with-default-user=squid' '--with-filedescriptors=16384' '--with-dl' '--with-openssl' '--enable-ssl-crtd' '--with-pthreads' '--with-included-ltdl' '--disable-arch-native' '--enable-ecap' '--without-nettle' 'build_alias=x86_64-redhat-linux-gnu' 'host_alias=x86_64-redhat-linux-gnu' 'CFLAGS=-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector-strong --param=ssp-buffer-size=4 -grecord-gcc-switches -m64 -mtune=generic' 'LDFLAGS=-Wl,-z,relro ' 'CXXFLAGS=-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector-strong --param=ssp-buffer-size=4 -grecord-gcc-switches -m64 -mtune=generic -fPIC' 'PKG_CONFIG_PATH=:/usr/lib64/pkgconfig:/usr/share/pkgconfig' --enable-ltdl-convenience
$ cat /etc/squid/squid.conf
workers 12
hopeless_kid_revival_delay 5 minute
# Default 'squid' logformat with request size and TLS SNI added
logformat ean_squid %ts.%03tu %6tr %>a %Ss/%03>Hs %>st %<st %rm %ru %ssl::>sni %[un %Sh/%<a %mt
logfile_rotate 0
access_log daemon:/var/log/squid/access.log logformat=ean_squid
debug_options ALL,1
# Only allow cachemgr access from localhost
http_access allow localhost manager
http_access deny manager
acl localnet src 10.26.128.0/21
acl SSL_ports port 443
acl Safe_ports port 80 # http
acl Safe_ports port 443 # https
acl CONNECT method CONNECT
#
# Recommended minimum Access Permission configuration:
#
# Deny requests to certain unsafe ports
http_access deny !Safe_ports
# Deny CONNECT to other than secure SSL ports
http_access deny CONNECT !SSL_ports
# Allow requests from the local network (see acl at the top)
http_access allow localnet
#
# INSERT YOUR OWN RULE(S) HERE TO ALLOW ACCESS FROM YOUR CLIENTS
#
# Just for debugging
# debug_options ALL,1 33,2 rotate=0
acl https_whitelist ssl::server_name "/etc/squid/whitelist.txt"
acl http_whitelist dstdomain "/etc/squid/whitelist.txt"
acl step1 at_step SslBump1
acl step2 at_step SslBump2
acl step3 at_step SslBump3
http_access allow http_whitelist
ssl_bump peek step1 all
ssl_bump peek step2 https_whitelist
ssl_bump splice step3 https_whitelist
ssl_bump terminate step2 all
# disable caching
cache deny all
# And finally deny all other access to this proxy
http_access deny all
# Squid normally listens to port 3128
http_port 3129 intercept
http_port 3128
https_port 3130 cert=/etc/pki/tls/certs/squid.pem key=/etc/pki/tls/private/squid.key ssl-bump intercept
visible_hostname squid
# Uncomment and adjust the following to add a disk cache directory.
#cache_dir ufs /var/spool/squid 100 16 256
# Leave coredumps in the first cache dir
coredump_dir /var/spool/squid
#
# Add any of your own refresh_pattern entries above these.
#
refresh_pattern ^ftp: 1440 20% 10080
refresh_pattern ^gopher: 1440 0% 1440
refresh_pattern -i (/cgi-bin/|\?) 0 0% 0
refresh_pattern . 0 20% 4320
--
Joseph M Jones
Senior Application Engineer
Expedia Partner Solutions
CPU and Memory loads are small. Disk IO could be a concern. We've ran some load test on a different instances with logging turned off and we were able to get a higher throughput without worker failure. We don't have caching enabled as most of our traffic is SSL anyway. I Was hoping someone could point us in a direction we should take our testing or if from the information I've given can tell use any obvious this we may be doing wrong.
$ uptime
18:28:30 up 6 days, 2:13, 1 user, load average: 0.88, 1.10, 1.09
$ free -m
total used free shared buff/cache available
Mem: 30987 1728 25264 1156 3994 27523
Swap: 0 0 0
2018/10/10 18:19:42 kid2| Squid Cache (Version 4.1): Terminated abnormally.
CPU Usage: 0.036 seconds = 0.022 user + 0.014 sys
Maximum Resident Size: 92544 KB
Page faults with physical i/o: 0
2018/10/10 18:19:42 kid1| Closing HTTP(S) port [::]:3129
2018/10/10 18:19:42 kid1| Closing HTTP(S) port [::]:3128
2018/10/10 18:19:42 kid1| Closing HTTP(S) port [::]:3130
2018/10/10 18:19:42 kid1| storeDirWriteCleanLogs: Starting...
2018/10/10 18:19:42 kid1| Finished. Wrote 0 entries.
2018/10/10 18:19:42 kid1| Took 0.00 seconds ( 0.00 entries/sec).
2018/10/10 18:19:42 kid1| FATAL: kid1 registration timed out
2018/10/10 18:19:42 kid1| Squid Cache (Version 4.1): Terminated abnormally.
CPU Usage: 0.034 seconds = 0.021 user + 0.013 sys
Maximum Resident Size: 92544 KB
Page faults with physical i/o: 0
$ cat /etc/redhat-release
CentOS Linux release 7.5.1804 (Core)
$ squid -v
Squid Cache: Version 4.1
Service Name: squid
This binary uses OpenSSL 1.0.2k-fips 26 Jan 2017. For legal restrictions on distribution see https://www.openssl.org/source/license.html
configure options: '--build=x86_64-redhat-linux-gnu' '--host=x86_64-redhat-linux-gnu' '--program-prefix=' '--prefix=/usr' '--exec-prefix=/usr' '--bindir=/usr/bin' '--sbindir=/usr/sbin' '--sysconfdir=/etc' '--datadir=/usr/share' '--includedir=/usr/include' '--libdir=/usr/lib64' '--libexecdir=/usr/libexec' '--sharedstatedir=/var/lib' '--mandir=/usr/share/man' '--infodir=/usr/share/info' '--exec_prefix=/usr' '--libexecdir=/usr/lib64/squid' '--localstatedir=/var' '--datadir=/usr/share/squid' '--sysconfdir=/etc/squid' '--with-logdir=$(localstatedir)/log/squid' '--with-pidfile=$(localstatedir)/run/squid.pid' '--disable-dependency-tracking' '--enable-follow-x-forwarded-for' '--enable-auth' '--enable-auth-basic=DB,LDAP,NCSA,NIS,PAM,POP3,RADIUS,SASL,SMB,getpwnam,fake' '--enable-auth-ntlm=fake' '--enable-auth-digest=file,LDAP,eDirectory' '--enable-auth-negotiate=kerberos,wrapper' '--enable-external-acl-helpers=wbinfo_group,kerberos_ldap_group,LDAP_group,delayer,file_userip,SQL_session,unix_group,session,time_quota' '--enable-cache-digests' '--enable-cachemgr-hostname=localhost' '--enable-delay-pools' '--enable-epoll' '--enable-smp' '--enable-icap-client' '--enable-ident-lookups' '--enable-linux-netfilter' '--enable-removal-policies=heap,lru' '--enable-snmp' '--enable-storeio=aufs,diskd,ufs,rock' '--enable-wccpv2' '--enable-esi' '--enable-security-cert-generators' '--enable-security-cert-validators' '--enable-icmp' '--with-aio' '--with-default-user=squid' '--with-filedescriptors=16384' '--with-dl' '--with-openssl' '--enable-ssl-crtd' '--with-pthreads' '--with-included-ltdl' '--disable-arch-native' '--enable-ecap' '--without-nettle' 'build_alias=x86_64-redhat-linux-gnu' 'host_alias=x86_64-redhat-linux-gnu' 'CFLAGS=-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector-strong --param=ssp-buffer-size=4 -grecord-gcc-switches -m64 -mtune=generic' 'LDFLAGS=-Wl,-z,relro ' 'CXXFLAGS=-O2 -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector-strong --param=ssp-buffer-size=4 -grecord-gcc-switches -m64 -mtune=generic -fPIC' 'PKG_CONFIG_PATH=:/usr/lib64/pkgconfig:/usr/share/pkgconfig' --enable-ltdl-convenience
$ cat /etc/squid/squid.conf
workers 12
hopeless_kid_revival_delay 5 minute
# Default 'squid' logformat with request size and TLS SNI added
logformat ean_squid %ts.%03tu %6tr %>a %Ss/%03>Hs %>st %<st %rm %ru %ssl::>sni %[un %Sh/%<a %mt
logfile_rotate 0
access_log daemon:/var/log/squid/access.log logformat=ean_squid
debug_options ALL,1
# Only allow cachemgr access from localhost
http_access allow localhost manager
http_access deny manager
acl localnet src 10.26.128.0/21
acl SSL_ports port 443
acl Safe_ports port 80 # http
acl Safe_ports port 443 # https
acl CONNECT method CONNECT
#
# Recommended minimum Access Permission configuration:
#
# Deny requests to certain unsafe ports
http_access deny !Safe_ports
# Deny CONNECT to other than secure SSL ports
http_access deny CONNECT !SSL_ports
# Allow requests from the local network (see acl at the top)
http_access allow localnet
#
# INSERT YOUR OWN RULE(S) HERE TO ALLOW ACCESS FROM YOUR CLIENTS
#
# Just for debugging
# debug_options ALL,1 33,2 rotate=0
acl https_whitelist ssl::server_name "/etc/squid/whitelist.txt"
acl http_whitelist dstdomain "/etc/squid/whitelist.txt"
acl step1 at_step SslBump1
acl step2 at_step SslBump2
acl step3 at_step SslBump3
http_access allow http_whitelist
ssl_bump peek step1 all
ssl_bump peek step2 https_whitelist
ssl_bump splice step3 https_whitelist
ssl_bump terminate step2 all
# disable caching
cache deny all
# And finally deny all other access to this proxy
http_access deny all
# Squid normally listens to port 3128
http_port 3129 intercept
http_port 3128
https_port 3130 cert=/etc/pki/tls/certs/squid.pem key=/etc/pki/tls/private/squid.key ssl-bump intercept
visible_hostname squid
# Uncomment and adjust the following to add a disk cache directory.
#cache_dir ufs /var/spool/squid 100 16 256
# Leave coredumps in the first cache dir
coredump_dir /var/spool/squid
#
# Add any of your own refresh_pattern entries above these.
#
refresh_pattern ^ftp: 1440 20% 10080
refresh_pattern ^gopher: 1440 0% 1440
refresh_pattern -i (/cgi-bin/|\?) 0 0% 0
refresh_pattern . 0 20% 4320
--
Joseph M Jones
Senior Application Engineer
Expedia Partner Solutions